malariagen
diff --git a/‎.github/actions/setup-python/action.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/actions/setup-python/action.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 6 additions & 2 deletions b/‎CONTRIBUTING.md‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎malariagen_data/anoph/base_params.py‎
Lines changed: 4 additions & 1 deletion b/‎malariagen_data/anoph/base_params.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎malariagen_data/anoph/frq_base.py‎
Lines changed: 14 additions & 0 deletions b/‎malariagen_data/anoph/frq_base.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎malariagen_data/anoph/sample_metadata.py‎
Lines changed: 166 additions & 10 deletions b/‎malariagen_data/anoph/sample_metadata.py‎
Lines changed: 166 additions & 10 deletions
diff --git a/‎malariagen_data/anoph/snp_frq.py‎
Lines changed: 50 additions & 6 deletions b/‎malariagen_data/anoph/snp_frq.py‎
Lines changed: 50 additions & 6 deletions
@@ -19,4 +19,4 @@ runs:
       shell: bash
       run: |
         poetry env use ${{ inputs.python-version }}
-        poetry install --extras dev
+        poetry install --with dev,test,docs
@@ -12,7 +12,7 @@ This package provides Python tools for accessing and analyzing genomic data from
 
 You'll need:
 
-- [pipx](https://python-poetry.org/) for installing Python tools
+- [pipx](https://pipx.pypa.io/) for installing Python tools
 - [git](https://git-scm.com/) for version control
 
 Both of these can be installed using your distribution's package manager or [Homebrew](https://brew.sh/) on Mac.
@@ -52,9 +52,13 @@ Both of these can be installed using your distribution's package manager or [Hom
 
    ```bash
    poetry env use 3.12
-   poetry install --extras dev
+   poetry install --with dev,test,docs
    ```
 
+   This installs the runtime dependencies along with the `dev`, `test`, and `docs`
+   [dependency groups](https://python-poetry.org/docs/managing-dependencies/#dependency-groups).
+   If you only need to run tests, `poetry install --with test` is sufficient.
+
    **Recommended**: Use `poetry run` to run commands inside the virtual environment:
 
    ```bash
 
@@ -69,7 +69,10 @@
     str,
     """
     A pandas query string to be evaluated against the sample metadata, to
-    select samples to be included in the returned data.
+    select samples to be included in the returned data. E.g.,
+    "country == 'Uganda'". If the query returns zero results, a warning
+    will be emitted with fuzzy-match suggestions for possible typos or
+    case mismatches.
     """,
 ]
 
 
@@ -29,6 +29,13 @@ def _prep_samples_for_cohort_grouping(
         # Users can explicitly override with True/False.
         filter_unassigned = taxon_by == "taxon"
 
+    # Validate taxon_by.
+    if taxon_by not in df_samples.columns:
+        raise ValueError(
+            f"Invalid value for `taxon_by`: {taxon_by!r}. "
+            f"Must be the name of an existing column in the sample metadata."
+        )
+
     if filter_unassigned:
         # Remove samples with "intermediate" or "unassigned" taxon values,
         # as we only want cohorts with clean taxon calls.
@@ -78,6 +85,13 @@ def _prep_samples_for_cohort_grouping(
         # Apply the matching period_by function to create a new "period" column.
         df_samples["period"] = df_samples.apply(period_by_func, axis="columns")
 
+    # Validate area_by.
+    if area_by not in df_samples.columns:
+        raise ValueError(
+            f"Invalid value for `area_by`: {area_by!r}. "
+            f"Must be the name of an existing column in the sample metadata."
+        )
+
     # Copy the specified area_by column to a new "area" column.
     df_samples["area"] = df_samples[area_by]
 
 
@@ -1,4 +1,7 @@
+import difflib
 import io
+import json
+import re
 from itertools import cycle
 from typing import (
     Any,
@@ -81,6 +84,8 @@ def __init__(
 
         # Initialize cache attributes.
         self._cache_sample_metadata: Dict = dict()
+        self._cache_cohorts: Dict = dict()
+        self._cache_cohort_geometries: Dict = dict()
 
     def _metadata_paths(
         self,
@@ -702,6 +707,17 @@ def clear_extra_metadata(self):
     @doc(
         summary="Access sample metadata for one or more sample sets.",
         returns="A dataframe of sample metadata, one row per sample.",
+        notes="""
+            Some samples in the dataset are lab crosses — mosquitoes bred in
+            the laboratory that have no real collection date. These samples
+            use ``year=-1`` and ``month=-1`` as sentinel values. They may
+            cause unexpected results in date-based analyses (e.g.,
+            ``pd.to_datetime`` will fail on negative year values).
+
+            To exclude lab cross samples, use::
+
+                df = api.sample_metadata(sample_query="year >= 0")
+        """,
     )
     def sample_metadata(
         self,
@@ -781,12 +797,65 @@ def sample_metadata(
         if prepared_sample_query is not None:
             # Assume a pandas query string.
             sample_query_options = sample_query_options or {}
+
+            # Save a reference to the pre-query DataFrame so we can detect
+            # zero-result queries and provide a helpful warning.
+            df_before_query = df_samples
+
             # Use the python engine in order to support extension array dtypes, e.g. Float64, Int64, boolean.
             df_samples = df_samples.query(
                 prepared_sample_query, **sample_query_options, engine="python"
             )
             df_samples = df_samples.reset_index(drop=True)
 
+            # Warn if query returned zero results on a non-empty dataset.
+            # Provide fuzzy-match suggestions so users can spot typos,
+            # case mismatches, or partial-value issues.
+            if len(df_samples) == 0 and len(df_before_query) > 0:
+                hint_lines = [
+                    f"sample_metadata() returned 0 samples for query: {prepared_sample_query!r}.",
+                ]
+
+                # Extract column == 'value' pairs from the query.
+                col_val_pairs = re.findall(
+                    r"\b(\w+)\s*==\s*['\"]([^'\"]+)['\"]",
+                    prepared_sample_query,
+                )
+
+                for col_name, queried_val in col_val_pairs:
+                    # If the column name is not recognised, suggest
+                    # close column names.
+                    if col_name not in df_before_query.columns:
+                        close_cols = difflib.get_close_matches(
+                            col_name,
+                            df_before_query.columns.tolist(),
+                            n=3,
+                            cutoff=0.6,
+                        )
+                        if close_cols:
+                            hint_lines.append(
+                                f"Column {col_name!r} not found. "
+                                f"Did you mean: {close_cols}?"
+                            )
+                        continue
+
+                    # For string columns, suggest close values.
+                    if df_before_query[col_name].dtype == object:
+                        valid_vals = (
+                            df_before_query[col_name].dropna().unique().tolist()
+                        )
+                        close_vals = difflib.get_close_matches(
+                            queried_val, valid_vals, n=5, cutoff=0.6
+                        )
+                        if close_vals:
+                            hint_lines.append(
+                                f"Value {queried_val!r} not found in "
+                                f"column {col_name!r}. "
+                                f"Did you mean: {close_vals}?"
+                            )
+
+                warnings.warn("\n".join(hint_lines), UserWarning, stacklevel=2)
+
         # Apply the sample_indices, if there are any.
         # Note: this might need to apply to the result of an internal sample_query, e.g. `is_surveillance == True`.
         if sample_indices is not None:
@@ -1485,7 +1554,11 @@ def _setup_cohort_queries(
                 A cohort set name. Accepted values are:
                 "admin1_month", "admin1_quarter", "admin1_year",
                 "admin2_month", "admin2_quarter", "admin2_year".
-            """
+            """,
+            query="""
+                An optional pandas query string to filter the resulting
+                dataframe, e.g., "country == 'Burkina Faso'".
+            """,
         ),
         returns="""A dataframe of cohort data, one row per cohort. There are up to 18 columns:
         `cohort_id` is the identifier of the cohort,
@@ -1512,20 +1585,98 @@ def _setup_cohort_queries(
     def cohorts(
         self,
         cohort_set: base_params.cohorts,
+        query: Optional[str] = None,
     ) -> pd.DataFrame:
-        major_version_path = self._major_version_path
+        valid_cohort_sets = {
+            "admin1_month",
+            "admin1_quarter",
+            "admin1_year",
+            "admin2_month",
+            "admin2_quarter",
+            "admin2_year",
+        }
+        if cohort_set not in valid_cohort_sets:
+            raise ValueError(
+                f"{cohort_set!r} is not a valid cohort set. "
+                f"Accepted values are: {sorted(valid_cohort_sets)}."
+            )
+
+        cohorts_analysis = self._cohorts_analysis
+
+        # Cache to avoid repeated reads.
+        cache_key = (cohorts_analysis, cohort_set)
+        try:
+            df_cohorts = self._cache_cohorts[cache_key]
+        except KeyError:
+            major_version_path = self._major_version_path
+            path = f"{major_version_path[:2]}_cohorts/cohorts_{cohorts_analysis}/cohorts_{cohort_set}.csv"
+
+            with self.open_file(path) as f:
+                df_cohorts = pd.read_csv(f, sep=",", na_values="")
+
+            # Ensure all column names are lower case.
+            df_cohorts.columns = [c.lower() for c in df_cohorts.columns]  # type: ignore
+
+            self._cache_cohorts[cache_key] = df_cohorts
+
+        if query is not None:
+            df_cohorts = df_cohorts.query(query)
+            df_cohorts = df_cohorts.reset_index(drop=True)
+
+        return df_cohorts.copy()
+
+    @_check_types
+    @doc(
+        summary="""
+            Read GeoJSON geometry data for a specific cohort set,
+            providing boundary geometries for each cohort.
+        """,
+        parameters=dict(
+            cohort_set="""
+                A cohort set name. Accepted values are:
+                "admin1_month", "admin1_quarter", "admin1_year",
+                "admin2_month", "admin2_quarter", "admin2_year".
+            """,
+        ),
+        returns="""
+            A dict containing the parsed GeoJSON FeatureCollection,
+            with boundary geometries for each cohort in the set.
+        """,
+    )
+    def cohort_geometries(
+        self,
+        cohort_set: base_params.cohorts,
+    ) -> dict:
+        valid_cohort_sets = {
+            "admin1_month",
+            "admin1_quarter",
+            "admin1_year",
+            "admin2_month",
+            "admin2_quarter",
+            "admin2_year",
+        }
+        if cohort_set not in valid_cohort_sets:
+            raise ValueError(
+                f"{cohort_set!r} is not a valid cohort set. "
+                f"Accepted values are: {sorted(valid_cohort_sets)}."
+            )
+
         cohorts_analysis = self._cohorts_analysis
 
-        path = f"{major_version_path[:2]}_cohorts/cohorts_{cohorts_analysis}/cohorts_{cohort_set}.csv"
+        # Cache to avoid repeated reads.
+        cache_key = (cohorts_analysis, cohort_set)
+        try:
+            geojson_data = self._cache_cohort_geometries[cache_key]
+        except KeyError:
+            major_version_path = self._major_version_path
+            path = f"{major_version_path[:2]}_cohorts/cohorts_{cohorts_analysis}/cohorts_{cohort_set}.geojson"
 
-        # Read the manifest into a pandas dataframe.
-        with self.open_file(path) as f:
-            df_cohorts = pd.read_csv(f, sep=",", na_values="")
+            with self.open_file(path) as f:
+                geojson_data = json.load(f)
 
-        # Ensure all column names are lower case.
-        df_cohorts.columns = [c.lower() for c in df_cohorts.columns]  # type: ignore
+            self._cache_cohort_geometries[cache_key] = geojson_data
 
-        return df_cohorts
+        return geojson_data
 
     @_check_types
     @doc(
@@ -1688,7 +1839,12 @@ def _locate_cohorts(*, cohorts, data, min_cohort_size):
         # to pandas queries.
 
         for coh, query in cohorts.items():
-            loc_coh = data.eval(query).values
+            try:
+                loc_coh = data.eval(query).values
+            except Exception as e:
+                raise ValueError(
+                    f"Invalid query for cohort {coh!r}: {query!r}. Error: {e}"
+                ) from e
             coh_dict[coh] = loc_coh
 
     else:
 
@@ -345,11 +345,55 @@ def aa_allele_frequencies(
         # We just want aa change.
         df_ns_snps = df_snps.query(AA_CHANGE_QUERY).copy()
 
-        # Early check for no matching SNPs.
-        if len(df_ns_snps) == 0:  # pragma: no cover
-            raise ValueError(
-                "No amino acid change SNPs found for the given transcript and site mask."
+        # Handle case where no amino acid change SNPs are found.
+        # N.B., this can legitimately happen for some transcript/site_mask/query
+        # combinations. Return a well-formed empty DataFrame rather than raising,
+        # to avoid transient test failures and to allow downstream code to handle
+        # the empty result gracefully. See also:
+        # https://github.com/malariagen/malariagen-data-python/issues/1064
+        if len(df_ns_snps) == 0:
+            warnings.warn(
+                "No amino acid change SNPs found for the given transcript "
+                "and site mask. Returning an empty DataFrame.",
+                stacklevel=2,
+            )
+            # Build an empty DataFrame with the expected schema.
+            freq_cols = [col for col in df_snps.columns if col.startswith("frq_")]
+            count_cols = [col for col in df_snps.columns if col.startswith("count_")]
+            nobs_cols = [col for col in df_snps.columns if col.startswith("nobs_")]
+            keep_cols = [
+                "contig",
+                "transcript",
+                "aa_pos",
+                "ref_allele",
+                "ref_aa",
+                "alt_aa",
+                "effect",
+                "impact",
+                "grantham_score",
+                "sneath_score",
+            ]
+            all_cols = (
+                ["aa_change"]
+                + freq_cols
+                + ["max_af"]
+                + keep_cols
+                + ["alt_allele", "label", "position"]
             )
+            if include_counts:
+                all_cols = all_cols + count_cols + nobs_cols
+            df_empty = pd.DataFrame(columns=all_cols)
+            df_empty.set_index(["aa_change", "contig", "position"], inplace=True)
+
+            # Add metadata.
+            gene_name = self._transcript_to_parent_name(transcript)
+            title = transcript
+            if gene_name:
+                title += f" ({gene_name})"
+            title += " SNP frequencies"
+            df_empty.attrs["title"] = title
+
+            return df_empty
 
         # N.B., we need to worry about the possibility of the
         # same aa change due to SNPs at different positions. We cannot
@@ -375,7 +419,7 @@ def np_sum(g):
             for c in nobs_cols:
                 agg[c] = "first"
 
-        keep_cols = (
+        keep_cols = [
             "contig",
             "transcript",
             "aa_pos",
@@ -386,7 +430,7 @@ def np_sum(g):
             "impact",
             "grantham_score",
             "sneath_score",
-        )
+        ]
         for c in keep_cols:
             agg[c] = "first"
         agg["alt_allele"] = lambda v: "{" + ",".join(v) + "}" if len(v) > 1 else v