malariagen
diff --git a/‎.github/actions/setup-python/action.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/actions/setup-python/action.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 6 additions & 2 deletions b/‎CONTRIBUTING.md‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎malariagen_data/anoph/base_params.py‎
Lines changed: 4 additions & 1 deletion b/‎malariagen_data/anoph/base_params.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎malariagen_data/anoph/dipclust.py‎
Lines changed: 7 additions & 4 deletions b/‎malariagen_data/anoph/dipclust.py‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎malariagen_data/anoph/frq_base.py‎
Lines changed: 14 additions & 0 deletions b/‎malariagen_data/anoph/frq_base.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎malariagen_data/anoph/hapclust.py‎
Lines changed: 7 additions & 4 deletions b/‎malariagen_data/anoph/hapclust.py‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎malariagen_data/anoph/sample_metadata.py‎
Lines changed: 75 additions & 3 deletions b/‎malariagen_data/anoph/sample_metadata.py‎
Lines changed: 75 additions & 3 deletions
diff --git a/‎malariagen_data/anoph/snp_frq.py‎
Lines changed: 50 additions & 6 deletions b/‎malariagen_data/anoph/snp_frq.py‎
Lines changed: 50 additions & 6 deletions
diff --git a/‎malariagen_data/veff.py‎
Lines changed: 7 additions & 2 deletions b/‎malariagen_data/veff.py‎
Lines changed: 7 additions & 2 deletions
@@ -19,4 +19,4 @@ runs:
       shell: bash
       run: |
         poetry env use ${{ inputs.python-version }}
-        poetry install --extras dev
+        poetry install --with dev,test,docs
@@ -12,7 +12,7 @@ This package provides Python tools for accessing and analyzing genomic data from
 
 You'll need:
 
-- [pipx](https://python-poetry.org/) for installing Python tools
+- [pipx](https://pipx.pypa.io/) for installing Python tools
 - [git](https://git-scm.com/) for version control
 
 Both of these can be installed using your distribution's package manager or [Homebrew](https://brew.sh/) on Mac.
@@ -52,9 +52,13 @@ Both of these can be installed using your distribution's package manager or [Hom
 
    ```bash
    poetry env use 3.12
-   poetry install --extras dev
+   poetry install --with dev,test,docs
    ```
 
+   This installs the runtime dependencies along with the `dev`, `test`, and `docs`
+   [dependency groups](https://python-poetry.org/docs/managing-dependencies/#dependency-groups).
+   If you only need to run tests, `poetry install --with test` is sufficient.
+
    **Recommended**: Use `poetry run` to run commands inside the virtual environment:
 
    ```bash
 
@@ -69,7 +69,10 @@
     str,
     """
     A pandas query string to be evaluated against the sample metadata, to
-    select samples to be included in the returned data.
+    select samples to be included in the returned data. E.g.,
+    "country == 'Uganda'". If the query returns zero results, a warning
+    will be emitted with fuzzy-match suggestions for possible typos or
+    case mismatches.
     """,
 ]
 
 
@@ -1,3 +1,4 @@
+import warnings
 from typing import Optional, Tuple
 
 import allel  # type: ignore
@@ -540,8 +541,9 @@ def _insert_dipclust_snp_trace(
             figures.append(snp_trace)
             subplot_heights.append(snp_row_height * n_snps_transcript)
         else:
-            print(
-                f"No SNPs were found below {snp_filter_min_maf} allele frequency. Omitting SNP genotype plot."
+            warnings.warn(
+                f"No SNPs were found below {snp_filter_min_maf} allele frequency. Omitting SNP genotype plot.",
+                stacklevel=2,
             )
         return figures, subplot_heights, n_snps_transcript
 
@@ -607,8 +609,9 @@ def plot_diplotype_clustering_advanced(
             cnv_colorscale = cnv_params.colorscale_default
         if cohort_size and snp_transcript:
             cohort_size = None
-            print(
-                "Cohort size is not supported with amino acid heatmap. Overriding cohort size to None."
+            warnings.warn(
+                "Cohort size is not supported with amino acid heatmap. Overriding cohort size to None.",
+                stacklevel=2,
             )
 
         res = self.plot_diplotype_clustering(
 
@@ -29,6 +29,13 @@ def _prep_samples_for_cohort_grouping(
         # Users can explicitly override with True/False.
         filter_unassigned = taxon_by == "taxon"
 
+    # Validate taxon_by.
+    if taxon_by not in df_samples.columns:
+        raise ValueError(
+            f"Invalid value for `taxon_by`: {taxon_by!r}. "
+            f"Must be the name of an existing column in the sample metadata."
+        )
+
     if filter_unassigned:
         # Remove samples with "intermediate" or "unassigned" taxon values,
         # as we only want cohorts with clean taxon calls.
@@ -78,6 +85,13 @@ def _prep_samples_for_cohort_grouping(
         # Apply the matching period_by function to create a new "period" column.
         df_samples["period"] = df_samples.apply(period_by_func, axis="columns")
 
+    # Validate area_by.
+    if area_by not in df_samples.columns:
+        raise ValueError(
+            f"Invalid value for `area_by`: {area_by!r}. "
+            f"Must be the name of an existing column in the sample metadata."
+        )
+
     # Copy the specified area_by column to a new "area" column.
     df_samples["area"] = df_samples[area_by]
 
 
@@ -1,3 +1,4 @@
+import warnings
 from typing import Optional, Tuple
 
 import allel  # type: ignore
@@ -402,8 +403,9 @@ def plot_haplotype_clustering_advanced(
 
         if cohort_size and snp_transcript:
             cohort_size = None
-            print(
-                "Cohort size is not supported with amino acid heatmap. Overriding cohort size to None."
+            warnings.warn(
+                "Cohort size is not supported with amino acid heatmap. Overriding cohort size to None.",
+                stacklevel=2,
             )
 
         res = self.plot_haplotype_clustering(
@@ -709,8 +711,9 @@ def _insert_hapclust_snp_trace(
             figures.append(snp_trace)
             subplot_heights.append(snp_row_height * df_haps.shape[0])
         else:
-            print(
-                f"No SNPs were found below {snp_filter_min_maf} allele frequency. Omitting SNP genotype plot."
+            warnings.warn(
+                f"No SNPs were found below {snp_filter_min_maf} allele frequency. Omitting SNP genotype plot.",
+                stacklevel=2,
             )
         return figures, subplot_heights, n_snps_transcript
 
 
@@ -1,5 +1,7 @@
+import difflib
 import io
 import json
+import re
 from itertools import cycle
 from typing import (
     Any,
@@ -705,6 +707,17 @@ def clear_extra_metadata(self):
     @doc(
         summary="Access sample metadata for one or more sample sets.",
         returns="A dataframe of sample metadata, one row per sample.",
+        notes="""
+            Some samples in the dataset are lab crosses — mosquitoes bred in
+            the laboratory that have no real collection date. These samples
+            use ``year=-1`` and ``month=-1`` as sentinel values. They may
+            cause unexpected results in date-based analyses (e.g.,
+            ``pd.to_datetime`` will fail on negative year values).
+
+            To exclude lab cross samples, use::
+
+                df = api.sample_metadata(sample_query="year >= 0")
+        """,
     )
     def sample_metadata(
         self,
@@ -784,12 +797,65 @@ def sample_metadata(
         if prepared_sample_query is not None:
             # Assume a pandas query string.
             sample_query_options = sample_query_options or {}
+
+            # Save a reference to the pre-query DataFrame so we can detect
+            # zero-result queries and provide a helpful warning.
+            df_before_query = df_samples
+
             # Use the python engine in order to support extension array dtypes, e.g. Float64, Int64, boolean.
             df_samples = df_samples.query(
                 prepared_sample_query, **sample_query_options, engine="python"
             )
             df_samples = df_samples.reset_index(drop=True)
 
+            # Warn if query returned zero results on a non-empty dataset.
+            # Provide fuzzy-match suggestions so users can spot typos,
+            # case mismatches, or partial-value issues.
+            if len(df_samples) == 0 and len(df_before_query) > 0:
+                hint_lines = [
+                    f"sample_metadata() returned 0 samples for query: {prepared_sample_query!r}.",
+                ]
+
+                # Extract column == 'value' pairs from the query.
+                col_val_pairs = re.findall(
+                    r"\b(\w+)\s*==\s*['\"]([^'\"]+)['\"]",
+                    prepared_sample_query,
+                )
+
+                for col_name, queried_val in col_val_pairs:
+                    # If the column name is not recognised, suggest
+                    # close column names.
+                    if col_name not in df_before_query.columns:
+                        close_cols = difflib.get_close_matches(
+                            col_name,
+                            df_before_query.columns.tolist(),
+                            n=3,
+                            cutoff=0.6,
+                        )
+                        if close_cols:
+                            hint_lines.append(
+                                f"Column {col_name!r} not found. "
+                                f"Did you mean: {close_cols}?"
+                            )
+                        continue
+
+                    # For string columns, suggest close values.
+                    if df_before_query[col_name].dtype == object:
+                        valid_vals = (
+                            df_before_query[col_name].dropna().unique().tolist()
+                        )
+                        close_vals = difflib.get_close_matches(
+                            queried_val, valid_vals, n=5, cutoff=0.6
+                        )
+                        if close_vals:
+                            hint_lines.append(
+                                f"Value {queried_val!r} not found in "
+                                f"column {col_name!r}. "
+                                f"Did you mean: {close_vals}?"
+                            )
+
+                warnings.warn("\n".join(hint_lines), UserWarning, stacklevel=2)
+
         # Apply the sample_indices, if there are any.
         # Note: this might need to apply to the result of an internal sample_query, e.g. `is_surveillance == True`.
         if sample_indices is not None:
@@ -1468,8 +1534,9 @@ def _setup_cohort_queries(
             if min_cohort_size is not None:
                 cohort_size = min_cohort_size
             if cohort_size is not None and n_samples < cohort_size:
-                print(
-                    f"Cohort ({cohort_label}) has insufficient samples ({n_samples}) for requested cohort size ({cohort_size}), dropping."
+                warnings.warn(
+                    f"Cohort ({cohort_label}) has insufficient samples ({n_samples}) for requested cohort size ({cohort_size}), dropping.",
+                    stacklevel=2,
                 )
             else:
                 cohort_queries_checked[cohort_label] = cohort_query
@@ -1773,7 +1840,12 @@ def _locate_cohorts(*, cohorts, data, min_cohort_size):
         # to pandas queries.
 
         for coh, query in cohorts.items():
-            loc_coh = data.eval(query).values
+            try:
+                loc_coh = data.eval(query).values
+            except Exception as e:
+                raise ValueError(
+                    f"Invalid query for cohort {coh!r}: {query!r}. Error: {e}"
+                ) from e
             coh_dict[coh] = loc_coh
 
     else:
 
@@ -345,11 +345,55 @@ def aa_allele_frequencies(
         # We just want aa change.
         df_ns_snps = df_snps.query(AA_CHANGE_QUERY).copy()
 
-        # Early check for no matching SNPs.
-        if len(df_ns_snps) == 0:  # pragma: no cover
-            raise ValueError(
-                "No amino acid change SNPs found for the given transcript and site mask."
+        # Handle case where no amino acid change SNPs are found.
+        # N.B., this can legitimately happen for some transcript/site_mask/query
+        # combinations. Return a well-formed empty DataFrame rather than raising,
+        # to avoid transient test failures and to allow downstream code to handle
+        # the empty result gracefully. See also:
+        # https://github.com/malariagen/malariagen-data-python/issues/1064
+        if len(df_ns_snps) == 0:
+            warnings.warn(
+                "No amino acid change SNPs found for the given transcript "
+                "and site mask. Returning an empty DataFrame.",
+                stacklevel=2,
+            )
+            # Build an empty DataFrame with the expected schema.
+            freq_cols = [col for col in df_snps.columns if col.startswith("frq_")]
+            count_cols = [col for col in df_snps.columns if col.startswith("count_")]
+            nobs_cols = [col for col in df_snps.columns if col.startswith("nobs_")]
+            keep_cols = [
+                "contig",
+                "transcript",
+                "aa_pos",
+                "ref_allele",
+                "ref_aa",
+                "alt_aa",
+                "effect",
+                "impact",
+                "grantham_score",
+                "sneath_score",
+            ]
+            all_cols = (
+                ["aa_change"]
+                + freq_cols
+                + ["max_af"]
+                + keep_cols
+                + ["alt_allele", "label", "position"]
             )
+            if include_counts:
+                all_cols = all_cols + count_cols + nobs_cols
+            df_empty = pd.DataFrame(columns=all_cols)
+            df_empty.set_index(["aa_change", "contig", "position"], inplace=True)
+
+            # Add metadata.
+            gene_name = self._transcript_to_parent_name(transcript)
+            title = transcript
+            if gene_name:
+                title += f" ({gene_name})"
+            title += " SNP frequencies"
+            df_empty.attrs["title"] = title
+
+            return df_empty
 
         # N.B., we need to worry about the possibility of the
         # same aa change due to SNPs at different positions. We cannot
@@ -375,7 +419,7 @@ def np_sum(g):
             for c in nobs_cols:
                 agg[c] = "first"
 
-        keep_cols = (
+        keep_cols = [
             "contig",
             "transcript",
             "aa_pos",
@@ -386,7 +430,7 @@ def np_sum(g):
             "impact",
             "grantham_score",
             "sneath_score",
-        )
+        ]
         for c in keep_cols:
             agg[c] = "first"
         agg["alt_allele"] = lambda v: "{" + ",".join(v) + "}" if len(v) > 1 else v
 
@@ -356,10 +356,15 @@ def _get_within_cds_effect(ann, base_effect, cds, cdss):
             effect = base_effect._replace(effect="STOP_GAINED", impact="HIGH")
 
         else:
-            # TODO NON_SYNONYMOUS_START and NON_SYNONYMOUS_STOP
-
             # variant causes a codon that produces a different amino acid
             # e.g.: Tgg/Cgg, W/R
+            # N.B. NON_SYNONYMOUS_START and NON_SYNONYMOUS_STOP from the SnpEff
+            # taxonomy do not require separate handling here. Any start codon
+            # mutation that changes the amino acid is already classified as
+            # START_LOST (when ref_aa == "M"). Any stop codon mutation that
+            # changes the amino acid is already classified as STOP_LOST or
+            # STOP_GAINED. There is no reachable case that falls through to here
+            # with ref or alt at a start or stop codon position.
             effect = base_effect._replace(
                 effect="NON_SYNONYMOUS_CODING", impact="MODERATE"
             )