Merge branch 'master' into GH1097-migrate-extras-to-groups

khushthecoder · web-flow · commit f7d09a5a0063 · 2026-03-17T16:11:20.000+05:30
diff --git a/malariagen_data/anoph/base_params.py b/malariagen_data/anoph/base_params.py
@@ -69,7 +69,10 @@
     str,
     """
     A pandas query string to be evaluated against the sample metadata, to
-    select samples to be included in the returned data.
+    select samples to be included in the returned data. E.g.,
+    "country == 'Uganda'". If the query returns zero results, a warning
+    will be emitted with fuzzy-match suggestions for possible typos or
+    case mismatches.
     """,
 ]
 
diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py
@@ -1,5 +1,7 @@
+import difflib
 import io
 import json
+import re
 from itertools import cycle
 from typing import (
     Any,
@@ -705,6 +707,17 @@ def clear_extra_metadata(self):
     @doc(
         summary="Access sample metadata for one or more sample sets.",
         returns="A dataframe of sample metadata, one row per sample.",
+        notes="""
+            Some samples in the dataset are lab crosses — mosquitoes bred in
+            the laboratory that have no real collection date. These samples
+            use ``year=-1`` and ``month=-1`` as sentinel values. They may
+            cause unexpected results in date-based analyses (e.g.,
+            ``pd.to_datetime`` will fail on negative year values).
+
+            To exclude lab cross samples, use::
+
+                df = api.sample_metadata(sample_query="year >= 0")
+        """,
     )
     def sample_metadata(
         self,
@@ -784,12 +797,65 @@ def sample_metadata(
         if prepared_sample_query is not None:
             # Assume a pandas query string.
             sample_query_options = sample_query_options or {}
+
+            # Save a reference to the pre-query DataFrame so we can detect
+            # zero-result queries and provide a helpful warning.
+            df_before_query = df_samples
+
             # Use the python engine in order to support extension array dtypes, e.g. Float64, Int64, boolean.
             df_samples = df_samples.query(
                 prepared_sample_query, **sample_query_options, engine="python"
             )
             df_samples = df_samples.reset_index(drop=True)
 
+            # Warn if query returned zero results on a non-empty dataset.
+            # Provide fuzzy-match suggestions so users can spot typos,
+            # case mismatches, or partial-value issues.
+            if len(df_samples) == 0 and len(df_before_query) > 0:
+                hint_lines = [
+                    f"sample_metadata() returned 0 samples for query: {prepared_sample_query!r}.",
+                ]
+
+                # Extract column == 'value' pairs from the query.
+                col_val_pairs = re.findall(
+                    r"\b(\w+)\s*==\s*['\"]([^'\"]+)['\"]",
+                    prepared_sample_query,
+                )
+
+                for col_name, queried_val in col_val_pairs:
+                    # If the column name is not recognised, suggest
+                    # close column names.
+                    if col_name not in df_before_query.columns:
+                        close_cols = difflib.get_close_matches(
+                            col_name,
+                            df_before_query.columns.tolist(),
+                            n=3,
+                            cutoff=0.6,
+                        )
+                        if close_cols:
+                            hint_lines.append(
+                                f"Column {col_name!r} not found. "
+                                f"Did you mean: {close_cols}?"
+                            )
+                        continue
+
+                    # For string columns, suggest close values.
+                    if df_before_query[col_name].dtype == object:
+                        valid_vals = (
+                            df_before_query[col_name].dropna().unique().tolist()
+                        )
+                        close_vals = difflib.get_close_matches(
+                            queried_val, valid_vals, n=5, cutoff=0.6
+                        )
+                        if close_vals:
+                            hint_lines.append(
+                                f"Value {queried_val!r} not found in "
+                                f"column {col_name!r}. "
+                                f"Did you mean: {close_vals}?"
+                            )
+
+                warnings.warn("\n".join(hint_lines), UserWarning, stacklevel=2)
+
         # Apply the sample_indices, if there are any.
         # Note: this might need to apply to the result of an internal sample_query, e.g. `is_surveillance == True`.
         if sample_indices is not None:
diff --git a/tests/anoph/test_sample_metadata.py b/tests/anoph/test_sample_metadata.py
@@ -1508,6 +1508,53 @@ def test_cohort_data(fixture, api):
     validate_cohort_data(df_cohorts, cohort_data_expected_columns())
 
 
+@parametrize_with_cases("fixture,api", cases=".")
+def test_sample_metadata_warns_on_zero_results_with_suggestions(
+    fixture, api: AnophelesSampleMetadata
+):
+    """Test that a UserWarning with fuzzy suggestions is raised when a query
+    returns 0 results due to a typo or case mismatch.
+
+    Regression test for https://github.com/malariagen/malariagen-data-python/issues/1083
+    """
+    # Get a valid country name from the metadata so we can construct
+    # a deliberately wrong-cased query.
+    df_all = api.sample_metadata()
+    if "country" not in df_all.columns or df_all["country"].dropna().empty:
+        pytest.skip("No 'country' column with data in this fixture.")
+
+    # Pick an actual country value and change its case.
+    real_country = df_all["country"].dropna().iloc[0]
+    wrong_case_country = real_country.lower()
+    # If lowercasing didn't actually change the string, use upper instead.
+    if wrong_case_country == real_country:
+        wrong_case_country = real_country.upper()
+
+    # The wrong-cased query should emit a UserWarning with fuzzy suggestions.
+    with pytest.warns(UserWarning, match="Did you mean"):
+        df = api.sample_metadata(sample_query=f"country == '{wrong_case_country}'")
+    assert len(df) == 0
+
+
+@parametrize_with_cases("fixture,api", cases=".")
+def test_sample_metadata_no_warning_on_valid_query(
+    fixture, api: AnophelesSampleMetadata
+):
+    """Test that no spurious warning is emitted when a valid query returns results."""
+    df_all = api.sample_metadata()
+    if "country" not in df_all.columns or df_all["country"].dropna().empty:
+        pytest.skip("No 'country' column with data in this fixture.")
+
+    real_country = df_all["country"].dropna().iloc[0]
+
+    import warnings
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        df = api.sample_metadata(sample_query=f"country == '{real_country}'")
+    assert len(df) > 0
+
+
 @parametrize_with_cases("fixture,api", cases=case_ag3_sim)
 def test_cohort_data_admin1_year(fixture, api):
     df_cohorts = api.cohorts("admin1_year")