fix: warn when sample_query returns 0 results due to case mismatch

khushthecoder · khushthecoder · commit c12e397d330e · 2026-03-12T23:01:19.000+05:30
Fixes #1083 ## Problem sample_metadata() silently returns an empty DataFrame when a sample_query contains case-mismatched string values (e.g., 'uganda' instead of 'Uganda'). pandas query() performs strict case-sensitive comparisons, and since all country/location names are title-cased, wrong-case queries silently yield zero rows with no feedback to the user. ## Changes ### malariagen_data/anoph/sample_metadata.py - Added import re - After df_samples.query(), check if result is empty on a non-empty input - If empty: parse query for column names in comparison expressions, collect valid unique values for string (object dtype) columns, emit UserWarning with case-sensitivity note and valid values list ### malariagen_data/anoph/base_params.py - Updated sample_query docstring to note case-sensitivity and exact-match requirement ### tests/anoph/test_sample_metadata.py - test_sample_metadata_warns_on_case_mismatch: verifies UserWarning is emitted with 'case-sensitive' in the message - test_sample_metadata_no_warning_on_valid_query: verifies no spurious warning on a correct query
diff --git a/malariagen_data/anoph/base_params.py b/malariagen_data/anoph/base_params.py
@@ -69,7 +69,11 @@
     str,
     """
     A pandas query string to be evaluated against the sample metadata, to
-    select samples to be included in the returned data.
+    select samples to be included in the returned data. E.g.,
+    "country == 'Uganda'". Note: string comparisons are case-sensitive —
+    column values must match the exact casing stored in the metadata
+    (e.g., "Uganda" not "uganda"). A warning will be emitted if the query
+    returns zero results.
     """,
 ]
 
diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py
@@ -1,4 +1,5 @@
 import io
+import re
 from itertools import cycle
 from typing import (
     Any,
@@ -781,12 +782,48 @@ def sample_metadata(
         if prepared_sample_query is not None:
             # Assume a pandas query string.
             sample_query_options = sample_query_options or {}
+
+            # Save a reference to the pre-query DataFrame so we can detect
+            # zero-result queries and provide a helpful warning.
+            df_before_query = df_samples
+
             # Use the python engine in order to support extension array dtypes, e.g. Float64, Int64, boolean.
             df_samples = df_samples.query(
                 prepared_sample_query, **sample_query_options, engine="python"
             )
             df_samples = df_samples.reset_index(drop=True)
 
+            # Warn if query returned zero results on a non-empty dataset.
+            # This helps users catch case-sensitivity issues in string queries,
+            # e.g. "country == 'uganda'" instead of "country == 'Uganda'".
+            if len(df_samples) == 0 and len(df_before_query) > 0:
+                # Extract column names from comparison expressions in the query.
+                # Match patterns like: column == 'value' or column == "value"
+                referenced_cols = re.findall(
+                    r"\b(\w+)\s*[=!<>]+\s*['\"]" , prepared_sample_query
+                )
+
+                hint_lines = [
+                    f"sample_metadata() returned 0 samples for the given "
+                    f"query: {prepared_sample_query!r}.",
+                    "Note: string comparisons in sample_query are "
+                    "case-sensitive.",
+                ]
+                # For each referenced string column, list valid values.
+                for col in dict.fromkeys(referenced_cols):  # deduplicate
+                    if (
+                        col in df_before_query.columns
+                        and df_before_query[col].dtype == object
+                    ):
+                        valid_vals = sorted(
+                            df_before_query[col].dropna().unique().tolist()
+                        )
+                        hint_lines.append(
+                            f"Valid values for column {col!r}: {valid_vals}"
+                        )
+
+                warnings.warn("\n".join(hint_lines), UserWarning, stacklevel=2)
+
         # Apply the sample_indices, if there are any.
         # Note: this might need to apply to the result of an internal sample_query, e.g. `is_surveillance == True`.
         if sample_indices is not None:
diff --git a/tests/anoph/test_sample_metadata.py b/tests/anoph/test_sample_metadata.py
@@ -1465,3 +1465,48 @@ def test_cohort_data(fixture, api):
     df_cohorts = api.cohorts(cohort_name)
     # Check output.
     validate_cohort_data(df_cohorts, cohort_data_expected_columns())
+
+
+@parametrize_with_cases("fixture,api", cases=".")
+def test_sample_metadata_warns_on_case_mismatch(fixture, api: AnophelesSampleMetadata):
+    """Test that a UserWarning is raised when a case-mismatched query returns 0 results.
+
+    Regression test for https://github.com/malariagen/malariagen-data-python/issues/1083
+    """
+    # Get a valid country name from the metadata so we can construct
+    # a deliberately wrong-cased query.
+    df_all = api.sample_metadata()
+    if "country" not in df_all.columns or df_all["country"].dropna().empty:
+        pytest.skip("No 'country' column with data in this fixture.")
+
+    # Pick an actual country value and change its case.
+    real_country = df_all["country"].dropna().iloc[0]
+    wrong_case_country = real_country.lower()
+    # If lowercasing didn't actually change the string, use upper instead.
+    if wrong_case_country == real_country:
+        wrong_case_country = real_country.upper()
+
+    # The wrong-cased query should emit a UserWarning mentioning "case-sensitive".
+    with pytest.warns(UserWarning, match="case-sensitive"):
+        df = api.sample_metadata(sample_query=f"country == '{wrong_case_country}'")
+    assert len(df) == 0
+
+
+@parametrize_with_cases("fixture,api", cases=".")
+def test_sample_metadata_no_warning_on_valid_query(
+    fixture, api: AnophelesSampleMetadata
+):
+    """Test that no spurious warning is emitted when a valid query returns results."""
+    df_all = api.sample_metadata()
+    if "country" not in df_all.columns or df_all["country"].dropna().empty:
+        pytest.skip("No 'country' column with data in this fixture.")
+
+    real_country = df_all["country"].dropna().iloc[0]
+
+    import warnings
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error", UserWarning)
+        df = api.sample_metadata(sample_query=f"country == '{real_country}'")
+    assert len(df) > 0
+