Skip to content

Commit c12e397

Browse files
committed
fix: warn when sample_query returns 0 results due to case mismatch
Fixes #1083 ## Problem sample_metadata() silently returns an empty DataFrame when a sample_query contains case-mismatched string values (e.g., 'uganda' instead of 'Uganda'). pandas query() performs strict case-sensitive comparisons, and since all country/location names are title-cased, wrong-case queries silently yield zero rows with no feedback to the user. ## Changes ### malariagen_data/anoph/sample_metadata.py - Added import re - After df_samples.query(), check if result is empty on a non-empty input - If empty: parse query for column names in comparison expressions, collect valid unique values for string (object dtype) columns, emit UserWarning with case-sensitivity note and valid values list ### malariagen_data/anoph/base_params.py - Updated sample_query docstring to note case-sensitivity and exact-match requirement ### tests/anoph/test_sample_metadata.py - test_sample_metadata_warns_on_case_mismatch: verifies UserWarning is emitted with 'case-sensitive' in the message - test_sample_metadata_no_warning_on_valid_query: verifies no spurious warning on a correct query
1 parent c269768 commit c12e397

3 files changed

Lines changed: 87 additions & 1 deletion

File tree

malariagen_data/anoph/base_params.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,11 @@
6969
str,
7070
"""
7171
A pandas query string to be evaluated against the sample metadata, to
72-
select samples to be included in the returned data.
72+
select samples to be included in the returned data. E.g.,
73+
"country == 'Uganda'". Note: string comparisons are case-sensitive —
74+
column values must match the exact casing stored in the metadata
75+
(e.g., "Uganda" not "uganda"). A warning will be emitted if the query
76+
returns zero results.
7377
""",
7478
]
7579

malariagen_data/anoph/sample_metadata.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import io
2+
import re
23
from itertools import cycle
34
from typing import (
45
Any,
@@ -781,12 +782,48 @@ def sample_metadata(
781782
if prepared_sample_query is not None:
782783
# Assume a pandas query string.
783784
sample_query_options = sample_query_options or {}
785+
786+
# Save a reference to the pre-query DataFrame so we can detect
787+
# zero-result queries and provide a helpful warning.
788+
df_before_query = df_samples
789+
784790
# Use the python engine in order to support extension array dtypes, e.g. Float64, Int64, boolean.
785791
df_samples = df_samples.query(
786792
prepared_sample_query, **sample_query_options, engine="python"
787793
)
788794
df_samples = df_samples.reset_index(drop=True)
789795

796+
# Warn if query returned zero results on a non-empty dataset.
797+
# This helps users catch case-sensitivity issues in string queries,
798+
# e.g. "country == 'uganda'" instead of "country == 'Uganda'".
799+
if len(df_samples) == 0 and len(df_before_query) > 0:
800+
# Extract column names from comparison expressions in the query.
801+
# Match patterns like: column == 'value' or column == "value"
802+
referenced_cols = re.findall(
803+
r"\b(\w+)\s*[=!<>]+\s*['\"]" , prepared_sample_query
804+
)
805+
806+
hint_lines = [
807+
f"sample_metadata() returned 0 samples for the given "
808+
f"query: {prepared_sample_query!r}.",
809+
"Note: string comparisons in sample_query are "
810+
"case-sensitive.",
811+
]
812+
# For each referenced string column, list valid values.
813+
for col in dict.fromkeys(referenced_cols): # deduplicate
814+
if (
815+
col in df_before_query.columns
816+
and df_before_query[col].dtype == object
817+
):
818+
valid_vals = sorted(
819+
df_before_query[col].dropna().unique().tolist()
820+
)
821+
hint_lines.append(
822+
f"Valid values for column {col!r}: {valid_vals}"
823+
)
824+
825+
warnings.warn("\n".join(hint_lines), UserWarning, stacklevel=2)
826+
790827
# Apply the sample_indices, if there are any.
791828
# Note: this might need to apply to the result of an internal sample_query, e.g. `is_surveillance == True`.
792829
if sample_indices is not None:

tests/anoph/test_sample_metadata.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1465,3 +1465,48 @@ def test_cohort_data(fixture, api):
14651465
df_cohorts = api.cohorts(cohort_name)
14661466
# Check output.
14671467
validate_cohort_data(df_cohorts, cohort_data_expected_columns())
1468+
1469+
1470+
@parametrize_with_cases("fixture,api", cases=".")
1471+
def test_sample_metadata_warns_on_case_mismatch(fixture, api: AnophelesSampleMetadata):
1472+
"""Test that a UserWarning is raised when a case-mismatched query returns 0 results.
1473+
1474+
Regression test for https://github.com/malariagen/malariagen-data-python/issues/1083
1475+
"""
1476+
# Get a valid country name from the metadata so we can construct
1477+
# a deliberately wrong-cased query.
1478+
df_all = api.sample_metadata()
1479+
if "country" not in df_all.columns or df_all["country"].dropna().empty:
1480+
pytest.skip("No 'country' column with data in this fixture.")
1481+
1482+
# Pick an actual country value and change its case.
1483+
real_country = df_all["country"].dropna().iloc[0]
1484+
wrong_case_country = real_country.lower()
1485+
# If lowercasing didn't actually change the string, use upper instead.
1486+
if wrong_case_country == real_country:
1487+
wrong_case_country = real_country.upper()
1488+
1489+
# The wrong-cased query should emit a UserWarning mentioning "case-sensitive".
1490+
with pytest.warns(UserWarning, match="case-sensitive"):
1491+
df = api.sample_metadata(sample_query=f"country == '{wrong_case_country}'")
1492+
assert len(df) == 0
1493+
1494+
1495+
@parametrize_with_cases("fixture,api", cases=".")
1496+
def test_sample_metadata_no_warning_on_valid_query(
1497+
fixture, api: AnophelesSampleMetadata
1498+
):
1499+
"""Test that no spurious warning is emitted when a valid query returns results."""
1500+
df_all = api.sample_metadata()
1501+
if "country" not in df_all.columns or df_all["country"].dropna().empty:
1502+
pytest.skip("No 'country' column with data in this fixture.")
1503+
1504+
real_country = df_all["country"].dropna().iloc[0]
1505+
1506+
import warnings
1507+
1508+
with warnings.catch_warnings():
1509+
warnings.simplefilter("error", UserWarning)
1510+
df = api.sample_metadata(sample_query=f"country == '{real_country}'")
1511+
assert len(df) > 0
1512+

0 commit comments

Comments
 (0)