malariagen
diff --git a/‎.github/actions/setup-python/action.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/actions/setup-python/action.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 5 additions & 1 deletion b/‎CONTRIBUTING.md‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎malariagen_data/anoph/base_params.py‎
Lines changed: 4 additions & 1 deletion b/‎malariagen_data/anoph/base_params.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎malariagen_data/anoph/frq_base.py‎
Lines changed: 14 additions & 0 deletions b/‎malariagen_data/anoph/frq_base.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎malariagen_data/anoph/sample_metadata.py‎
Lines changed: 160 additions & 9 deletions b/‎malariagen_data/anoph/sample_metadata.py‎
Lines changed: 160 additions & 9 deletions
diff --git a/‎malariagen_data/util.py‎
Lines changed: 2 additions & 3 deletions b/‎malariagen_data/util.py‎
Lines changed: 2 additions & 3 deletions
@@ -19,4 +19,4 @@ runs:
       shell: bash
       run: |
         poetry env use ${{ inputs.python-version }}
-        poetry install --extras dev
+        poetry install --with dev,test,docs
@@ -52,9 +52,13 @@ Both of these can be installed using your distribution's package manager or [Hom
 
    ```bash
    poetry env use 3.12
-   poetry install --extras dev
+   poetry install --with dev,test,docs
    ```
 
+   This installs the runtime dependencies along with the `dev`, `test`, and `docs`
+   [dependency groups](https://python-poetry.org/docs/managing-dependencies/#dependency-groups).
+   If you only need to run tests, `poetry install --with test` is sufficient.
+
    **Recommended**: Use `poetry run` to run commands inside the virtual environment:
 
    ```bash
 
@@ -69,7 +69,10 @@
     str,
     """
     A pandas query string to be evaluated against the sample metadata, to
-    select samples to be included in the returned data.
+    select samples to be included in the returned data. E.g.,
+    "country == 'Uganda'". If the query returns zero results, a warning
+    will be emitted with fuzzy-match suggestions for possible typos or
+    case mismatches.
     """,
 ]
 
 
@@ -29,6 +29,13 @@ def _prep_samples_for_cohort_grouping(
         # Users can explicitly override with True/False.
         filter_unassigned = taxon_by == "taxon"
 
+    # Validate taxon_by.
+    if taxon_by not in df_samples.columns:
+        raise ValueError(
+            f"Invalid value for `taxon_by`: {taxon_by!r}. "
+            f"Must be the name of an existing column in the sample metadata."
+        )
+
     if filter_unassigned:
         # Remove samples with "intermediate" or "unassigned" taxon values,
         # as we only want cohorts with clean taxon calls.
@@ -78,6 +85,13 @@ def _prep_samples_for_cohort_grouping(
         # Apply the matching period_by function to create a new "period" column.
         df_samples["period"] = df_samples.apply(period_by_func, axis="columns")
 
+    # Validate area_by.
+    if area_by not in df_samples.columns:
+        raise ValueError(
+            f"Invalid value for `area_by`: {area_by!r}. "
+            f"Must be the name of an existing column in the sample metadata."
+        )
+
     # Copy the specified area_by column to a new "area" column.
     df_samples["area"] = df_samples[area_by]
 
 
@@ -1,4 +1,7 @@
+import difflib
 import io
+import json
+import re
 from itertools import cycle
 from typing import (
     Any,
@@ -81,6 +84,8 @@ def __init__(
 
         # Initialize cache attributes.
         self._cache_sample_metadata: Dict = dict()
+        self._cache_cohorts: Dict = dict()
+        self._cache_cohort_geometries: Dict = dict()
 
     def _metadata_paths(
         self,
@@ -702,6 +707,17 @@ def clear_extra_metadata(self):
     @doc(
         summary="Access sample metadata for one or more sample sets.",
         returns="A dataframe of sample metadata, one row per sample.",
+        notes="""
+            Some samples in the dataset are lab crosses — mosquitoes bred in
+            the laboratory that have no real collection date. These samples
+            use ``year=-1`` and ``month=-1`` as sentinel values. They may
+            cause unexpected results in date-based analyses (e.g.,
+            ``pd.to_datetime`` will fail on negative year values).
+
+            To exclude lab cross samples, use::
+
+                df = api.sample_metadata(sample_query="year >= 0")
+        """,
     )
     def sample_metadata(
         self,
@@ -781,12 +797,65 @@ def sample_metadata(
         if prepared_sample_query is not None:
             # Assume a pandas query string.
             sample_query_options = sample_query_options or {}
+
+            # Save a reference to the pre-query DataFrame so we can detect
+            # zero-result queries and provide a helpful warning.
+            df_before_query = df_samples
+
             # Use the python engine in order to support extension array dtypes, e.g. Float64, Int64, boolean.
             df_samples = df_samples.query(
                 prepared_sample_query, **sample_query_options, engine="python"
             )
             df_samples = df_samples.reset_index(drop=True)
 
+            # Warn if query returned zero results on a non-empty dataset.
+            # Provide fuzzy-match suggestions so users can spot typos,
+            # case mismatches, or partial-value issues.
+            if len(df_samples) == 0 and len(df_before_query) > 0:
+                hint_lines = [
+                    f"sample_metadata() returned 0 samples for query: {prepared_sample_query!r}.",
+                ]
+
+                # Extract column == 'value' pairs from the query.
+                col_val_pairs = re.findall(
+                    r"\b(\w+)\s*==\s*['\"]([^'\"]+)['\"]",
+                    prepared_sample_query,
+                )
+
+                for col_name, queried_val in col_val_pairs:
+                    # If the column name is not recognised, suggest
+                    # close column names.
+                    if col_name not in df_before_query.columns:
+                        close_cols = difflib.get_close_matches(
+                            col_name,
+                            df_before_query.columns.tolist(),
+                            n=3,
+                            cutoff=0.6,
+                        )
+                        if close_cols:
+                            hint_lines.append(
+                                f"Column {col_name!r} not found. "
+                                f"Did you mean: {close_cols}?"
+                            )
+                        continue
+
+                    # For string columns, suggest close values.
+                    if df_before_query[col_name].dtype == object:
+                        valid_vals = (
+                            df_before_query[col_name].dropna().unique().tolist()
+                        )
+                        close_vals = difflib.get_close_matches(
+                            queried_val, valid_vals, n=5, cutoff=0.6
+                        )
+                        if close_vals:
+                            hint_lines.append(
+                                f"Value {queried_val!r} not found in "
+                                f"column {col_name!r}. "
+                                f"Did you mean: {close_vals}?"
+                            )
+
+                warnings.warn("\n".join(hint_lines), UserWarning, stacklevel=2)
+
         # Apply the sample_indices, if there are any.
         # Note: this might need to apply to the result of an internal sample_query, e.g. `is_surveillance == True`.
         if sample_indices is not None:
@@ -1485,7 +1554,11 @@ def _setup_cohort_queries(
                 A cohort set name. Accepted values are:
                 "admin1_month", "admin1_quarter", "admin1_year",
                 "admin2_month", "admin2_quarter", "admin2_year".
-            """
+            """,
+            query="""
+                An optional pandas query string to filter the resulting
+                dataframe, e.g., "country == 'Burkina Faso'".
+            """,
         ),
         returns="""A dataframe of cohort data, one row per cohort. There are up to 18 columns:
         `cohort_id` is the identifier of the cohort,
@@ -1512,20 +1585,98 @@ def _setup_cohort_queries(
     def cohorts(
         self,
         cohort_set: base_params.cohorts,
+        query: Optional[str] = None,
     ) -> pd.DataFrame:
-        major_version_path = self._major_version_path
+        valid_cohort_sets = {
+            "admin1_month",
+            "admin1_quarter",
+            "admin1_year",
+            "admin2_month",
+            "admin2_quarter",
+            "admin2_year",
+        }
+        if cohort_set not in valid_cohort_sets:
+            raise ValueError(
+                f"{cohort_set!r} is not a valid cohort set. "
+                f"Accepted values are: {sorted(valid_cohort_sets)}."
+            )
+
         cohorts_analysis = self._cohorts_analysis
 
-        path = f"{major_version_path[:2]}_cohorts/cohorts_{cohorts_analysis}/cohorts_{cohort_set}.csv"
+        # Cache to avoid repeated reads.
+        cache_key = (cohorts_analysis, cohort_set)
+        try:
+            df_cohorts = self._cache_cohorts[cache_key]
+        except KeyError:
+            major_version_path = self._major_version_path
+            path = f"{major_version_path[:2]}_cohorts/cohorts_{cohorts_analysis}/cohorts_{cohort_set}.csv"
+
+            with self.open_file(path) as f:
+                df_cohorts = pd.read_csv(f, sep=",", na_values="")
+
+            # Ensure all column names are lower case.
+            df_cohorts.columns = [c.lower() for c in df_cohorts.columns]  # type: ignore
+
+            self._cache_cohorts[cache_key] = df_cohorts
+
+        if query is not None:
+            df_cohorts = df_cohorts.query(query)
+            df_cohorts = df_cohorts.reset_index(drop=True)
+
+        return df_cohorts.copy()
+
+    @_check_types
+    @doc(
+        summary="""
+            Read GeoJSON geometry data for a specific cohort set,
+            providing boundary geometries for each cohort.
+        """,
+        parameters=dict(
+            cohort_set="""
+                A cohort set name. Accepted values are:
+                "admin1_month", "admin1_quarter", "admin1_year",
+                "admin2_month", "admin2_quarter", "admin2_year".
+            """,
+        ),
+        returns="""
+            A dict containing the parsed GeoJSON FeatureCollection,
+            with boundary geometries for each cohort in the set.
+        """,
+    )
+    def cohort_geometries(
+        self,
+        cohort_set: base_params.cohorts,
+    ) -> dict:
+        valid_cohort_sets = {
+            "admin1_month",
+            "admin1_quarter",
+            "admin1_year",
+            "admin2_month",
+            "admin2_quarter",
+            "admin2_year",
+        }
+        if cohort_set not in valid_cohort_sets:
+            raise ValueError(
+                f"{cohort_set!r} is not a valid cohort set. "
+                f"Accepted values are: {sorted(valid_cohort_sets)}."
+            )
+
+        cohorts_analysis = self._cohorts_analysis
+
+        # Cache to avoid repeated reads.
+        cache_key = (cohorts_analysis, cohort_set)
+        try:
+            geojson_data = self._cache_cohort_geometries[cache_key]
+        except KeyError:
+            major_version_path = self._major_version_path
+            path = f"{major_version_path[:2]}_cohorts/cohorts_{cohorts_analysis}/cohorts_{cohort_set}.geojson"
 
-        # Read the manifest into a pandas dataframe.
-        with self.open_file(path) as f:
-            df_cohorts = pd.read_csv(f, sep=",", na_values="")
+            with self.open_file(path) as f:
+                geojson_data = json.load(f)
 
-        # Ensure all column names are lower case.
-        df_cohorts.columns = [c.lower() for c in df_cohorts.columns]  # type: ignore
+            self._cache_cohort_geometries[cache_key] = geojson_data
 
-        return df_cohorts
+        return geojson_data
 
     @_check_types
     @doc(
 
@@ -855,9 +855,7 @@ def _value_error(
     value,
     expectation,
 ):
-    message = (
-        f"Bad value for parameter {name}; expected {expectation}, " f"found {value!r}"
-    )
+    message = f"Bad value for parameter {name}; expected {expectation}, found {value!r}"
     raise ValueError(message)
 
 
@@ -935,6 +933,7 @@ def info(self, msg):
         self.flush()
 
     def set_level(self, level):
+        self._logger.setLevel(level)
         if self._handler is not None:
             self._handler.setLevel(level)