Add filter_unassigned parameter to _prep_samples_for_cohort_grouping

shauryam2807 · shauryam2807 · commit b89e20ba4048 · 2026-02-28T20:34:53.000+05:30
Fixes #806. When building cohorts, the function previously always filtered out samples with 'intermediate' or 'unassigned' taxon values. This was surprising when users specified a custom taxon_by column. The new filter_unassigned parameter (default None) auto-detects: - When taxon_by='taxon' (default): filters as before (backward compat) - When taxon_by is custom: preserves all values - Users can explicitly override with True/False Propagated through snp_allele_frequencies_advanced(), aa_allele_frequencies_advanced(), gene_cnv_frequencies_advanced(), and haplotypes_frequencies_advanced().
diff --git a/malariagen_data/anoph/cnv_frq.py b/malariagen_data/anoph/cnv_frq.py
@@ -446,6 +446,7 @@ def gene_cnv_frequencies_advanced(
         chunks: base_params.chunks = base_params.native_chunks,
         inline_array: base_params.inline_array = base_params.inline_array_default,
         taxon_by: frq_params.taxon_by = frq_params.taxon_by_default,
+        filter_unassigned: Optional[frq_params.filter_unassigned] = None,
     ) -> xr.Dataset:
         regions: List[Region] = _parse_multi_region(self, region)
         del region
@@ -468,6 +469,7 @@ def gene_cnv_frequencies_advanced(
                     chunks=chunks,
                     inline_array=inline_array,
                     taxon_by=taxon_by,
+                    filter_unassigned=filter_unassigned,
                 )
                 for r in regions
             ],
@@ -497,6 +499,7 @@ def _gene_cnv_frequencies_advanced(
         chunks,
         inline_array,
         taxon_by,
+        filter_unassigned,
     ):
         debug = self._log.debug
 
@@ -527,6 +530,7 @@ def _gene_cnv_frequencies_advanced(
             area_by=area_by,
             period_by=period_by,
             taxon_by=taxon_by,
+            filter_unassigned=filter_unassigned,
         )
 
         debug("group samples to make cohorts")
diff --git a/malariagen_data/anoph/frq_base.py b/malariagen_data/anoph/frq_base.py
@@ -16,20 +16,29 @@
 from .base import AnophelesBase
 
 
-def _prep_samples_for_cohort_grouping(*, df_samples, area_by, period_by, taxon_by):
+def _prep_samples_for_cohort_grouping(
+    *, df_samples, area_by, period_by, taxon_by, filter_unassigned=None
+):
     # Take a copy, as we will modify the dataframe.
     df_samples = df_samples.copy()
 
-    # Fix "intermediate" or "unassigned" taxon values - we only want to build
-    # cohorts with clean taxon calls, so we set other values to None.
-    loc_intermediate_taxon = (
-        df_samples[taxon_by].str.startswith("intermediate").fillna(False)
-    )
-    df_samples.loc[loc_intermediate_taxon, taxon_by] = None
-    loc_unassigned_taxon = (
-        df_samples[taxon_by].str.startswith("unassigned").fillna(False)
-    )
-    df_samples.loc[loc_unassigned_taxon, taxon_by] = None
+    # Determine whether to filter "intermediate"/"unassigned" taxon values.
+    # When filter_unassigned is None (default), auto-apply filtering only
+    # when using the default "taxon" column. Users can explicitly override
+    # with True/False.
+    # See: https://github.com/malariagen/malariagen-data-python/issues/806
+    if filter_unassigned is None:
+        filter_unassigned = taxon_by == "taxon"
+
+    if filter_unassigned:
+        loc_intermediate_taxon = (
+            df_samples[taxon_by].str.startswith("intermediate").fillna(False)
+        )
+        df_samples.loc[loc_intermediate_taxon, taxon_by] = None
+        loc_unassigned_taxon = (
+            df_samples[taxon_by].str.startswith("unassigned").fillna(False)
+        )
+        df_samples.loc[loc_unassigned_taxon, taxon_by] = None
 
     # Add period column.
 
diff --git a/malariagen_data/anoph/frq_params.py b/malariagen_data/anoph/frq_params.py
@@ -87,3 +87,13 @@
 ]
 
 taxon_by_default: taxon_by = "taxon"
+
+filter_unassigned: TypeAlias = Annotated[
+    Optional[bool],
+    """
+    Whether to filter out samples with "intermediate" or "unassigned" taxon
+    values before building cohorts. If None (default), filtering is applied
+    only when using the default "taxon" column. Set True to always filter,
+    or False to never filter.
+    """,
+]
diff --git a/malariagen_data/anoph/hap_frq.py b/malariagen_data/anoph/hap_frq.py
@@ -154,6 +154,7 @@ def haplotypes_frequencies_advanced(
         chunks: base_params.chunks = base_params.native_chunks,
         inline_array: base_params.inline_array = base_params.inline_array_default,
         taxon_by: frq_params.taxon_by = frq_params.taxon_by_default,
+        filter_unassigned: Optional[frq_params.filter_unassigned] = None,
     ) -> xr.Dataset:
         # Load sample metadata.
         df_samples = self.sample_metadata(
@@ -168,6 +169,7 @@ def haplotypes_frequencies_advanced(
             area_by=area_by,
             period_by=period_by,
             taxon_by=taxon_by,
+            filter_unassigned=filter_unassigned,
         )
 
         # Group samples to make cohorts.
diff --git a/malariagen_data/anoph/snp_frq.py b/malariagen_data/anoph/snp_frq.py
@@ -453,6 +453,7 @@ def snp_allele_frequencies_advanced(
         chunks: base_params.chunks = base_params.native_chunks,
         inline_array: base_params.inline_array = base_params.inline_array_default,
         taxon_by: frq_params.taxon_by = frq_params.taxon_by_default,
+        filter_unassigned: Optional[frq_params.filter_unassigned] = None,
     ) -> xr.Dataset:
         # Load sample metadata.
         df_samples = self.sample_metadata(
@@ -467,6 +468,7 @@ def snp_allele_frequencies_advanced(
             area_by=area_by,
             period_by=period_by,
             taxon_by=taxon_by,
+            filter_unassigned=filter_unassigned,
         )
 
         # Group samples to make cohorts.
@@ -684,6 +686,7 @@ def aa_allele_frequencies_advanced(
         chunks: base_params.chunks = base_params.native_chunks,
         inline_array: base_params.inline_array = base_params.inline_array_default,
         taxon_by: frq_params.taxon_by = frq_params.taxon_by_default,
+        filter_unassigned: Optional[frq_params.filter_unassigned] = None,
     ) -> xr.Dataset:
         # Begin by computing SNP allele frequencies.
         ds_snp_frq = self.snp_allele_frequencies_advanced(
@@ -702,6 +705,7 @@ def aa_allele_frequencies_advanced(
             chunks=chunks,
             inline_array=inline_array,
             taxon_by=taxon_by,
+            filter_unassigned=filter_unassigned,
         )
 
         # N.B., we need to worry about the possibility of the
diff --git a/tests/anoph/test_frq_base.py b/tests/anoph/test_frq_base.py
@@ -0,0 +1,101 @@
+"""Tests for _prep_samples_for_cohort_grouping filter_unassigned behavior.
+
+See: https://github.com/malariagen/malariagen-data-python/issues/806
+"""
+
+import pandas as pd
+
+from malariagen_data.anoph.frq_base import _prep_samples_for_cohort_grouping
+
+
+def _make_test_df(taxon_col="taxon"):
+    """Create a test DataFrame with intermediate and unassigned taxon values."""
+    return pd.DataFrame(
+        {
+            taxon_col: [
+                "gambiae",
+                "intermediate_gambcolu_arabiensis",
+                "unassigned",
+                "coluzzii",
+            ],
+            "admin1_iso": ["KE-01", "KE-01", "KE-02", "KE-02"],
+            "year": [2020, 2020, 2020, 2020],
+            "month": [1, 1, 1, 1],
+        }
+    )
+
+
+class TestPrepSamplesFilterUnassigned:
+    """Tests for the filter_unassigned parameter in _prep_samples_for_cohort_grouping."""
+
+    def test_default_taxon_column_filters(self):
+        """When taxon_by='taxon' and filter_unassigned=None (default),
+        intermediate/unassigned values should be set to None (backward compat)."""
+        df = _make_test_df(taxon_col="taxon")
+        result = _prep_samples_for_cohort_grouping(
+            df_samples=df,
+            area_by="admin1_iso",
+            period_by="year",
+            taxon_by="taxon",
+        )
+        assert result["taxon"].iloc[0] == "gambiae"
+        assert result["taxon"].iloc[1] is None
+        assert result["taxon"].iloc[2] is None
+        assert result["taxon"].iloc[3] == "coluzzii"
+
+    def test_custom_column_preserves(self):
+        """When taxon_by is a custom column and filter_unassigned=None (default),
+        intermediate/unassigned values should be preserved."""
+        df = _make_test_df(taxon_col="custom_taxon")
+        result = _prep_samples_for_cohort_grouping(
+            df_samples=df,
+            area_by="admin1_iso",
+            period_by="year",
+            taxon_by="custom_taxon",
+        )
+        assert result["custom_taxon"].iloc[0] == "gambiae"
+        assert result["custom_taxon"].iloc[1] == "intermediate_gambcolu_arabiensis"
+        assert result["custom_taxon"].iloc[2] == "unassigned"
+        assert result["custom_taxon"].iloc[3] == "coluzzii"
+
+    def test_explicit_filter_true(self):
+        """When filter_unassigned=True, always filter regardless of column name."""
+        df = _make_test_df(taxon_col="custom_taxon")
+        result = _prep_samples_for_cohort_grouping(
+            df_samples=df,
+            area_by="admin1_iso",
+            period_by="year",
+            taxon_by="custom_taxon",
+            filter_unassigned=True,
+        )
+        assert result["custom_taxon"].iloc[0] == "gambiae"
+        assert result["custom_taxon"].iloc[1] is None
+        assert result["custom_taxon"].iloc[2] is None
+        assert result["custom_taxon"].iloc[3] == "coluzzii"
+
+    def test_explicit_filter_false(self):
+        """When filter_unassigned=False, never filter even for default 'taxon' column."""
+        df = _make_test_df(taxon_col="taxon")
+        result = _prep_samples_for_cohort_grouping(
+            df_samples=df,
+            area_by="admin1_iso",
+            period_by="year",
+            taxon_by="taxon",
+            filter_unassigned=False,
+        )
+        assert result["taxon"].iloc[0] == "gambiae"
+        assert result["taxon"].iloc[1] == "intermediate_gambcolu_arabiensis"
+        assert result["taxon"].iloc[2] == "unassigned"
+        assert result["taxon"].iloc[3] == "coluzzii"
+
+    def test_does_not_modify_original(self):
+        """Ensure the original DataFrame is not modified."""
+        df = _make_test_df(taxon_col="taxon")
+        original_values = df["taxon"].tolist()
+        _prep_samples_for_cohort_grouping(
+            df_samples=df,
+            area_by="admin1_iso",
+            period_by="year",
+            taxon_by="taxon",
+        )
+        assert df["taxon"].tolist() == original_values