|
| 1 | +"""Tests for _prep_samples_for_cohort_grouping filter_unassigned behavior. |
| 2 | +
|
| 3 | +See: https://github.com/malariagen/malariagen-data-python/issues/806 |
| 4 | +""" |
| 5 | + |
| 6 | +import pandas as pd |
| 7 | + |
| 8 | +from malariagen_data.anoph.frq_base import _prep_samples_for_cohort_grouping |
| 9 | + |
| 10 | + |
| 11 | +def _make_test_df(taxon_col="taxon"): |
| 12 | + """Create a test DataFrame with intermediate and unassigned taxon values.""" |
| 13 | + return pd.DataFrame( |
| 14 | + { |
| 15 | + taxon_col: [ |
| 16 | + "gambiae", |
| 17 | + "intermediate_gambcolu_arabiensis", |
| 18 | + "unassigned", |
| 19 | + "coluzzii", |
| 20 | + ], |
| 21 | + "admin1_iso": ["KE-01", "KE-01", "KE-02", "KE-02"], |
| 22 | + "year": [2020, 2020, 2020, 2020], |
| 23 | + "month": [1, 1, 1, 1], |
| 24 | + } |
| 25 | + ) |
| 26 | + |
| 27 | + |
| 28 | +class TestPrepSamplesFilterUnassigned: |
| 29 | + """Tests for the filter_unassigned parameter in _prep_samples_for_cohort_grouping.""" |
| 30 | + |
| 31 | + def test_default_taxon_column_filters(self): |
| 32 | + """When taxon_by='taxon' and filter_unassigned=None (default), |
| 33 | + intermediate/unassigned values should be set to None (backward compat).""" |
| 34 | + df = _make_test_df(taxon_col="taxon") |
| 35 | + result = _prep_samples_for_cohort_grouping( |
| 36 | + df_samples=df, |
| 37 | + area_by="admin1_iso", |
| 38 | + period_by="year", |
| 39 | + taxon_by="taxon", |
| 40 | + ) |
| 41 | + assert result["taxon"].iloc[0] == "gambiae" |
| 42 | + assert result["taxon"].iloc[1] is None |
| 43 | + assert result["taxon"].iloc[2] is None |
| 44 | + assert result["taxon"].iloc[3] == "coluzzii" |
| 45 | + |
| 46 | + def test_custom_column_preserves(self): |
| 47 | + """When taxon_by is a custom column and filter_unassigned=None (default), |
| 48 | + intermediate/unassigned values should be preserved.""" |
| 49 | + df = _make_test_df(taxon_col="custom_taxon") |
| 50 | + result = _prep_samples_for_cohort_grouping( |
| 51 | + df_samples=df, |
| 52 | + area_by="admin1_iso", |
| 53 | + period_by="year", |
| 54 | + taxon_by="custom_taxon", |
| 55 | + ) |
| 56 | + assert result["custom_taxon"].iloc[0] == "gambiae" |
| 57 | + assert result["custom_taxon"].iloc[1] == "intermediate_gambcolu_arabiensis" |
| 58 | + assert result["custom_taxon"].iloc[2] == "unassigned" |
| 59 | + assert result["custom_taxon"].iloc[3] == "coluzzii" |
| 60 | + |
| 61 | + def test_explicit_filter_true(self): |
| 62 | + """When filter_unassigned=True, always filter regardless of column name.""" |
| 63 | + df = _make_test_df(taxon_col="custom_taxon") |
| 64 | + result = _prep_samples_for_cohort_grouping( |
| 65 | + df_samples=df, |
| 66 | + area_by="admin1_iso", |
| 67 | + period_by="year", |
| 68 | + taxon_by="custom_taxon", |
| 69 | + filter_unassigned=True, |
| 70 | + ) |
| 71 | + assert result["custom_taxon"].iloc[0] == "gambiae" |
| 72 | + assert result["custom_taxon"].iloc[1] is None |
| 73 | + assert result["custom_taxon"].iloc[2] is None |
| 74 | + assert result["custom_taxon"].iloc[3] == "coluzzii" |
| 75 | + |
| 76 | + def test_explicit_filter_false(self): |
| 77 | + """When filter_unassigned=False, never filter even for default 'taxon' column.""" |
| 78 | + df = _make_test_df(taxon_col="taxon") |
| 79 | + result = _prep_samples_for_cohort_grouping( |
| 80 | + df_samples=df, |
| 81 | + area_by="admin1_iso", |
| 82 | + period_by="year", |
| 83 | + taxon_by="taxon", |
| 84 | + filter_unassigned=False, |
| 85 | + ) |
| 86 | + assert result["taxon"].iloc[0] == "gambiae" |
| 87 | + assert result["taxon"].iloc[1] == "intermediate_gambcolu_arabiensis" |
| 88 | + assert result["taxon"].iloc[2] == "unassigned" |
| 89 | + assert result["taxon"].iloc[3] == "coluzzii" |
| 90 | + |
| 91 | + def test_does_not_modify_original(self): |
| 92 | + """Ensure the original DataFrame is not modified.""" |
| 93 | + df = _make_test_df(taxon_col="taxon") |
| 94 | + original_values = df["taxon"].tolist() |
| 95 | + _prep_samples_for_cohort_grouping( |
| 96 | + df_samples=df, |
| 97 | + area_by="admin1_iso", |
| 98 | + period_by="year", |
| 99 | + taxon_by="taxon", |
| 100 | + ) |
| 101 | + assert df["taxon"].tolist() == original_values |
0 commit comments