|
16 | 16 | from .base import AnophelesBase |
17 | 17 |
|
18 | 18 |
|
19 | | -def _prep_samples_for_cohort_grouping(*, df_samples, area_by, period_by, taxon_by): |
| 19 | +def _prep_samples_for_cohort_grouping( |
| 20 | + *, df_samples, area_by, period_by, taxon_by, filter_unassigned=None |
| 21 | +): |
20 | 22 | # Take a copy, as we will modify the dataframe. |
21 | 23 | df_samples = df_samples.copy() |
22 | 24 |
|
23 | | - # Fix "intermediate" or "unassigned" taxon values - we only want to build |
24 | | - # cohorts with clean taxon calls, so we set other values to None. |
25 | | - loc_intermediate_taxon = ( |
26 | | - df_samples[taxon_by].str.startswith("intermediate").fillna(False) |
27 | | - ) |
28 | | - df_samples.loc[loc_intermediate_taxon, taxon_by] = None |
29 | | - loc_unassigned_taxon = ( |
30 | | - df_samples[taxon_by].str.startswith("unassigned").fillna(False) |
31 | | - ) |
32 | | - df_samples.loc[loc_unassigned_taxon, taxon_by] = None |
| 25 | + # Determine whether to filter "intermediate"/"unassigned" taxon values. |
| 26 | + # See: https://github.com/malariagen/malariagen-data-python/issues/806 |
| 27 | + if filter_unassigned is None: |
| 28 | + # Auto-apply filtering only when using the default "taxon" column. |
| 29 | + # Users can explicitly override with True/False. |
| 30 | + filter_unassigned = taxon_by == "taxon" |
| 31 | + |
| 32 | + if filter_unassigned: |
| 33 | + # Remove samples with "intermediate" or "unassigned" taxon values, |
| 34 | + # as we only want cohorts with clean taxon calls. |
| 35 | + loc_intermediate_taxon = ( |
| 36 | + df_samples[taxon_by].str.startswith("intermediate").fillna(False) |
| 37 | + ) |
| 38 | + df_samples.loc[loc_intermediate_taxon, taxon_by] = None |
| 39 | + loc_unassigned_taxon = ( |
| 40 | + df_samples[taxon_by].str.startswith("unassigned").fillna(False) |
| 41 | + ) |
| 42 | + df_samples.loc[loc_unassigned_taxon, taxon_by] = None |
33 | 43 |
|
34 | 44 | # Add period column. |
35 | 45 |
|
|
0 commit comments