Add taxon_by to gene_cnv_frequencies_advanced, haplotypes_frequencies_advanced(). Merge util.py prep_samples_for_cohort_grouping() into frq_base.py.

leehart · leehart · commit f347969de001 · 2025-02-20T17:30:57.000Z
diff --git a/malariagen_data/anoph/cnv_frq.py b/malariagen_data/anoph/cnv_frq.py
@@ -445,6 +445,7 @@ def gene_cnv_frequencies_advanced(
         ci_method: Optional[frq_params.ci_method] = frq_params.ci_method_default,
         chunks: base_params.chunks = base_params.native_chunks,
         inline_array: base_params.inline_array = base_params.inline_array_default,
+        taxon_by: frq_params.taxon_by = frq_params.taxon_by_default,
     ) -> xr.Dataset:
         regions: List[Region] = parse_multi_region(self, region)
         del region
@@ -466,6 +467,7 @@ def gene_cnv_frequencies_advanced(
                     ci_method=ci_method,
                     chunks=chunks,
                     inline_array=inline_array,
+                    taxon_by=taxon_by,
                 )
                 for r in regions
             ],
@@ -494,6 +496,7 @@ def _gene_cnv_frequencies_advanced(
         ci_method,
         chunks,
         inline_array,
+        taxon_by,
     ):
         debug = self._log.debug
 
@@ -523,6 +526,7 @@ def _gene_cnv_frequencies_advanced(
             df_samples=df_samples,
             area_by=area_by,
             period_by=period_by,
+            taxon_by=taxon_by,
         )
 
         debug("group samples to make cohorts")
@@ -532,6 +536,7 @@ def _gene_cnv_frequencies_advanced(
         df_cohorts = build_cohorts_from_sample_grouping(
             group_samples_by_cohort=group_samples_by_cohort,
             min_cohort_size=min_cohort_size,
+            taxon_by=taxon_by,
         )
 
         debug("figure out expected copy number")
@@ -556,7 +561,8 @@ def _gene_cnv_frequencies_advanced(
         debug("build event count and nobs for each cohort")
         for cohort_index, cohort in enumerate(df_cohorts.itertuples()):
             # construct grouping key
-            cohort_key = cohort.taxon, cohort.area, cohort.period
+            cohort_taxon = getattr(cohort, taxon_by)
+            cohort_key = cohort_taxon, cohort.area, cohort.period
 
             # obtain sample indices for cohort
             sample_indices = group_samples_by_cohort.indices[cohort_key]
diff --git a/malariagen_data/anoph/frq_base.py b/malariagen_data/anoph/frq_base.py
@@ -14,20 +14,22 @@
 from .base import AnophelesBase
 
 
-def prep_samples_for_cohort_grouping(*, df_samples, area_by, period_by):
+def prep_samples_for_cohort_grouping(
+    *, df_samples, area_by, period_by, taxon_by="taxon"
+):
     # Take a copy, as we will modify the dataframe.
     df_samples = df_samples.copy()
 
     # Fix "intermediate" or "unassigned" taxon values - we only want to build
     # cohorts with clean taxon calls, so we set other values to None.
     loc_intermediate_taxon = (
-        df_samples["taxon"].str.startswith("intermediate").fillna(False)
+        df_samples[taxon_by].str.startswith("intermediate").fillna(False)
     )
-    df_samples.loc[loc_intermediate_taxon, "taxon"] = None
+    df_samples.loc[loc_intermediate_taxon, taxon_by] = None
     loc_unassigned_taxon = (
-        df_samples["taxon"].str.startswith("unassigned").fillna(False)
+        df_samples[taxon_by].str.startswith("unassigned").fillna(False)
     )
-    df_samples.loc[loc_unassigned_taxon, "taxon"] = None
+    df_samples.loc[loc_unassigned_taxon, taxon_by] = None
 
     # Add period column.
     if period_by == "year":
diff --git a/malariagen_data/anoph/hap_frq.py b/malariagen_data/anoph/hap_frq.py
@@ -153,6 +153,7 @@ def haplotypes_frequencies_advanced(
         ci_method: Optional[frq_params.ci_method] = frq_params.ci_method_default,
         chunks: base_params.chunks = base_params.native_chunks,
         inline_array: base_params.inline_array = base_params.inline_array_default,
+        taxon_by: frq_params.taxon_by = frq_params.taxon_by_default,
     ) -> xr.Dataset:
         # Load sample metadata.
         df_samples = self.sample_metadata(
@@ -166,15 +167,17 @@ def haplotypes_frequencies_advanced(
             df_samples=df_samples,
             area_by=area_by,
             period_by=period_by,
+            taxon_by=taxon_by,
         )
 
         # Group samples to make cohorts.
-        group_samples_by_cohort = df_samples.groupby(["taxon", "area", "period"])
+        group_samples_by_cohort = df_samples.groupby(["taxon_by", "area", "period"])
 
         # Build cohorts dataframe.
         df_cohorts = build_cohorts_from_sample_grouping(
             group_samples_by_cohort=group_samples_by_cohort,
             min_cohort_size=min_cohort_size,
+            taxon_by=taxon_by,
         )
 
         # Access haplotypes.
@@ -211,8 +214,9 @@ def haplotypes_frequencies_advanced(
             df_cohorts.itertuples(), desc="Compute allele frequencies"
         )
         for cohort in cohorts_iterator:
-            cohort_key = cohort.taxon, cohort.area, cohort.period
-            cohort_key_str = cohort.taxon + "_" + cohort.area + "_" + str(cohort.period)
+            cohort_taxon = getattr(cohort, taxon_by)
+            cohort_key = cohort_taxon, cohort.area, cohort.period
+            cohort_key_str = cohort_taxon + "_" + cohort.area + "_" + str(cohort.period)
             # We reset all frequencies, counts to 0 for each cohort, nobs is set to the number of haplotypes
             n_samples = cohort.size
             hap_freq = {k: 0 for k in f_all.keys()}
diff --git a/malariagen_data/util.py b/malariagen_data/util.py
@@ -1591,43 +1591,6 @@ def distributed_client():
     return client
 
 
-def prep_samples_for_cohort_grouping(
-    *, df_samples, area_by, period_by, taxon_by="taxon"
-):
-    # Take a copy, as we will modify the dataframe.
-    df_samples = df_samples.copy()
-
-    # Fix "intermediate" or "unassigned" taxon values - we only want to build
-    # cohorts with clean taxon calls, so we set other values to None.
-    loc_intermediate_taxon = (
-        df_samples[taxon_by].str.startswith("intermediate").fillna(False)
-    )
-    df_samples.loc[loc_intermediate_taxon, taxon_by] = None
-    loc_unassigned_taxon = (
-        df_samples[taxon_by].str.startswith("unassigned").fillna(False)
-    )
-    df_samples.loc[loc_unassigned_taxon, taxon_by] = None
-
-    # Add period column.
-    if period_by == "year":
-        make_period = _make_sample_period_year
-    elif period_by == "quarter":
-        make_period = _make_sample_period_quarter
-    elif period_by == "month":
-        make_period = _make_sample_period_month
-    else:  # pragma: no cover
-        raise ValueError(
-            f"Value for period_by parameter must be one of 'year', 'quarter', 'month'; found {period_by!r}."
-        )
-    sample_period = df_samples.apply(make_period, axis="columns")
-    df_samples["period"] = sample_period
-
-    # Add area column for consistent output.
-    df_samples["area"] = df_samples[area_by]
-
-    return df_samples
-
-
 def add_frequency_ci(*, ds, ci_method):
     from statsmodels.stats.proportion import proportion_confint  # type: ignore