Add taxon_by param to snp_allele_frequencies_advanced(). Add example usage to notebook.

leehart · leehart · commit 2a7577ec19ff · 2024-12-10T16:33:48.000Z
diff --git a/malariagen_data/anoph/frq_params.py b/malariagen_data/anoph/frq_params.py
@@ -80,3 +80,10 @@
     Optional[Union[str, List[str], Tuple[str, ...]]],
     "The area or areas to restrict the dataset to.",
 ]
+
+taxon_by: TypeAlias = Annotated[
+    str,
+    "The column to use for taxon stratification.",
+]
+
+taxon_by_default: taxon_by = "taxon"
diff --git a/malariagen_data/anoph/snp_frq.py b/malariagen_data/anoph/snp_frq.py
@@ -452,6 +452,7 @@ def snp_allele_frequencies_advanced(
         ci_method: Optional[frq_params.ci_method] = frq_params.ci_method_default,
         chunks: base_params.chunks = base_params.native_chunks,
         inline_array: base_params.inline_array = base_params.inline_array_default,
+        taxon_by: frq_params.taxon_by = frq_params.taxon_by_default,
     ) -> xr.Dataset:
         # Load sample metadata.
         df_samples = self.sample_metadata(
@@ -465,15 +466,17 @@ def snp_allele_frequencies_advanced(
             df_samples=df_samples,
             area_by=area_by,
             period_by=period_by,
+            taxon_by=taxon_by,
         )
 
         # Group samples to make cohorts.
-        group_samples_by_cohort = df_samples.groupby(["taxon", "area", "period"])
+        group_samples_by_cohort = df_samples.groupby([taxon_by, "area", "period"])
 
         # Build cohorts dataframe.
         df_cohorts = build_cohorts_from_sample_grouping(
             group_samples_by_cohort=group_samples_by_cohort,
             min_cohort_size=min_cohort_size,
+            taxon_by=taxon_by,
         )
 
         # Early check for no cohorts.
@@ -529,7 +532,8 @@ def snp_allele_frequencies_advanced(
             desc="Compute SNP allele frequencies",
         )
         for cohort_index, cohort in cohorts_iterator:
-            cohort_key = cohort.taxon, cohort.area, cohort.period
+            cohort_taxon = getattr(cohort, taxon_by)
+            cohort_key = cohort_taxon, cohort.area, cohort.period
             sample_indices = group_samples_by_cohort.indices[cohort_key]
 
             cohort_ac, cohort_an = _cohort_alt_allele_counts_melt(
@@ -601,7 +605,11 @@ def snp_allele_frequencies_advanced(
 
         # Cohort variables.
         for coh_col in df_cohorts.columns:
-            ds_out[f"cohort_{coh_col}"] = "cohorts", df_cohorts[coh_col]
+            if coh_col == taxon_by:
+                # Other functions expect cohort_taxon, e.g. plot_frequencies_interactive_map()
+                ds_out["cohort_taxon"] = "cohorts", df_cohorts[coh_col]
+            else:
+                ds_out[f"cohort_{coh_col}"] = "cohorts", df_cohorts[coh_col]
 
         # Variant variables.
         for snp_col in df_variants.columns:
diff --git a/malariagen_data/util.py b/malariagen_data/util.py
@@ -1612,20 +1612,22 @@ def _karyotype_tags_n_alt(gt, alts, inversion_alts):
     return inv_n_alt
 
 
-def prep_samples_for_cohort_grouping(*, df_samples, area_by, period_by):
+def prep_samples_for_cohort_grouping(
+    *, df_samples, area_by, period_by, taxon_by="taxon"
+):
     # Take a copy, as we will modify the dataframe.
     df_samples = df_samples.copy()
 
     # Fix "intermediate" or "unassigned" taxon values - we only want to build
     # cohorts with clean taxon calls, so we set other values to None.
     loc_intermediate_taxon = (
-        df_samples["taxon"].str.startswith("intermediate").fillna(False)
+        df_samples[taxon_by].str.startswith("intermediate").fillna(False)
     )
-    df_samples.loc[loc_intermediate_taxon, "taxon"] = None
+    df_samples.loc[loc_intermediate_taxon, taxon_by] = None
     loc_unassigned_taxon = (
-        df_samples["taxon"].str.startswith("unassigned").fillna(False)
+        df_samples[taxon_by].str.startswith("unassigned").fillna(False)
     )
-    df_samples.loc[loc_unassigned_taxon, "taxon"] = None
+    df_samples.loc[loc_unassigned_taxon, taxon_by] = None
 
     # Add period column.
     if period_by == "year":
@@ -1647,7 +1649,9 @@ def prep_samples_for_cohort_grouping(*, df_samples, area_by, period_by):
     return df_samples
 
 
-def build_cohorts_from_sample_grouping(*, group_samples_by_cohort, min_cohort_size):
+def build_cohorts_from_sample_grouping(
+    *, group_samples_by_cohort, min_cohort_size, taxon_by="taxon"
+):
     # Build cohorts dataframe.
     df_cohorts = group_samples_by_cohort.agg(
         size=("sample_id", len),
@@ -1669,7 +1673,7 @@ def build_cohorts_from_sample_grouping(*, group_samples_by_cohort, min_cohort_si
     # Create a label that is similar to the cohort metadata,
     # although this won't be perfect.
     df_cohorts["label"] = df_cohorts.apply(
-        lambda v: f"{v.area}_{v.taxon[:4]}_{v.period}", axis="columns"
+        lambda v: f"{v.area}_{v[taxon_by][:4]}_{v.period}", axis="columns"
     )
 
     # Apply minimum cohort size.
diff --git a/notebooks/plot_frequencies_space_time.ipynb b/notebooks/plot_frequencies_space_time.ipynb
@@ -382,6 +382,48 @@
    "source": [
     "af1.plot_frequencies_interactive_map(ds)"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a94d6521",
+   "metadata": {},
+   "source": [
+    "### SNP allele frequencies using `taxon_by`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0cf222ee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds = ag3.snp_allele_frequencies_advanced(\n",
+    "    transcript=\"AGAP004707-RD\",\n",
+    "    area_by=\"admin1_iso\",\n",
+    "    period_by=\"year\",\n",
+    "    sample_sets=[\"AG1000G-BF-A\", \"AG1000G-BF-B\", \"AG1000G-UG\", \"AG1000G-TZ\"],\n",
+    "    sample_query=\"aim_species in ['gambiae', 'coluzzii']\",\n",
+    "    min_cohort_size=10,\n",
+    "    drop_invariant=True,\n",
+    "    variant_query=\"max_af > 0.05 and effect == 'NON_SYNONYMOUS_CODING'\",\n",
+    "    site_mask=None,\n",
+    "    nobs_mode=\"called\",\n",
+    "    ci_method=\"wilson\",\n",
+    "    taxon_by=\"aim_species\",\n",
+    ")\n",
+    "ds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2b7e9acc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ag3.plot_frequencies_interactive_map(ds)"
+   ]
   }
  ],
  "metadata": {