malariagen
diff --git a/‎malariagen_data/anoph/cnv_frq.py‎
Lines changed: 8 additions & 2 deletions b/‎malariagen_data/anoph/cnv_frq.py‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎malariagen_data/anoph/frq_base.py‎
Lines changed: 55 additions & 22 deletions b/‎malariagen_data/anoph/frq_base.py‎
Lines changed: 55 additions & 22 deletions
diff --git a/‎malariagen_data/anoph/frq_params.py‎
Lines changed: 9 additions & 2 deletions b/‎malariagen_data/anoph/frq_params.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎malariagen_data/anoph/hap_frq.py‎
Lines changed: 7 additions & 3 deletions b/‎malariagen_data/anoph/hap_frq.py‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎malariagen_data/anoph/snp_frq.py‎
Lines changed: 13 additions & 3 deletions b/‎malariagen_data/anoph/snp_frq.py‎
Lines changed: 13 additions & 3 deletions
@@ -445,6 +445,7 @@ def gene_cnv_frequencies_advanced(
         ci_method: Optional[frq_params.ci_method] = frq_params.ci_method_default,
         chunks: base_params.chunks = base_params.native_chunks,
         inline_array: base_params.inline_array = base_params.inline_array_default,
+        taxon_by: frq_params.taxon_by = frq_params.taxon_by_default,
     ) -> xr.Dataset:
         regions: List[Region] = parse_multi_region(self, region)
         del region
@@ -466,6 +467,7 @@ def gene_cnv_frequencies_advanced(
                     ci_method=ci_method,
                     chunks=chunks,
                     inline_array=inline_array,
+                    taxon_by=taxon_by,
                 )
                 for r in regions
             ],
@@ -494,6 +496,7 @@ def _gene_cnv_frequencies_advanced(
         ci_method,
         chunks,
         inline_array,
+        taxon_by,
     ):
         debug = self._log.debug
 
@@ -523,15 +526,17 @@ def _gene_cnv_frequencies_advanced(
             df_samples=df_samples,
             area_by=area_by,
             period_by=period_by,
+            taxon_by=taxon_by,
         )
 
         debug("group samples to make cohorts")
-        group_samples_by_cohort = df_samples.groupby(["taxon", "area", "period"])
+        group_samples_by_cohort = df_samples.groupby([taxon_by, "area", "period"])
 
         debug("build cohorts dataframe")
         df_cohorts = build_cohorts_from_sample_grouping(
             group_samples_by_cohort=group_samples_by_cohort,
             min_cohort_size=min_cohort_size,
+            taxon_by=taxon_by,
         )
 
         debug("figure out expected copy number")
@@ -556,7 +561,8 @@ def _gene_cnv_frequencies_advanced(
         debug("build event count and nobs for each cohort")
         for cohort_index, cohort in enumerate(df_cohorts.itertuples()):
             # construct grouping key
-            cohort_key = cohort.taxon, cohort.area, cohort.period
+            cohort_taxon = getattr(cohort, taxon_by)
+            cohort_key = cohort_taxon, cohort.area, cohort.period
 
             # obtain sample indices for cohort
             sample_indices = group_samples_by_cohort.indices[cohort_key]
 
@@ -1,5 +1,6 @@
 import numpy as np
 import pandas as pd
+import re
 import xarray as xr
 import plotly.express as px
 from textwrap import dedent
@@ -14,42 +15,67 @@
 from .base import AnophelesBase
 
 
-def prep_samples_for_cohort_grouping(*, df_samples, area_by, period_by):
+def prep_samples_for_cohort_grouping(*, df_samples, area_by, period_by, taxon_by):
     # Take a copy, as we will modify the dataframe.
     df_samples = df_samples.copy()
 
     # Fix "intermediate" or "unassigned" taxon values - we only want to build
     # cohorts with clean taxon calls, so we set other values to None.
     loc_intermediate_taxon = (
-        df_samples["taxon"].str.startswith("intermediate").fillna(False)
+        df_samples[taxon_by].str.startswith("intermediate").fillna(False)
     )
-    df_samples.loc[loc_intermediate_taxon, "taxon"] = None
+    df_samples.loc[loc_intermediate_taxon, taxon_by] = None
     loc_unassigned_taxon = (
-        df_samples["taxon"].str.startswith("unassigned").fillna(False)
+        df_samples[taxon_by].str.startswith("unassigned").fillna(False)
     )
-    df_samples.loc[loc_unassigned_taxon, "taxon"] = None
+    df_samples.loc[loc_unassigned_taxon, taxon_by] = None
 
     # Add period column.
-    if period_by == "year":
-        make_period = _make_sample_period_year
-    elif period_by == "quarter":
-        make_period = _make_sample_period_quarter
-    elif period_by == "month":
-        make_period = _make_sample_period_month
-    else:  # pragma: no cover
-        raise ValueError(
-            f"Value for period_by parameter must be one of 'year', 'quarter', 'month'; found {period_by!r}."
-        )
-    sample_period = df_samples.apply(make_period, axis="columns")
-    df_samples["period"] = sample_period
 
-    # Add area column for consistent output.
+    # Map supported period_by values to functions that return either the relevant pd.Period or pd.NaT per row.
+    period_by_funcs = {
+        "year": _make_sample_period_year,
+        "quarter": _make_sample_period_quarter,
+        "month": _make_sample_period_month,
+    }
+
+    # Get the matching function for the specified period_by value, or None.
+    period_by_func = period_by_funcs.get(period_by)
+
+    # If there were no matching functions for the specified period_by value...
+    if period_by_func is None:
+        # Raise a ValueError if the specified period_by value is not a column in the DataFrame.
+        if period_by not in df_samples.columns:
+            raise ValueError(
+                f"Invalid value for `period_by`: {period_by!r}. Either specify the name of an existing column "
+                "or a supported period: 'year', 'quarter', or 'month'."
+            )
+
+        # Raise a ValueError if the specified period_by column does not contain instances pd.Period.
+        if not all(
+            df_samples[period_by].apply(
+                lambda value: pd.isnull(value) or isinstance(value, pd.Period)
+            )
+        ):
+            raise TypeError(
+                f"Invalid values in {period_by!r} column. Must be either pandas.Period or null."
+            )
+
+        # Copy the specified period_by column to a new "period" column.
+        df_samples["period"] = df_samples[period_by]
+    else:
+        # Apply the matching period_by function to create a new "period" column.
+        df_samples["period"] = df_samples.apply(period_by_func, axis="columns")
+
+    # Copy the specified area_by column to a new "area" column.
     df_samples["area"] = df_samples[area_by]
 
     return df_samples
 
 
-def build_cohorts_from_sample_grouping(*, group_samples_by_cohort, min_cohort_size):
+def build_cohorts_from_sample_grouping(
+    *, group_samples_by_cohort, min_cohort_size, taxon_by
+):
     # Build cohorts dataframe.
     df_cohorts = group_samples_by_cohort.agg(
         size=("sample_id", len),
@@ -70,9 +96,16 @@ def build_cohorts_from_sample_grouping(*, group_samples_by_cohort, min_cohort_si
     df_cohorts["period_end"] = cohort_period_end
     # Create a label that is similar to the cohort metadata,
     # although this won't be perfect.
-    df_cohorts["label"] = df_cohorts.apply(
-        lambda v: f"{v.area}_{v.taxon[:4]}_{v.period}", axis="columns"
-    )
+    if taxon_by == frq_params.taxon_by_default:
+        df_cohorts["label"] = df_cohorts.apply(
+            lambda v: f"{v.area}_{v[taxon_by][:4]}_{v.period}", axis="columns"
+        )
+    else:
+        # Replace non-alphanumeric characters in the taxon with underscores.
+        df_cohorts["label"] = df_cohorts.apply(
+            lambda v: f"{v.area}_{re.sub(r'[^A-Za-z0-9]+', '_', str(v[taxon_by]))}_{v.period}",
+            axis="columns",
+        )
 
     # Apply minimum cohort size.
     df_cohorts = df_cohorts.query(f"size >= {min_cohort_size}").reset_index(drop=True)
 
@@ -25,8 +25,8 @@
 ]
 
 period_by: TypeAlias = Annotated[
-    Literal["year", "quarter", "month"],
-    "Length of time to group samples temporally.",
+    Union[str, Literal["year", "quarter", "month"]],
+    "Either the length of time to group samples temporally or the name the column to use.",
 ]
 
 variant_query: TypeAlias = Annotated[
@@ -80,3 +80,10 @@
     Optional[Union[str, List[str], Tuple[str, ...]]],
     "The area or areas to restrict the dataset to.",
 ]
+
+taxon_by: TypeAlias = Annotated[
+    str,
+    "The column to use for taxon stratification.",
+]
+
+taxon_by_default: taxon_by = "taxon"
@@ -153,6 +153,7 @@ def haplotypes_frequencies_advanced(
         ci_method: Optional[frq_params.ci_method] = frq_params.ci_method_default,
         chunks: base_params.chunks = base_params.native_chunks,
         inline_array: base_params.inline_array = base_params.inline_array_default,
+        taxon_by: frq_params.taxon_by = frq_params.taxon_by_default,
     ) -> xr.Dataset:
         # Load sample metadata.
         df_samples = self.sample_metadata(
@@ -166,15 +167,17 @@ def haplotypes_frequencies_advanced(
             df_samples=df_samples,
             area_by=area_by,
             period_by=period_by,
+            taxon_by=taxon_by,
         )
 
         # Group samples to make cohorts.
-        group_samples_by_cohort = df_samples.groupby(["taxon", "area", "period"])
+        group_samples_by_cohort = df_samples.groupby([taxon_by, "area", "period"])
 
         # Build cohorts dataframe.
         df_cohorts = build_cohorts_from_sample_grouping(
             group_samples_by_cohort=group_samples_by_cohort,
             min_cohort_size=min_cohort_size,
+            taxon_by=taxon_by,
         )
 
         # Access haplotypes.
@@ -211,8 +214,9 @@ def haplotypes_frequencies_advanced(
             df_cohorts.itertuples(), desc="Compute allele frequencies"
         )
         for cohort in cohorts_iterator:
-            cohort_key = cohort.taxon, cohort.area, cohort.period
-            cohort_key_str = cohort.taxon + "_" + cohort.area + "_" + str(cohort.period)
+            cohort_taxon = getattr(cohort, taxon_by)
+            cohort_key = cohort_taxon, cohort.area, cohort.period
+            cohort_key_str = cohort_taxon + "_" + cohort.area + "_" + str(cohort.period)
             # We reset all frequencies, counts to 0 for each cohort, nobs is set to the number of haplotypes
             n_samples = cohort.size
             hap_freq = {k: 0 for k in f_all.keys()}
 
@@ -452,6 +452,7 @@ def snp_allele_frequencies_advanced(
         ci_method: Optional[frq_params.ci_method] = frq_params.ci_method_default,
         chunks: base_params.chunks = base_params.native_chunks,
         inline_array: base_params.inline_array = base_params.inline_array_default,
+        taxon_by: frq_params.taxon_by = frq_params.taxon_by_default,
     ) -> xr.Dataset:
         # Load sample metadata.
         df_samples = self.sample_metadata(
@@ -465,15 +466,17 @@ def snp_allele_frequencies_advanced(
             df_samples=df_samples,
             area_by=area_by,
             period_by=period_by,
+            taxon_by=taxon_by,
         )
 
         # Group samples to make cohorts.
-        group_samples_by_cohort = df_samples.groupby(["taxon", "area", "period"])
+        group_samples_by_cohort = df_samples.groupby([taxon_by, "area", "period"])
 
         # Build cohorts dataframe.
         df_cohorts = build_cohorts_from_sample_grouping(
             group_samples_by_cohort=group_samples_by_cohort,
             min_cohort_size=min_cohort_size,
+            taxon_by=taxon_by,
         )
 
         # Early check for no cohorts.
@@ -529,7 +532,8 @@ def snp_allele_frequencies_advanced(
             desc="Compute SNP allele frequencies",
         )
         for cohort_index, cohort in cohorts_iterator:
-            cohort_key = cohort.taxon, cohort.area, cohort.period
+            cohort_taxon = getattr(cohort, taxon_by)
+            cohort_key = cohort_taxon, cohort.area, cohort.period
             sample_indices = group_samples_by_cohort.indices[cohort_key]
 
             cohort_ac, cohort_an = _cohort_alt_allele_counts_melt(
@@ -601,7 +605,11 @@ def snp_allele_frequencies_advanced(
 
         # Cohort variables.
         for coh_col in df_cohorts.columns:
-            ds_out[f"cohort_{coh_col}"] = "cohorts", df_cohorts[coh_col]
+            if coh_col == taxon_by:
+                # Other functions expect cohort_taxon, e.g. plot_frequencies_interactive_map()
+                ds_out["cohort_taxon"] = "cohorts", df_cohorts[coh_col]
+            else:
+                ds_out[f"cohort_{coh_col}"] = "cohorts", df_cohorts[coh_col]
 
         # Variant variables.
         for snp_col in df_variants.columns:
@@ -673,6 +681,7 @@ def aa_allele_frequencies_advanced(
         ci_method: Optional[frq_params.ci_method] = "wilson",
         chunks: base_params.chunks = base_params.native_chunks,
         inline_array: base_params.inline_array = base_params.inline_array_default,
+        taxon_by: frq_params.taxon_by = frq_params.taxon_by_default,
     ) -> xr.Dataset:
         # Begin by computing SNP allele frequencies.
         ds_snp_frq = self.snp_allele_frequencies_advanced(
@@ -690,6 +699,7 @@ def aa_allele_frequencies_advanced(
             ci_method=None,  # we will recompute confidence intervals later
             chunks=chunks,
             inline_array=inline_array,
+            taxon_by=taxon_by,
         )
 
         # N.B., we need to worry about the possibility of the
Original file line number	Diff line number	Diff line change
`@@ -25,8 +25,8 @@`
`25`	`25`	`]`
`26`	`26`
`27`	`27`	`period_by: TypeAlias = Annotated[`
`28`		`- Literal["year", "quarter", "month"],`
`29`		`- "Length of time to group samples temporally.",`
	`28`	`+ Union[str, Literal["year", "quarter", "month"]],`
	`29`	`+ "Either the length of time to group samples temporally or the name the column to use.",`
`30`	`30`	`]`
`31`	`31`
`32`	`32`	`variant_query: TypeAlias = Annotated[`
`@@ -80,3 +80,10 @@`
`80`	`80`	`Optional[Union[str, List[str], Tuple[str, ...]]],`
`81`	`81`	`"The area or areas to restrict the dataset to.",`
`82`	`82`	`]`
	`83`	`+`
	`84`	`+taxon_by: TypeAlias = Annotated[`
	`85`	`+ str,`
	`86`	`+ "The column to use for taxon stratification.",`
	`87`	`+]`
	`88`	`+`
	`89`	`+taxon_by_default: taxon_by = "taxon"`