Allow period_by to specify column name in prep_samples_for_cohort_grouping(). Add nb test. Remove old _make_sample_period_...() funcs from util.py.

leehart · leehart · commit 3c2ce1d49580 · 2025-02-21T17:15:46.000Z
diff --git a/malariagen_data/anoph/frq_base.py b/malariagen_data/anoph/frq_base.py
@@ -30,20 +30,43 @@ def prep_samples_for_cohort_grouping(*, df_samples, area_by, period_by, taxon_by
     df_samples.loc[loc_unassigned_taxon, taxon_by] = None
 
     # Add period column.
-    if period_by == "year":
-        make_period = _make_sample_period_year
-    elif period_by == "quarter":
-        make_period = _make_sample_period_quarter
-    elif period_by == "month":
-        make_period = _make_sample_period_month
-    else:  # pragma: no cover
-        raise ValueError(
-            f"Value for period_by parameter must be one of 'year', 'quarter', 'month'; found {period_by!r}."
-        )
-    sample_period = df_samples.apply(make_period, axis="columns")
-    df_samples["period"] = sample_period
 
-    # Add area column for consistent output.
+    # Map supported period_by values to functions that return either the relevant pd.Period or pd.NaT per row.
+    period_by_funcs = {
+        "year": _make_sample_period_year,
+        "quarter": _make_sample_period_quarter,
+        "month": _make_sample_period_month,
+    }
+
+    # Get the matching function for the specified period_by value, or None.
+    period_by_func = period_by_funcs.get(period_by)
+
+    # If there were no matching functions for the specified period_by value...
+    if period_by_func is None:
+        # Raise a ValueError if the specified period_by value is not a column in the DataFrame.
+        if period_by not in df_samples.columns:
+            raise ValueError(
+                f"Invalid value for `period_by`: {period_by!r}. Either specify the name of an existing column "
+                "or a supported period: 'year', 'quarter', or 'month'."
+            )
+
+        # Raise a ValueError if the specified period_by column does not contain instances pd.Period.
+        if not all(
+            df_samples[period_by].apply(
+                lambda value: pd.isnull(value) or isinstance(value, pd.Period)
+            )
+        ):
+            raise TypeError(
+                "Invalid values in {period_by!r} column. Must be either pandas.Period or null."
+            )
+
+        # Copy the specified period_by column to a new "period" column.
+        df_samples["period"] = df_samples[period_by]
+    else:
+        # Apply the matching period_by function to create a new "period" column.
+        df_samples["period"] = df_samples.apply(period_by_func, axis="columns")
+
+    # Copy the specified area_by column to a new "area" column.
     df_samples["area"] = df_samples[area_by]
 
     return df_samples
diff --git a/malariagen_data/anoph/frq_params.py b/malariagen_data/anoph/frq_params.py
@@ -25,8 +25,8 @@
 ]
 
 period_by: TypeAlias = Annotated[
-    Literal["year", "quarter", "month"],
-    "Length of time to group samples temporally.",
+    Union[str, Literal["year", "quarter", "month"]],
+    "Either the length of time to group samples temporally or the name the column to use.",
 ]
 
 variant_query: TypeAlias = Annotated[
diff --git a/malariagen_data/util.py b/malariagen_data/util.py
@@ -1603,29 +1603,3 @@ def add_frequency_ci(*, ds, ci_method):
             )
         ds["event_frequency_ci_low"] = ("variants", "cohorts"), frq_ci_low
         ds["event_frequency_ci_upp"] = ("variants", "cohorts"), frq_ci_upp
-
-
-def _make_sample_period_month(row):
-    year = row.year
-    month = row.month
-    if year > 0 and month > 0:
-        return pd.Period(freq="M", year=year, month=month)
-    else:
-        return pd.NaT
-
-
-def _make_sample_period_quarter(row):
-    year = row.year
-    month = row.month
-    if year > 0 and month > 0:
-        return pd.Period(freq="Q", year=year, month=month)
-    else:
-        return pd.NaT
-
-
-def _make_sample_period_year(row):
-    year = row.year
-    if year > 0:
-        return pd.Period(freq="Y", year=year)
-    else:
-        return pd.NaT
diff --git a/notebooks/plot_frequencies_space_time.ipynb b/notebooks/plot_frequencies_space_time.ipynb
@@ -424,6 +424,155 @@
    "source": [
     "ag3.plot_frequencies_interactive_map(ds)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2c3227e4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eb89088f",
+   "metadata": {},
+   "source": [
+    "### SNP allele frequencies using a custom `period_by`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bd7856d2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample_metadata_df = ag3.sample_metadata()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8a54c978",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a36cd399",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "random_years_as_list = np.random.choice(range(1900, 2100), len(sample_metadata_df))\n",
+    "random_years_as_list[:3]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1087bada",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "random_years_as_period_index = pd.PeriodIndex(random_years_as_list, freq=\"Y\")\n",
+    "random_years_as_period_index[:3]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "018e9a59",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "extra_metadata_df = pd.DataFrame(\n",
+    "    {\n",
+    "        \"sample_id\": sample_metadata_df[\"sample_id\"],\n",
+    "        \"random_year_as_period\": random_years_as_period_index,\n",
+    "    }\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "454c36d9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "type(extra_metadata_df['random_year_as_period'][0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6b078bd1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "extra_metadata_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "33d392d3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ag3.add_extra_metadata(extra_metadata_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "71bf535c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "extra_sample_metadata_df = ag3.sample_metadata()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "72feb026",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "extra_sample_metadata_df['random_year_as_period'][:3]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "31b9acee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds = ag3.snp_allele_frequencies_advanced(\n",
+    "    transcript=\"AGAP004707-RD\",\n",
+    "    area_by=\"admin1_iso\",\n",
+    "    period_by=\"random_year_as_period\",\n",
+    ")\n",
+    "ds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16ac5eb6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ag3.plot_frequencies_interactive_map(ds)"
+   ]
   }
  ],
  "metadata": {

Original file line number	Diff line number	Diff line change
`@@ -25,8 +25,8 @@`
`25`	`25`	`]`
`26`	`26`
`27`	`27`	`period_by: TypeAlias = Annotated[`
`28`		`- Literal["year", "quarter", "month"],`
`29`		`- "Length of time to group samples temporally.",`
	`28`	`+ Union[str, Literal["year", "quarter", "month"]],`
	`29`	`+ "Either the length of time to group samples temporally or the name the column to use.",`
`30`	`30`	`]`
`31`	`31`
`32`	`32`	`variant_query: TypeAlias = Annotated[`