Skip to content

Commit 3c2ce1d

Browse files
committed
Allow period_by to specify column name in prep_samples_for_cohort_grouping(). Add nb test. Remove old _make_sample_period_...() funcs from util.py.
1 parent 6d36546 commit 3c2ce1d

4 files changed

Lines changed: 187 additions & 41 deletions

File tree

malariagen_data/anoph/frq_base.py

Lines changed: 36 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -30,20 +30,43 @@ def prep_samples_for_cohort_grouping(*, df_samples, area_by, period_by, taxon_by
3030
df_samples.loc[loc_unassigned_taxon, taxon_by] = None
3131

3232
# Add period column.
33-
if period_by == "year":
34-
make_period = _make_sample_period_year
35-
elif period_by == "quarter":
36-
make_period = _make_sample_period_quarter
37-
elif period_by == "month":
38-
make_period = _make_sample_period_month
39-
else: # pragma: no cover
40-
raise ValueError(
41-
f"Value for period_by parameter must be one of 'year', 'quarter', 'month'; found {period_by!r}."
42-
)
43-
sample_period = df_samples.apply(make_period, axis="columns")
44-
df_samples["period"] = sample_period
4533

46-
# Add area column for consistent output.
34+
# Map supported period_by values to functions that return either the relevant pd.Period or pd.NaT per row.
35+
period_by_funcs = {
36+
"year": _make_sample_period_year,
37+
"quarter": _make_sample_period_quarter,
38+
"month": _make_sample_period_month,
39+
}
40+
41+
# Get the matching function for the specified period_by value, or None.
42+
period_by_func = period_by_funcs.get(period_by)
43+
44+
# If there were no matching functions for the specified period_by value...
45+
if period_by_func is None:
46+
# Raise a ValueError if the specified period_by value is not a column in the DataFrame.
47+
if period_by not in df_samples.columns:
48+
raise ValueError(
49+
f"Invalid value for `period_by`: {period_by!r}. Either specify the name of an existing column "
50+
"or a supported period: 'year', 'quarter', or 'month'."
51+
)
52+
53+
# Raise a ValueError if the specified period_by column does not contain instances pd.Period.
54+
if not all(
55+
df_samples[period_by].apply(
56+
lambda value: pd.isnull(value) or isinstance(value, pd.Period)
57+
)
58+
):
59+
raise TypeError(
60+
"Invalid values in {period_by!r} column. Must be either pandas.Period or null."
61+
)
62+
63+
# Copy the specified period_by column to a new "period" column.
64+
df_samples["period"] = df_samples[period_by]
65+
else:
66+
# Apply the matching period_by function to create a new "period" column.
67+
df_samples["period"] = df_samples.apply(period_by_func, axis="columns")
68+
69+
# Copy the specified area_by column to a new "area" column.
4770
df_samples["area"] = df_samples[area_by]
4871

4972
return df_samples

malariagen_data/anoph/frq_params.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@
2525
]
2626

2727
period_by: TypeAlias = Annotated[
28-
Literal["year", "quarter", "month"],
29-
"Length of time to group samples temporally.",
28+
Union[str, Literal["year", "quarter", "month"]],
29+
"Either the length of time to group samples temporally or the name the column to use.",
3030
]
3131

3232
variant_query: TypeAlias = Annotated[

malariagen_data/util.py

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1603,29 +1603,3 @@ def add_frequency_ci(*, ds, ci_method):
16031603
)
16041604
ds["event_frequency_ci_low"] = ("variants", "cohorts"), frq_ci_low
16051605
ds["event_frequency_ci_upp"] = ("variants", "cohorts"), frq_ci_upp
1606-
1607-
1608-
def _make_sample_period_month(row):
1609-
year = row.year
1610-
month = row.month
1611-
if year > 0 and month > 0:
1612-
return pd.Period(freq="M", year=year, month=month)
1613-
else:
1614-
return pd.NaT
1615-
1616-
1617-
def _make_sample_period_quarter(row):
1618-
year = row.year
1619-
month = row.month
1620-
if year > 0 and month > 0:
1621-
return pd.Period(freq="Q", year=year, month=month)
1622-
else:
1623-
return pd.NaT
1624-
1625-
1626-
def _make_sample_period_year(row):
1627-
year = row.year
1628-
if year > 0:
1629-
return pd.Period(freq="Y", year=year)
1630-
else:
1631-
return pd.NaT

notebooks/plot_frequencies_space_time.ipynb

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,155 @@
424424
"source": [
425425
"ag3.plot_frequencies_interactive_map(ds)"
426426
]
427+
},
428+
{
429+
"cell_type": "code",
430+
"execution_count": null,
431+
"id": "2c3227e4",
432+
"metadata": {},
433+
"outputs": [],
434+
"source": []
435+
},
436+
{
437+
"cell_type": "markdown",
438+
"id": "eb89088f",
439+
"metadata": {},
440+
"source": [
441+
"### SNP allele frequencies using a custom `period_by`"
442+
]
443+
},
444+
{
445+
"cell_type": "code",
446+
"execution_count": null,
447+
"id": "bd7856d2",
448+
"metadata": {},
449+
"outputs": [],
450+
"source": [
451+
"sample_metadata_df = ag3.sample_metadata()"
452+
]
453+
},
454+
{
455+
"cell_type": "code",
456+
"execution_count": null,
457+
"id": "8a54c978",
458+
"metadata": {},
459+
"outputs": [],
460+
"source": [
461+
"import pandas as pd\n",
462+
"import numpy as np"
463+
]
464+
},
465+
{
466+
"cell_type": "code",
467+
"execution_count": null,
468+
"id": "a36cd399",
469+
"metadata": {},
470+
"outputs": [],
471+
"source": [
472+
"random_years_as_list = np.random.choice(range(1900, 2100), len(sample_metadata_df))\n",
473+
"random_years_as_list[:3]"
474+
]
475+
},
476+
{
477+
"cell_type": "code",
478+
"execution_count": null,
479+
"id": "1087bada",
480+
"metadata": {},
481+
"outputs": [],
482+
"source": [
483+
"random_years_as_period_index = pd.PeriodIndex(random_years_as_list, freq=\"Y\")\n",
484+
"random_years_as_period_index[:3]"
485+
]
486+
},
487+
{
488+
"cell_type": "code",
489+
"execution_count": null,
490+
"id": "018e9a59",
491+
"metadata": {},
492+
"outputs": [],
493+
"source": [
494+
"extra_metadata_df = pd.DataFrame(\n",
495+
" {\n",
496+
" \"sample_id\": sample_metadata_df[\"sample_id\"],\n",
497+
" \"random_year_as_period\": random_years_as_period_index,\n",
498+
" }\n",
499+
")"
500+
]
501+
},
502+
{
503+
"cell_type": "code",
504+
"execution_count": null,
505+
"id": "454c36d9",
506+
"metadata": {},
507+
"outputs": [],
508+
"source": [
509+
"type(extra_metadata_df['random_year_as_period'][0])"
510+
]
511+
},
512+
{
513+
"cell_type": "code",
514+
"execution_count": null,
515+
"id": "6b078bd1",
516+
"metadata": {},
517+
"outputs": [],
518+
"source": [
519+
"extra_metadata_df.head()"
520+
]
521+
},
522+
{
523+
"cell_type": "code",
524+
"execution_count": null,
525+
"id": "33d392d3",
526+
"metadata": {},
527+
"outputs": [],
528+
"source": [
529+
"ag3.add_extra_metadata(extra_metadata_df)"
530+
]
531+
},
532+
{
533+
"cell_type": "code",
534+
"execution_count": null,
535+
"id": "71bf535c",
536+
"metadata": {},
537+
"outputs": [],
538+
"source": [
539+
"extra_sample_metadata_df = ag3.sample_metadata()"
540+
]
541+
},
542+
{
543+
"cell_type": "code",
544+
"execution_count": null,
545+
"id": "72feb026",
546+
"metadata": {},
547+
"outputs": [],
548+
"source": [
549+
"extra_sample_metadata_df['random_year_as_period'][:3]"
550+
]
551+
},
552+
{
553+
"cell_type": "code",
554+
"execution_count": null,
555+
"id": "31b9acee",
556+
"metadata": {},
557+
"outputs": [],
558+
"source": [
559+
"ds = ag3.snp_allele_frequencies_advanced(\n",
560+
" transcript=\"AGAP004707-RD\",\n",
561+
" area_by=\"admin1_iso\",\n",
562+
" period_by=\"random_year_as_period\",\n",
563+
")\n",
564+
"ds"
565+
]
566+
},
567+
{
568+
"cell_type": "code",
569+
"execution_count": null,
570+
"id": "16ac5eb6",
571+
"metadata": {},
572+
"outputs": [],
573+
"source": [
574+
"ag3.plot_frequencies_interactive_map(ds)"
575+
]
427576
}
428577
],
429578
"metadata": {

0 commit comments

Comments
 (0)