Skip to content

Commit 0e8f12f

Browse files
committed
Moving a lot of frequency functions to frq_funcs.py
1 parent e213d1b commit 0e8f12f

4 files changed

Lines changed: 125 additions & 5 deletions

File tree

malariagen_data/anoph/cnv_frq.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,18 @@
1010
from numpydoc_decorator import doc # type: ignore
1111

1212
from . import base_params, cnv_params, frq_params
13+
from .frq_funcs import (
14+
prep_samples_for_cohort_grouping,
15+
build_cohorts_from_sample_grouping,
16+
add_frequency_ci,
17+
)
1318
from ..util import (
1419
check_types,
1520
pandas_apply,
1621
Region,
1722
parse_multi_region,
1823
region_str,
1924
simple_xarray_concat,
20-
prep_samples_for_cohort_grouping,
21-
build_cohorts_from_sample_grouping,
22-
add_frequency_ci,
2325
)
2426
from .cnv_data import AnophelesCnvData
2527
from .sample_metadata import locate_cohorts

malariagen_data/anoph/frq_funcs.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
import numpy as np
2+
import pandas as pd
3+
4+
5+
def prep_samples_for_cohort_grouping(*, df_samples, area_by, period_by):
6+
# Take a copy, as we will modify the dataframe.
7+
df_samples = df_samples.copy()
8+
9+
# Fix "intermediate" or "unassigned" taxon values - we only want to build
10+
# cohorts with clean taxon calls, so we set other values to None.
11+
loc_intermediate_taxon = (
12+
df_samples["taxon"].str.startswith("intermediate").fillna(False)
13+
)
14+
df_samples.loc[loc_intermediate_taxon, "taxon"] = None
15+
loc_unassigned_taxon = (
16+
df_samples["taxon"].str.startswith("unassigned").fillna(False)
17+
)
18+
df_samples.loc[loc_unassigned_taxon, "taxon"] = None
19+
20+
# Add period column.
21+
if period_by == "year":
22+
make_period = _make_sample_period_year
23+
elif period_by == "quarter":
24+
make_period = _make_sample_period_quarter
25+
elif period_by == "month":
26+
make_period = _make_sample_period_month
27+
else: # pragma: no cover
28+
raise ValueError(
29+
f"Value for period_by parameter must be one of 'year', 'quarter', 'month'; found {period_by!r}."
30+
)
31+
sample_period = df_samples.apply(make_period, axis="columns")
32+
df_samples["period"] = sample_period
33+
34+
# Add area column for consistent output.
35+
df_samples["area"] = df_samples[area_by]
36+
37+
return df_samples
38+
39+
40+
def build_cohorts_from_sample_grouping(*, group_samples_by_cohort, min_cohort_size):
41+
# Build cohorts dataframe.
42+
df_cohorts = group_samples_by_cohort.agg(
43+
size=("sample_id", len),
44+
lat_mean=("latitude", "mean"),
45+
lat_max=("latitude", "max"),
46+
lat_min=("latitude", "min"),
47+
lon_mean=("longitude", "mean"),
48+
lon_max=("longitude", "max"),
49+
lon_min=("longitude", "min"),
50+
)
51+
# Reset index so that the index fields are included as columns.
52+
df_cohorts = df_cohorts.reset_index()
53+
54+
# Add cohort helper variables.
55+
cohort_period_start = df_cohorts["period"].apply(lambda v: v.start_time)
56+
cohort_period_end = df_cohorts["period"].apply(lambda v: v.end_time)
57+
df_cohorts["period_start"] = cohort_period_start
58+
df_cohorts["period_end"] = cohort_period_end
59+
# Create a label that is similar to the cohort metadata,
60+
# although this won't be perfect.
61+
df_cohorts["label"] = df_cohorts.apply(
62+
lambda v: f"{v.area}_{v.taxon[:4]}_{v.period}", axis="columns"
63+
)
64+
65+
# Apply minimum cohort size.
66+
df_cohorts = df_cohorts.query(f"size >= {min_cohort_size}").reset_index(drop=True)
67+
68+
# Early check for no cohorts.
69+
if len(df_cohorts) == 0:
70+
raise ValueError(
71+
"No cohorts available for the given sample selection parameters and minimum cohort size."
72+
)
73+
74+
return df_cohorts
75+
76+
77+
def add_frequency_ci(*, ds, ci_method):
78+
from statsmodels.stats.proportion import proportion_confint # type: ignore
79+
80+
if ci_method is not None:
81+
count = ds["event_count"].values
82+
nobs = ds["event_nobs"].values
83+
with np.errstate(divide="ignore", invalid="ignore"):
84+
frq_ci_low, frq_ci_upp = proportion_confint(
85+
count=count, nobs=nobs, method=ci_method
86+
)
87+
ds["event_frequency_ci_low"] = ("variants", "cohorts"), frq_ci_low
88+
ds["event_frequency_ci_upp"] = ("variants", "cohorts"), frq_ci_upp
89+
90+
91+
def _make_sample_period_month(row):
92+
year = row.year
93+
month = row.month
94+
if year > 0 and month > 0:
95+
return pd.Period(freq="M", year=year, month=month)
96+
else:
97+
return pd.NaT
98+
99+
100+
def _make_sample_period_quarter(row):
101+
year = row.year
102+
month = row.month
103+
if year > 0 and month > 0:
104+
return pd.Period(freq="Q", year=year, month=month)
105+
else:
106+
return pd.NaT
107+
108+
109+
def _make_sample_period_year(row):
110+
year = row.year
111+
if year > 0:
112+
return pd.Period(freq="Y", year=year)
113+
else:
114+
return pd.NaT

malariagen_data/anoph/hap_frq.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,13 @@
99
from ..util import (
1010
check_types,
1111
haplotype_frequencies,
12+
)
13+
from .hap_data import AnophelesHapData
14+
from .frq_funcs import (
1215
prep_samples_for_cohort_grouping,
1316
build_cohorts_from_sample_grouping,
1417
add_frequency_ci,
1518
)
16-
from .hap_data import AnophelesHapData
1719
from .sample_metadata import locate_cohorts
1820
from . import base_params, frq_params
1921

malariagen_data/anoph/snp_frq.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,13 @@
1414
from ..util import (
1515
check_types,
1616
pandas_apply,
17+
)
18+
from .snp_data import AnophelesSnpData
19+
from .frq_funcs import (
1720
prep_samples_for_cohort_grouping,
1821
build_cohorts_from_sample_grouping,
1922
add_frequency_ci,
2023
)
21-
from .snp_data import AnophelesSnpData
2224
from .sample_metadata import locate_cohorts
2325
from . import base_params, frq_params, map_params, plotly_params
2426

0 commit comments

Comments
 (0)