Skip to content

Commit 63a1cdf

Browse files
committed
Merge changes from util.py build_cohorts_from_sample_grouping() into frq_base.py build_cohorts_from_sample_grouping()
1 parent 2cc9087 commit 63a1cdf

2 files changed

Lines changed: 4 additions & 41 deletions

File tree

malariagen_data/anoph/frq_base.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,9 @@ def prep_samples_for_cohort_grouping(*, df_samples, area_by, period_by):
4949
return df_samples
5050

5151

52-
def build_cohorts_from_sample_grouping(*, group_samples_by_cohort, min_cohort_size):
52+
def build_cohorts_from_sample_grouping(
53+
*, group_samples_by_cohort, min_cohort_size, taxon_by
54+
):
5355
# Build cohorts dataframe.
5456
df_cohorts = group_samples_by_cohort.agg(
5557
size=("sample_id", len),
@@ -71,7 +73,7 @@ def build_cohorts_from_sample_grouping(*, group_samples_by_cohort, min_cohort_si
7173
# Create a label that is similar to the cohort metadata,
7274
# although this won't be perfect.
7375
df_cohorts["label"] = df_cohorts.apply(
74-
lambda v: f"{v.area}_{v.taxon[:4]}_{v.period}", axis="columns"
76+
lambda v: f"{v.area}_{v[taxon_by][:4]}_{v.period}", axis="columns"
7577
)
7678

7779
# Apply minimum cohort size.

malariagen_data/util.py

Lines changed: 0 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1628,45 +1628,6 @@ def prep_samples_for_cohort_grouping(
16281628
return df_samples
16291629

16301630

1631-
def build_cohorts_from_sample_grouping(
1632-
*, group_samples_by_cohort, min_cohort_size, taxon_by="taxon"
1633-
):
1634-
# Build cohorts dataframe.
1635-
df_cohorts = group_samples_by_cohort.agg(
1636-
size=("sample_id", len),
1637-
lat_mean=("latitude", "mean"),
1638-
lat_max=("latitude", "max"),
1639-
lat_min=("latitude", "min"),
1640-
lon_mean=("longitude", "mean"),
1641-
lon_max=("longitude", "max"),
1642-
lon_min=("longitude", "min"),
1643-
)
1644-
# Reset index so that the index fields are included as columns.
1645-
df_cohorts = df_cohorts.reset_index()
1646-
1647-
# Add cohort helper variables.
1648-
cohort_period_start = df_cohorts["period"].apply(lambda v: v.start_time)
1649-
cohort_period_end = df_cohorts["period"].apply(lambda v: v.end_time)
1650-
df_cohorts["period_start"] = cohort_period_start
1651-
df_cohorts["period_end"] = cohort_period_end
1652-
# Create a label that is similar to the cohort metadata,
1653-
# although this won't be perfect.
1654-
df_cohorts["label"] = df_cohorts.apply(
1655-
lambda v: f"{v.area}_{v[taxon_by][:4]}_{v.period}", axis="columns"
1656-
)
1657-
1658-
# Apply minimum cohort size.
1659-
df_cohorts = df_cohorts.query(f"size >= {min_cohort_size}").reset_index(drop=True)
1660-
1661-
# Early check for no cohorts.
1662-
if len(df_cohorts) == 0:
1663-
raise ValueError(
1664-
"No cohorts available for the given sample selection parameters and minimum cohort size."
1665-
)
1666-
1667-
return df_cohorts
1668-
1669-
16701631
def add_frequency_ci(*, ds, ci_method):
16711632
from statsmodels.stats.proportion import proportion_confint # type: ignore
16721633

0 commit comments

Comments
 (0)