Skip to content

Commit f347969

Browse files
committed
Add taxon_by to gene_cnv_frequencies_advanced, haplotypes_frequencies_advanced(). Merge util.py prep_samples_for_cohort_grouping() into frq_base.py.
1 parent 63a1cdf commit f347969

4 files changed

Lines changed: 21 additions & 46 deletions

File tree

malariagen_data/anoph/cnv_frq.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -445,6 +445,7 @@ def gene_cnv_frequencies_advanced(
445445
ci_method: Optional[frq_params.ci_method] = frq_params.ci_method_default,
446446
chunks: base_params.chunks = base_params.native_chunks,
447447
inline_array: base_params.inline_array = base_params.inline_array_default,
448+
taxon_by: frq_params.taxon_by = frq_params.taxon_by_default,
448449
) -> xr.Dataset:
449450
regions: List[Region] = parse_multi_region(self, region)
450451
del region
@@ -466,6 +467,7 @@ def gene_cnv_frequencies_advanced(
466467
ci_method=ci_method,
467468
chunks=chunks,
468469
inline_array=inline_array,
470+
taxon_by=taxon_by,
469471
)
470472
for r in regions
471473
],
@@ -494,6 +496,7 @@ def _gene_cnv_frequencies_advanced(
494496
ci_method,
495497
chunks,
496498
inline_array,
499+
taxon_by,
497500
):
498501
debug = self._log.debug
499502

@@ -523,6 +526,7 @@ def _gene_cnv_frequencies_advanced(
523526
df_samples=df_samples,
524527
area_by=area_by,
525528
period_by=period_by,
529+
taxon_by=taxon_by,
526530
)
527531

528532
debug("group samples to make cohorts")
@@ -532,6 +536,7 @@ def _gene_cnv_frequencies_advanced(
532536
df_cohorts = build_cohorts_from_sample_grouping(
533537
group_samples_by_cohort=group_samples_by_cohort,
534538
min_cohort_size=min_cohort_size,
539+
taxon_by=taxon_by,
535540
)
536541

537542
debug("figure out expected copy number")
@@ -556,7 +561,8 @@ def _gene_cnv_frequencies_advanced(
556561
debug("build event count and nobs for each cohort")
557562
for cohort_index, cohort in enumerate(df_cohorts.itertuples()):
558563
# construct grouping key
559-
cohort_key = cohort.taxon, cohort.area, cohort.period
564+
cohort_taxon = getattr(cohort, taxon_by)
565+
cohort_key = cohort_taxon, cohort.area, cohort.period
560566

561567
# obtain sample indices for cohort
562568
sample_indices = group_samples_by_cohort.indices[cohort_key]

malariagen_data/anoph/frq_base.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,20 +14,22 @@
1414
from .base import AnophelesBase
1515

1616

17-
def prep_samples_for_cohort_grouping(*, df_samples, area_by, period_by):
17+
def prep_samples_for_cohort_grouping(
18+
*, df_samples, area_by, period_by, taxon_by="taxon"
19+
):
1820
# Take a copy, as we will modify the dataframe.
1921
df_samples = df_samples.copy()
2022

2123
# Fix "intermediate" or "unassigned" taxon values - we only want to build
2224
# cohorts with clean taxon calls, so we set other values to None.
2325
loc_intermediate_taxon = (
24-
df_samples["taxon"].str.startswith("intermediate").fillna(False)
26+
df_samples[taxon_by].str.startswith("intermediate").fillna(False)
2527
)
26-
df_samples.loc[loc_intermediate_taxon, "taxon"] = None
28+
df_samples.loc[loc_intermediate_taxon, taxon_by] = None
2729
loc_unassigned_taxon = (
28-
df_samples["taxon"].str.startswith("unassigned").fillna(False)
30+
df_samples[taxon_by].str.startswith("unassigned").fillna(False)
2931
)
30-
df_samples.loc[loc_unassigned_taxon, "taxon"] = None
32+
df_samples.loc[loc_unassigned_taxon, taxon_by] = None
3133

3234
# Add period column.
3335
if period_by == "year":

malariagen_data/anoph/hap_frq.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ def haplotypes_frequencies_advanced(
153153
ci_method: Optional[frq_params.ci_method] = frq_params.ci_method_default,
154154
chunks: base_params.chunks = base_params.native_chunks,
155155
inline_array: base_params.inline_array = base_params.inline_array_default,
156+
taxon_by: frq_params.taxon_by = frq_params.taxon_by_default,
156157
) -> xr.Dataset:
157158
# Load sample metadata.
158159
df_samples = self.sample_metadata(
@@ -166,15 +167,17 @@ def haplotypes_frequencies_advanced(
166167
df_samples=df_samples,
167168
area_by=area_by,
168169
period_by=period_by,
170+
taxon_by=taxon_by,
169171
)
170172

171173
# Group samples to make cohorts.
172-
group_samples_by_cohort = df_samples.groupby(["taxon", "area", "period"])
174+
group_samples_by_cohort = df_samples.groupby(["taxon_by", "area", "period"])
173175

174176
# Build cohorts dataframe.
175177
df_cohorts = build_cohorts_from_sample_grouping(
176178
group_samples_by_cohort=group_samples_by_cohort,
177179
min_cohort_size=min_cohort_size,
180+
taxon_by=taxon_by,
178181
)
179182

180183
# Access haplotypes.
@@ -211,8 +214,9 @@ def haplotypes_frequencies_advanced(
211214
df_cohorts.itertuples(), desc="Compute allele frequencies"
212215
)
213216
for cohort in cohorts_iterator:
214-
cohort_key = cohort.taxon, cohort.area, cohort.period
215-
cohort_key_str = cohort.taxon + "_" + cohort.area + "_" + str(cohort.period)
217+
cohort_taxon = getattr(cohort, taxon_by)
218+
cohort_key = cohort_taxon, cohort.area, cohort.period
219+
cohort_key_str = cohort_taxon + "_" + cohort.area + "_" + str(cohort.period)
216220
# We reset all frequencies, counts to 0 for each cohort, nobs is set to the number of haplotypes
217221
n_samples = cohort.size
218222
hap_freq = {k: 0 for k in f_all.keys()}

malariagen_data/util.py

Lines changed: 0 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1591,43 +1591,6 @@ def distributed_client():
15911591
return client
15921592

15931593

1594-
def prep_samples_for_cohort_grouping(
1595-
*, df_samples, area_by, period_by, taxon_by="taxon"
1596-
):
1597-
# Take a copy, as we will modify the dataframe.
1598-
df_samples = df_samples.copy()
1599-
1600-
# Fix "intermediate" or "unassigned" taxon values - we only want to build
1601-
# cohorts with clean taxon calls, so we set other values to None.
1602-
loc_intermediate_taxon = (
1603-
df_samples[taxon_by].str.startswith("intermediate").fillna(False)
1604-
)
1605-
df_samples.loc[loc_intermediate_taxon, taxon_by] = None
1606-
loc_unassigned_taxon = (
1607-
df_samples[taxon_by].str.startswith("unassigned").fillna(False)
1608-
)
1609-
df_samples.loc[loc_unassigned_taxon, taxon_by] = None
1610-
1611-
# Add period column.
1612-
if period_by == "year":
1613-
make_period = _make_sample_period_year
1614-
elif period_by == "quarter":
1615-
make_period = _make_sample_period_quarter
1616-
elif period_by == "month":
1617-
make_period = _make_sample_period_month
1618-
else: # pragma: no cover
1619-
raise ValueError(
1620-
f"Value for period_by parameter must be one of 'year', 'quarter', 'month'; found {period_by!r}."
1621-
)
1622-
sample_period = df_samples.apply(make_period, axis="columns")
1623-
df_samples["period"] = sample_period
1624-
1625-
# Add area column for consistent output.
1626-
df_samples["area"] = df_samples[area_by]
1627-
1628-
return df_samples
1629-
1630-
16311594
def add_frequency_ci(*, ds, ci_method):
16321595
from statsmodels.stats.proportion import proportion_confint # type: ignore
16331596

0 commit comments

Comments
 (0)