Skip to content

Commit 2a7577e

Browse files
committed
Add taxon_by param to snp_allele_frequencies_advanced(). Add example usage to notebook.
1 parent 70ad53d commit 2a7577e

4 files changed

Lines changed: 71 additions & 10 deletions

File tree

malariagen_data/anoph/frq_params.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,3 +80,10 @@
8080
Optional[Union[str, List[str], Tuple[str, ...]]],
8181
"The area or areas to restrict the dataset to.",
8282
]
83+
84+
taxon_by: TypeAlias = Annotated[
85+
str,
86+
"The column to use for taxon stratification.",
87+
]
88+
89+
taxon_by_default: taxon_by = "taxon"

malariagen_data/anoph/snp_frq.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -452,6 +452,7 @@ def snp_allele_frequencies_advanced(
452452
ci_method: Optional[frq_params.ci_method] = frq_params.ci_method_default,
453453
chunks: base_params.chunks = base_params.native_chunks,
454454
inline_array: base_params.inline_array = base_params.inline_array_default,
455+
taxon_by: frq_params.taxon_by = frq_params.taxon_by_default,
455456
) -> xr.Dataset:
456457
# Load sample metadata.
457458
df_samples = self.sample_metadata(
@@ -465,15 +466,17 @@ def snp_allele_frequencies_advanced(
465466
df_samples=df_samples,
466467
area_by=area_by,
467468
period_by=period_by,
469+
taxon_by=taxon_by,
468470
)
469471

470472
# Group samples to make cohorts.
471-
group_samples_by_cohort = df_samples.groupby(["taxon", "area", "period"])
473+
group_samples_by_cohort = df_samples.groupby([taxon_by, "area", "period"])
472474

473475
# Build cohorts dataframe.
474476
df_cohorts = build_cohorts_from_sample_grouping(
475477
group_samples_by_cohort=group_samples_by_cohort,
476478
min_cohort_size=min_cohort_size,
479+
taxon_by=taxon_by,
477480
)
478481

479482
# Early check for no cohorts.
@@ -529,7 +532,8 @@ def snp_allele_frequencies_advanced(
529532
desc="Compute SNP allele frequencies",
530533
)
531534
for cohort_index, cohort in cohorts_iterator:
532-
cohort_key = cohort.taxon, cohort.area, cohort.period
535+
cohort_taxon = getattr(cohort, taxon_by)
536+
cohort_key = cohort_taxon, cohort.area, cohort.period
533537
sample_indices = group_samples_by_cohort.indices[cohort_key]
534538

535539
cohort_ac, cohort_an = _cohort_alt_allele_counts_melt(
@@ -601,7 +605,11 @@ def snp_allele_frequencies_advanced(
601605

602606
# Cohort variables.
603607
for coh_col in df_cohorts.columns:
604-
ds_out[f"cohort_{coh_col}"] = "cohorts", df_cohorts[coh_col]
608+
if coh_col == taxon_by:
609+
# Other functions expect cohort_taxon, e.g. plot_frequencies_interactive_map()
610+
ds_out["cohort_taxon"] = "cohorts", df_cohorts[coh_col]
611+
else:
612+
ds_out[f"cohort_{coh_col}"] = "cohorts", df_cohorts[coh_col]
605613

606614
# Variant variables.
607615
for snp_col in df_variants.columns:

malariagen_data/util.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1612,20 +1612,22 @@ def _karyotype_tags_n_alt(gt, alts, inversion_alts):
16121612
return inv_n_alt
16131613

16141614

1615-
def prep_samples_for_cohort_grouping(*, df_samples, area_by, period_by):
1615+
def prep_samples_for_cohort_grouping(
1616+
*, df_samples, area_by, period_by, taxon_by="taxon"
1617+
):
16161618
# Take a copy, as we will modify the dataframe.
16171619
df_samples = df_samples.copy()
16181620

16191621
# Fix "intermediate" or "unassigned" taxon values - we only want to build
16201622
# cohorts with clean taxon calls, so we set other values to None.
16211623
loc_intermediate_taxon = (
1622-
df_samples["taxon"].str.startswith("intermediate").fillna(False)
1624+
df_samples[taxon_by].str.startswith("intermediate").fillna(False)
16231625
)
1624-
df_samples.loc[loc_intermediate_taxon, "taxon"] = None
1626+
df_samples.loc[loc_intermediate_taxon, taxon_by] = None
16251627
loc_unassigned_taxon = (
1626-
df_samples["taxon"].str.startswith("unassigned").fillna(False)
1628+
df_samples[taxon_by].str.startswith("unassigned").fillna(False)
16271629
)
1628-
df_samples.loc[loc_unassigned_taxon, "taxon"] = None
1630+
df_samples.loc[loc_unassigned_taxon, taxon_by] = None
16291631

16301632
# Add period column.
16311633
if period_by == "year":
@@ -1647,7 +1649,9 @@ def prep_samples_for_cohort_grouping(*, df_samples, area_by, period_by):
16471649
return df_samples
16481650

16491651

1650-
def build_cohorts_from_sample_grouping(*, group_samples_by_cohort, min_cohort_size):
1652+
def build_cohorts_from_sample_grouping(
1653+
*, group_samples_by_cohort, min_cohort_size, taxon_by="taxon"
1654+
):
16511655
# Build cohorts dataframe.
16521656
df_cohorts = group_samples_by_cohort.agg(
16531657
size=("sample_id", len),
@@ -1669,7 +1673,7 @@ def build_cohorts_from_sample_grouping(*, group_samples_by_cohort, min_cohort_si
16691673
# Create a label that is similar to the cohort metadata,
16701674
# although this won't be perfect.
16711675
df_cohorts["label"] = df_cohorts.apply(
1672-
lambda v: f"{v.area}_{v.taxon[:4]}_{v.period}", axis="columns"
1676+
lambda v: f"{v.area}_{v[taxon_by][:4]}_{v.period}", axis="columns"
16731677
)
16741678

16751679
# Apply minimum cohort size.

notebooks/plot_frequencies_space_time.ipynb

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,48 @@
382382
"source": [
383383
"af1.plot_frequencies_interactive_map(ds)"
384384
]
385+
},
386+
{
387+
"cell_type": "markdown",
388+
"id": "a94d6521",
389+
"metadata": {},
390+
"source": [
391+
"### SNP allele frequencies using `taxon_by`"
392+
]
393+
},
394+
{
395+
"cell_type": "code",
396+
"execution_count": null,
397+
"id": "0cf222ee",
398+
"metadata": {},
399+
"outputs": [],
400+
"source": [
401+
"ds = ag3.snp_allele_frequencies_advanced(\n",
402+
" transcript=\"AGAP004707-RD\",\n",
403+
" area_by=\"admin1_iso\",\n",
404+
" period_by=\"year\",\n",
405+
" sample_sets=[\"AG1000G-BF-A\", \"AG1000G-BF-B\", \"AG1000G-UG\", \"AG1000G-TZ\"],\n",
406+
" sample_query=\"aim_species in ['gambiae', 'coluzzii']\",\n",
407+
" min_cohort_size=10,\n",
408+
" drop_invariant=True,\n",
409+
" variant_query=\"max_af > 0.05 and effect == 'NON_SYNONYMOUS_CODING'\",\n",
410+
" site_mask=None,\n",
411+
" nobs_mode=\"called\",\n",
412+
" ci_method=\"wilson\",\n",
413+
" taxon_by=\"aim_species\",\n",
414+
")\n",
415+
"ds"
416+
]
417+
},
418+
{
419+
"cell_type": "code",
420+
"execution_count": null,
421+
"id": "2b7e9acc",
422+
"metadata": {},
423+
"outputs": [],
424+
"source": [
425+
"ag3.plot_frequencies_interactive_map(ds)"
426+
]
385427
}
386428
],
387429
"metadata": {

0 commit comments

Comments
 (0)