Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
39a5410
It seems to work but I have not created any test yet
jonbrenas Mar 13, 2025
29c5bb4
It seems to work but I have not created any test yet
jonbrenas Mar 13, 2025
1f15221
Making some progress with the tests
jonbrenas Mar 21, 2025
040545d
Merge branch 'master' of github.com:malariagen/malariagen-data-python…
jonbrenas Mar 21, 2025
6a6ad11
More tests
jonbrenas Mar 26, 2025
fca19e0
Added tests
jonbrenas Mar 31, 2025
1f09857
Merge branch 'master' into 652-karyotype-frequencies
jonbrenas Apr 13, 2025
6f9b84b
Merge branch 'master' into 652-karyotype-frequencies
jonbrenas May 20, 2025
5aef685
Merge branch 'master' into 652-karyotype-frequencies
jonbrenas Feb 21, 2026
68c8169
Merge branch 'master' into 652-karyotype-frequencies
jonbrenas Feb 23, 2026
aaa0046
Updating the code
Feb 23, 2026
859f6a7
Removing duplicate xr
Feb 23, 2026
a408126
Adding taxon_by
Feb 23, 2026
72307bb
Merge branch 'master' into 652-karyotype-frequencies
jonbrenas Feb 23, 2026
cf817f0
Merge branch 'master' into 652-karyotype-frequencies
jonbrenas Feb 23, 2026
9a6a2bd
remove duplicate code and add unit tests
31puneet Apr 7, 2026
4c1fceb
fix lint errors
31puneet Apr 10, 2026
97ebd79
Merge branch 'master' into 652-karyotype-frequencies
jonbrenas Apr 11, 2026
72048ff
Merge branch 'master' into 652-karyotype-frequencies
31puneet Apr 13, 2026
7273918
removing tests
31puneet Apr 14, 2026
303f081
Merge branch 'master' into 652-karyotype-frequencies
31puneet Apr 14, 2026
7b2adf5
resolving conflicts
31puneet Apr 22, 2026
def391d
Merge remote-tracking branch 'origin/652-karyotype-frequencies' into …
31puneet Apr 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
283 changes: 283 additions & 0 deletions malariagen_data/anoph/inversion_frq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,283 @@
from typing import Optional

import numpy as np
import pandas as pd
from numpydoc_decorator import doc # type: ignore
import xarray as xr

from ..util import _check_types
from .karyotype import AnophelesKaryotypeAnalysis
from .frq_base import (
AnophelesFrequencyAnalysis,
_prep_samples_for_cohort_grouping,
_build_cohorts_from_sample_grouping,
_add_frequency_ci,
)
from .sample_metadata import _locate_cohorts
from .karyotype_params import inversions_param
from . import base_params, frq_params


class AnophelesInversionFrequencyAnalysis(
AnophelesKaryotypeAnalysis, AnophelesFrequencyAnalysis
):
def __init__(
self,
**kwargs,
):
# N.B., this class is designed to work cooperatively, and
# so it's important that any remaining parameters are passed
# to the superclass constructor.
super().__init__(**kwargs)

@_check_types
@doc(
summary="""
Compute inversion frequencies for a sequence of inversions.
""",
returns="""
A dataframe of inversion frequencies, one row per karyotype.
""",
notes="""
Cohorts with fewer samples than `min_cohort_size` will be excluded from
output data frame.
""",
)
def inversion_frequencies(
self,
inversions: inversions_param,
cohorts: base_params.cohorts,
sample_query: Optional[base_params.sample_query] = None,
sample_query_options: Optional[base_params.sample_query_options] = None,
min_cohort_size: base_params.min_cohort_size = 10,
sample_sets: Optional[base_params.sample_sets] = None,
include_counts: frq_params.include_counts = False,
) -> pd.DataFrame:
if not inversions:
raise ValueError("At least one inversion needs to be provided.")
if isinstance(inversions, str):
inversions = [inversions]

df_kar_frqs_list = []
for inversion in inversions:
# Access sample metadata.
df_samples = self.sample_metadata(
sample_sets=sample_sets,
sample_query=sample_query,
sample_query_options=sample_query_options,
)

# Build cohort dictionary, maps cohort labels to boolean indexers.
coh_dict = _locate_cohorts(
cohorts=cohorts, data=df_samples, min_cohort_size=min_cohort_size
)

# Access karyotypes
kar_df = self.karyotype(
inversion=inversion,
sample_sets=sample_sets,
sample_query=sample_query,
sample_query_options=sample_query_options,
)

base_df = pd.DataFrame(
{
"inversion": [inversion] * 3,
"allele": ["hom. ref.", "het.", "hom. alt."],
"label": [
f"{inversion} hom. ref.",
f"{inversion} het.",
f"{inversion} hom. alt.",
],
}
)

# Count alleles.
count_cols = dict()
nobs_cols = dict()
freq_cols = dict()
cohorts_iterator = self._progress(
coh_dict.items(), desc="Compute karyotype frequencies"
)
for coh, loc_coh in cohorts_iterator:
n_samples = np.count_nonzero(loc_coh)
assert n_samples >= min_cohort_size
kar_loc = kar_df.loc[loc_coh]

count_cols[f"count_{coh}"] = [
len(kar_loc.query(f"karyotype_{inversion} == {i}"))
for i in range(0, 3)
]
freq_cols[f"frq_{coh}"] = [
c / n_samples for c in count_cols[f"count_{coh}"]
]
nobs = 2 * n_samples
nobs_cols[f"nobs_{coh}"] = [nobs] * 3

# Build a dataframe with the frequency columns.
df_freqs = pd.DataFrame(freq_cols)
df_counts = pd.DataFrame(count_cols)
df_nobs = pd.DataFrame(nobs_cols)

# Build the final dataframe.
if include_counts:
df_kar_frqs_inv = pd.concat(
[base_df, df_freqs, df_counts, df_nobs], axis=1
)
else:
df_kar_frqs_inv = pd.concat([base_df, df_freqs], axis=1)

df_kar_frqs_list.append(df_kar_frqs_inv)

df_kar_frqs = pd.concat(df_kar_frqs_list, axis=0)

return df_kar_frqs

@_check_types
@doc(
summary="""
Group samples by taxon, area (space) and period (time), then compute
inversion frequencies.
""",
returns="""
The resulting dataset contains data has dimensions "cohorts" and
"variants". Variables prefixed with "cohort" are 1-dimensional
arrays with data about the cohorts, such as the area, period, taxon
and cohort size. Variables prefixed with "variant" are
1-dimensional arrays with data about the variants, such as the
contig, position, reference and alternate alleles. Variables
prefixed with "event" are 2-dimensional arrays with the allele
counts and frequency calculations.
""",
)
def inversion_frequencies_advanced(
self,
inversions: inversions_param,
area_by: frq_params.area_by,
period_by: frq_params.period_by,
taxon_by: frq_params.taxon_by = frq_params.taxon_by_default,
sample_sets: Optional[base_params.sample_sets] = None,
sample_query: Optional[base_params.sample_query] = None,
sample_query_options: Optional[base_params.sample_query_options] = None,
min_cohort_size: base_params.min_cohort_size = 10,
ci_method: Optional[frq_params.ci_method] = frq_params.ci_method_default,
) -> xr.Dataset:
if not inversions:
raise ValueError("At least one inversion needs to be provided.")
if isinstance(inversions, str):
inversions = [inversions]

ds_list = []

# Load sample metadata.
df_samples = self.sample_metadata(
sample_sets=sample_sets,
sample_query=sample_query,
sample_query_options=sample_query_options,
)

# Prepare sample metadata for cohort grouping.
df_samples = _prep_samples_for_cohort_grouping(
df_samples=df_samples,
area_by=area_by,
period_by=period_by,
taxon_by=taxon_by,
)

# Group samples to make cohorts.
group_samples_by_cohort = df_samples.groupby([taxon_by, "area", "period"])

# Build cohorts dataframe.
df_cohorts = _build_cohorts_from_sample_grouping(
group_samples_by_cohort=group_samples_by_cohort,
min_cohort_size=min_cohort_size,
taxon_by=taxon_by,
)

# Early check for no cohorts.
if len(df_cohorts) == 0:
raise ValueError(
"No cohorts available for the given sample selection parameters and minimum cohort size."
)

for inversion in inversions:
# Access karyotypes.
kar_df = self.karyotype(
inversion=inversion,
sample_sets=sample_sets,
sample_query=sample_query,
sample_query_options=sample_query_options,
)

# Count alleles.
count_cols = dict()
nobs_cols = dict()
freq_cols = dict()
cohorts_iterator = self._progress(
df_cohorts.itertuples(), desc="Compute karyotype frequencies"
)
for cohort in cohorts_iterator:
cohort_taxon = getattr(cohort, taxon_by)
cohort_key = cohort_taxon, cohort.area, cohort.period
cohort_key_str = (
str(cohort_taxon)
+ "_"
+ str(cohort.area)
+ "_"
+ str(cohort.period)
)
n_samples = cohort.size
assert n_samples >= min_cohort_size
sample_indices = group_samples_by_cohort.indices[cohort_key]
kar_loc = kar_df.loc[sample_indices]

count_cols[f"count_{cohort_key_str}"] = [
len(kar_loc.query(f"karyotype_{inversion} == {i}"))
for i in range(0, 3)
]
freq_cols[f"frq_{cohort_key_str}"] = [
c / n_samples for c in count_cols[f"count_{cohort_key_str}"]
]
nobs = 2 * n_samples
nobs_cols[f"nobs_{cohort_key_str}"] = [nobs] * 3

# Build a dataframe with the frequency columns.
df_freqs = pd.DataFrame(freq_cols)
df_counts = pd.DataFrame(count_cols)
df_nobs = pd.DataFrame(nobs_cols)

# Build the output dataset.
ds_tmp = xr.Dataset()

# Cohort variables.
for coh_col in df_cohorts.columns:
ds_tmp[f"cohort_{coh_col}"] = "cohorts", df_cohorts[coh_col]

# Variant labels
ds_tmp["variant_label"] = (
"variants",
[f"{inversion}_{allele}" for allele in ["hom_ref", "het", "hom_alt"]],
)

# Event variables.
ds_tmp["event_frequency"] = (
("variants", "cohorts"),
df_freqs.to_numpy(),
)
ds_tmp["event_count"] = (
("variants", "cohorts"),
df_counts.to_numpy(),
)
ds_tmp["event_nobs"] = (
("variants", "cohorts"),
df_nobs.to_numpy(),
)

# Add confidence intervals.
_add_frequency_ci(ds=ds_tmp, ci_method=ci_method)

ds_list.append(ds_tmp)

ds_out = xr.concat(ds_list, dim="variants", data_vars="minimal")

return ds_out
6 changes: 6 additions & 0 deletions malariagen_data/anoph/karyotype_params.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
"""Parameter definitions for karyotype analysis functions."""
from typing import Union, Sequence

from typing_extensions import Annotated, TypeAlias

inversion_param: TypeAlias = Annotated[
str,
"Name of inversion to infer karyotype for.",
]

inversions_param: TypeAlias = Annotated[
Union[Sequence[inversion_param], inversion_param],
"Names of inversion to infer karyotype for.",
]
2 changes: 2 additions & 0 deletions malariagen_data/anopheles.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
from .anoph.hapclust import AnophelesHapClustAnalysis
from .anoph.describe import AnophelesDescribe
from .anoph.dipclust import AnophelesDipClustAnalysis
from .anoph.inversion_frq import AnophelesInversionFrequencyAnalysis
from .anoph.heterozygosity import AnophelesHetAnalysis
from .anoph.xpehh import AnophelesXpehhAnalysis
from .util import (
Expand Down Expand Up @@ -86,6 +87,7 @@ class AnophelesDataResource(
AnophelesH12Analysis,
AnophelesG123Analysis,
AnophelesFstAnalysis,
AnophelesInversionFrequencyAnalysis,
AnophelesHetAnalysis,
AnophelesHapFrequencyAnalysis,
AnophelesDistanceAnalysis,
Expand Down
Loading
Loading