Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 28 additions & 10 deletions malariagen_data/anoph/cnv_frq.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,32 @@
from .sample_metadata import _locate_cohorts


def _expected_copy_number(contig, sex_contig, df_samples):
"""Compute expected copy number per sample for a given contig.

On the sex-linked contig, males are hemizygous (CN=1) and females
are diploid (CN=2). On all other contigs, samples are diploid (CN=2).

If the sex-linked contig is requested but the ``sex_call`` column is
missing, a warning is emitted and all samples are treated as diploid
rather than silently returning a constant that would misrepresent
male hemizygous loci.
"""
if contig != sex_contig:
return 2
if "sex_call" not in df_samples.columns:
warnings.warn(
"sex_call column not found; defaulting to diploid (CN=2) for "
"all samples on the sex-linked contig. Amplification and "
"deletion calls for hemizygous males may be incorrect.",
UserWarning,
stacklevel=3,
)
return 2
is_male = (df_samples["sex_call"] == "M").values
return np.where(is_male, 1, 2)[np.newaxis, :]


class AnophelesCnvFrequencyAnalysis(AnophelesCnvData, AnophelesFrequencyAnalysis):
def __init__(
self,
Expand Down Expand Up @@ -290,11 +316,7 @@ def _gene_cnv_frequencies(
df_samples = df_samples.set_index("sample_id").loc[sample_id].reset_index()

debug("figure out expected copy number")
if region.contig == "X":
is_male = (df_samples["sex_call"] == "M").values
expected_cn = np.where(is_male, 1, 2)[np.newaxis, :]
else:
expected_cn = 2
expected_cn = _expected_copy_number(region.contig, self._sex_contig, df_samples)

debug(
"setup output dataframe - two rows for each gene, one for amplification and one for deletion"
Expand Down Expand Up @@ -557,11 +579,7 @@ def _gene_cnv_frequencies_advanced(
)

debug("figure out expected copy number")
if region.contig == "X":
is_male = (df_samples["sex_call"] == "M").values
expected_cn = np.where(is_male, 1, 2)[np.newaxis, :]
else:
expected_cn = 2
expected_cn = _expected_copy_number(region.contig, self._sex_contig, df_samples)

debug("set up intermediates")
cn = ds_cnv["CN_mode"].values
Expand Down
16 changes: 15 additions & 1 deletion malariagen_data/anoph/genome_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@ class AnophelesGenomeSequenceData(AnophelesBase):
_cache_genome: Optional[zarr.hierarchy.Group]

def __init__(
self, virtual_contigs: Optional[Mapping[str, Sequence[str]]] = None, **kwargs
self,
virtual_contigs: Optional[Mapping[str, Sequence[str]]] = None,
sex_contig: str = "X",
**kwargs,
):
# N.B., this class is designed to work cooperatively, and
# so it's important that any remaining parameters are passed
Expand All @@ -31,6 +34,12 @@ def __init__(
virtual_contigs = dict()
self._virtual_contigs = virtual_contigs

# Store the name of the sex-linked contig. Defaults to "X",
# which is correct for all currently supported assemblies, but
# can be overridden per species to support assemblies that use
# different contig naming conventions.
self._sex_contig = sex_contig

@property
def contigs(self) -> Tuple[str, ...]:
"""Contigs in the reference genome."""
Expand All @@ -41,6 +50,11 @@ def virtual_contigs(self) -> Mapping[str, Sequence[str]]:
"""Virtual contigs made by concatenating contigs in the reference genome."""
return self._virtual_contigs

@property
def sex_contig(self) -> str:
"""Name of the sex-linked contig used for ploidy-aware analyses."""
return self._sex_contig

@property
def _genome_zarr_path(self) -> str:
return self.config["GENOME_ZARR_PATH"]
Expand Down
2 changes: 2 additions & 0 deletions malariagen_data/anopheles.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ def __init__(
taxon_colors: Optional[Mapping[str, str]] = None,
aim_species_colors: Optional[Mapping[str, str]] = None,
virtual_contigs: Optional[Mapping[str, Sequence[str]]] = None,
sex_contig: str = "X",
gene_names: Optional[Mapping[str, str]] = None,
inversion_tag_path: Optional[str] = None,
unrestricted_use_only: Optional[bool] = None,
Expand Down Expand Up @@ -179,6 +180,7 @@ def __init__(
taxon_colors=taxon_colors,
aim_species_colors=aim_species_colors,
virtual_contigs=virtual_contigs,
sex_contig=sex_contig,
gene_names=gene_names,
inversion_tag_path=inversion_tag_path,
unrestricted_use_only=unrestricted_use_only,
Expand Down
30 changes: 29 additions & 1 deletion tests/anoph/test_cnv_frq.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@

from malariagen_data import af1 as _af1
from malariagen_data import ag3 as _ag3
from malariagen_data.anoph.cnv_frq import AnophelesCnvFrequencyAnalysis
from malariagen_data.anoph.cnv_frq import (
AnophelesCnvFrequencyAnalysis,
_expected_copy_number,
)
from malariagen_data.util import _compare_series_like
from .test_frq import (
check_plot_frequencies_heatmap,
Expand Down Expand Up @@ -883,3 +886,28 @@ def check_gene_cnv_frequencies_advanced(
actual_frq = ds["event_frequency"].values[:, cohort_index]
expect_frq = df_af[f"frq_{cohort_label}"].values
assert_allclose(actual_frq, expect_frq)


def test_expected_copy_number_autosome():
df_samples = pd.DataFrame({"sex_call": ["M", "F", "M"]})
assert _expected_copy_number("2R", "X", df_samples) == 2


def test_expected_copy_number_sex_contig_with_sex_call():
df_samples = pd.DataFrame({"sex_call": ["M", "F", "M", "F"]})
result = _expected_copy_number("X", "X", df_samples)
assert_array_equal(result, np.array([[1, 2, 1, 2]]))


def test_expected_copy_number_sex_contig_missing_sex_call():
df_samples = pd.DataFrame({"sample_id": ["a", "b", "c"]})
with pytest.warns(UserWarning, match="sex_call column not found"):
result = _expected_copy_number("X", "X", df_samples)
assert result == 2


def test_expected_copy_number_custom_sex_contig():
df_samples = pd.DataFrame({"sex_call": ["M", "F"]})
result = _expected_copy_number("Z", "Z", df_samples)
assert_array_equal(result, np.array([[1, 2]]))
assert _expected_copy_number("X", "Z", df_samples) == 2
19 changes: 19 additions & 0 deletions tests/anoph/test_genome_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,25 @@ def test_genome_sequence_region(fixture, api):
assert seq.shape[0] == stop - start + 1


@parametrize_with_cases("fixture,api", cases=".")
def test_sex_contig_default(fixture, api):
assert api.sex_contig == "X"


def test_sex_contig_configurable(ag3_sim_fixture):
api = AnophelesGenomeSequenceData(
url=ag3_sim_fixture.url,
public_url=ag3_sim_fixture.url,
config_path=_ag3.CONFIG_PATH,
major_version_number=_ag3.MAJOR_VERSION_NUMBER,
major_version_path=_ag3.MAJOR_VERSION_PATH,
pre=True,
virtual_contigs=_ag3.VIRTUAL_CONTIGS,
sex_contig="Z",
)
assert api.sex_contig == "Z"


def test_virtual_contigs(ag3_sim_api):
api = ag3_sim_api
contigs = api.contigs
Expand Down
Loading