Skip to content

Commit 087182e

Browse files
committed
Make the sex-linked contig configurable in CNV frequency analysis
Fix #1313. gene_cnv_frequencies() and gene_cnv_frequencies_advanced() hardcoded `region.contig == "X"` when computing expected copy number. That is correct for every currently supported assembly, but bakes in a gambiae-complex assumption: any future species whose sex-linked contig uses a different name would silently fall through to the diploid branch and report incorrect amplification / deletion frequencies for males. Changes: - Add a `sex_contig` constructor parameter to `AnophelesGenomeSequenceData` (defaulting to "X") and expose it as a public `sex_contig` property, matching the pattern already used for `virtual_contigs`. - Thread `sex_contig` through `AnophelesDataResource.__init__` so species subclasses can override it when their assembly differs. - Replace both hardcoded `"X"` comparisons in cnv_frq.py with a shared `_expected_copy_number()` helper that consults `self._sex_contig`. - Handle missing `sex_call` explicitly: if the sex-linked contig is requested but `sex_call` is absent from the sample metadata, emit a UserWarning and fall back to diploid for all samples rather than raising KeyError or silently returning a misleading scalar. The default of "X" keeps behaviour unchanged for Ag3, Af1, As1, and every other existing resource. New species (e.g. An. farauti) can opt in by passing `sex_contig=` during construction.
1 parent c8344ea commit 087182e

File tree

3 files changed

+45
-11
lines changed

3 files changed

+45
-11
lines changed

malariagen_data/anoph/cnv_frq.py

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,32 @@
2929
from .sample_metadata import _locate_cohorts
3030

3131

32+
def _expected_copy_number(contig, sex_contig, df_samples):
33+
"""Compute expected copy number per sample for a given contig.
34+
35+
On the sex-linked contig, males are hemizygous (CN=1) and females
36+
are diploid (CN=2). On all other contigs, samples are diploid (CN=2).
37+
38+
If the sex-linked contig is requested but the ``sex_call`` column is
39+
missing, a warning is emitted and all samples are treated as diploid
40+
rather than silently returning a constant that would misrepresent
41+
male hemizygous loci.
42+
"""
43+
if contig != sex_contig:
44+
return 2
45+
if "sex_call" not in df_samples.columns:
46+
warnings.warn(
47+
"sex_call column not found; defaulting to diploid (CN=2) for "
48+
"all samples on the sex-linked contig. Amplification and "
49+
"deletion calls for hemizygous males may be incorrect.",
50+
UserWarning,
51+
stacklevel=3,
52+
)
53+
return 2
54+
is_male = (df_samples["sex_call"] == "M").values
55+
return np.where(is_male, 1, 2)[np.newaxis, :]
56+
57+
3258
class AnophelesCnvFrequencyAnalysis(AnophelesCnvData, AnophelesFrequencyAnalysis):
3359
def __init__(
3460
self,
@@ -290,11 +316,7 @@ def _gene_cnv_frequencies(
290316
df_samples = df_samples.set_index("sample_id").loc[sample_id].reset_index()
291317

292318
debug("figure out expected copy number")
293-
if region.contig == "X":
294-
is_male = (df_samples["sex_call"] == "M").values
295-
expected_cn = np.where(is_male, 1, 2)[np.newaxis, :]
296-
else:
297-
expected_cn = 2
319+
expected_cn = _expected_copy_number(region.contig, self._sex_contig, df_samples)
298320

299321
debug(
300322
"setup output dataframe - two rows for each gene, one for amplification and one for deletion"
@@ -557,11 +579,7 @@ def _gene_cnv_frequencies_advanced(
557579
)
558580

559581
debug("figure out expected copy number")
560-
if region.contig == "X":
561-
is_male = (df_samples["sex_call"] == "M").values
562-
expected_cn = np.where(is_male, 1, 2)[np.newaxis, :]
563-
else:
564-
expected_cn = 2
582+
expected_cn = _expected_copy_number(region.contig, self._sex_contig, df_samples)
565583

566584
debug("set up intermediates")
567585
cn = ds_cnv["CN_mode"].values

malariagen_data/anoph/genome_sequence.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,10 @@ class AnophelesGenomeSequenceData(AnophelesBase):
1919
_cache_genome: Optional[zarr.hierarchy.Group]
2020

2121
def __init__(
22-
self, virtual_contigs: Optional[Mapping[str, Sequence[str]]] = None, **kwargs
22+
self,
23+
virtual_contigs: Optional[Mapping[str, Sequence[str]]] = None,
24+
sex_contig: str = "X",
25+
**kwargs,
2326
):
2427
# N.B., this class is designed to work cooperatively, and
2528
# so it's important that any remaining parameters are passed
@@ -31,6 +34,12 @@ def __init__(
3134
virtual_contigs = dict()
3235
self._virtual_contigs = virtual_contigs
3336

37+
# Store the name of the sex-linked contig. Defaults to "X",
38+
# which is correct for all currently supported assemblies, but
39+
# can be overridden per species to support assemblies that use
40+
# different contig naming conventions.
41+
self._sex_contig = sex_contig
42+
3443
@property
3544
def contigs(self) -> Tuple[str, ...]:
3645
"""Contigs in the reference genome."""
@@ -41,6 +50,11 @@ def virtual_contigs(self) -> Mapping[str, Sequence[str]]:
4150
"""Virtual contigs made by concatenating contigs in the reference genome."""
4251
return self._virtual_contigs
4352

53+
@property
54+
def sex_contig(self) -> str:
55+
"""Name of the sex-linked contig used for ploidy-aware analyses."""
56+
return self._sex_contig
57+
4458
@property
4559
def _genome_zarr_path(self) -> str:
4660
return self.config["GENOME_ZARR_PATH"]

malariagen_data/anopheles.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ def __init__(
141141
taxon_colors: Optional[Mapping[str, str]] = None,
142142
aim_species_colors: Optional[Mapping[str, str]] = None,
143143
virtual_contigs: Optional[Mapping[str, Sequence[str]]] = None,
144+
sex_contig: str = "X",
144145
gene_names: Optional[Mapping[str, str]] = None,
145146
inversion_tag_path: Optional[str] = None,
146147
unrestricted_use_only: Optional[bool] = None,
@@ -179,6 +180,7 @@ def __init__(
179180
taxon_colors=taxon_colors,
180181
aim_species_colors=aim_species_colors,
181182
virtual_contigs=virtual_contigs,
183+
sex_contig=sex_contig,
182184
gene_names=gene_names,
183185
inversion_tag_path=inversion_tag_path,
184186
unrestricted_use_only=unrestricted_use_only,

0 commit comments

Comments
 (0)