diff --git a/malariagen_data/anoph/cnv_frq.py b/malariagen_data/anoph/cnv_frq.py index ee4724e20..5ace5751e 100644 --- a/malariagen_data/anoph/cnv_frq.py +++ b/malariagen_data/anoph/cnv_frq.py @@ -29,6 +29,32 @@ from .sample_metadata import _locate_cohorts +def _expected_copy_number(contig, sex_contig, df_samples): + """Compute expected copy number per sample for a given contig. + + On the sex-linked contig, males are hemizygous (CN=1) and females + are diploid (CN=2). On all other contigs, samples are diploid (CN=2). + + If the sex-linked contig is requested but the ``sex_call`` column is + missing, a warning is emitted and all samples are treated as diploid + rather than silently returning a constant that would misrepresent + male hemizygous loci. + """ + if contig != sex_contig: + return 2 + if "sex_call" not in df_samples.columns: + warnings.warn( + "sex_call column not found; defaulting to diploid (CN=2) for " + "all samples on the sex-linked contig. Amplification and " + "deletion calls for hemizygous males may be incorrect.", + UserWarning, + stacklevel=3, + ) + return 2 + is_male = (df_samples["sex_call"] == "M").values + return np.where(is_male, 1, 2)[np.newaxis, :] + + class AnophelesCnvFrequencyAnalysis(AnophelesCnvData, AnophelesFrequencyAnalysis): def __init__( self, @@ -290,11 +316,7 @@ def _gene_cnv_frequencies( df_samples = df_samples.set_index("sample_id").loc[sample_id].reset_index() debug("figure out expected copy number") - if region.contig == "X": - is_male = (df_samples["sex_call"] == "M").values - expected_cn = np.where(is_male, 1, 2)[np.newaxis, :] - else: - expected_cn = 2 + expected_cn = _expected_copy_number(region.contig, self._sex_contig, df_samples) debug( "setup output dataframe - two rows for each gene, one for amplification and one for deletion" @@ -557,11 +579,7 @@ def _gene_cnv_frequencies_advanced( ) debug("figure out expected copy number") - if region.contig == "X": - is_male = (df_samples["sex_call"] == "M").values - expected_cn = np.where(is_male, 1, 2)[np.newaxis, :] - else: - expected_cn = 2 + expected_cn = _expected_copy_number(region.contig, self._sex_contig, df_samples) debug("set up intermediates") cn = ds_cnv["CN_mode"].values diff --git a/malariagen_data/anoph/genome_sequence.py b/malariagen_data/anoph/genome_sequence.py index 3dc380167..981c8ab8a 100644 --- a/malariagen_data/anoph/genome_sequence.py +++ b/malariagen_data/anoph/genome_sequence.py @@ -19,7 +19,10 @@ class AnophelesGenomeSequenceData(AnophelesBase): _cache_genome: Optional[zarr.hierarchy.Group] def __init__( - self, virtual_contigs: Optional[Mapping[str, Sequence[str]]] = None, **kwargs + self, + virtual_contigs: Optional[Mapping[str, Sequence[str]]] = None, + sex_contig: str = "X", + **kwargs, ): # N.B., this class is designed to work cooperatively, and # so it's important that any remaining parameters are passed @@ -31,6 +34,12 @@ def __init__( virtual_contigs = dict() self._virtual_contigs = virtual_contigs + # Store the name of the sex-linked contig. Defaults to "X", + # which is correct for all currently supported assemblies, but + # can be overridden per species to support assemblies that use + # different contig naming conventions. + self._sex_contig = sex_contig + @property def contigs(self) -> Tuple[str, ...]: """Contigs in the reference genome.""" @@ -41,6 +50,11 @@ def virtual_contigs(self) -> Mapping[str, Sequence[str]]: """Virtual contigs made by concatenating contigs in the reference genome.""" return self._virtual_contigs + @property + def sex_contig(self) -> str: + """Name of the sex-linked contig used for ploidy-aware analyses.""" + return self._sex_contig + @property def _genome_zarr_path(self) -> str: return self.config["GENOME_ZARR_PATH"] diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index 8342dbb88..9a520286a 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -141,6 +141,7 @@ def __init__( taxon_colors: Optional[Mapping[str, str]] = None, aim_species_colors: Optional[Mapping[str, str]] = None, virtual_contigs: Optional[Mapping[str, Sequence[str]]] = None, + sex_contig: str = "X", gene_names: Optional[Mapping[str, str]] = None, inversion_tag_path: Optional[str] = None, unrestricted_use_only: Optional[bool] = None, @@ -179,6 +180,7 @@ def __init__( taxon_colors=taxon_colors, aim_species_colors=aim_species_colors, virtual_contigs=virtual_contigs, + sex_contig=sex_contig, gene_names=gene_names, inversion_tag_path=inversion_tag_path, unrestricted_use_only=unrestricted_use_only, diff --git a/tests/anoph/test_cnv_frq.py b/tests/anoph/test_cnv_frq.py index c775ffff0..502bd7d84 100644 --- a/tests/anoph/test_cnv_frq.py +++ b/tests/anoph/test_cnv_frq.py @@ -7,7 +7,10 @@ from malariagen_data import af1 as _af1 from malariagen_data import ag3 as _ag3 -from malariagen_data.anoph.cnv_frq import AnophelesCnvFrequencyAnalysis +from malariagen_data.anoph.cnv_frq import ( + AnophelesCnvFrequencyAnalysis, + _expected_copy_number, +) from malariagen_data.util import _compare_series_like from .test_frq import ( check_plot_frequencies_heatmap, @@ -883,3 +886,28 @@ def check_gene_cnv_frequencies_advanced( actual_frq = ds["event_frequency"].values[:, cohort_index] expect_frq = df_af[f"frq_{cohort_label}"].values assert_allclose(actual_frq, expect_frq) + + +def test_expected_copy_number_autosome(): + df_samples = pd.DataFrame({"sex_call": ["M", "F", "M"]}) + assert _expected_copy_number("2R", "X", df_samples) == 2 + + +def test_expected_copy_number_sex_contig_with_sex_call(): + df_samples = pd.DataFrame({"sex_call": ["M", "F", "M", "F"]}) + result = _expected_copy_number("X", "X", df_samples) + assert_array_equal(result, np.array([[1, 2, 1, 2]])) + + +def test_expected_copy_number_sex_contig_missing_sex_call(): + df_samples = pd.DataFrame({"sample_id": ["a", "b", "c"]}) + with pytest.warns(UserWarning, match="sex_call column not found"): + result = _expected_copy_number("X", "X", df_samples) + assert result == 2 + + +def test_expected_copy_number_custom_sex_contig(): + df_samples = pd.DataFrame({"sex_call": ["M", "F"]}) + result = _expected_copy_number("Z", "Z", df_samples) + assert_array_equal(result, np.array([[1, 2]])) + assert _expected_copy_number("X", "Z", df_samples) == 2 diff --git a/tests/anoph/test_genome_sequence.py b/tests/anoph/test_genome_sequence.py index a50dcfaac..69d7e39c2 100644 --- a/tests/anoph/test_genome_sequence.py +++ b/tests/anoph/test_genome_sequence.py @@ -107,6 +107,25 @@ def test_genome_sequence_region(fixture, api): assert seq.shape[0] == stop - start + 1 +@parametrize_with_cases("fixture,api", cases=".") +def test_sex_contig_default(fixture, api): + assert api.sex_contig == "X" + + +def test_sex_contig_configurable(ag3_sim_fixture): + api = AnophelesGenomeSequenceData( + url=ag3_sim_fixture.url, + public_url=ag3_sim_fixture.url, + config_path=_ag3.CONFIG_PATH, + major_version_number=_ag3.MAJOR_VERSION_NUMBER, + major_version_path=_ag3.MAJOR_VERSION_PATH, + pre=True, + virtual_contigs=_ag3.VIRTUAL_CONTIGS, + sex_contig="Z", + ) + assert api.sex_contig == "Z" + + def test_virtual_contigs(ag3_sim_api): api = ag3_sim_api contigs = api.contigs