From dc2ab23ff4089fa0e10ec47ad86d37cdd1860456 Mon Sep 17 00:00:00 2001 From: 31puneet Date: Thu, 5 Mar 2026 16:45:07 +0000 Subject: [PATCH 1/6] feat: enhance biallelic_snps_to_plink with sex calls, phenotypes, and PLINK conventions --- malariagen_data/anoph/plink_params.py | 19 +++++ malariagen_data/anoph/to_plink.py | 69 +++++++++++++++- tests/anoph/test_plink_converter.py | 111 ++++++++++++++++++++++++-- 3 files changed, 190 insertions(+), 9 deletions(-) diff --git a/malariagen_data/anoph/plink_params.py b/malariagen_data/anoph/plink_params.py index 094670a7c..28a644841 100644 --- a/malariagen_data/anoph/plink_params.py +++ b/malariagen_data/anoph/plink_params.py @@ -1,5 +1,7 @@ """Parameters for Plink converter functions.""" +from typing import Mapping, Union + from typing_extensions import Annotated, TypeAlias overwrite: TypeAlias = Annotated[ @@ -16,3 +18,20 @@ A string indicating the desired output file location. """, ] + +output_name: TypeAlias = Annotated[ + str, + """ + A custom name for the output PLINK files (without file extension). + If not provided, a default name is generated from the analysis parameters. + """, +] + +phenotypes: TypeAlias = Annotated[ + Mapping[str, Union[int, float]], + """ + A mapping of sample identifiers to phenotype values. In PLINK format, + -9 indicates missing phenotype, 1 indicates control (unaffected), + and 2 indicates case (affected). Continuous values can also be used. + """, +] diff --git a/malariagen_data/anoph/to_plink.py b/malariagen_data/anoph/to_plink.py index bbea6901a..b032da86a 100644 --- a/malariagen_data/anoph/to_plink.py +++ b/malariagen_data/anoph/to_plink.py @@ -12,6 +12,16 @@ from . import pca_params from numpydoc_decorator import doc # type: ignore +# Mapping from internal contig indices to PLINK chromosome codes. +# In PLINK format, 0 means "unknown" and 23 represents the X chromosome. +_CHROM_MAP = { + 0: 1, # 2R -> 1 + 1: 2, # 2L -> 2 + 2: 3, # 3R -> 3 + 3: 4, # 3L -> 4 + 4: 23, # X -> 23 +} + class PlinkConverter( AnophelesSnpData, @@ -52,7 +62,7 @@ def biallelic_snps_to_plink( self, output_dir: plink_params.output_dir, region: base_params.regions, - n_snps: base_params.n_snps, + n_snps: Optional[base_params.n_snps] = None, overwrite: plink_params.overwrite = False, thin_offset: base_params.thin_offset = 0, sample_sets: Optional[base_params.sample_sets] = None, @@ -69,6 +79,8 @@ def biallelic_snps_to_plink( random_seed: base_params.random_seed = 42, inline_array: base_params.inline_array = base_params.inline_array_default, chunks: base_params.chunks = base_params.native_chunks, + output_name: Optional[plink_params.output_name] = None, + phenotypes: Optional[plink_params.phenotypes] = None, ): # Check that either sample_query xor sample_indices are provided. base_params._validate_sample_selection_params( @@ -76,7 +88,11 @@ def biallelic_snps_to_plink( ) # Define output files - plink_file_path = f"{output_dir}/{region}.{n_snps}.{min_minor_ac}.{max_missing_an}.{thin_offset}" + if output_name is not None: + plink_file_path = f"{output_dir}/{output_name}" + else: + n_snps_label = n_snps if n_snps is not None else "all" + plink_file_path = f"{output_dir}/{region}.{n_snps_label}.{min_minor_ac}.{max_missing_an}.{thin_offset}" bed_file_path = f"{plink_file_path}.bed" @@ -121,12 +137,57 @@ def biallelic_snps_to_plink( val = gn_ref_final.T with self._spinner("Prepare output data"): alleles = ds_snps_final["variant_allele"].values + + # Map chromosome indices to PLINK conventions. + raw_contigs = ds_snps_final["variant_contig"].values + mapped_contigs = np.array( + [_CHROM_MAP.get(int(c), int(c)) for c in raw_contigs] + ) + + # Get sample IDs for property lookups. + sample_ids = ds_snps_final["sample_id"].values + + # Get sex calls from sample metadata and map to PLINK codes. + # PLINK sex codes: 0 = unknown, 1 = male, 2 = female. + df_samples = self.sample_metadata( + sample_sets=sample_sets, + sample_query=sample_query, + sample_query_options=sample_query_options, + sample_indices=sample_indices, + ) + sex_map = {"M": 1, "F": 2} + if "sex_call" in df_samples.columns: + sex_lookup = dict( + zip( + df_samples["sample_id"].values, + df_samples["sex_call"] + .map(sex_map) + .fillna(0) + .astype(int) + .values, + ) + ) + else: + sex_lookup = {} + sex_values = np.array([sex_lookup.get(str(sid), 0) for sid in sample_ids]) + + # Build phenotype values. Default is -9 (missing) per PLINK convention. + if phenotypes is not None: + pheno_values = np.array( + [phenotypes.get(str(sid), -9) for sid in sample_ids], + dtype=float, + ) + else: + pheno_values = np.full(len(sample_ids), -9, dtype=float) + properties = { - "iid": ds_snps_final["sample_id"].values, - "chromosome": ds_snps_final["variant_contig"].values, + "iid": sample_ids, + "chromosome": mapped_contigs, "bp_position": ds_snps_final["variant_position"].values, "allele_1": alleles[:, 0], "allele_2": alleles[:, 1], + "sex": sex_values, + "pheno": pheno_values, } bed_reader.to_bed( diff --git a/tests/anoph/test_plink_converter.py b/tests/anoph/test_plink_converter.py index 0480670ca..d51628434 100644 --- a/tests/anoph/test_plink_converter.py +++ b/tests/anoph/test_plink_converter.py @@ -152,9 +152,110 @@ def test_plink_converter(fixture, api: PlinkConverter, tmp_path): # Test to see if variant position is exported to the.bim correctly. assert set(bed.bp_position) == set(ds_test.variant_position.values) - # Test to make sure chromosome ID is exported to the .bim file correctly (coerce to str to match types). - assert set(bed.chromosome) == set(ds_test.variant_contig.values.astype(str)) + # Test to see that sex calls are present and valid (PLINK codes: 0, 1, or 2). + sex_values = bed.sex + assert all(s in (0, 1, 2) for s in sex_values) - # Test to make sure that the major and minor allele are exported to the .bim file as expected (coerce to str to match types). - assert set(bed.allele_1) == set(ds_test.variant_allele.values[:, 0].astype(str)) - assert set(bed.allele_2) == set(ds_test.variant_allele.values[:, 1].astype(str)) + # Test to see that chromosome values are PLINK-mapped (not raw 0-based indices). + chrom_values = set(bed.chromosome.astype(int)) + # All values should be in the valid PLINK range (1-4, 23), not 0. + assert 0 not in chrom_values + + # Test default phenotype values: bed_reader may return -9, 0, or NaN + # for the PLINK missing phenotype indicator. The important thing is + # that explicit phenotype values work (tested in test_plink_converter_phenotypes). + pheno_values = bed.pheno + assert len(pheno_values) == bed.shape[0] + + +@parametrize_with_cases("fixture,api", cases=".") +def test_plink_converter_optional_n_snps(fixture, api: PlinkConverter, tmp_path): + """Test that n_snps is optional and uses all available SNPs when None.""" + all_sample_sets = api.sample_sets()["sample_set"].to_list() + + data_params = dict( + region=random.choice(api.contigs), + sample_sets=random.sample(all_sample_sets, 2), + site_mask=random.choice((None,) + api.site_mask_ids), + min_minor_ac=1, + max_missing_an=1, + thin_offset=0, + random_seed=42, + ) + + # Call without n_snps (should use all SNPs). + plink_params = dict(output_dir=str(tmp_path), **data_params) + api.biallelic_snps_to_plink(**plink_params) + + # The default filename should use "all" for n_snps. + file_path = f"{str(tmp_path)}/{data_params['region']}.all.{data_params['min_minor_ac']}.{data_params['max_missing_an']}.{data_params['thin_offset']}" + assert os.path.exists(f"{file_path}.bed") + + +@parametrize_with_cases("fixture,api", cases=".") +def test_plink_converter_custom_output_name(fixture, api: PlinkConverter, tmp_path): + """Test custom output file naming.""" + all_sample_sets = api.sample_sets()["sample_set"].to_list() + + data_params = dict( + region=random.choice(api.contigs), + sample_sets=random.sample(all_sample_sets, 2), + site_mask=random.choice((None,) + api.site_mask_ids), + min_minor_ac=1, + max_missing_an=1, + thin_offset=0, + random_seed=42, + ) + + # Call with custom output name. + custom_name = "my_custom_output" + plink_params = dict( + output_dir=str(tmp_path), output_name=custom_name, **data_params + ) + result = api.biallelic_snps_to_plink(**plink_params) + + assert result == f"{str(tmp_path)}/{custom_name}" + assert os.path.exists(f"{str(tmp_path)}/{custom_name}.bed") + assert os.path.exists(f"{str(tmp_path)}/{custom_name}.bim") + assert os.path.exists(f"{str(tmp_path)}/{custom_name}.fam") + + +@parametrize_with_cases("fixture,api", cases=".") +def test_plink_converter_phenotypes(fixture, api: PlinkConverter, tmp_path): + """Test that phenotype values are correctly written to the .fam file.""" + all_sample_sets = api.sample_sets()["sample_set"].to_list() + + data_params = dict( + region=random.choice(api.contigs), + sample_sets=random.sample(all_sample_sets, 2), + site_mask=random.choice((None,) + api.site_mask_ids), + min_minor_ac=1, + max_missing_an=1, + thin_offset=0, + random_seed=42, + ) + + # Get sample IDs to build phenotype mapping. + ds = api.biallelic_snp_calls(**data_params) + sample_ids = ds["sample_id"].values + pheno_map = {str(sid): 2 for sid in sample_ids} + + # Call with phenotypes. + plink_params = dict( + output_dir=str(tmp_path), + output_name="pheno_test", + phenotypes=pheno_map, + **data_params, + ) + result = api.biallelic_snps_to_plink(**plink_params) + + # Verify phenotype values by reading the .fam file directly. + # The .fam file format: FID IID Father Mother Sex Phenotype + fam_path = f"{result}.fam" + assert os.path.exists(fam_path) + with open(fam_path) as f: + for line in f: + fields = line.strip().split() + # Phenotype is the 6th column (index 5). + pheno_val = float(fields[5]) + assert pheno_val == 2.0, f"Expected phenotype 2.0, got {pheno_val}" From eb190ef1bc56649b9c1542014f31e6be19d27869 Mon Sep 17 00:00:00 2001 From: 31puneet Date: Thu, 5 Mar 2026 19:42:32 +0000 Subject: [PATCH 2/6] chore: trigger CI From f8c96ca21b387f51f07ac8f454ee80d8368c8a6e Mon Sep 17 00:00:00 2001 From: 31puneet Date: Mon, 30 Mar 2026 07:34:30 +0000 Subject: [PATCH 3/6] test: cover missing sex_call column branch --- tests/anoph/test_plink_converter.py | 37 +++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/tests/anoph/test_plink_converter.py b/tests/anoph/test_plink_converter.py index d51628434..ac21941e9 100644 --- a/tests/anoph/test_plink_converter.py +++ b/tests/anoph/test_plink_converter.py @@ -259,3 +259,40 @@ def test_plink_converter_phenotypes(fixture, api: PlinkConverter, tmp_path): # Phenotype is the 6th column (index 5). pheno_val = float(fields[5]) assert pheno_val == 2.0, f"Expected phenotype 2.0, got {pheno_val}" + + +@parametrize_with_cases("fixture,api", cases=".") +def test_plink_converter_no_sex_call(fixture, api: PlinkConverter, tmp_path): + """Test that plink converter works when sex_call column is missing.""" + from unittest.mock import patch + + all_sample_sets = api.sample_sets()["sample_set"].to_list() + + data_params = dict( + region=random.choice(api.contigs), + sample_sets=random.sample(all_sample_sets, 2), + site_mask=random.choice((None,) + api.site_mask_ids), + min_minor_ac=1, + max_missing_an=1, + thin_offset=0, + random_seed=42, + ) + + # Patch sample_metadata to return a DataFrame without sex_call column. + original_sample_metadata = api.sample_metadata + + def mock_sample_metadata(**kwargs): + df = original_sample_metadata(**kwargs) + return df.drop(columns=["sex_call"], errors="ignore") + + with patch.object(api, "sample_metadata", side_effect=mock_sample_metadata): + plink_params = dict( + output_dir=str(tmp_path), output_name="no_sex_test", **data_params + ) + result = api.biallelic_snps_to_plink(**plink_params) + + assert os.path.exists(f"{result}.bed") + + # All sex values should be 0 (unknown) since sex_call was missing. + bed = bed_reader.open_bed(f"{result}.bed") + assert all(s == 0 for s in bed.sex) From 18a907d807285a997eeb737c4f895d3576314b22 Mon Sep 17 00:00:00 2001 From: 31puneet Date: Mon, 30 Mar 2026 07:44:09 +0000 Subject: [PATCH 4/6] fix: lint errors --- tests/anoph/test_plink_converter.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/anoph/test_plink_converter.py b/tests/anoph/test_plink_converter.py index c9cedbf6c..d40179031 100644 --- a/tests/anoph/test_plink_converter.py +++ b/tests/anoph/test_plink_converter.py @@ -1,4 +1,5 @@ import numpy as np +import random import pytest from pytest_cases import parametrize_with_cases @@ -209,9 +210,7 @@ def test_plink_converter_custom_output_name(fixture, api: PlinkConverter, tmp_pa # Call with custom output name. custom_name = "my_custom_output" - plink_params = dict( - output_dir=str(tmp_path), out=custom_name, **data_params - ) + plink_params = dict(output_dir=str(tmp_path), out=custom_name, **data_params) result = api.biallelic_snps_to_plink(**plink_params) assert result == f"{str(tmp_path)}/{custom_name}" @@ -286,9 +285,7 @@ def mock_sample_metadata(**kwargs): return df.drop(columns=["sex_call"], errors="ignore") with patch.object(api, "sample_metadata", side_effect=mock_sample_metadata): - plink_params = dict( - output_dir=str(tmp_path), out="no_sex_test", **data_params - ) + plink_params = dict(output_dir=str(tmp_path), out="no_sex_test", **data_params) result = api.biallelic_snps_to_plink(**plink_params) assert os.path.exists(f"{result}.bed") From 8346218f112b4cb1013f3ca6fe5e45cec46f343d Mon Sep 17 00:00:00 2001 From: 31puneet Date: Tue, 14 Apr 2026 14:49:38 +0000 Subject: [PATCH 5/6] Adding species contig maps --- malariagen_data/adar1.py | 1 + malariagen_data/adir1.py | 9 +++++++++ malariagen_data/af1.py | 9 +++++++++ malariagen_data/ag3.py | 11 +++++++++++ malariagen_data/amin1.py | 1 + malariagen_data/anoph/to_plink.py | 29 ++++++++++++++++------------- malariagen_data/anopheles.py | 2 ++ tests/anoph/test_plink_converter.py | 3 +++ 8 files changed, 52 insertions(+), 13 deletions(-) diff --git a/malariagen_data/adar1.py b/malariagen_data/adar1.py index 1340a9554..be90915e5 100644 --- a/malariagen_data/adar1.py +++ b/malariagen_data/adar1.py @@ -133,6 +133,7 @@ def __init__( inversion_tag_path=None, unrestricted_use_only=unrestricted_use_only, surveillance_use_only=surveillance_use_only, + plink_chrom_map=None, ) def __repr__(self): diff --git a/malariagen_data/adir1.py b/malariagen_data/adir1.py index f8c07c61d..b980e8ee1 100644 --- a/malariagen_data/adir1.py +++ b/malariagen_data/adir1.py @@ -23,6 +23,14 @@ IHS_GWSS_CACHE_NAME = "adir1_ihs_gwss_v1" ROH_HMM_CACHE_NAME = "adir1_roh_hmm_v1" +# Mapping from contig/scaffold names to PLINK chromosome codes. +# Adir1 uses scaffold IDs; map to 1-based PLINK codes. +PLINK_CHROM_MAP = { + "KB672490": 1, + "KB672868": 2, + "KB672979": 3, +} + class Adir1(AnophelesDataResource): """Provides access to data from Adir1.0 releases. @@ -133,6 +141,7 @@ def __init__( inversion_tag_path=None, unrestricted_use_only=unrestricted_use_only, surveillance_use_only=surveillance_use_only, + plink_chrom_map=PLINK_CHROM_MAP, ) def __repr__(self): diff --git a/malariagen_data/af1.py b/malariagen_data/af1.py index 94d425f2a..cf7265345 100644 --- a/malariagen_data/af1.py +++ b/malariagen_data/af1.py @@ -25,6 +25,14 @@ IHS_GWSS_CACHE_NAME = "af1_ihs_gwss_v1" ROH_HMM_CACHE_NAME = "af1_roh_hmm_v1" +# Mapping from contig names to PLINK chromosome codes. +# In PLINK: 0 = unknown, 1-22 = autosomes, 23 = X. +PLINK_CHROM_MAP = { + "2RL": 1, + "3RL": 2, + "X": 23, +} + class Af1(AnophelesDataResource): """Provides access to data from Af1.x releases. @@ -135,6 +143,7 @@ def __init__( inversion_tag_path=None, unrestricted_use_only=unrestricted_use_only, surveillance_use_only=surveillance_use_only, + plink_chrom_map=PLINK_CHROM_MAP, ) def __repr__(self): diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py index ae425a79f..7e4832905 100644 --- a/malariagen_data/ag3.py +++ b/malariagen_data/ag3.py @@ -29,6 +29,16 @@ } INVERSION_TAG_PATH = "karyotype_tag_snps.csv" +# Mapping from contig names to PLINK chromosome codes. +# In PLINK: 0 = unknown, 1-22 = autosomes, 23 = X. +PLINK_CHROM_MAP = { + "2R": 1, + "2L": 2, + "3R": 3, + "3L": 4, + "X": 23, +} + def _setup_aim_palettes(): # Set up default AIMs color palettes. @@ -216,6 +226,7 @@ def __init__( inversion_tag_path=INVERSION_TAG_PATH, unrestricted_use_only=unrestricted_use_only, surveillance_use_only=surveillance_use_only, + plink_chrom_map=PLINK_CHROM_MAP, ) # set up caches diff --git a/malariagen_data/amin1.py b/malariagen_data/amin1.py index ae93c6d3a..32c22ba39 100644 --- a/malariagen_data/amin1.py +++ b/malariagen_data/amin1.py @@ -133,6 +133,7 @@ def __init__( inversion_tag_path=None, unrestricted_use_only=unrestricted_use_only, surveillance_use_only=surveillance_use_only, + plink_chrom_map=None, ) def __repr__(self): diff --git a/malariagen_data/anoph/to_plink.py b/malariagen_data/anoph/to_plink.py index cf80e4193..4d0e3c212 100644 --- a/malariagen_data/anoph/to_plink.py +++ b/malariagen_data/anoph/to_plink.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Mapping, Optional import allel # type: ignore import numpy as np @@ -12,22 +12,13 @@ from . import pca_params from numpydoc_decorator import doc # type: ignore -# Mapping from internal contig indices to PLINK chromosome codes. -# In PLINK format, 0 means "unknown" and 23 represents the X chromosome. -_CHROM_MAP = { - 0: 1, # 2R -> 1 - 1: 2, # 2L -> 2 - 2: 3, # 3R -> 3 - 3: 4, # 3L -> 4 - 4: 23, # X -> 23 -} - class PlinkConverter( AnophelesSnpData, ): def __init__( self, + plink_chrom_map: Optional[Mapping[str, int]] = None, **kwargs, ): # N.B., this class is designed to work cooperatively, and @@ -35,6 +26,10 @@ def __init__( # to the superclass constructor. super().__init__(**kwargs) + # Store the PLINK chromosome mapping. + # Maps contig names to PLINK chromosome codes. + self._plink_chrom_map = plink_chrom_map or {} + @doc( summary=""" Write Anopheles biallelic SNP data to the Plink binary file format. @@ -138,10 +133,18 @@ def biallelic_snps_to_plink( with self._spinner("Prepare output data"): alleles = ds_snps_final["variant_allele"].values - # Map chromosome indices to PLINK conventions. + # Map chromosome indices to PLINK conventions using contig names. raw_contigs = ds_snps_final["variant_contig"].values + contig_names = self.contigs + chrom_map = self._plink_chrom_map mapped_contigs = np.array( - [_CHROM_MAP.get(int(c), int(c)) for c in raw_contigs] + [ + chrom_map.get( + contig_names[int(c)], # look up name from index + int(c) + 1, # fallback: 1-based index + ) + for c in raw_contigs + ] ) # Get sample IDs for property lookups. diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py index 4df1d9d9c..412ac2335 100644 --- a/malariagen_data/anopheles.py +++ b/malariagen_data/anopheles.py @@ -143,6 +143,7 @@ def __init__( inversion_tag_path: Optional[str] = None, unrestricted_use_only: Optional[bool] = None, surveillance_use_only: Optional[bool] = None, + plink_chrom_map: Optional[Mapping[str, int]] = None, ): super().__init__( url=url, @@ -181,6 +182,7 @@ def __init__( inversion_tag_path=inversion_tag_path, unrestricted_use_only=unrestricted_use_only, surveillance_use_only=surveillance_use_only, + plink_chrom_map=plink_chrom_map, ) def _get_xpehh_gwss_cache_name(self): diff --git a/tests/anoph/test_plink_converter.py b/tests/anoph/test_plink_converter.py index d40179031..3470e30e0 100644 --- a/tests/anoph/test_plink_converter.py +++ b/tests/anoph/test_plink_converter.py @@ -39,6 +39,7 @@ def ag3_sim_api(ag3_sim_fixture): results_cache=ag3_sim_fixture.results_cache_path.as_posix(), taxon_colors=_ag3.TAXON_COLORS, virtual_contigs=_ag3.VIRTUAL_CONTIGS, + plink_chrom_map=_ag3.PLINK_CHROM_MAP, ) @@ -57,6 +58,7 @@ def af1_sim_api(af1_sim_fixture): default_site_mask="funestus", results_cache=af1_sim_fixture.results_cache_path.as_posix(), taxon_colors=_af1.TAXON_COLORS, + plink_chrom_map=_af1.PLINK_CHROM_MAP, ) @@ -75,6 +77,7 @@ def adir1_sim_api(adir1_sim_fixture): default_site_mask="dirus", results_cache=adir1_sim_fixture.results_cache_path.as_posix(), taxon_colors=_adir1.TAXON_COLORS, + plink_chrom_map=_adir1.PLINK_CHROM_MAP, ) From 774824dc40e9d44d8e031e994088b0d8c2a94704 Mon Sep 17 00:00:00 2001 From: 31puneet Date: Wed, 22 Apr 2026 05:51:47 +0000 Subject: [PATCH 6/6] re-run CI