Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
dc2ab23
feat: enhance biallelic_snps_to_plink with sex calls, phenotypes, and…
31puneet Mar 5, 2026
eb190ef
chore: trigger CI
31puneet Mar 5, 2026
caaf4b5
Merge branch 'master' into fix/issue-730-plink-enhancements
31puneet Mar 19, 2026
f8c96ca
test: cover missing sex_call column branch
31puneet Mar 30, 2026
dd9877b
fix: resolve merge conflicts
31puneet Mar 30, 2026
18a907d
fix: lint errors
31puneet Mar 30, 2026
22b6e8d
Merge branch 'master' into fix/issue-730-plink-enhancements
jonbrenas Apr 13, 2026
3b994f8
Merge branch 'master' into fix/issue-730-plink-enhancements
31puneet Apr 13, 2026
10e687a
Merge branch 'master' into fix/issue-730-plink-enhancements
jonbrenas Apr 14, 2026
8346218
Adding species contig maps
31puneet Apr 14, 2026
3b0b338
Merge branch 'master' into fix/issue-730-plink-enhancements
jonbrenas Apr 14, 2026
a0a73b9
Merge branch 'master' into fix/issue-730-plink-enhancements
31puneet Apr 14, 2026
9f713b1
Merge branch 'master' into fix/issue-730-plink-enhancements
31puneet Apr 16, 2026
1158a74
Merge branch 'master' into fix/issue-730-plink-enhancements
31puneet Apr 18, 2026
b07457e
Merge branch 'master' into fix/issue-730-plink-enhancements
31puneet Apr 20, 2026
af88486
Merge branch 'master' into fix/issue-730-plink-enhancements
31puneet Apr 22, 2026
774824d
re-run CI
31puneet Apr 22, 2026
6139b41
Merge branch 'fix/issue-730-plink-enhancements' of https://github.com…
31puneet Apr 22, 2026
f3c1b96
Merge branch 'master' into fix/issue-730-plink-enhancements
jonbrenas Apr 27, 2026
dff9ba4
Merge branch 'master' into fix/issue-730-plink-enhancements
31puneet Apr 28, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions malariagen_data/adar1.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ def __init__(
inversion_tag_path=None,
unrestricted_use_only=unrestricted_use_only,
surveillance_use_only=surveillance_use_only,
plink_chrom_map=None,
)

def __repr__(self):
Expand Down
9 changes: 9 additions & 0 deletions malariagen_data/adir1.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,14 @@
IHS_GWSS_CACHE_NAME = "adir1_ihs_gwss_v1"
ROH_HMM_CACHE_NAME = "adir1_roh_hmm_v1"

# Mapping from contig/scaffold names to PLINK chromosome codes.
# Adir1 uses scaffold IDs; map to 1-based PLINK codes.
PLINK_CHROM_MAP = {
"KB672490": 1,
"KB672868": 2,
"KB672979": 3,
}


class Adir1(AnophelesDataResource):
"""Provides access to data from Adir1.0 releases.
Expand Down Expand Up @@ -133,6 +141,7 @@ def __init__(
inversion_tag_path=None,
unrestricted_use_only=unrestricted_use_only,
surveillance_use_only=surveillance_use_only,
plink_chrom_map=PLINK_CHROM_MAP,
)

def __repr__(self):
Expand Down
9 changes: 9 additions & 0 deletions malariagen_data/af1.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,14 @@
IHS_GWSS_CACHE_NAME = "af1_ihs_gwss_v1"
ROH_HMM_CACHE_NAME = "af1_roh_hmm_v1"

# Mapping from contig names to PLINK chromosome codes.
# In PLINK: 0 = unknown, 1-22 = autosomes, 23 = X.
PLINK_CHROM_MAP = {
"2RL": 1,
"3RL": 2,
"X": 23,
}


class Af1(AnophelesDataResource):
"""Provides access to data from Af1.x releases.
Expand Down Expand Up @@ -135,6 +143,7 @@ def __init__(
inversion_tag_path=None,
unrestricted_use_only=unrestricted_use_only,
surveillance_use_only=surveillance_use_only,
plink_chrom_map=PLINK_CHROM_MAP,
)

def __repr__(self):
Expand Down
11 changes: 11 additions & 0 deletions malariagen_data/ag3.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,16 @@
}
INVERSION_TAG_PATH = "karyotype_tag_snps.csv"

# Mapping from contig names to PLINK chromosome codes.
# In PLINK: 0 = unknown, 1-22 = autosomes, 23 = X.
PLINK_CHROM_MAP = {
"2R": 1,
"2L": 2,
"3R": 3,
"3L": 4,
"X": 23,
}


def _setup_aim_palettes():
# Set up default AIMs color palettes.
Expand Down Expand Up @@ -216,6 +226,7 @@ def __init__(
inversion_tag_path=INVERSION_TAG_PATH,
unrestricted_use_only=unrestricted_use_only,
surveillance_use_only=surveillance_use_only,
plink_chrom_map=PLINK_CHROM_MAP,
)

# set up caches
Expand Down
1 change: 1 addition & 0 deletions malariagen_data/amin1.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ def __init__(
inversion_tag_path=None,
unrestricted_use_only=unrestricted_use_only,
surveillance_use_only=surveillance_use_only,
plink_chrom_map=None,
)

def __repr__(self):
Expand Down
11 changes: 11 additions & 0 deletions malariagen_data/anoph/plink_params.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Parameters for Plink converter functions."""

from typing import Mapping, Union

from typing_extensions import Annotated, TypeAlias

overwrite: TypeAlias = Annotated[
Expand Down Expand Up @@ -27,3 +29,12 @@
min_minor_ac, max_missing_an, thin_offset).
""",
]

phenotypes: TypeAlias = Annotated[
Mapping[str, Union[int, float]],
"""
A mapping of sample identifiers to phenotype values. In PLINK format,
-9 indicates missing phenotype, 1 indicates control (unaffected),
and 2 indicates case (affected). Continuous values can also be used.
""",
]
70 changes: 65 additions & 5 deletions malariagen_data/anoph/to_plink.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Optional
from typing import Mapping, Optional

import allel # type: ignore
import numpy as np
Expand All @@ -18,13 +18,18 @@ class PlinkConverter(
):
def __init__(
self,
plink_chrom_map: Optional[Mapping[str, int]] = None,
**kwargs,
):
# N.B., this class is designed to work cooperatively, and
# so it's important that any remaining parameters are passed
# to the superclass constructor.
super().__init__(**kwargs)

# Store the PLINK chromosome mapping.
# Maps contig names to PLINK chromosome codes.
self._plink_chrom_map = plink_chrom_map or {}

@doc(
summary="""
Write Anopheles biallelic SNP data to the Plink binary file format.
Expand Down Expand Up @@ -52,7 +57,7 @@ def biallelic_snps_to_plink(
self,
output_dir: plink_params.output_dir,
region: base_params.regions,
n_snps: base_params.n_snps,
n_snps: Optional[base_params.n_snps] = None,
overwrite: plink_params.overwrite = False,
thin_offset: base_params.thin_offset = 0,
sample_sets: Optional[base_params.sample_sets] = None,
Expand All @@ -70,6 +75,7 @@ def biallelic_snps_to_plink(
inline_array: base_params.inline_array = base_params.inline_array_default,
chunks: base_params.chunks = base_params.native_chunks,
out: Optional[plink_params.out] = None,
phenotypes: Optional[plink_params.phenotypes] = None,
):
# Check that either sample_query xor sample_indices are provided.
base_params._validate_sample_selection_params(
Expand All @@ -80,7 +86,8 @@ def biallelic_snps_to_plink(
if out is not None:
plink_file_path = f"{output_dir}/{out}"
else:
plink_file_path = f"{output_dir}/{region}.{n_snps}.{min_minor_ac}.{max_missing_an}.{thin_offset}"
n_snps_label = n_snps if n_snps is not None else "all"
plink_file_path = f"{output_dir}/{region}.{n_snps_label}.{min_minor_ac}.{max_missing_an}.{thin_offset}"

bed_file_path = f"{plink_file_path}.bed"

Expand Down Expand Up @@ -125,12 +132,65 @@ def biallelic_snps_to_plink(
val = gn_ref_final.T
with self._spinner("Prepare output data"):
alleles = ds_snps_final["variant_allele"].values

# Map chromosome indices to PLINK conventions using contig names.
raw_contigs = ds_snps_final["variant_contig"].values
contig_names = self.contigs
chrom_map = self._plink_chrom_map
mapped_contigs = np.array(
[
chrom_map.get(
contig_names[int(c)], # look up name from index
int(c) + 1, # fallback: 1-based index
)
for c in raw_contigs
]
)

# Get sample IDs for property lookups.
sample_ids = ds_snps_final["sample_id"].values

# Get sex calls from sample metadata and map to PLINK codes.
# PLINK sex codes: 0 = unknown, 1 = male, 2 = female.
df_samples = self.sample_metadata(
sample_sets=sample_sets,
sample_query=sample_query,
sample_query_options=sample_query_options,
sample_indices=sample_indices,
)
sex_map = {"M": 1, "F": 2}
if "sex_call" in df_samples.columns:
sex_lookup = dict(
zip(
df_samples["sample_id"].values,
df_samples["sex_call"]
.map(sex_map)
.fillna(0)
.astype(int)
.values,
)
)
else:
sex_lookup = {}
sex_values = np.array([sex_lookup.get(str(sid), 0) for sid in sample_ids])

# Build phenotype values. Default is -9 (missing) per PLINK convention.
if phenotypes is not None:
pheno_values = np.array(
[phenotypes.get(str(sid), -9) for sid in sample_ids],
dtype=float,
)
else:
pheno_values = np.full(len(sample_ids), -9, dtype=float)

properties = {
"iid": ds_snps_final["sample_id"].values,
"chromosome": ds_snps_final["variant_contig"].values,
"iid": sample_ids,
"chromosome": mapped_contigs,
"bp_position": ds_snps_final["variant_position"].values,
"allele_1": alleles[:, 0],
"allele_2": alleles[:, 1],
"sex": sex_values,
"pheno": pheno_values,
}

bed_reader.to_bed(
Expand Down
2 changes: 2 additions & 0 deletions malariagen_data/anopheles.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ def __init__(
inversion_tag_path: Optional[str] = None,
unrestricted_use_only: Optional[bool] = None,
surveillance_use_only: Optional[bool] = None,
plink_chrom_map: Optional[Mapping[str, int]] = None,
):
super().__init__(
url=url,
Expand Down Expand Up @@ -183,6 +184,7 @@ def __init__(
inversion_tag_path=inversion_tag_path,
unrestricted_use_only=unrestricted_use_only,
surveillance_use_only=surveillance_use_only,
plink_chrom_map=plink_chrom_map,
)

def _get_ihs_gwss_cache_name(self):
Expand Down
Loading
Loading