Skip to content

Commit 4b84290

Browse files
committed
improve LD pruning with parameter validation and additional tests
1 parent c4f39d5 commit 4b84290

2 files changed

Lines changed: 67 additions & 0 deletions

File tree

malariagen_data/anoph/ld.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from typing import Optional
22

3+
import numpy as np
4+
35
import allel # type: ignore
46
import xarray as xr
57
from numpydoc_decorator import doc # type: ignore
@@ -31,6 +33,15 @@ def __init__(
3133
using scikit-allel's `locate_unlinked` function. The resulting dataset
3234
can be used as input to ADMIXTURE workflows or exported to PLINK format.
3335
36+
LD pruning is controlled by three parameters:
37+
38+
- `ld_window_size`: number of SNPs in the sliding window used to
39+
compute pairwise r-squared.
40+
- `ld_window_step`: number of SNPs to advance the window each
41+
iteration.
42+
- `ld_threshold`: maximum r-squared value; SNP pairs above this
43+
are considered linked and one will be removed.
44+
3445
Note that `n_snps` is required to control memory usage. Without
3546
pre-thinning, LD pruning could attempt to materialise millions of
3647
variants and run out of memory.
@@ -68,6 +79,14 @@ def biallelic_snp_calls_ld_pruned(
6879
sample_query=sample_query, sample_indices=sample_indices
6980
)
7081

82+
# Validate LD parameters.
83+
if ld_window_size <= 0:
84+
raise ValueError(f"ld_window_size must be > 0, got {ld_window_size}")
85+
if ld_window_step <= 0:
86+
raise ValueError(f"ld_window_step must be > 0, got {ld_window_step}")
87+
if not (0 < ld_threshold <= 1):
88+
raise ValueError(f"ld_threshold must be in (0, 1], got {ld_threshold}")
89+
7190
# Obtain biallelic SNP calls with thinning applied first.
7291
ds_snps = self.biallelic_snp_calls(
7392
region=region,
@@ -99,6 +118,13 @@ def biallelic_snp_calls_ld_pruned(
99118
threshold=ld_threshold,
100119
)
101120

121+
# Guard against empty result.
122+
if not np.any(loc_unlinked):
123+
raise ValueError(
124+
"LD pruning removed all variants. Consider using a less "
125+
"stringent ld_threshold or providing more variants via n_snps."
126+
)
127+
102128
# Apply the pruning mask.
103129
ds_pruned = _dask_compress_dataset(
104130
ds_snps, indexer=loc_unlinked, dim="variants"

tests/anoph/test_ld.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,3 +170,44 @@ def test_ld_pruned_plink_compatibility(fixture, api: AnophelesLdAnalysis):
170170
n_samples = ds_pruned.sizes["samples"]
171171
assert ds_pruned["call_genotype"].shape == (n_variants, n_samples, 2)
172172
assert ds_pruned["variant_allele"].shape == (n_variants, 2)
173+
174+
175+
@parametrize_with_cases("fixture,api", cases=".")
176+
def test_ld_pruning_threshold_sensitivity(fixture, api: AnophelesLdAnalysis):
177+
region = random.choice(api.contigs)
178+
site_mask = random.choice(api.site_mask_ids)
179+
ds_full = api.biallelic_snp_calls(
180+
region=region,
181+
site_mask=site_mask,
182+
min_minor_ac=1,
183+
max_missing_an=0,
184+
)
185+
n_available = ds_full.sizes["variants"]
186+
if n_available < 10:
187+
pytest.skip("Not enough variants for LD pruning test")
188+
189+
n_snps = min(n_available, 200)
190+
191+
try:
192+
ds_strict = api.biallelic_snp_calls_ld_pruned(
193+
region=region,
194+
n_snps=n_snps,
195+
site_mask=site_mask,
196+
min_minor_ac=1,
197+
max_missing_an=0,
198+
ld_threshold=0.1,
199+
)
200+
except ValueError:
201+
pytest.skip("Too few variants survive strict LD pruning")
202+
203+
ds_lenient = api.biallelic_snp_calls_ld_pruned(
204+
region=region,
205+
n_snps=n_snps,
206+
site_mask=site_mask,
207+
min_minor_ac=1,
208+
max_missing_an=0,
209+
ld_threshold=0.5,
210+
)
211+
212+
# A stricter threshold should retain fewer or equal variants.
213+
assert ds_strict.sizes["variants"] <= ds_lenient.sizes["variants"]

0 commit comments

Comments
 (0)