Skip to content

Commit 9784221

Browse files
authored
Merge branch 'master' into fix/broken-image-urls
2 parents a60286a + c9bfbc0 commit 9784221

File tree

6 files changed

+601
-5
lines changed

6 files changed

+601
-5
lines changed

malariagen_data/anoph/ld.py

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
from typing import Optional
2+
3+
import numpy as np
4+
5+
import allel # type: ignore
6+
import xarray as xr
7+
from numpydoc_decorator import doc # type: ignore
8+
9+
from ..util import _check_types, _dask_compress_dataset
10+
from . import base_params, ld_params, pca_params
11+
from .snp_data import AnophelesSnpData
12+
13+
14+
class AnophelesLdAnalysis(
15+
AnophelesSnpData,
16+
):
17+
def __init__(
18+
self,
19+
**kwargs,
20+
):
21+
# N.B., this class is designed to work cooperatively, and
22+
# so it's important that any remaining parameters are passed
23+
# to the superclass constructor.
24+
super().__init__(**kwargs)
25+
26+
@_check_types
27+
@doc(
28+
summary="""
29+
Access biallelic SNP calls after LD pruning.
30+
""",
31+
extended_summary="""
32+
This function obtains biallelic SNP calls, then performs LD pruning
33+
using scikit-allel's `locate_unlinked` function. The resulting dataset
34+
can be used as input to ADMIXTURE workflows or exported to PLINK format.
35+
36+
LD pruning is controlled by three parameters:
37+
38+
- `ld_window_size`: number of SNPs in the sliding window used to
39+
compute pairwise r-squared.
40+
- `ld_window_step`: number of SNPs to advance the window each
41+
iteration.
42+
- `ld_threshold`: maximum r-squared value; SNP pairs above this
43+
are considered linked and one will be removed.
44+
45+
Note that `n_snps` is required to control memory usage. Without
46+
pre-thinning, LD pruning could attempt to materialise millions of
47+
variants and run out of memory.
48+
""",
49+
returns="""
50+
A dataset of LD-pruned biallelic SNP calls with the same structure as
51+
the output of `biallelic_snp_calls`.
52+
""",
53+
)
54+
def biallelic_snp_calls_ld_pruned(
55+
self,
56+
region: base_params.regions,
57+
n_snps: base_params.n_snps,
58+
ld_window_size: ld_params.ld_window_size = ld_params.ld_window_size_default,
59+
ld_window_step: ld_params.ld_window_step = ld_params.ld_window_step_default,
60+
ld_threshold: ld_params.ld_threshold = ld_params.ld_threshold_default,
61+
thin_offset: base_params.thin_offset = 0,
62+
sample_sets: Optional[base_params.sample_sets] = None,
63+
sample_query: Optional[base_params.sample_query] = None,
64+
sample_query_options: Optional[base_params.sample_query_options] = None,
65+
sample_indices: Optional[base_params.sample_indices] = None,
66+
site_mask: Optional[base_params.site_mask] = base_params.DEFAULT,
67+
min_minor_ac: Optional[
68+
base_params.min_minor_ac
69+
] = pca_params.min_minor_ac_default,
70+
max_missing_an: Optional[
71+
base_params.max_missing_an
72+
] = pca_params.max_missing_an_default,
73+
random_seed: base_params.random_seed = 42,
74+
inline_array: base_params.inline_array = base_params.inline_array_default,
75+
chunks: base_params.chunks = base_params.native_chunks,
76+
) -> xr.Dataset:
77+
# Check that either sample_query xor sample_indices are provided.
78+
base_params._validate_sample_selection_params(
79+
sample_query=sample_query, sample_indices=sample_indices
80+
)
81+
82+
# Validate LD parameters.
83+
if ld_window_size <= 0:
84+
raise ValueError(f"ld_window_size must be > 0, got {ld_window_size}")
85+
if ld_window_step <= 0:
86+
raise ValueError(f"ld_window_step must be > 0, got {ld_window_step}")
87+
if not (0 < ld_threshold <= 1):
88+
raise ValueError(f"ld_threshold must be in (0, 1], got {ld_threshold}")
89+
90+
# Obtain biallelic SNP calls with thinning applied first.
91+
ds_snps = self.biallelic_snp_calls(
92+
region=region,
93+
sample_sets=sample_sets,
94+
sample_query=sample_query,
95+
sample_query_options=sample_query_options,
96+
sample_indices=sample_indices,
97+
site_mask=site_mask,
98+
min_minor_ac=min_minor_ac,
99+
max_missing_an=max_missing_an,
100+
n_snps=n_snps,
101+
thin_offset=thin_offset,
102+
random_seed=random_seed,
103+
inline_array=inline_array,
104+
chunks=chunks,
105+
)
106+
107+
# Compute genotype reference counts.
108+
with self._dask_progress(desc="Computing genotype ref counts"):
109+
gt = ds_snps["call_genotype"].data
110+
gn = allel.GenotypeDaskArray(gt).to_n_ref(fill=-127).compute()
111+
112+
# Perform LD pruning.
113+
with self._spinner(desc="LD pruning"):
114+
loc_unlinked = allel.locate_unlinked(
115+
gn,
116+
size=ld_window_size,
117+
step=ld_window_step,
118+
threshold=ld_threshold,
119+
)
120+
121+
# Guard against empty result.
122+
if not np.any(loc_unlinked):
123+
raise ValueError(
124+
"LD pruning removed all variants. Consider using a less "
125+
"stringent ld_threshold or providing more variants via n_snps."
126+
)
127+
128+
# Apply the pruning mask.
129+
ds_pruned = _dask_compress_dataset(
130+
ds_snps, indexer=loc_unlinked, dim="variants"
131+
)
132+
133+
return ds_pruned

malariagen_data/anoph/ld_params.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
"""Parameters for LD pruning functions."""
2+
3+
from typing_extensions import Annotated, TypeAlias
4+
5+
ld_window_size: TypeAlias = Annotated[
6+
int,
7+
"Window size in number of SNPs for LD pruning.",
8+
]
9+
10+
ld_window_size_default: ld_window_size = 500
11+
12+
ld_window_step: TypeAlias = Annotated[
13+
int,
14+
"Step size in number of SNPs for LD pruning.",
15+
]
16+
17+
ld_window_step_default: ld_window_step = 200
18+
19+
ld_threshold: TypeAlias = Annotated[
20+
float,
21+
"r-squared threshold for LD pruning. SNP pairs with r-squared above "
22+
"this threshold will be considered linked.",
23+
]
24+
25+
ld_threshold_default: ld_threshold = 0.1

malariagen_data/anopheles.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
from .anoph.sample_metadata import AnophelesSampleMetadata
3636
from .anoph.snp_data import AnophelesSnpData
3737
from .anoph.to_plink import PlinkConverter
38+
from .anoph.ld import AnophelesLdAnalysis
3839
from .anoph.to_vcf import SnpVcfExporter
3940
from .anoph.g123 import AnophelesG123Analysis
4041
from .anoph.fst import AnophelesFstAnalysis
@@ -88,6 +89,7 @@ class AnophelesDataResource(
8889
AnophelesDistanceAnalysis,
8990
AnophelesPca,
9091
PlinkConverter,
92+
AnophelesLdAnalysis,
9193
SnpVcfExporter,
9294
AnophelesIgv,
9395
AnophelesKaryotypeAnalysis,

notebooks/ld_pruning.ipynb

Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "a1b2c3d4-e5f6-7890-abcd-ef1234567890",
6+
"metadata": {},
7+
"source": [
8+
"# LD-pruned biallelic SNP calls\n",
9+
"\n",
10+
"LD pruning removes redundant SNPs that are in linkage disequilibrium, reducing correlation between variants. This is important for analyses like PCA and ADMIXTURE where independent markers are assumed.\n",
11+
"\n",
12+
"`biallelic_snp_calls_ld_pruned()` wraps `biallelic_snp_calls()` and applies LD pruning via `scikit-allel`'s `locate_unlinked()`. The output retains the same dataset structure and is compatible with existing downstream methods."
13+
]
14+
},
15+
{
16+
"cell_type": "markdown",
17+
"id": "b2c3d4e5-f6a7-8901-bcde-f12345678901",
18+
"metadata": {},
19+
"source": [
20+
"## Setup"
21+
]
22+
},
23+
{
24+
"cell_type": "code",
25+
"execution_count": null,
26+
"id": "c3d4e5f6-a7b8-9012-cdef-123456789012",
27+
"metadata": {},
28+
"outputs": [],
29+
"source": [
30+
"import malariagen_data"
31+
]
32+
},
33+
{
34+
"cell_type": "code",
35+
"execution_count": null,
36+
"id": "d4e5f6a7-b8c9-0123-defa-234567890123",
37+
"metadata": {},
38+
"outputs": [],
39+
"source": [
40+
"ag3 = malariagen_data.Ag3(\n",
41+
" \"simplecache::gs://vo_agam_release_master_us_central1\",\n",
42+
" simplecache=dict(cache_storage=\"../gcs_cache\"),\n",
43+
" results_cache=\"results_cache\",\n",
44+
")\n",
45+
"ag3"
46+
]
47+
},
48+
{
49+
"cell_type": "markdown",
50+
"id": "e5f6a7b8-c9d0-1234-efab-345678901234",
51+
"metadata": {},
52+
"source": [
53+
"## Basic usage"
54+
]
55+
},
56+
{
57+
"cell_type": "code",
58+
"execution_count": null,
59+
"id": "f6a7b8c9-d0e1-2345-fabc-456789012345",
60+
"metadata": {},
61+
"outputs": [],
62+
"source": [
63+
"ds_pruned = ag3.biallelic_snp_calls_ld_pruned(\n",
64+
" region=\"3L\",\n",
65+
" n_snps=100_000,\n",
66+
" sample_sets=\"AG1000G-BF-A\",\n",
67+
")\n",
68+
"ds_pruned"
69+
]
70+
},
71+
{
72+
"cell_type": "code",
73+
"execution_count": null,
74+
"id": "a7b8c9d0-e1f2-3456-abcd-567890123456",
75+
"metadata": {},
76+
"outputs": [],
77+
"source": [
78+
"# Inspect dimensions.\n",
79+
"print(f\"variants: {ds_pruned.sizes['variants']}\")\n",
80+
"print(f\"samples: {ds_pruned.sizes['samples']}\")\n",
81+
"print(f\"alleles: {ds_pruned.sizes['alleles']}\")"
82+
]
83+
},
84+
{
85+
"cell_type": "markdown",
86+
"id": "b8c9d0e1-f2a3-4567-bcde-678901234567",
87+
"metadata": {},
88+
"source": [
89+
"## Effect of LD parameters"
90+
]
91+
},
92+
{
93+
"cell_type": "code",
94+
"execution_count": null,
95+
"id": "c9d0e1f2-a3b4-5678-cdef-789012345678",
96+
"metadata": {},
97+
"outputs": [],
98+
"source": [
99+
"# Stricter threshold (0.1, default) removes more correlated SNPs.\n",
100+
"ds_strict = ag3.biallelic_snp_calls_ld_pruned(\n",
101+
" region=\"3L\",\n",
102+
" n_snps=100_000,\n",
103+
" sample_sets=\"AG1000G-BF-A\",\n",
104+
" ld_threshold=0.1,\n",
105+
")\n",
106+
"\n",
107+
"# More lenient threshold retains more SNPs.\n",
108+
"ds_lenient = ag3.biallelic_snp_calls_ld_pruned(\n",
109+
" region=\"3L\",\n",
110+
" n_snps=100_000,\n",
111+
" sample_sets=\"AG1000G-BF-A\",\n",
112+
" ld_threshold=0.5,\n",
113+
")\n",
114+
"\n",
115+
"print(f\"strict (threshold=0.1): {ds_strict.sizes['variants']} variants\")\n",
116+
"print(f\"lenient (threshold=0.5): {ds_lenient.sizes['variants']} variants\")"
117+
]
118+
},
119+
{
120+
"cell_type": "markdown",
121+
"id": "d0e1f2a3-b4c5-6789-defa-890123456789",
122+
"metadata": {},
123+
"source": [
124+
"## Before vs after pruning"
125+
]
126+
},
127+
{
128+
"cell_type": "code",
129+
"execution_count": null,
130+
"id": "e1f2a3b4-c5d6-7890-efab-901234567890",
131+
"metadata": {},
132+
"outputs": [],
133+
"source": [
134+
"# Get the thinned (but not LD-pruned) SNPs for comparison.\n",
135+
"ds_before = ag3.biallelic_snp_calls(\n",
136+
" region=\"3L\",\n",
137+
" n_snps=100_000,\n",
138+
" sample_sets=\"AG1000G-BF-A\",\n",
139+
")\n",
140+
"\n",
141+
"print(f\"before LD pruning: {ds_before.sizes['variants']} variants\")\n",
142+
"print(f\"after LD pruning: {ds_pruned.sizes['variants']} variants\")\n",
143+
"print(f\"removed: {ds_before.sizes['variants'] - ds_pruned.sizes['variants']} variants\")"
144+
]
145+
},
146+
{
147+
"cell_type": "markdown",
148+
"id": "f2a3b4c5-d6e7-8901-fabc-012345678901",
149+
"metadata": {},
150+
"source": [
151+
"## Downstream compatibility\n",
152+
"\n",
153+
"The pruned dataset retains the same structure as `biallelic_snp_calls()` output, so it can be passed directly into existing workflows like PCA."
154+
]
155+
},
156+
{
157+
"cell_type": "code",
158+
"execution_count": null,
159+
"id": "a3b4c5d6-e7f8-9012-abcd-123456789abc",
160+
"metadata": {},
161+
"outputs": [],
162+
"source": [
163+
"# Verify the pruned dataset has all variables expected by downstream methods.\n",
164+
"assert \"call_genotype\" in ds_pruned\n",
165+
"assert \"variant_allele\" in ds_pruned\n",
166+
"assert \"variant_contig\" in ds_pruned.coords\n",
167+
"assert \"variant_position\" in ds_pruned.coords\n",
168+
"assert \"sample_id\" in ds_pruned.coords\n",
169+
"\n",
170+
"# Shape sanity check.\n",
171+
"n_variants = ds_pruned.sizes[\"variants\"]\n",
172+
"n_samples = ds_pruned.sizes[\"samples\"]\n",
173+
"assert ds_pruned[\"call_genotype\"].shape == (n_variants, n_samples, 2)\n",
174+
"assert ds_pruned[\"variant_allele\"].shape == (n_variants, 2)\n",
175+
"\n",
176+
"print(f\"Dataset is valid: {n_variants} variants × {n_samples} samples\")"
177+
]
178+
},
179+
{
180+
"cell_type": "code",
181+
"execution_count": null,
182+
"id": "b4c5d6e7-f8a9-0123-bcde-234567890bcd",
183+
"metadata": {},
184+
"outputs": [],
185+
"source": []
186+
}
187+
],
188+
"metadata": {
189+
"kernelspec": {
190+
"display_name": "Python 3 (ipykernel)",
191+
"language": "python",
192+
"name": "python3"
193+
},
194+
"language_info": {
195+
"codemirror_mode": {
196+
"name": "ipython",
197+
"version": 3
198+
},
199+
"file_extension": ".py",
200+
"mimetype": "text/x-python",
201+
"name": "python",
202+
"nbconvert_exporter": "python",
203+
"pygments_lexer": "ipython3",
204+
"version": "3.10.12"
205+
},
206+
"widgets": {
207+
"application/vnd.jupyter.widget-state+json": {
208+
"state": {},
209+
"version_major": 2,
210+
"version_minor": 0
211+
}
212+
}
213+
},
214+
"nbformat": 4,
215+
"nbformat_minor": 5
216+
}

0 commit comments

Comments
 (0)