|
1 | 1 | from typing import Optional |
2 | 2 |
|
| 3 | +import numpy as np |
| 4 | + |
3 | 5 | import allel # type: ignore |
4 | 6 | import xarray as xr |
5 | 7 | from numpydoc_decorator import doc # type: ignore |
@@ -31,6 +33,15 @@ def __init__( |
31 | 33 | using scikit-allel's `locate_unlinked` function. The resulting dataset |
32 | 34 | can be used as input to ADMIXTURE workflows or exported to PLINK format. |
33 | 35 |
|
| 36 | + LD pruning is controlled by three parameters: |
| 37 | +
|
| 38 | + - `ld_window_size`: number of SNPs in the sliding window used to |
| 39 | + compute pairwise r-squared. |
| 40 | + - `ld_window_step`: number of SNPs to advance the window each |
| 41 | + iteration. |
| 42 | + - `ld_threshold`: maximum r-squared value; SNP pairs above this |
| 43 | + are considered linked and one will be removed. |
| 44 | +
|
34 | 45 | Note that `n_snps` is required to control memory usage. Without |
35 | 46 | pre-thinning, LD pruning could attempt to materialise millions of |
36 | 47 | variants and run out of memory. |
@@ -68,6 +79,14 @@ def biallelic_snp_calls_ld_pruned( |
68 | 79 | sample_query=sample_query, sample_indices=sample_indices |
69 | 80 | ) |
70 | 81 |
|
| 82 | + # Validate LD parameters. |
| 83 | + if ld_window_size <= 0: |
| 84 | + raise ValueError(f"ld_window_size must be > 0, got {ld_window_size}") |
| 85 | + if ld_window_step <= 0: |
| 86 | + raise ValueError(f"ld_window_step must be > 0, got {ld_window_step}") |
| 87 | + if not (0 < ld_threshold <= 1): |
| 88 | + raise ValueError(f"ld_threshold must be in (0, 1], got {ld_threshold}") |
| 89 | + |
71 | 90 | # Obtain biallelic SNP calls with thinning applied first. |
72 | 91 | ds_snps = self.biallelic_snp_calls( |
73 | 92 | region=region, |
@@ -99,6 +118,13 @@ def biallelic_snp_calls_ld_pruned( |
99 | 118 | threshold=ld_threshold, |
100 | 119 | ) |
101 | 120 |
|
| 121 | + # Guard against empty result. |
| 122 | + if not np.any(loc_unlinked): |
| 123 | + raise ValueError( |
| 124 | + "LD pruning removed all variants. Consider using a less " |
| 125 | + "stringent ld_threshold or providing more variants via n_snps." |
| 126 | + ) |
| 127 | + |
102 | 128 | # Apply the pruning mask. |
103 | 129 | ds_pruned = _dask_compress_dataset( |
104 | 130 | ds_snps, indexer=loc_unlinked, dim="variants" |
|
0 commit comments