Skip to content

Commit b8cd642

Browse files
authored
Merge branch 'master' into 618-bad-random-value
2 parents 4b49866 + 5ec5de3 commit b8cd642

17 files changed

Lines changed: 2552 additions & 126 deletions

File tree

docs/source/Af1.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ All the functions below can then be accessed as methods on the ``af1`` object. E
1414

1515
df_samples = af1.sample_metadata()
1616

17-
For more information about the data and terns of use, please see the
17+
For more information about the data and terms of use, please see the
1818
`MalariaGEN Anopheles funestus genomic surveillance project <https://www.malariagen.net/projects/anopheles-funestus-genomic-surveillance-project>`_
1919
home page.
2020

docs/source/Ag3.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ All the functions below can then be accessed as methods on the ``ag3`` object. E
1414

1515
df_samples = ag3.sample_metadata()
1616

17-
For more information about the data and terns of use, please see the
17+
For more information about the data and terms of use, please see the
1818
`MalariaGEN Anopheles gambiae genomic surveillance project <https://www.malariagen.net/anopheles-gambiae-genomic-surveillance-project>`_
1919
home page.
2020

docs/source/Amin1.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ All the functions below can then be accessed as methods on the ``amin1`` object.
1414

1515
df_samples = amin1.sample_metadata()
1616

17-
For more information about the data and terns of use, please see the
17+
For more information about the data and terms of use, please see the
1818
`MalariaGEN Vector Observatory Asia <https://www.malariagen.net/mosquito/vector-observatory-asia>`_
1919
home page.
2020

docs/source/_static/switcher.json

Lines changed: 7 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,20 @@
11
[
2+
{
3+
"name": "13.0.0",
4+
"version": "v13.0.0",
5+
"url": "https:///malariagen.github.io/malariagen-data-python/v13.0.0/",
6+
"preferred": true
7+
},
28
{
39
"name": "12.0.0",
410
"version": "v12.0.0",
5-
"url": "https:///malariagen.github.io/malariagen-data-python/v12.0.0/",
6-
"preferred": true
11+
"url": "https:///malariagen.github.io/malariagen-data-python/v12.0.0/"
712
},
813
{
914
"name": "11.0.0",
1015
"version": "v11.0.0",
1116
"url": "https:///malariagen.github.io/malariagen-data-python/v11.0.0/"
1217
},
13-
{
14-
"name": "10.2.0",
15-
"version": "v10.2.0",
16-
"url": "https:///malariagen.github.io/malariagen-data-python/v10.2.0/"
17-
},
18-
{
19-
"name": "10.1.0",
20-
"version": "v10.1.0",
21-
"url": "https:///malariagen.github.io/malariagen-data-python/v10.1.0/"
22-
},
2318
{
2419
"name": "10.0.0",
2520
"version": "v10.0.0",
@@ -30,46 +25,6 @@
3025
"version": "v9.0.0",
3126
"url": "https:///malariagen.github.io/malariagen-data-python/v9.0.0/"
3227
},
33-
{
34-
"name": "8.8.0",
35-
"version": "v8.8.0",
36-
"url": "https:///malariagen.github.io/malariagen-data-python/v8.8.0/"
37-
},
38-
{
39-
"name": "8.7.0",
40-
"version": "v8.7.0",
41-
"url": "https:///malariagen.github.io/malariagen-data-python/v8.7.0/"
42-
},
43-
{
44-
"name": "8.6.0",
45-
"version": "v8.6.0",
46-
"url": "https:///malariagen.github.io/malariagen-data-python/v8.6.0/"
47-
},
48-
{
49-
"name": "8.5.0",
50-
"version": "v8.5.0",
51-
"url": "https:///malariagen.github.io/malariagen-data-python/v8.5.0/"
52-
},
53-
{
54-
"name": "8.4.0",
55-
"version": "v8.4.0",
56-
"url": "https:///malariagen.github.io/malariagen-data-python/v8.4.0/"
57-
},
58-
{
59-
"name": "8.3.0",
60-
"version": "v8.3.0",
61-
"url": "https:///malariagen.github.io/malariagen-data-python/v8.3.0/"
62-
},
63-
{
64-
"name": "8.2.0",
65-
"version": "v8.2.0",
66-
"url": "https:///malariagen.github.io/malariagen-data-python/v8.2.0/"
67-
},
68-
{
69-
"name": "8.1.0",
70-
"version": "v8.1.0",
71-
"url": "https:///malariagen.github.io/malariagen-data-python/v8.1.0/"
72-
},
7328
{
7429
"name": "8.0.0",
7530
"version": "v8.0.0",
@@ -80,11 +35,6 @@
8035
"version": "v7.15.0",
8136
"url": "https:///malariagen.github.io/malariagen-data-python/v7.15.0/"
8237
},
83-
{
84-
"name": "7.14.1",
85-
"version": "v7.14.1",
86-
"url": "https:///malariagen.github.io/malariagen-data-python/v7.14.1/"
87-
},
8838
{
8939
"name": "7.14.0",
9040
"version": "v7.14.0",

malariagen_data/anoph/base.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
LoggingHelper,
3333
check_colab_location,
3434
check_types,
35+
distributed_client,
3536
get_gcp_region,
3637
hash_params,
3738
init_filesystem,
@@ -174,9 +175,13 @@ def _dask_progress(self, desc=None, leave=False, **kwargs): # pragma: no cover
174175
# Progress doesn't mix well with debug logging.
175176
show_progress = self._show_progress and not self._debug
176177
if show_progress:
177-
return TqdmCallback(
178-
desc=desc, leave=leave, tqdm_class=self._tqdm_class, **kwargs
179-
)
178+
if distributed_client():
179+
# Cannot easily show progress, fall back to spinner.
180+
return self._spinner(desc=desc)
181+
else:
182+
return TqdmCallback(
183+
desc=desc, leave=leave, tqdm_class=self._tqdm_class, **kwargs
184+
)
180185
else:
181186
return nullcontext()
182187

malariagen_data/anoph/base_params.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""General parameters common to many functions in the public API."""
22

3-
from typing import Final, List, Mapping, Optional, Sequence, Tuple, Union, Callable
3+
from typing import Final, List, Mapping, Optional, Sequence, Tuple, Union
44

55
from typing_extensions import Annotated, TypeAlias
66

@@ -9,6 +9,7 @@
99
region_param_type,
1010
single_contig_param_type,
1111
single_region_param_type,
12+
chunks_param_type,
1213
)
1314

1415
contig: TypeAlias = Annotated[
@@ -226,15 +227,22 @@ def validate_sample_selection_params(
226227
inline_array_default: inline_array = True
227228

228229
chunks: TypeAlias = Annotated[
229-
Union[str, Tuple[int, ...], Callable[[Tuple[int, ...]], Tuple[int, ...]]],
230+
chunks_param_type,
230231
"""
231232
If 'auto' let dask decide chunk size. If 'native' use native zarr
232-
chunks. Also, can be a target size, e.g., '200 MiB', or a tuple of
233-
integers.
233+
chunks. If 'ndauto' let dask decide chunk size but only for arrays with
234+
more than one dimension. If 'ndauto0' as 'ndauto' but only vary the first
235+
chunk dimension. If 'ndauto1' as 'ndauto' but only vary the second chunk
236+
dimension. If 'ndauto01' as 'ndauto' but only vary the first and second
237+
chunk dimensions. Also, can be a target size, e.g., '200 MiB', or a tuple of
238+
integers, or a callable which accepts the native chunks as a single argument
239+
and returns a valid dask chunks value.
234240
""",
235241
]
236242

237-
chunks_default: chunks = "native"
243+
# The "ndauto0" value means auto-size chunks for arrays with more than one dimension,
244+
# allowing the first chunk dimension to be varied.
245+
chunks_default: chunks = "ndauto0"
238246

239247
gff_attributes: TypeAlias = Annotated[
240248
Optional[Union[Sequence[str], str]],
@@ -263,19 +271,21 @@ def validate_sample_selection_params(
263271
]
264272

265273
min_minor_ac: TypeAlias = Annotated[
266-
int,
274+
Union[int, float],
267275
"""
268276
The minimum minor allele count. SNPs with a minor allele count
269-
below this value will be excluded.
277+
below this value will be excluded. Can also be a float, which will
278+
be interpreted as a fraction.
270279
""",
271280
]
272281

273282
max_missing_an: TypeAlias = Annotated[
274-
int,
283+
Union[int, float],
275284
"""
276285
The maximum number of missing allele calls to accept. SNPs with
277286
more than this value will be excluded. Set to 0 to require no
278-
missing calls.
287+
missing calls. Can also be a float, which will be interpreted as
288+
a fraction.
279289
""",
280290
]
281291

malariagen_data/anoph/pca.py

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -71,13 +71,15 @@ def pca(
7171
cohort_size: Optional[base_params.cohort_size] = None,
7272
min_cohort_size: Optional[base_params.min_cohort_size] = None,
7373
max_cohort_size: Optional[base_params.max_cohort_size] = None,
74+
exclude_samples: Optional[base_params.samples] = None,
75+
fit_exclude_samples: Optional[base_params.samples] = None,
7476
random_seed: base_params.random_seed = 42,
7577
inline_array: base_params.inline_array = base_params.inline_array_default,
7678
chunks: base_params.chunks = base_params.chunks_default,
7779
) -> Tuple[pca_params.df_pca, pca_params.evr]:
7880
# Change this name if you ever change the behaviour of this function, to
7981
# invalidate any previously cached data.
80-
name = "pca_v3"
82+
name = "pca_v4"
8183

8284
# Normalize params for consistent hash value.
8385
(
@@ -104,6 +106,8 @@ def pca(
104106
cohort_size=cohort_size,
105107
min_cohort_size=min_cohort_size,
106108
max_cohort_size=max_cohort_size,
109+
exclude_samples=exclude_samples,
110+
fit_exclude_samples=fit_exclude_samples,
107111
random_seed=random_seed,
108112
)
109113

@@ -119,11 +123,11 @@ def pca(
119123
coords = results["coords"]
120124
evr = results["evr"]
121125
samples = results["samples"]
126+
loc_keep_fit = results["loc_keep_fit"]
122127

123128
# Load sample metadata.
124129
df_samples = self.sample_metadata(
125130
sample_sets=sample_sets,
126-
sample_indices=sample_indices_prepped,
127131
)
128132

129133
# Ensure aligned with genotype data.
@@ -134,6 +138,8 @@ def pca(
134138
{f"PC{i + 1}": coords[:, i] for i in range(coords.shape[1])}
135139
)
136140
df_pca = df_samples.join(df_coords, how="inner")
141+
# Add a column for which samples were included in fitting.
142+
df_pca["pca_fit"] = loc_keep_fit
137143

138144
return df_pca, evr
139145

@@ -153,6 +159,8 @@ def _pca(
153159
cohort_size,
154160
min_cohort_size,
155161
max_cohort_size,
162+
exclude_samples,
163+
fit_exclude_samples,
156164
random_seed,
157165
chunks,
158166
inline_array,
@@ -177,12 +185,39 @@ def _pca(
177185
)
178186

179187
with self._spinner(desc="Compute PCA"):
188+
# Exclude any samples prior to computing PCA.
189+
if exclude_samples is not None:
190+
x = np.array(exclude_samples, dtype="U")
191+
loc_keep = ~np.isin(samples, x)
192+
samples = samples[loc_keep]
193+
gn = gn[:, loc_keep]
194+
195+
# Exclude any samples from fitting only.
196+
if fit_exclude_samples is not None:
197+
xf = np.array(fit_exclude_samples, dtype="U")
198+
loc_keep_fit = ~np.isin(samples, xf)
199+
gn_fit = gn[:, loc_keep_fit]
200+
else:
201+
loc_keep_fit = np.ones(len(samples), dtype=bool)
202+
gn_fit = gn
203+
180204
# Remove any sites where all genotypes are identical.
181-
loc_var = np.any(gn != gn[:, 0, np.newaxis], axis=1)
205+
loc_var = np.any(gn_fit != gn_fit[:, 0, np.newaxis], axis=1)
206+
gn_fit_var = np.compress(loc_var, gn_fit, axis=0)
182207
gn_var = np.compress(loc_var, gn, axis=0)
183208

184209
# Run the PCA.
185-
coords, model = allel.pca(gn_var, n_components=n_components)
210+
if fit_exclude_samples is None:
211+
# Simple fit and transform on the same data.
212+
coords, model = allel.pca(gn_fit_var, n_components=n_components)
213+
214+
else:
215+
# Fit and transform separately.
216+
model = allel.stats.decomposition.GenotypePCA(
217+
n_components=n_components,
218+
)
219+
model.fit(gn_fit_var)
220+
coords = model.transform(gn_var, copy=False)
186221

187222
# Work around sign indeterminacy.
188223
for i in range(coords.shape[1]):
@@ -191,7 +226,10 @@ def _pca(
191226
coords[:, i] = c * -1
192227

193228
results = dict(
194-
samples=samples, coords=coords, evr=model.explained_variance_ratio_
229+
samples=samples,
230+
coords=coords,
231+
evr=model.explained_variance_ratio_,
232+
loc_keep_fit=loc_keep_fit,
195233
)
196234
return results
197235

malariagen_data/anoph/snp_data.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1582,6 +1582,8 @@ def biallelic_snp_calls(
15821582
min_cohort_size=min_cohort_size,
15831583
max_cohort_size=max_cohort_size,
15841584
random_seed=random_seed,
1585+
inline_array=inline_array,
1586+
chunks=chunks,
15851587
)
15861588

15871589
# Locate biallelic SNPs.
@@ -1657,18 +1659,26 @@ def biallelic_snp_calls(
16571659
# Apply conditions.
16581660
if max_missing_an is not None or min_minor_ac is not None:
16591661
loc_out = np.ones(ds_out.sizes["variants"], dtype=bool)
1662+
an = ac_out.sum(axis=1)
16601663

16611664
# Apply missingness condition.
16621665
if max_missing_an is not None:
1663-
an = ac_out.sum(axis=1)
16641666
an_missing = (ds_out.sizes["samples"] * ds_out.sizes["ploidy"]) - an
1665-
loc_missing = an_missing <= max_missing_an
1667+
if isinstance(max_missing_an, float):
1668+
an_missing_frac = an_missing / an
1669+
loc_missing = an_missing_frac <= max_missing_an
1670+
else:
1671+
loc_missing = an_missing <= max_missing_an
16661672
loc_out &= loc_missing
16671673

16681674
# Apply minor allele count condition.
16691675
if min_minor_ac is not None:
16701676
ac_minor = ac_out.min(axis=1)
1671-
loc_minor = ac_minor >= min_minor_ac
1677+
if isinstance(min_minor_ac, float):
1678+
ac_minor_frac = ac_minor / an
1679+
loc_minor = ac_minor_frac >= min_minor_ac
1680+
else:
1681+
loc_minor = ac_minor >= min_minor_ac
16721682
loc_out &= loc_minor
16731683

16741684
ds_out = ds_out.isel(variants=loc_out)

0 commit comments

Comments
 (0)