Skip to content

Commit ae8ba96

Browse files
authored
Merge branch 'master' into GH410_add_sample_query_options
2 parents 074caf0 + 69d0576 commit ae8ba96

11 files changed

Lines changed: 70 additions & 68 deletions

File tree

malariagen_data/ag3.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from .anopheles import AnophelesDataResource
99

1010
# silence dask performance warnings
11-
dask.config.set(**{"array.slicing.split_large_chunks": False}) # type: ignore
11+
dask.config.set(**{"array.slicing.split_native_chunks": False}) # type: ignore
1212

1313
MAJOR_VERSION_NUMBER = 3
1414
MAJOR_VERSION_PATH = "v3"

malariagen_data/anoph/base_params.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -256,10 +256,6 @@ def validate_sample_selection_params(
256256
# amounts of data.
257257
native_chunks: chunks = "native"
258258

259-
# Alternative default chunk size, suitable for functions which need to
260-
# scan a large amount of data.
261-
large_chunks: chunks = "300MiB"
262-
263259
gff_attributes: TypeAlias = Annotated[
264260
Optional[Union[Sequence[str], str]],
265261
"""

malariagen_data/anoph/fst.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ def fst_gwss(
115115
] = fst_params.max_cohort_size_default,
116116
random_seed: base_params.random_seed = 42,
117117
inline_array: base_params.inline_array = base_params.inline_array_default,
118-
chunks: base_params.chunks = base_params.large_chunks,
118+
chunks: base_params.chunks = base_params.native_chunks,
119119
clip_min: fst_params.clip_min = 0.0,
120120
) -> Tuple[np.ndarray, np.ndarray]:
121121
# Change this name if you ever change the behaviour of this function, to

malariagen_data/anoph/g123.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ def g123_gwss(
166166
] = g123_params.max_cohort_size_default,
167167
random_seed: base_params.random_seed = 42,
168168
inline_array: base_params.inline_array = base_params.inline_array_default,
169-
chunks: base_params.chunks = base_params.large_chunks,
169+
chunks: base_params.chunks = base_params.native_chunks,
170170
) -> Tuple[np.ndarray, np.ndarray]:
171171
# Change this name if you ever change the behaviour of this function, to
172172
# invalidate any previously cached data.
@@ -273,7 +273,7 @@ def g123_calibration(
273273
window_sizes: g123_params.window_sizes = g123_params.window_sizes_default,
274274
random_seed: base_params.random_seed = 42,
275275
inline_array: base_params.inline_array = base_params.inline_array_default,
276-
chunks: base_params.chunks = base_params.large_chunks,
276+
chunks: base_params.chunks = base_params.native_chunks,
277277
) -> Mapping[str, np.ndarray]:
278278
# Change this name if you ever change the behaviour of this function, to
279279
# invalidate any previously cached data.
@@ -334,7 +334,7 @@ def plot_g123_gwss_track(
334334
x_range: Optional[gplt_params.x_range] = None,
335335
output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
336336
inline_array: base_params.inline_array = base_params.inline_array_default,
337-
chunks: base_params.chunks = base_params.large_chunks,
337+
chunks: base_params.chunks = base_params.native_chunks,
338338
) -> gplt_params.figure:
339339
# compute G123
340340
x, g123 = self.g123_gwss(
@@ -437,7 +437,7 @@ def plot_g123_gwss(
437437
show: gplt_params.show = True,
438438
output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
439439
inline_array: base_params.inline_array = base_params.inline_array_default,
440-
chunks: base_params.chunks = base_params.large_chunks,
440+
chunks: base_params.chunks = base_params.native_chunks,
441441
) -> gplt_params.figure:
442442
# gwss track
443443
fig1 = self.plot_g123_gwss_track(
@@ -512,7 +512,7 @@ def plot_g123_calibration(
512512
title: Optional[gplt_params.title] = None,
513513
show: gplt_params.show = True,
514514
inline_array: base_params.inline_array = base_params.inline_array_default,
515-
chunks: base_params.chunks = base_params.large_chunks,
515+
chunks: base_params.chunks = base_params.native_chunks,
516516
) -> gplt_params.figure:
517517
# get g123 values
518518
calibration_runs = self.g123_calibration(

malariagen_data/anoph/h12.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def h12_calibration(
8888
] = h12_params.max_cohort_size_default,
8989
window_sizes: h12_params.window_sizes = h12_params.window_sizes_default,
9090
random_seed: base_params.random_seed = 42,
91-
chunks: base_params.chunks = base_params.large_chunks,
91+
chunks: base_params.chunks = base_params.native_chunks,
9292
inline_array: base_params.inline_array = base_params.inline_array_default,
9393
) -> Mapping[str, np.ndarray]:
9494
# Change this name if you ever change the behaviour of this function, to
@@ -148,7 +148,7 @@ def plot_h12_calibration(
148148
random_seed: base_params.random_seed = 42,
149149
title: Optional[str] = None,
150150
show: bool = True,
151-
chunks: base_params.chunks = base_params.large_chunks,
151+
chunks: base_params.chunks = base_params.native_chunks,
152152
inline_array: base_params.inline_array = base_params.inline_array_default,
153153
) -> gplt_params.figure:
154154
# Get H12 values.
@@ -295,7 +295,7 @@ def h12_gwss(
295295
base_params.max_cohort_size
296296
] = h12_params.max_cohort_size_default,
297297
random_seed: base_params.random_seed = 42,
298-
chunks: base_params.chunks = base_params.large_chunks,
298+
chunks: base_params.chunks = base_params.native_chunks,
299299
inline_array: base_params.inline_array = base_params.inline_array_default,
300300
) -> Tuple[np.ndarray, np.ndarray]:
301301
# Change this name if you ever change the behaviour of this function, to
@@ -357,7 +357,7 @@ def plot_h12_gwss_track(
357357
show: gplt_params.show = True,
358358
x_range: Optional[gplt_params.x_range] = None,
359359
output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
360-
chunks: base_params.chunks = base_params.large_chunks,
360+
chunks: base_params.chunks = base_params.native_chunks,
361361
inline_array: base_params.inline_array = base_params.inline_array_default,
362362
) -> gplt_params.figure:
363363
# Compute H12.
@@ -460,7 +460,7 @@ def plot_h12_gwss(
460460
genes_height: gplt_params.genes_height = gplt_params.genes_height_default,
461461
show: gplt_params.show = True,
462462
output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
463-
chunks: base_params.chunks = base_params.large_chunks,
463+
chunks: base_params.chunks = base_params.native_chunks,
464464
inline_array: base_params.inline_array = base_params.inline_array_default,
465465
) -> gplt_params.figure:
466466
# Plot GWSS track.

malariagen_data/anoph/h1x.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ def h1x_gwss(
112112
base_params.max_cohort_size
113113
] = h12_params.max_cohort_size_default,
114114
random_seed: base_params.random_seed = 42,
115-
chunks: base_params.chunks = base_params.large_chunks,
115+
chunks: base_params.chunks = base_params.native_chunks,
116116
inline_array: base_params.inline_array = base_params.inline_array_default,
117117
) -> Tuple[np.ndarray, np.ndarray]:
118118
# Change this name if you ever change the behaviour of this function, to
@@ -177,7 +177,7 @@ def plot_h1x_gwss_track(
177177
show: gplt_params.show = True,
178178
x_range: Optional[gplt_params.x_range] = None,
179179
output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
180-
chunks: base_params.chunks = base_params.large_chunks,
180+
chunks: base_params.chunks = base_params.native_chunks,
181181
inline_array: base_params.inline_array = base_params.inline_array_default,
182182
) -> gplt_params.figure:
183183
# Compute H1X.
@@ -283,7 +283,7 @@ def plot_h1x_gwss(
283283
genes_height: gplt_params.genes_height = gplt_params.genes_height_default,
284284
show: gplt_params.show = True,
285285
output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
286-
chunks: base_params.chunks = base_params.large_chunks,
286+
chunks: base_params.chunks = base_params.native_chunks,
287287
inline_array: base_params.inline_array = base_params.inline_array_default,
288288
) -> gplt_params.figure:
289289
# Plot GWSS track.

malariagen_data/anoph/pca.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def pca(
7676
fit_exclude_samples: Optional[base_params.samples] = None,
7777
random_seed: base_params.random_seed = 42,
7878
inline_array: base_params.inline_array = base_params.inline_array_default,
79-
chunks: base_params.chunks = base_params.large_chunks,
79+
chunks: base_params.chunks = base_params.native_chunks,
8080
) -> Tuple[pca_params.df_pca, pca_params.evr]:
8181
# Change this name if you ever change the behaviour of this function, to
8282
# invalidate any previously cached data.

malariagen_data/anoph/snp_data.py

Lines changed: 6 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
DIM_VARIANT,
1818
CacheMiss,
1919
Region,
20+
apply_allele_mapping,
2021
check_types,
2122
da_compress,
2223
da_concat,
@@ -1667,30 +1668,20 @@ def biallelic_snp_calls(
16671668
variant_position = ds_bi["variant_position"].data
16681669
coords["variant_position"] = ("variants",), variant_position
16691670

1670-
# Prepare allele mapping for dask computations.
1671-
allele_mapping_zarr = zarr.array(allele_mapping)
1672-
allele_mapping_dask = da_from_zarr(
1673-
allele_mapping_zarr, chunks="native", inline_array=True
1674-
)
1675-
16761671
# Store alleles, transformed.
16771672
variant_allele_dask = ds_bi["variant_allele"].data
16781673
variant_allele_out = dask_apply_allele_mapping(
1679-
variant_allele_dask, allele_mapping_dask, max_allele=1
1674+
variant_allele_dask, allele_mapping, max_allele=1
16801675
)
16811676
data_vars["variant_allele"] = ("variants", "alleles"), variant_allele_out
16821677

16831678
# Store allele counts, transformed.
1684-
ac_bi_zarr = zarr.array(ac_bi)
1685-
ac_bi_dask = da_from_zarr(ac_bi_zarr, chunks="native", inline_array=True)
1686-
ac_out = dask_apply_allele_mapping(
1687-
ac_bi_dask, allele_mapping_dask, max_allele=1
1688-
)
1679+
ac_out = apply_allele_mapping(ac_bi, allele_mapping, max_allele=1)
16891680
data_vars["variant_allele_count"] = ("variants", "alleles"), ac_out
16901681

16911682
# Store genotype calls, transformed.
16921683
gt_dask = ds_bi["call_genotype"].data
1693-
gt_out = dask_genotype_array_map_alleles(gt_dask, allele_mapping_dask)
1684+
gt_out = dask_genotype_array_map_alleles(gt_dask, allele_mapping)
16941685
data_vars["call_genotype"] = (
16951686
(
16961687
"variants",
@@ -1705,9 +1696,8 @@ def biallelic_snp_calls(
17051696

17061697
# Apply conditions.
17071698
if max_missing_an is not None or min_minor_ac is not None:
1708-
ac_out_computed = ac_out.compute()
17091699
loc_out = np.ones(ds_out.sizes["variants"], dtype=bool)
1710-
an = ac_out_computed.sum(axis=1)
1700+
an = ac_out.sum(axis=1)
17111701

17121702
# Apply missingness condition.
17131703
if max_missing_an is not None:
@@ -1721,7 +1711,7 @@ def biallelic_snp_calls(
17211711

17221712
# Apply minor allele count condition.
17231713
if min_minor_ac is not None:
1724-
ac_minor = ac_out_computed.min(axis=1)
1714+
ac_minor = ac_out.min(axis=1)
17251715
if isinstance(min_minor_ac, float):
17261716
ac_minor_frac = ac_minor / an
17271717
loc_minor = ac_minor_frac >= min_minor_ac

malariagen_data/anopheles.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1646,7 +1646,7 @@ def cohort_diversity_stats(
16461646
random_seed: base_params.random_seed = 42,
16471647
n_jack: base_params.n_jack = 200,
16481648
confidence_level: base_params.confidence_level = 0.95,
1649-
chunks: base_params.chunks = base_params.large_chunks,
1649+
chunks: base_params.chunks = base_params.native_chunks,
16501650
inline_array: base_params.inline_array = base_params.inline_array_default,
16511651
) -> pd.Series:
16521652
debug = self._log.debug
@@ -1753,7 +1753,7 @@ def diversity_stats(
17531753
random_seed: base_params.random_seed = 42,
17541754
n_jack: base_params.n_jack = 200,
17551755
confidence_level: base_params.confidence_level = 0.95,
1756-
chunks: base_params.chunks = base_params.large_chunks,
1756+
chunks: base_params.chunks = base_params.native_chunks,
17571757
inline_array: base_params.inline_array = base_params.inline_array_default,
17581758
) -> pd.DataFrame:
17591759
# Normalise cohorts parameter.
@@ -1960,7 +1960,7 @@ def ihs_gwss(
19601960
base_params.max_cohort_size
19611961
] = ihs_params.max_cohort_size_default,
19621962
random_seed: base_params.random_seed = 42,
1963-
chunks: base_params.chunks = base_params.large_chunks,
1963+
chunks: base_params.chunks = base_params.native_chunks,
19641964
inline_array: base_params.inline_array = base_params.inline_array_default,
19651965
) -> Tuple[np.ndarray, np.ndarray]:
19661966
# change this name if you ever change the behaviour of this function, to
@@ -2141,7 +2141,7 @@ def plot_ihs_gwss_track(
21412141
show: gplt_params.show = True,
21422142
x_range: Optional[gplt_params.x_range] = None,
21432143
output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
2144-
chunks: base_params.chunks = base_params.large_chunks,
2144+
chunks: base_params.chunks = base_params.native_chunks,
21452145
inline_array: base_params.inline_array = base_params.inline_array_default,
21462146
) -> gplt_params.figure:
21472147
# compute ihs
@@ -2283,7 +2283,7 @@ def plot_xpehh_gwss(
22832283
genes_height: gplt_params.genes_height = gplt_params.genes_height_default,
22842284
show: gplt_params.show = True,
22852285
output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
2286-
chunks: base_params.chunks = base_params.large_chunks,
2286+
chunks: base_params.chunks = base_params.native_chunks,
22872287
inline_array: base_params.inline_array = base_params.inline_array_default,
22882288
) -> gplt_params.figure:
22892289
# gwss track
@@ -2383,7 +2383,7 @@ def plot_ihs_gwss(
23832383
genes_height: gplt_params.genes_height = gplt_params.genes_height_default,
23842384
show: gplt_params.show = True,
23852385
output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
2386-
chunks: base_params.chunks = base_params.large_chunks,
2386+
chunks: base_params.chunks = base_params.native_chunks,
23872387
inline_array: base_params.inline_array = base_params.inline_array_default,
23882388
) -> gplt_params.figure:
23892389
# gwss track
@@ -2479,7 +2479,7 @@ def xpehh_gwss(
24792479
base_params.max_cohort_size
24802480
] = xpehh_params.max_cohort_size_default,
24812481
random_seed: base_params.random_seed = 42,
2482-
chunks: base_params.chunks = base_params.large_chunks,
2482+
chunks: base_params.chunks = base_params.native_chunks,
24832483
inline_array: base_params.inline_array = base_params.inline_array_default,
24842484
) -> Tuple[np.ndarray, np.ndarray]:
24852485
# change this name if you ever change the behaviour of this function, to
@@ -2658,7 +2658,7 @@ def plot_xpehh_gwss_track(
26582658
show: gplt_params.show = True,
26592659
x_range: Optional[gplt_params.x_range] = None,
26602660
output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
2661-
chunks: base_params.chunks = base_params.large_chunks,
2661+
chunks: base_params.chunks = base_params.native_chunks,
26622662
inline_array: base_params.inline_array = base_params.inline_array_default,
26632663
) -> gplt_params.figure:
26642664
# compute xpehh
@@ -3311,7 +3311,7 @@ def plot_njt(
33113311
max_cohort_size: Optional[base_params.max_cohort_size] = None,
33123312
random_seed: base_params.random_seed = 42,
33133313
inline_array: base_params.inline_array = base_params.inline_array_default,
3314-
chunks: base_params.chunks = base_params.large_chunks,
3314+
chunks: base_params.chunks = base_params.native_chunks,
33153315
) -> plotly_params.figure:
33163316
from biotite.sequence.phylo import neighbor_joining # type: ignore
33173317
from scipy.spatial.distance import squareform # type: ignore

0 commit comments

Comments
 (0)