Skip to content

Commit 4077e9a

Browse files
authored
Merge branch 'master' into 618-bad-random-value
2 parents 2e7de3f + 69d0576 commit 4077e9a

19 files changed

Lines changed: 713 additions & 3687 deletions

malariagen_data/ag3.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from .anopheles import AnophelesDataResource
99

1010
# silence dask performance warnings
11-
dask.config.set(**{"array.slicing.split_large_chunks": False}) # type: ignore
11+
dask.config.set(**{"array.slicing.split_native_chunks": False}) # type: ignore
1212

1313
MAJOR_VERSION_NUMBER = 3
1414
MAJOR_VERSION_PATH = "v3"

malariagen_data/anoph/base_params.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -248,10 +248,6 @@ def validate_sample_selection_params(
248248
# amounts of data.
249249
native_chunks: chunks = "native"
250250

251-
# Alternative default chunk size, suitable for functions which need to
252-
# scan a large amount of data.
253-
large_chunks: chunks = "300MiB"
254-
255251
gff_attributes: TypeAlias = Annotated[
256252
Optional[Union[Sequence[str], str]],
257253
"""

malariagen_data/anoph/fst.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ def fst_gwss(
115115
] = fst_params.max_cohort_size_default,
116116
random_seed: base_params.random_seed = 42,
117117
inline_array: base_params.inline_array = base_params.inline_array_default,
118-
chunks: base_params.chunks = base_params.large_chunks,
118+
chunks: base_params.chunks = base_params.native_chunks,
119119
clip_min: fst_params.clip_min = 0.0,
120120
) -> Tuple[np.ndarray, np.ndarray]:
121121
# Change this name if you ever change the behaviour of this function, to

malariagen_data/anoph/g123.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ def g123_gwss(
161161
] = g123_params.max_cohort_size_default,
162162
random_seed: base_params.random_seed = 42,
163163
inline_array: base_params.inline_array = base_params.inline_array_default,
164-
chunks: base_params.chunks = base_params.large_chunks,
164+
chunks: base_params.chunks = base_params.native_chunks,
165165
) -> Tuple[np.ndarray, np.ndarray]:
166166
# Change this name if you ever change the behaviour of this function, to
167167
# invalidate any previously cached data.
@@ -264,7 +264,7 @@ def g123_calibration(
264264
window_sizes: g123_params.window_sizes = g123_params.window_sizes_default,
265265
random_seed: base_params.random_seed = 42,
266266
inline_array: base_params.inline_array = base_params.inline_array_default,
267-
chunks: base_params.chunks = base_params.large_chunks,
267+
chunks: base_params.chunks = base_params.native_chunks,
268268
) -> Mapping[str, np.ndarray]:
269269
# Change this name if you ever change the behaviour of this function, to
270270
# invalidate any previously cached data.
@@ -323,7 +323,7 @@ def plot_g123_gwss_track(
323323
x_range: Optional[gplt_params.x_range] = None,
324324
output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
325325
inline_array: base_params.inline_array = base_params.inline_array_default,
326-
chunks: base_params.chunks = base_params.large_chunks,
326+
chunks: base_params.chunks = base_params.native_chunks,
327327
) -> gplt_params.figure:
328328
# compute G123
329329
x, g123 = self.g123_gwss(
@@ -424,7 +424,7 @@ def plot_g123_gwss(
424424
show: gplt_params.show = True,
425425
output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
426426
inline_array: base_params.inline_array = base_params.inline_array_default,
427-
chunks: base_params.chunks = base_params.large_chunks,
427+
chunks: base_params.chunks = base_params.native_chunks,
428428
) -> gplt_params.figure:
429429
# gwss track
430430
fig1 = self.plot_g123_gwss_track(
@@ -497,7 +497,7 @@ def plot_g123_calibration(
497497
title: Optional[gplt_params.title] = None,
498498
show: gplt_params.show = True,
499499
inline_array: base_params.inline_array = base_params.inline_array_default,
500-
chunks: base_params.chunks = base_params.large_chunks,
500+
chunks: base_params.chunks = base_params.native_chunks,
501501
) -> gplt_params.figure:
502502
# get g123 values
503503
calibration_runs = self.g123_calibration(

malariagen_data/anoph/h12.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def h12_calibration(
8585
] = h12_params.max_cohort_size_default,
8686
window_sizes: h12_params.window_sizes = h12_params.window_sizes_default,
8787
random_seed: base_params.random_seed = 42,
88-
chunks: base_params.chunks = base_params.large_chunks,
88+
chunks: base_params.chunks = base_params.native_chunks,
8989
inline_array: base_params.inline_array = base_params.inline_array_default,
9090
) -> Mapping[str, np.ndarray]:
9191
# Change this name if you ever change the behaviour of this function, to
@@ -143,7 +143,7 @@ def plot_h12_calibration(
143143
random_seed: base_params.random_seed = 42,
144144
title: Optional[str] = None,
145145
show: bool = True,
146-
chunks: base_params.chunks = base_params.large_chunks,
146+
chunks: base_params.chunks = base_params.native_chunks,
147147
inline_array: base_params.inline_array = base_params.inline_array_default,
148148
) -> gplt_params.figure:
149149
# Get H12 values.
@@ -286,7 +286,7 @@ def h12_gwss(
286286
base_params.max_cohort_size
287287
] = h12_params.max_cohort_size_default,
288288
random_seed: base_params.random_seed = 42,
289-
chunks: base_params.chunks = base_params.large_chunks,
289+
chunks: base_params.chunks = base_params.native_chunks,
290290
inline_array: base_params.inline_array = base_params.inline_array_default,
291291
) -> Tuple[np.ndarray, np.ndarray]:
292292
# Change this name if you ever change the behaviour of this function, to
@@ -346,7 +346,7 @@ def plot_h12_gwss_track(
346346
show: gplt_params.show = True,
347347
x_range: Optional[gplt_params.x_range] = None,
348348
output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
349-
chunks: base_params.chunks = base_params.large_chunks,
349+
chunks: base_params.chunks = base_params.native_chunks,
350350
inline_array: base_params.inline_array = base_params.inline_array_default,
351351
) -> gplt_params.figure:
352352
# Compute H12.
@@ -447,7 +447,7 @@ def plot_h12_gwss(
447447
genes_height: gplt_params.genes_height = gplt_params.genes_height_default,
448448
show: gplt_params.show = True,
449449
output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
450-
chunks: base_params.chunks = base_params.large_chunks,
450+
chunks: base_params.chunks = base_params.native_chunks,
451451
inline_array: base_params.inline_array = base_params.inline_array_default,
452452
) -> gplt_params.figure:
453453
# Plot GWSS track.

malariagen_data/anoph/h1x.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ def h1x_gwss(
112112
base_params.max_cohort_size
113113
] = h12_params.max_cohort_size_default,
114114
random_seed: base_params.random_seed = 42,
115-
chunks: base_params.chunks = base_params.large_chunks,
115+
chunks: base_params.chunks = base_params.native_chunks,
116116
inline_array: base_params.inline_array = base_params.inline_array_default,
117117
) -> Tuple[np.ndarray, np.ndarray]:
118118
# Change this name if you ever change the behaviour of this function, to
@@ -177,7 +177,7 @@ def plot_h1x_gwss_track(
177177
show: gplt_params.show = True,
178178
x_range: Optional[gplt_params.x_range] = None,
179179
output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
180-
chunks: base_params.chunks = base_params.large_chunks,
180+
chunks: base_params.chunks = base_params.native_chunks,
181181
inline_array: base_params.inline_array = base_params.inline_array_default,
182182
) -> gplt_params.figure:
183183
# Compute H1X.
@@ -283,7 +283,7 @@ def plot_h1x_gwss(
283283
genes_height: gplt_params.genes_height = gplt_params.genes_height_default,
284284
show: gplt_params.show = True,
285285
output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
286-
chunks: base_params.chunks = base_params.large_chunks,
286+
chunks: base_params.chunks = base_params.native_chunks,
287287
inline_array: base_params.inline_array = base_params.inline_array_default,
288288
) -> gplt_params.figure:
289289
# Plot GWSS track.

malariagen_data/anoph/map_params.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,13 @@
77
from typing_extensions import Annotated, TypeAlias
88

99
center: TypeAlias = Annotated[
10-
Tuple[int, int],
10+
Tuple[Union[int, float], Union[int, float]],
1111
"Location to center the map.",
1212
]
1313

1414
center_default: center = (-2, 20)
1515

16-
zoom: TypeAlias = Annotated[int, "Initial zoom level."]
16+
zoom: TypeAlias = Annotated[Union[int, float], "Initial zoom level."]
1717

1818
zoom_default: zoom = 3
1919

malariagen_data/anoph/pca.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ def pca(
7575
fit_exclude_samples: Optional[base_params.samples] = None,
7676
random_seed: base_params.random_seed = 42,
7777
inline_array: base_params.inline_array = base_params.inline_array_default,
78-
chunks: base_params.chunks = base_params.large_chunks,
78+
chunks: base_params.chunks = base_params.native_chunks,
7979
) -> Tuple[pca_params.df_pca, pca_params.evr]:
8080
# Change this name if you ever change the behaviour of this function, to
8181
# invalidate any previously cached data.

malariagen_data/anoph/snp_data.py

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,9 @@
2222
da_compress,
2323
da_concat,
2424
da_from_zarr,
25+
dask_apply_allele_mapping,
2526
dask_compress_dataset,
27+
dask_genotype_array_map_alleles,
2628
init_zarr_store,
2729
locate_region,
2830
parse_multi_region,
@@ -565,6 +567,7 @@ def _snp_variants_for_contig(
565567
ref = da_from_zarr(ref_z, inline_array=inline_array, chunks=chunks)
566568
alt = da_from_zarr(alt_z, inline_array=inline_array, chunks=chunks)
567569
variant_allele = da.concatenate([ref[:, None], alt], axis=1)
570+
variant_allele = variant_allele.rechunk((variant_allele.chunks[0], -1))
568571
data_vars["variant_allele"] = [DIM_VARIANT, DIM_ALLELE], variant_allele
569572

570573
# Set up variant_contig.
@@ -1611,7 +1614,7 @@ def biallelic_snp_calls(
16111614

16121615
with self._spinner("Prepare biallelic SNP calls"):
16131616
# Subset to biallelic sites.
1614-
ds_bi = ds.isel(variants=loc_bi)
1617+
ds_bi = dask_compress_dataset(ds, indexer=loc_bi, dim="variants")
16151618

16161619
# Start building a new dataset.
16171620
coords: Dict[str, Any] = dict()
@@ -1624,33 +1627,30 @@ def biallelic_snp_calls(
16241627
coords["variant_contig"] = ("variants",), ds_bi["variant_contig"].data
16251628

16261629
# Store position.
1627-
coords["variant_position"] = ("variants",), ds_bi["variant_position"].data
1630+
variant_position = ds_bi["variant_position"].data
1631+
coords["variant_position"] = ("variants",), variant_position
16281632

16291633
# Store alleles, transformed.
1630-
variant_allele = ds_bi["variant_allele"].data
1631-
variant_allele = variant_allele.rechunk((variant_allele.chunks[0], -1))
1632-
variant_allele_out = da.map_blocks(
1633-
lambda block: apply_allele_mapping(block, allele_mapping, max_allele=1),
1634-
variant_allele,
1635-
dtype=variant_allele.dtype,
1636-
chunks=(variant_allele.chunks[0], [2]),
1634+
variant_allele_dask = ds_bi["variant_allele"].data
1635+
variant_allele_out = dask_apply_allele_mapping(
1636+
variant_allele_dask, allele_mapping, max_allele=1
16371637
)
16381638
data_vars["variant_allele"] = ("variants", "alleles"), variant_allele_out
16391639

1640-
# Store allele counts, transformed, so we don't have to recompute.
1640+
# Store allele counts, transformed.
16411641
ac_out = apply_allele_mapping(ac_bi, allele_mapping, max_allele=1)
16421642
data_vars["variant_allele_count"] = ("variants", "alleles"), ac_out
16431643

16441644
# Store genotype calls, transformed.
1645-
gt = ds_bi["call_genotype"].data
1646-
gt_out = allel.GenotypeDaskArray(gt).map_alleles(allele_mapping)
1645+
gt_dask = ds_bi["call_genotype"].data
1646+
gt_out = dask_genotype_array_map_alleles(gt_dask, allele_mapping)
16471647
data_vars["call_genotype"] = (
16481648
(
16491649
"variants",
16501650
"samples",
16511651
"ploidy",
16521652
),
1653-
gt_out.values,
1653+
gt_out,
16541654
)
16551655

16561656
# Build dataset.
@@ -1681,12 +1681,13 @@ def biallelic_snp_calls(
16811681
loc_minor = ac_minor >= min_minor_ac
16821682
loc_out &= loc_minor
16831683

1684-
ds_out = ds_out.isel(variants=loc_out)
1684+
# Apply selection from conditions.
1685+
ds_out = dask_compress_dataset(ds_out, indexer=loc_out, dim="variants")
16851686

16861687
# Try to meet target number of SNPs.
16871688
if n_snps is not None:
16881689
if ds_out.sizes["variants"] > (n_snps * 2):
1689-
# Do some thinning.
1690+
# Apply thinning.
16901691
thin_step = ds_out.sizes["variants"] // n_snps
16911692
loc_thin = slice(thin_offset, None, thin_step)
16921693
ds_out = ds_out.isel(variants=loc_thin)

malariagen_data/anoph/snp_frq.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1107,6 +1107,7 @@ def plot_frequencies_map_markers(
11071107
"""
11081108
marker.popup = ipyleaflet.Popup(
11091109
child=ipywidgets.HTML(popup_html),
1110+
auto_pan=False,
11101111
)
11111112
m.add(marker)
11121113

0 commit comments

Comments
 (0)