Skip to content

Commit edebc2b

Browse files
authored
Merge branch 'master' into 618-bad-random-value
2 parents b8cd642 + 57c6d8a commit edebc2b

88 files changed

Lines changed: 854 additions & 12328 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/legacy_tests.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ jobs:
5353
uses: actions/cache/restore@v3
5454
with:
5555
path: gcs_cache
56-
key: gcs_cache_tests_20240324
56+
key: gcs_cache_tests_20240922
5757

5858
- name: Run full test suite
5959
run: poetry run pytest --durations=20 --ignore=tests/anoph -v tests
@@ -63,4 +63,4 @@ jobs:
6363
if: always()
6464
with:
6565
path: gcs_cache
66-
key: gcs_cache_tests_20240324
66+
key: gcs_cache_tests_20240922

.github/workflows/notebooks.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ jobs:
4949
uses: actions/cache/restore@v3
5050
with:
5151
path: gcs_cache
52-
key: gcs_cache_notebooks_20240324
52+
key: gcs_cache_notebooks_20240922
5353

5454
- name: Run notebooks
5555
run: poetry run jupyter nbconvert --execute notebooks/*.ipynb --inplace
@@ -59,4 +59,4 @@ jobs:
5959
if: always()
6060
with:
6161
path: gcs_cache
62-
key: gcs_cache_notebooks_20240324
62+
key: gcs_cache_notebooks_20240922

malariagen_data/af1.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ class Af1(AnophelesDataResource):
6161
in a directory named "gcs_cache":
6262
6363
>>> af1 = malariagen_data.Af1(
64-
... "simplecache::gs://vo_afun_release",
64+
... "simplecache::gs://vo_afun_release_master_us_central1",
6565
... simplecache=dict(cache_storage="gcs_cache"),
6666
... )
6767

malariagen_data/ag3.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ class Ag3(AnophelesDataResource):
119119
in a directory named "gcs_cache":
120120
121121
>>> ag3 = malariagen_data.Ag3(
122-
... "simplecache::gs://vo_agam_release",
122+
... "simplecache::gs://vo_agam_release_master_us_central1",
123123
... simplecache=dict(cache_storage="gcs_cache"),
124124
... )
125125

malariagen_data/anoph/base_params.py

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -229,20 +229,28 @@ def validate_sample_selection_params(
229229
chunks: TypeAlias = Annotated[
230230
chunks_param_type,
231231
"""
232-
If 'auto' let dask decide chunk size. If 'native' use native zarr
233-
chunks. If 'ndauto' let dask decide chunk size but only for arrays with
234-
more than one dimension. If 'ndauto0' as 'ndauto' but only vary the first
235-
chunk dimension. If 'ndauto1' as 'ndauto' but only vary the second chunk
236-
dimension. If 'ndauto01' as 'ndauto' but only vary the first and second
237-
chunk dimensions. Also, can be a target size, e.g., '200 MiB', or a tuple of
238-
integers, or a callable which accepts the native chunks as a single argument
239-
and returns a valid dask chunks value.
232+
Define how input data being read from zarr should be divided into chunks
233+
for a dask computation. If 'native', use underlying zarr chunks. If a string
234+
specifying a target memory size, e.g., '300 MiB', resize chunks in arrays
235+
with more than one dimension to match this size. If 'auto', let dask decide
236+
chunk size. If 'ndauto', let dask decide chunk size but only for arrays with
237+
more than one dimension. If 'ndauto0', as 'ndauto' but only vary the first
238+
chunk dimension. If 'ndauto1', as 'ndauto' but only vary the second chunk
239+
dimension. If 'ndauto01', as 'ndauto' but only vary the first and second
240+
chunk dimensions. Also, can be a tuple of integers, or a callable which
241+
accepts the native chunks as a single argument and returns a valid dask
242+
chunks value.
240243
""",
241244
]
242245

243-
# The "ndauto0" value means auto-size chunks for arrays with more than one dimension,
244-
# allowing the first chunk dimension to be varied.
245-
chunks_default: chunks = "ndauto0"
246+
# Match the native zarr chunk sizes by default. N.B., some functions may
247+
# choose a different default, especially if they need to retrieve larger
248+
# amounts of data.
249+
native_chunks: chunks = "native"
250+
251+
# Alternative default chunk size, suitable for functions which need to
252+
# scan a large amount of data.
253+
large_chunks: chunks = "300MiB"
246254

247255
gff_attributes: TypeAlias = Annotated[
248256
Optional[Union[Sequence[str], str]],

malariagen_data/anoph/cnv_data.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ def cnv_hmm(
179179
sample_query: Optional[base_params.sample_query] = None,
180180
max_coverage_variance: cnv_params.max_coverage_variance = cnv_params.max_coverage_variance_default,
181181
inline_array: base_params.inline_array = base_params.inline_array_default,
182-
chunks: base_params.chunks = base_params.chunks_default,
182+
chunks: base_params.chunks = base_params.native_chunks,
183183
) -> xr.Dataset:
184184
debug = self._log.debug
185185

@@ -381,7 +381,7 @@ def cnv_coverage_calls(
381381
sample_set: base_params.sample_set,
382382
analysis: cnv_params.coverage_calls_analysis,
383383
inline_array: base_params.inline_array = base_params.inline_array_default,
384-
chunks: base_params.chunks = base_params.chunks_default,
384+
chunks: base_params.chunks = base_params.native_chunks,
385385
) -> xr.Dataset:
386386
debug = self._log.debug
387387

@@ -537,7 +537,7 @@ def cnv_discordant_read_calls(
537537
sample_sets: Optional[base_params.sample_sets] = None,
538538
sample_query: Optional[base_params.sample_query] = None,
539539
inline_array: base_params.inline_array = base_params.inline_array_default,
540-
chunks: base_params.chunks = base_params.chunks_default,
540+
chunks: base_params.chunks = base_params.native_chunks,
541541
) -> xr.Dataset:
542542
debug = self._log.debug
543543

malariagen_data/anoph/dipclust.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ def plot_diplotype_clustering(
7373
color_discrete_map: plotly_params.color_discrete_map = None,
7474
category_orders: plotly_params.category_order = None,
7575
legend_sizing: plotly_params.legend_sizing = "constant",
76+
chunks: base_params.chunks = base_params.native_chunks,
77+
inline_array: base_params.inline_array = base_params.inline_array_default,
7678
) -> Optional[dict]:
7779
import sys
7880

@@ -98,6 +100,8 @@ def plot_diplotype_clustering(
98100
cohort_size=cohort_size,
99101
distance_metric=distance_metric,
100102
random_seed=random_seed,
103+
chunks=chunks,
104+
inline_array=inline_array,
101105
)
102106

103107
# Align sample metadata with genotypes.
@@ -196,6 +200,8 @@ def diplotype_pairwise_distances(
196200
cohort_size: Optional[base_params.cohort_size] = None,
197201
distance_metric: dipclust_params.distance_metric = dipclust_params.distance_metric_default,
198202
random_seed: base_params.random_seed = 42,
203+
chunks: base_params.chunks = base_params.native_chunks,
204+
inline_array: base_params.inline_array = base_params.inline_array_default,
199205
) -> Tuple[np.ndarray, np.ndarray, int]:
200206
# Change this name if you ever change the behaviour of this function, to
201207
# invalidate any previously cached data.
@@ -220,7 +226,9 @@ def diplotype_pairwise_distances(
220226
results = self.results_cache_get(name=name, params=params)
221227

222228
except CacheMiss:
223-
results = self._diplotype_pairwise_distances(**params)
229+
results = self._diplotype_pairwise_distances(
230+
chunks=chunks, inline_array=inline_array, **params
231+
)
224232
self.results_cache_set(name=name, params=params, results=results)
225233

226234
# Unpack results")
@@ -241,6 +249,8 @@ def _diplotype_pairwise_distances(
241249
cohort_size,
242250
distance_metric,
243251
random_seed,
252+
chunks,
253+
inline_array,
244254
):
245255
if distance_metric == "cityblock":
246256
metric = multiallelic_diplotype_mean_cityblock
@@ -256,6 +266,8 @@ def _diplotype_pairwise_distances(
256266
site_class=site_class,
257267
cohort_size=cohort_size,
258268
random_seed=random_seed,
269+
chunks=chunks,
270+
inline_array=inline_array,
259271
)
260272

261273
with self._dask_progress(desc="Load genotypes for distance calculation"):
@@ -302,6 +314,8 @@ def _dipclust_het_bar_trace(
302314
cohort_size: Optional[base_params.cohort_size],
303315
random_seed: base_params.random_seed,
304316
color_continuous_scale: Optional[plotly_params.color_continuous_scale],
317+
chunks: base_params.chunks = base_params.native_chunks,
318+
inline_array: base_params.inline_array = base_params.inline_array_default,
305319
):
306320
ds_snps = self.snp_calls(
307321
region=region,
@@ -310,6 +324,8 @@ def _dipclust_het_bar_trace(
310324
cohort_size=cohort_size,
311325
site_mask=site_mask,
312326
random_seed=random_seed,
327+
chunks=chunks,
328+
inline_array=inline_array,
313329
)
314330

315331
# Strictly speaking we are loading the genotypes for the second time here,
@@ -361,6 +377,8 @@ def _dipclust_cnv_bar_trace(
361377
sample_query: Optional[base_params.sample_query],
362378
max_coverage_variance: Optional[cnv_params.max_coverage_variance],
363379
colorscale: Optional[plotly_params.color_continuous_scale],
380+
chunks: base_params.chunks = base_params.native_chunks,
381+
inline_array: base_params.inline_array = base_params.inline_array_default,
364382
):
365383
try:
366384
# TODO The gene_cnv() method still needs to get migrated to the
@@ -372,6 +390,8 @@ def _dipclust_cnv_bar_trace(
372390
sample_sets=sample_sets,
373391
sample_query=sample_query,
374392
max_coverage_variance=max_coverage_variance,
393+
chunks=chunks,
394+
inline_array=inline_array,
375395
)
376396

377397
except ValueError:
@@ -422,6 +442,8 @@ def _dipclust_snp_trace(
422442
dendro_sample_id_order: np.ndarray,
423443
snp_filter_min_maf: float,
424444
snp_colorscale: Optional[plotly_params.color_continuous_scale],
445+
chunks: base_params.chunks = base_params.native_chunks,
446+
inline_array: base_params.inline_array = base_params.inline_array_default,
425447
):
426448
# load genotype allele counts at SNP variants for each sample
427449
df_snps = self.snp_genotype_allele_counts(
@@ -430,6 +452,8 @@ def _dipclust_snp_trace(
430452
sample_query=sample_query,
431453
sample_sets=sample_sets,
432454
site_mask=site_mask,
455+
chunks=chunks,
456+
inline_array=inline_array,
433457
)
434458
df_snps = df_snps.set_index("label")
435459

@@ -557,6 +581,8 @@ def plot_diplotype_clustering_advanced(
557581
color_discrete_map: plotly_params.color_discrete_map = None,
558582
category_orders: plotly_params.category_order = None,
559583
legend_sizing: plotly_params.legend_sizing = "constant",
584+
chunks: base_params.chunks = base_params.native_chunks,
585+
inline_array: base_params.inline_array = base_params.inline_array_default,
560586
):
561587
if cohort_size and snp_transcript:
562588
cohort_size = None
@@ -592,6 +618,8 @@ def plot_diplotype_clustering_advanced(
592618
category_orders=category_orders,
593619
legend_sizing=legend_sizing,
594620
random_seed=random_seed,
621+
chunks=chunks,
622+
inline_array=inline_array,
595623
)
596624

597625
fig_dendro = res["figure"]
@@ -611,6 +639,8 @@ def plot_diplotype_clustering_advanced(
611639
site_mask=site_mask,
612640
color_continuous_scale=heterozygosity_colorscale,
613641
random_seed=random_seed,
642+
chunks=chunks,
643+
inline_array=inline_array,
614644
)
615645
figures.append(het_trace)
616646
subplot_heights.append(heterozygosity_height)
@@ -623,6 +653,8 @@ def plot_diplotype_clustering_advanced(
623653
sample_query=sample_query,
624654
max_coverage_variance=cnv_max_coverage_variance,
625655
colorscale=cnv_colorscale,
656+
chunks=chunks,
657+
inline_array=inline_array,
626658
)
627659
# N.B., sometimes no CNV data may be available, so check to
628660
# see if the trace is not None.
@@ -640,6 +672,8 @@ def plot_diplotype_clustering_advanced(
640672
dendro_sample_id_order=dendro_sample_id_order,
641673
snp_filter_min_maf=snp_filter_min_maf,
642674
snp_colorscale=snp_colorscale,
675+
chunks=chunks,
676+
inline_array=inline_array,
643677
)
644678

645679
if snp_trace:

malariagen_data/anoph/fst.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ def fst_gwss(
115115
] = fst_params.max_cohort_size_default,
116116
random_seed: base_params.random_seed = 42,
117117
inline_array: base_params.inline_array = base_params.inline_array_default,
118-
chunks: base_params.chunks = base_params.chunks_default,
118+
chunks: base_params.chunks = base_params.large_chunks,
119119
clip_min: fst_params.clip_min = 0.0,
120120
) -> Tuple[np.ndarray, np.ndarray]:
121121
# Change this name if you ever change the behaviour of this function, to

malariagen_data/anoph/g123.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ def g123_gwss(
161161
] = g123_params.max_cohort_size_default,
162162
random_seed: base_params.random_seed = 42,
163163
inline_array: base_params.inline_array = base_params.inline_array_default,
164-
chunks: base_params.chunks = base_params.chunks_default,
164+
chunks: base_params.chunks = base_params.large_chunks,
165165
) -> Tuple[np.ndarray, np.ndarray]:
166166
# Change this name if you ever change the behaviour of this function, to
167167
# invalidate any previously cached data.
@@ -264,7 +264,7 @@ def g123_calibration(
264264
window_sizes: g123_params.window_sizes = g123_params.window_sizes_default,
265265
random_seed: base_params.random_seed = 42,
266266
inline_array: base_params.inline_array = base_params.inline_array_default,
267-
chunks: base_params.chunks = base_params.chunks_default,
267+
chunks: base_params.chunks = base_params.large_chunks,
268268
) -> Mapping[str, np.ndarray]:
269269
# Change this name if you ever change the behaviour of this function, to
270270
# invalidate any previously cached data.
@@ -323,7 +323,7 @@ def plot_g123_gwss_track(
323323
x_range: Optional[gplt_params.x_range] = None,
324324
output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
325325
inline_array: base_params.inline_array = base_params.inline_array_default,
326-
chunks: base_params.chunks = base_params.chunks_default,
326+
chunks: base_params.chunks = base_params.large_chunks,
327327
) -> gplt_params.figure:
328328
# compute G123
329329
x, g123 = self.g123_gwss(
@@ -424,7 +424,7 @@ def plot_g123_gwss(
424424
show: gplt_params.show = True,
425425
output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
426426
inline_array: base_params.inline_array = base_params.inline_array_default,
427-
chunks: base_params.chunks = base_params.chunks_default,
427+
chunks: base_params.chunks = base_params.large_chunks,
428428
) -> gplt_params.figure:
429429
# gwss track
430430
fig1 = self.plot_g123_gwss_track(
@@ -497,7 +497,7 @@ def plot_g123_calibration(
497497
title: Optional[gplt_params.title] = None,
498498
show: gplt_params.show = True,
499499
inline_array: base_params.inline_array = base_params.inline_array_default,
500-
chunks: base_params.chunks = base_params.chunks_default,
500+
chunks: base_params.chunks = base_params.large_chunks,
501501
) -> gplt_params.figure:
502502
# get g123 values
503503
calibration_runs = self.g123_calibration(

malariagen_data/anoph/genome_sequence.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def genome_sequence(
109109
self,
110110
region: base_params.region,
111111
inline_array: base_params.inline_array = base_params.inline_array_default,
112-
chunks: base_params.chunks = base_params.chunks_default,
112+
chunks: base_params.chunks = base_params.native_chunks,
113113
) -> da.Array:
114114
# Parse the region parameter into a Region object.
115115
resolved_region: Region = parse_single_region(self, region)

0 commit comments

Comments
 (0)