2929import typeguard
3030import xarray as xr
3131import zarr # type: ignore
32+
33+ # zarr >= 2.11.0
34+ from zarr .storage import BaseStore # type: ignore
3235from fsspec .core import url_to_fs # type: ignore
3336from fsspec .mapping import FSMap # type: ignore
3437from numpydoc_decorator .impl import humanize_type # type: ignore
@@ -113,46 +116,40 @@ def unpack_gff3_attributes(df: pd.DataFrame, attributes: Tuple[str, ...]):
113116 return df
114117
115118
116- # zarr compatibility, version 2.11.0 introduced the BaseStore class
117- # see also https://github.com/malariagen/malariagen-data-python/issues/129
118-
119- try :
120- # zarr >= 2.11.0
121- from zarr .storage import KVStore # type: ignore
122-
123- class SafeStore (KVStore ):
124- def __getitem__ (self , key ):
125- try :
126- return self ._mutable_mapping [key ]
127- except KeyError as e :
128- # raise a different error to ensure zarr propagates the exception, rather than filling
129- raise FileNotFoundError (e )
119+ class SafeStore (BaseStore ):
120+ """This class wraps any zarr store and ensures that missing chunks
121+ will not get automatically filled but will raise an exception. There
122+ should be no missing chunks in any of the datasets we host."""
130123
131- def __contains__ (self , key ):
132- return key in self ._mutable_mapping
124+ def __init__ (self , store ):
125+ self ._store = store
133126
134- except ImportError :
135- # zarr < 2.11.0
127+ def __getitem__ (self , key ):
128+ try :
129+ return self ._store [key ]
130+ except KeyError as e :
131+ # Raise a different error to ensure zarr propagates the exception,
132+ # rather than filling.
133+ raise FileNotFoundError (e )
136134
137- class SafeStore (Mapping ): # type: ignore
138- def __init__ (self , store ):
139- self .store = store
135+ def __getattr__ (self , attr ):
136+ if attr == "__setstate__" :
137+ # Special method called during unpickling, don't pass through.
138+ raise AttributeError (attr )
139+ # Pass through all other attribute access to the wrapped store.
140+ return getattr (self ._store , attr )
140141
141- def __getitem__ (self , key ):
142- try :
143- return self .store [key ]
144- except KeyError as e :
145- # raise a different error to ensure zarr propagates the exception, rather than filling
146- raise FileNotFoundError (e )
142+ def __iter__ (self ):
143+ return iter (self ._store )
147144
148- def __contains__ (self , key ):
149- return key in self .store
145+ def __len__ (self ):
146+ return len ( self ._store )
150147
151- def __iter__ (self ):
152- return iter ( self . store )
148+ def __setitem__ (self , item ):
149+ raise NotImplementedError
153150
154- def __len__ (self ):
155- return len ( self . store )
151+ def __delitem__ (self , item ):
152+ raise NotImplementedError
156153
157154
158155class SiteClass (Enum ):
@@ -269,7 +266,11 @@ def da_from_zarr(
269266 dask_chunks = chunks
270267
271268 kwargs = dict (
272- chunks = dask_chunks , fancy = False , lock = False , inline_array = inline_array
269+ inline_array = inline_array ,
270+ chunks = dask_chunks ,
271+ fancy = True ,
272+ lock = False ,
273+ asarray = True ,
273274 )
274275 try :
275276 d = da .from_array (z , ** kwargs )
@@ -301,14 +302,19 @@ def dask_compress_dataset(ds, indexer, dim):
301302 indexer = ds [indexer ].data
302303
303304 # sanity checks
304- assert isinstance (indexer , da .Array )
305305 assert indexer .ndim == 1
306306 assert indexer .dtype == bool
307307 assert indexer .shape [0 ] == ds .sizes [dim ]
308308
309- # temporarily compute the indexer once, to avoid multiple reads from
310- # the underlying data
311- indexer_computed = indexer .compute ()
309+ if isinstance (indexer , da .Array ):
310+ # temporarily compute the indexer once, to avoid multiple reads from
311+ # the underlying data
312+ indexer_computed = indexer .compute ()
313+ else :
314+ assert isinstance (indexer , np .ndarray )
315+ indexer_computed = indexer
316+ indexer_zarr = zarr .array (indexer_computed )
317+ indexer = da_from_zarr (indexer_zarr , chunks = "native" , inline_array = True )
312318
313319 coords = dict ()
314320 for k in ds .coords :
@@ -353,32 +359,36 @@ def da_compress(
353359):
354360 """Wrapper for dask.array.compress() which computes chunk sizes faster."""
355361
356- # sanity checks
362+ # Sanity checks.
363+ assert indexer .ndim == 1
364+ assert indexer .dtype == bool
357365 assert indexer .shape [0 ] == data .shape [axis ]
358366
359- # useful variables
367+ # Useful variables.
360368 old_chunks = data .chunks
361369 axis_old_chunks = old_chunks [axis ]
362370
363- # load the indexer temporarily for chunk size computations
371+ # Load the indexer temporarily for chunk size computations.
364372 if indexer_computed is None :
365373 indexer_computed = indexer .compute ()
366374
367- # ensure indexer and data are chunked in the same way
375+ # Ensure indexer and data are chunked in the same way.
368376 indexer = indexer .rechunk ((axis_old_chunks ,))
369377
370- # apply the indexing operation
378+ # Apply the indexing operation.
371379 v = da .compress (indexer , data , axis = axis )
372380
373- # need to compute chunks sizes in order to know dimension sizes;
381+ # Need to compute chunks sizes in order to know dimension sizes;
374382 # would normally do v.compute_chunk_sizes() but that is slow for
375- # multidimensional arrays, so hack something more efficient
376-
383+ # multidimensional arrays, so hack something more efficient.
377384 axis_new_chunks_list = []
378385 slice_start = 0
386+ need_rechunk = False
379387 for old_chunk_size in axis_old_chunks :
380388 slice_stop = slice_start + old_chunk_size
381- new_chunk_size = np .sum (indexer_computed [slice_start :slice_stop ])
389+ new_chunk_size = int (np .sum (indexer_computed [slice_start :slice_stop ]))
390+ if new_chunk_size == 0 :
391+ need_rechunk = True
382392 axis_new_chunks_list .append (new_chunk_size )
383393 slice_start = slice_stop
384394 axis_new_chunks = tuple (axis_new_chunks_list )
@@ -387,6 +397,23 @@ def da_compress(
387397 )
388398 v ._chunks = new_chunks
389399
400+ # Deal with empty chunks, they break reductions.
401+ # Possibly related to https://github.com/dask/dask/issues/10327
402+ # and https://github.com/dask/dask/issues/2794
403+ if need_rechunk :
404+ axis_new_chunks_nonzero = tuple ([x for x in axis_new_chunks if x > 0 ])
405+ # Edge case, all chunks empty:
406+ if len (axis_new_chunks_nonzero ) == 0 :
407+ # Not much we can do about this, no data.
408+ axis_new_chunks_nonzero = (0 ,)
409+ new_chunks_nonzero = tuple (
410+ [
411+ axis_new_chunks_nonzero if i == axis else c
412+ for i , c in enumerate (new_chunks )
413+ ]
414+ )
415+ v = v .rechunk (new_chunks_nonzero )
416+
390417 return v
391418
392419
@@ -1461,6 +1488,64 @@ def apply_allele_mapping(x, mapping, max_allele):
14611488 return out
14621489
14631490
1491+ def dask_apply_allele_mapping (v , mapping , max_allele ):
1492+ assert isinstance (v , da .Array )
1493+ assert isinstance (mapping , da .Array )
1494+ assert v .ndim == 2
1495+ assert mapping .ndim == 2
1496+ assert v .shape [0 ] == mapping .shape [0 ]
1497+ v = v .rechunk ((v .chunks [0 ], - 1 ))
1498+ mapping = mapping .rechunk ((v .chunks [0 ], - 1 ))
1499+ out = da .map_blocks (
1500+ lambda xb , mb : apply_allele_mapping (xb , mb , max_allele = max_allele ),
1501+ v ,
1502+ mapping ,
1503+ dtype = v .dtype ,
1504+ chunks = (v .chunks [0 ], [max_allele + 1 ]),
1505+ )
1506+ return out
1507+
1508+
1509+ def genotype_array_map_alleles (gt , mapping ):
1510+ # Transform genotype calls via an allele mapping.
1511+ # N.B., scikit-allel does not handle empty blocks well, so we
1512+ # include some extra logic to handle that better.
1513+ assert isinstance (gt , np .ndarray )
1514+ assert isinstance (mapping , np .ndarray )
1515+ assert gt .ndim == 3
1516+ assert mapping .ndim == 3
1517+ assert gt .shape [0 ] == mapping .shape [0 ]
1518+ assert gt .shape [1 ] > 0
1519+ assert gt .shape [2 ] == 2
1520+ if gt .size > 0 :
1521+ # Block is not empty, can pass through to GenotypeArray.
1522+ assert gt .shape [0 ] > 0
1523+ m = mapping [:, 0 , :]
1524+ out = allel .GenotypeArray (gt ).map_alleles (m ).values
1525+ else :
1526+ # Block is empty so no alleles need to be mapped.
1527+ assert gt .shape [0 ] == 0
1528+ out = gt
1529+ return out
1530+
1531+
1532+ def dask_genotype_array_map_alleles (gt , mapping ):
1533+ assert isinstance (gt , da .Array )
1534+ assert isinstance (mapping , da .Array )
1535+ assert gt .ndim == 3
1536+ assert mapping .ndim == 2
1537+ assert gt .shape [0 ] == mapping .shape [0 ]
1538+ mapping = mapping .rechunk ((gt .chunks [0 ], - 1 ))
1539+ gt_out = da .map_blocks (
1540+ genotype_array_map_alleles ,
1541+ gt ,
1542+ mapping [:, None , :],
1543+ chunks = gt .chunks ,
1544+ dtype = gt .dtype ,
1545+ )
1546+ return gt_out
1547+
1548+
14641549def pandas_apply (f , df , columns ):
14651550 """Optimised alternative to pandas apply."""
14661551 df = df .reset_index (drop = True )
0 commit comments