2222 da_compress ,
2323 da_concat ,
2424 da_from_zarr ,
25+ dask_apply_allele_mapping ,
2526 dask_compress_dataset ,
27+ dask_genotype_array_map_alleles ,
2628 init_zarr_store ,
2729 locate_region ,
2830 parse_multi_region ,
@@ -565,6 +567,7 @@ def _snp_variants_for_contig(
565567 ref = da_from_zarr (ref_z , inline_array = inline_array , chunks = chunks )
566568 alt = da_from_zarr (alt_z , inline_array = inline_array , chunks = chunks )
567569 variant_allele = da .concatenate ([ref [:, None ], alt ], axis = 1 )
570+ variant_allele = variant_allele .rechunk ((variant_allele .chunks [0 ], - 1 ))
568571 data_vars ["variant_allele" ] = [DIM_VARIANT , DIM_ALLELE ], variant_allele
569572
570573 # Set up variant_contig.
@@ -1611,7 +1614,7 @@ def biallelic_snp_calls(
16111614
16121615 with self ._spinner ("Prepare biallelic SNP calls" ):
16131616 # Subset to biallelic sites.
1614- ds_bi = ds . isel ( variants = loc_bi )
1617+ ds_bi = dask_compress_dataset ( ds , indexer = loc_bi , dim = "variants" )
16151618
16161619 # Start building a new dataset.
16171620 coords : Dict [str , Any ] = dict ()
@@ -1624,33 +1627,30 @@ def biallelic_snp_calls(
16241627 coords ["variant_contig" ] = ("variants" ,), ds_bi ["variant_contig" ].data
16251628
16261629 # Store position.
1627- coords ["variant_position" ] = ("variants" ,), ds_bi ["variant_position" ].data
1630+ variant_position = ds_bi ["variant_position" ].data
1631+ coords ["variant_position" ] = ("variants" ,), variant_position
16281632
16291633 # Store alleles, transformed.
1630- variant_allele = ds_bi ["variant_allele" ].data
1631- variant_allele = variant_allele .rechunk ((variant_allele .chunks [0 ], - 1 ))
1632- variant_allele_out = da .map_blocks (
1633- lambda block : apply_allele_mapping (block , allele_mapping , max_allele = 1 ),
1634- variant_allele ,
1635- dtype = variant_allele .dtype ,
1636- chunks = (variant_allele .chunks [0 ], [2 ]),
1634+ variant_allele_dask = ds_bi ["variant_allele" ].data
1635+ variant_allele_out = dask_apply_allele_mapping (
1636+ variant_allele_dask , allele_mapping , max_allele = 1
16371637 )
16381638 data_vars ["variant_allele" ] = ("variants" , "alleles" ), variant_allele_out
16391639
1640- # Store allele counts, transformed, so we don't have to recompute .
1640+ # Store allele counts, transformed.
16411641 ac_out = apply_allele_mapping (ac_bi , allele_mapping , max_allele = 1 )
16421642 data_vars ["variant_allele_count" ] = ("variants" , "alleles" ), ac_out
16431643
16441644 # Store genotype calls, transformed.
1645- gt = ds_bi ["call_genotype" ].data
1646- gt_out = allel . GenotypeDaskArray ( gt ). map_alleles ( allele_mapping )
1645+ gt_dask = ds_bi ["call_genotype" ].data
1646+ gt_out = dask_genotype_array_map_alleles ( gt_dask , allele_mapping )
16471647 data_vars ["call_genotype" ] = (
16481648 (
16491649 "variants" ,
16501650 "samples" ,
16511651 "ploidy" ,
16521652 ),
1653- gt_out . values ,
1653+ gt_out ,
16541654 )
16551655
16561656 # Build dataset.
@@ -1681,12 +1681,13 @@ def biallelic_snp_calls(
16811681 loc_minor = ac_minor >= min_minor_ac
16821682 loc_out &= loc_minor
16831683
1684- ds_out = ds_out .isel (variants = loc_out )
1684+ # Apply selection from conditions.
1685+ ds_out = dask_compress_dataset (ds_out , indexer = loc_out , dim = "variants" )
16851686
16861687 # Try to meet target number of SNPs.
16871688 if n_snps is not None :
16881689 if ds_out .sizes ["variants" ] > (n_snps * 2 ):
1689- # Do some thinning.
1690+ # Apply thinning.
16901691 thin_step = ds_out .sizes ["variants" ] // n_snps
16911692 loc_thin = slice (thin_offset , None , thin_step )
16921693 ds_out = ds_out .isel (variants = loc_thin )
0 commit comments