@@ -1435,6 +1435,7 @@ def _snp_allele_counts(
14351435 )
14361436 gt = ds_snps ["call_genotype" ]
14371437
1438+ # Set up and run allele counts computation.
14381439 gt = allel .GenotypeDaskArray (gt .data )
14391440 ac = gt .count_alleles (max_allele = 3 )
14401441 with self ._dask_progress (desc = "Compute SNP allele counts" ):
@@ -1493,10 +1494,15 @@ def snp_allele_counts(
14931494 # enabling Dataset reconstruction without extra snp_calls().
14941495 name = "snp_allele_counts_v3"
14951496
1497+ # Check that either sample_query xor sample_indices are provided.
14961498 base_params ._validate_sample_selection_params (
14971499 sample_query = sample_query , sample_indices = sample_indices
14981500 )
14991501
1502+ ## Normalize params for consistent hash value.
1503+
1504+ # Note: `_prep_sample_selection_cache_params` converts `sample_query` and `sample_query_options` into `sample_indices`.
1505+ # So `sample_query` and `sample_query_options` should not be used beyond this point. (`sample_indices` should be used instead.)
15001506 (
15011507 sample_sets_prepped ,
15021508 sample_indices_prepped ,
@@ -2095,6 +2101,8 @@ def biallelic_diplotypes(
20952101 chunks : base_params .chunks = base_params .native_chunks ,
20962102 return_dataset : base_params .return_dataset = False ,
20972103 ) -> Any :
2104+ # Change this name if you ever change the behaviour of this function, to
2105+ # invalidate any previously cached data.
20982106 name = "biallelic_diplotypes_v3"
20992107
21002108 # Check that either sample_query xor sample_indices are provided.
@@ -2152,6 +2160,7 @@ def biallelic_diplotypes(
21522160 max_missing_an = max_missing_an ,
21532161 )
21542162
2163+ # Try to retrieve results from the cache.
21552164 try :
21562165 results = self .results_cache_get (name = name , params = params )
21572166
@@ -2181,6 +2190,7 @@ def biallelic_diplotypes(
21812190 )
21822191 self .results_cache_set (name = name , params = params , results = results )
21832192
2193+ # Unpack results.
21842194 gn = results ["gn" ]
21852195 samples = results ["samples" ]
21862196
@@ -2244,10 +2254,13 @@ def _biallelic_diplotypes(
22442254 chunks = chunks ,
22452255 )
22462256
2257+ # Load sample IDs.
22472258 samples = ds ["sample_id" ].values .astype ("U" )
22482259 variant_position = ds ["variant_position" ].values
22492260 variant_contig = ds ["variant_contig" ].values
22502261
2262+ # Compute diplotypes as the number of all alleles per genotype call,
2263+ # with missing calls coded as -127.
22512264 gt = allel .GenotypeDaskArray (ds ["call_genotype" ].data )
22522265 with self ._dask_progress (desc = "Compute biallelic diplotypes" ):
22532266 gn = gt .to_n_ref ().compute ()
0 commit comments