Skip to content

Commit 870d2be

Browse files
committed
adding comments
1 parent 57c17d4 commit 870d2be

3 files changed

Lines changed: 24 additions & 2 deletions

File tree

malariagen_data/anoph/distance.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,8 @@ def biallelic_diplotype_pairwise_distances(
115115
chunks: base_params.chunks = base_params.native_chunks,
116116
return_dataset: base_params.return_dataset = False,
117117
) -> Any:
118+
# Change this name if you ever change the behaviour of this function, to
119+
# invalidate any previously cached data.
118120
name = "biallelic_diplotype_pairwise_distances"
119121

120122
base_params._validate_sample_selection_params(
@@ -155,6 +157,7 @@ def biallelic_diplotype_pairwise_distances(
155157
max_missing_an=max_missing_an,
156158
)
157159

160+
# Try to retrieve results from the cache.
158161
try:
159162
results = self.results_cache_get(name=name, params=params)
160163

@@ -164,9 +167,10 @@ def biallelic_diplotype_pairwise_distances(
164167
)
165168
self.results_cache_set(name=name, params=params, results=results)
166169

170+
# Unpack results.
167171
dist: np.ndarray = results["dist"]
168172
samples: np.ndarray = results["samples"]
169-
n_snps_used: int = int(results["n_snps"][()])
173+
n_snps_used: int = int(results["n_snps"][()]) # ensure scalar
170174

171175
if return_dataset:
172176
import xarray as xr

malariagen_data/anoph/hapclust.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,8 +225,11 @@ def haplotype_pairwise_distances(
225225
inline_array: base_params.inline_array = base_params.inline_array_default,
226226
return_dataset: base_params.return_dataset = False,
227227
) -> Any:
228+
# Change this name if you ever change the behaviour of this function, to
229+
# invalidate any previously cached data.
228230
name = "haplotype_pairwise_distances"
229231

232+
# Normalize params for consistent hash value.
230233
sample_sets_prepped = self._prep_sample_sets_param(sample_sets=sample_sets)
231234
del sample_sets
232235
sample_query_prepped = self._prep_sample_query_param(sample_query=sample_query)
@@ -244,6 +247,7 @@ def haplotype_pairwise_distances(
244247
random_seed=random_seed,
245248
)
246249

250+
# Try to retrieve results from the cache.
247251
try:
248252
results = self.results_cache_get(name=name, params=params)
249253

@@ -253,9 +257,10 @@ def haplotype_pairwise_distances(
253257
)
254258
self.results_cache_set(name=name, params=params, results=results)
255259

260+
# Unpack results.
256261
dist: np.ndarray = results["dist"]
257262
phased_samples: np.ndarray = results["phased_samples"]
258-
n_snps: int = int(results["n_snps"][()])
263+
n_snps: int = int(results["n_snps"][()]) # ensure scalar
259264

260265
if return_dataset:
261266
import xarray as xr

malariagen_data/anoph/snp_data.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1435,6 +1435,7 @@ def _snp_allele_counts(
14351435
)
14361436
gt = ds_snps["call_genotype"]
14371437

1438+
# Set up and run allele counts computation.
14381439
gt = allel.GenotypeDaskArray(gt.data)
14391440
ac = gt.count_alleles(max_allele=3)
14401441
with self._dask_progress(desc="Compute SNP allele counts"):
@@ -1493,10 +1494,15 @@ def snp_allele_counts(
14931494
# enabling Dataset reconstruction without extra snp_calls().
14941495
name = "snp_allele_counts_v3"
14951496

1497+
# Check that either sample_query xor sample_indices are provided.
14961498
base_params._validate_sample_selection_params(
14971499
sample_query=sample_query, sample_indices=sample_indices
14981500
)
14991501

1502+
## Normalize params for consistent hash value.
1503+
1504+
# Note: `_prep_sample_selection_cache_params` converts `sample_query` and `sample_query_options` into `sample_indices`.
1505+
# So `sample_query` and `sample_query_options` should not be used beyond this point. (`sample_indices` should be used instead.)
15001506
(
15011507
sample_sets_prepped,
15021508
sample_indices_prepped,
@@ -2095,6 +2101,8 @@ def biallelic_diplotypes(
20952101
chunks: base_params.chunks = base_params.native_chunks,
20962102
return_dataset: base_params.return_dataset = False,
20972103
) -> Any:
2104+
# Change this name if you ever change the behaviour of this function, to
2105+
# invalidate any previously cached data.
20982106
name = "biallelic_diplotypes_v3"
20992107

21002108
# Check that either sample_query xor sample_indices are provided.
@@ -2152,6 +2160,7 @@ def biallelic_diplotypes(
21522160
max_missing_an=max_missing_an,
21532161
)
21542162

2163+
# Try to retrieve results from the cache.
21552164
try:
21562165
results = self.results_cache_get(name=name, params=params)
21572166

@@ -2181,6 +2190,7 @@ def biallelic_diplotypes(
21812190
)
21822191
self.results_cache_set(name=name, params=params, results=results)
21832192

2193+
# Unpack results.
21842194
gn = results["gn"]
21852195
samples = results["samples"]
21862196

@@ -2244,10 +2254,13 @@ def _biallelic_diplotypes(
22442254
chunks=chunks,
22452255
)
22462256

2257+
# Load sample IDs.
22472258
samples = ds["sample_id"].values.astype("U")
22482259
variant_position = ds["variant_position"].values
22492260
variant_contig = ds["variant_contig"].values
22502261

2262+
# Compute diplotypes as the number of all alleles per genotype call,
2263+
# with missing calls coded as -127.
22512264
gt = allel.GenotypeDaskArray(ds["call_genotype"].data)
22522265
with self._dask_progress(desc="Compute biallelic diplotypes"):
22532266
gn = gt.to_n_ref().compute()

0 commit comments

Comments
 (0)