adding comments

31puneet · 31puneet · commit 870d2bec1803 · 2026-04-13T05:42:19.000Z
diff --git a/malariagen_data/anoph/distance.py b/malariagen_data/anoph/distance.py
@@ -115,6 +115,8 @@ def biallelic_diplotype_pairwise_distances(
         chunks: base_params.chunks = base_params.native_chunks,
         return_dataset: base_params.return_dataset = False,
     ) -> Any:
+        # Change this name if you ever change the behaviour of this function, to
+        # invalidate any previously cached data.
         name = "biallelic_diplotype_pairwise_distances"
 
         base_params._validate_sample_selection_params(
@@ -155,6 +157,7 @@ def biallelic_diplotype_pairwise_distances(
             max_missing_an=max_missing_an,
         )
 
+        # Try to retrieve results from the cache.
         try:
             results = self.results_cache_get(name=name, params=params)
 
@@ -164,9 +167,10 @@ def biallelic_diplotype_pairwise_distances(
             )
             self.results_cache_set(name=name, params=params, results=results)
 
+        # Unpack results.
         dist: np.ndarray = results["dist"]
         samples: np.ndarray = results["samples"]
-        n_snps_used: int = int(results["n_snps"][()])
+        n_snps_used: int = int(results["n_snps"][()])  # ensure scalar
 
         if return_dataset:
             import xarray as xr
diff --git a/malariagen_data/anoph/hapclust.py b/malariagen_data/anoph/hapclust.py
@@ -225,8 +225,11 @@ def haplotype_pairwise_distances(
         inline_array: base_params.inline_array = base_params.inline_array_default,
         return_dataset: base_params.return_dataset = False,
     ) -> Any:
+        # Change this name if you ever change the behaviour of this function, to
+        # invalidate any previously cached data.
         name = "haplotype_pairwise_distances"
 
+        # Normalize params for consistent hash value.
         sample_sets_prepped = self._prep_sample_sets_param(sample_sets=sample_sets)
         del sample_sets
         sample_query_prepped = self._prep_sample_query_param(sample_query=sample_query)
@@ -244,6 +247,7 @@ def haplotype_pairwise_distances(
             random_seed=random_seed,
         )
 
+        # Try to retrieve results from the cache.
         try:
             results = self.results_cache_get(name=name, params=params)
 
@@ -253,9 +257,10 @@ def haplotype_pairwise_distances(
             )
             self.results_cache_set(name=name, params=params, results=results)
 
+        # Unpack results.
         dist: np.ndarray = results["dist"]
         phased_samples: np.ndarray = results["phased_samples"]
-        n_snps: int = int(results["n_snps"][()])
+        n_snps: int = int(results["n_snps"][()])  # ensure scalar
 
         if return_dataset:
             import xarray as xr
diff --git a/malariagen_data/anoph/snp_data.py b/malariagen_data/anoph/snp_data.py
@@ -1435,6 +1435,7 @@ def _snp_allele_counts(
         )
         gt = ds_snps["call_genotype"]
 
+        # Set up and run allele counts computation.
         gt = allel.GenotypeDaskArray(gt.data)
         ac = gt.count_alleles(max_allele=3)
         with self._dask_progress(desc="Compute SNP allele counts"):
@@ -1493,10 +1494,15 @@ def snp_allele_counts(
         # enabling Dataset reconstruction without extra snp_calls().
         name = "snp_allele_counts_v3"
 
+        # Check that either sample_query xor sample_indices are provided.
         base_params._validate_sample_selection_params(
             sample_query=sample_query, sample_indices=sample_indices
         )
 
+        ## Normalize params for consistent hash value.
+
+        # Note: `_prep_sample_selection_cache_params` converts `sample_query` and `sample_query_options` into `sample_indices`.
+        # So `sample_query` and `sample_query_options` should not be used beyond this point. (`sample_indices` should be used instead.)
         (
             sample_sets_prepped,
             sample_indices_prepped,
@@ -2095,6 +2101,8 @@ def biallelic_diplotypes(
         chunks: base_params.chunks = base_params.native_chunks,
         return_dataset: base_params.return_dataset = False,
     ) -> Any:
+        # Change this name if you ever change the behaviour of this function, to
+        # invalidate any previously cached data.
         name = "biallelic_diplotypes_v3"
 
         # Check that either sample_query xor sample_indices are provided.
@@ -2152,6 +2160,7 @@ def biallelic_diplotypes(
             max_missing_an=max_missing_an,
         )
 
+        # Try to retrieve results from the cache.
         try:
             results = self.results_cache_get(name=name, params=params)
 
@@ -2181,6 +2190,7 @@ def biallelic_diplotypes(
             )
             self.results_cache_set(name=name, params=params, results=results)
 
+        # Unpack results.
         gn = results["gn"]
         samples = results["samples"]
 
@@ -2244,10 +2254,13 @@ def _biallelic_diplotypes(
             chunks=chunks,
         )
 
+        # Load sample IDs.
         samples = ds["sample_id"].values.astype("U")
         variant_position = ds["variant_position"].values
         variant_contig = ds["variant_contig"].values
 
+        # Compute diplotypes as the number of all alleles per genotype call,
+        # with missing calls coded as -127.
         gt = allel.GenotypeDaskArray(ds["call_genotype"].data)
         with self._dask_progress(desc="Compute biallelic diplotypes"):
             gn = gt.to_n_ref().compute()