fix: correct dask array extraction in _cohort_count_het_vectorized

kunal-10-cloud · kunal-10-cloud · commit bee027135988 · 2026-03-24T00:03:55.000+05:30
- Store raw dask array before subsetting to avoid AttributeError on .data
- Access gt_data directly instead of wrapping then slicing GenotypeDaskArray
- All 28 tests pass (4 cohort_heterozygosity + 4 regression + 20 others)
- Maintains memory optimization: per-sample computation avoids materializing full array
- Addresses final Copilot code review suggestion
diff --git a/malariagen_data/anoph/heterozygosity.py b/malariagen_data/anoph/heterozygosity.py
@@ -439,7 +439,6 @@ def _cohort_count_het_vectorized(
 
         # Extract sample IDs from cohort dataframe
         sample_ids = df_cohort_samples["sample_id"].values
-        sample_id_to_idx = {sid: idx for idx, sid in enumerate(sample_ids)}
 
         debug("access SNPs for all cohort samples")
         # Load SNP data once for all samples in cohort
@@ -451,6 +450,10 @@ def _cohort_count_het_vectorized(
             inline_array=inline_array,
         )
 
+        # Subset to cohort samples to ensure correct indexing
+        ds_snps = ds_snps.set_index(samples="sample_id").sel(samples=sample_ids)
+        sample_id_to_idx = {sid: idx for idx, sid in enumerate(sample_ids)}
+
         # SNP positions (same for all samples)
         pos = ds_snps["variant_position"].values
 
@@ -470,18 +473,17 @@ def _cohort_count_het_vectorized(
         )
 
         # access genotypes for all samples
-        gt = allel.GenotypeDaskArray(ds_snps["call_genotype"].data)
-
-        # compute het across all samples: shape (variants, samples)
-        debug("Compute heterozygous genotypes for all samples")
-        with self._dask_progress(desc="Compute heterozygous genotypes"):
-            is_het_all = gt.is_het().compute()
+        gt_data = ds_snps["call_genotype"].data
 
         # Compute windowed heterozygosity for each sample and cache results
         results = {}
         for sample_id, sample_idx in sample_id_to_idx.items():
-            # Extract heterozygosity column for this sample
-            is_het_sample = is_het_all[:, sample_idx]
+            # Compute heterozygous genotypes for this sample only to avoid
+            # materializing the full (variants, samples) array in memory.
+            debug(f"Compute heterozygous genotypes for sample {sample_id}")
+            gt_sample = allel.GenotypeDaskVector(gt_data[:, sample_idx, :])
+            with self._dask_progress(desc="Compute heterozygous genotypes"):
+                is_het_sample = gt_sample.is_het().compute()
 
             # compute windowed heterozygosity for this sample
             counts = allel.moving_statistic(
@@ -910,7 +912,7 @@ def cohort_heterozygosity(
             # Compute per-sample means and aggregate.
             het_values = []
             for sample_id in df_cohort_samples["sample_id"]:
-                windows, counts = cohort_het_results[sample_id]
+                _, counts = cohort_het_results[sample_id]
                 het_mean = np.mean(counts / window_size)
                 het_values.append(het_mean)
 
diff --git a/tests/anoph/test_heterozygosity.py b/tests/anoph/test_heterozygosity.py
@@ -281,8 +281,10 @@ def test_cohort_count_het_vectorized_regression(fixture, api: AnophelesHetAnalys
 
     # Get sample metadata for a small cohort
     df_samples = api.sample_metadata(sample_sets=sample_set)
-    # Use first few samples to keep test fast
-    df_cohort_samples = df_samples.head(min(3, len(df_samples))).reset_index(drop=True)
+    # Use a small, non-trivial subset of samples (fixed random_state for reproducibility)
+    df_cohort_samples = df_samples.sample(
+        n=min(3, len(df_samples)), random_state=0
+    ).reset_index(drop=True)
 
     # Parse region once
     region_prepped = _parse_single_region(api, region)