docs: update comments and variable names to reflect cohort_count_het as public API

kunal-10-cloud · kunal-10-cloud · commit 5e5fc44ad977 · 2026-03-24T18:14:13.000+05:30
- Updated docstring to emphasize this is a public reusable method
- Clarified vectorized approach in comments for performance context
- Renamed test variables: vectorized_results → cohort_results
- Renamed: vectorized_het → cohort_het for consistency with public API
- Updated inline comments to reference cohort_count_het() explicitly
- All 28 tests pass, pre-commit checks pass
diff --git a/malariagen_data/anoph/heterozygosity.py b/malariagen_data/anoph/heterozygosity.py
@@ -395,7 +395,7 @@ def _sample_count_het(
 
         return sample_id, sample_set, windows, counts
 
-    def _cohort_count_het_vectorized(
+    def cohort_count_het(
         self,
         region: Region,
         df_cohort_samples: pd.DataFrame,
@@ -405,11 +405,12 @@ def _cohort_count_het_vectorized(
         chunks: base_params.chunks,
         inline_array: base_params.inline_array,
     ):
-        """Vectorized computation of windowed heterozygosity for multiple samples.
+        """Compute windowed heterozygosity counts for multiple samples in a cohort.
 
-        Loads SNP data once for all cohort samples, then computes heterozygosity
-        across all samples efficiently, rather than calling snp_calls() repeatedly
-        for each sample.
+        This method efficiently computes heterozygosity for all samples by loading
+        SNP data once and computing across all samples, rather than calling snp_calls()
+        repeatedly for each sample. This vectorized approach provides substantial
+        performance improvements for large cohorts.
 
         Parameters
         ----------
@@ -896,10 +897,10 @@ def cohort_heterozygosity(
             )
             n_samples = len(df_cohort_samples)
 
-            # Compute heterozygosity for all samples in the cohort using vectorized method.
-            # This loads SNP data once and computes heterozygosity across all samples,
-            # yielding substantial speedup over sequential per-sample processing.
-            cohort_het_results = self._cohort_count_het_vectorized(
+            # Compute heterozygosity for all samples in the cohort using cohort_count_het().
+            # This public method loads SNP data once and computes across all samples,
+            # providing substantial speedup over sequential per-sample processing.
+            cohort_het_results = self.cohort_count_het(
                 region=region_prepped,
                 df_cohort_samples=df_cohort_samples,
                 sample_sets=sample_sets,
diff --git a/tests/anoph/test_heterozygosity.py b/tests/anoph/test_heterozygosity.py
@@ -264,10 +264,10 @@ def test_cohort_heterozygosity(fixture, api: AnophelesHetAnalysis):
 
 
 @parametrize_with_cases("fixture,api", cases=".")
-def test_cohort_count_het_vectorized_regression(fixture, api: AnophelesHetAnalysis):
-    """Regression test: vectorized method produces identical results to sequential method.
+def test_cohort_count_het_regression(fixture, api: AnophelesHetAnalysis):
+    """Regression test: cohort method produces identical results to sequential method.
 
-    This test verifies that the _cohort_count_het_vectorized() method produces
+    This test verifies that the cohort_count_het() method produces
     numerically identical heterozygosity values as the sequential per-sample approach.
     """
     from malariagen_data.util import _parse_single_region
@@ -290,7 +290,7 @@ def test_cohort_count_het_vectorized_regression(fixture, api: AnophelesHetAnalys
     region_prepped = _parse_single_region(api, region)
 
     # Method 1: use vectorized method
-    vectorized_results = api._cohort_count_het_vectorized(
+    cohort_results = api.cohort_count_het(
         region=region_prepped,
         df_cohort_samples=df_cohort_samples,
         sample_sets=sample_set,
@@ -315,21 +315,21 @@ def test_cohort_count_het_vectorized_regression(fixture, api: AnophelesHetAnalys
 
     # Verify both methods produce identical results
     for sample_id in df_cohort_samples["sample_id"]:
-        windows, counts = vectorized_results[sample_id]
+        windows, counts = cohort_results[sample_id]
 
-        # Convert vectorized counts to heterozygosity
-        vectorized_het = counts / window_size
+        # Convert cohort counts to heterozygosity
+        cohort_het = counts / window_size
 
         # Get sequential heterozygosity
         sequential_het = sequential_results[sample_id]
 
         # Check shapes match
         assert (
-            len(vectorized_het) == len(sequential_het)
-        ), f"Shape mismatch for sample {sample_id}: vectorized={len(vectorized_het)}, sequential={len(sequential_het)}"
+            len(cohort_het) == len(sequential_het)
+        ), f"Shape mismatch for sample {sample_id}: cohort={len(cohort_het)}, sequential={len(sequential_het)}"
 
         # Check values are numerically identical (within floating point precision)
-        assert np.allclose(vectorized_het, sequential_het, rtol=1e-10), (
+        assert np.allclose(cohort_het, sequential_het, rtol=1e-10), (
             f"Values differ for sample {sample_id}. "
-            f"Max difference: {np.max(np.abs(vectorized_het - sequential_het))}"
+            f"Max difference: {np.max(np.abs(cohort_het - sequential_het))}"
         )