Skip to content

Commit 5e5fc44

Browse files
docs: update comments and variable names to reflect cohort_count_het as public API
- Updated docstring to emphasize this is a public reusable method - Clarified vectorized approach in comments for performance context - Renamed test variables: vectorized_results → cohort_results - Renamed: vectorized_het → cohort_het for consistency with public API - Updated inline comments to reference cohort_count_het() explicitly - All 28 tests pass, pre-commit checks pass
1 parent bee0271 commit 5e5fc44

2 files changed

Lines changed: 21 additions & 20 deletions

File tree

malariagen_data/anoph/heterozygosity.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -395,7 +395,7 @@ def _sample_count_het(
395395

396396
return sample_id, sample_set, windows, counts
397397

398-
def _cohort_count_het_vectorized(
398+
def cohort_count_het(
399399
self,
400400
region: Region,
401401
df_cohort_samples: pd.DataFrame,
@@ -405,11 +405,12 @@ def _cohort_count_het_vectorized(
405405
chunks: base_params.chunks,
406406
inline_array: base_params.inline_array,
407407
):
408-
"""Vectorized computation of windowed heterozygosity for multiple samples.
408+
"""Compute windowed heterozygosity counts for multiple samples in a cohort.
409409
410-
Loads SNP data once for all cohort samples, then computes heterozygosity
411-
across all samples efficiently, rather than calling snp_calls() repeatedly
412-
for each sample.
410+
This method efficiently computes heterozygosity for all samples by loading
411+
SNP data once and computing across all samples, rather than calling snp_calls()
412+
repeatedly for each sample. This vectorized approach provides substantial
413+
performance improvements for large cohorts.
413414
414415
Parameters
415416
----------
@@ -896,10 +897,10 @@ def cohort_heterozygosity(
896897
)
897898
n_samples = len(df_cohort_samples)
898899

899-
# Compute heterozygosity for all samples in the cohort using vectorized method.
900-
# This loads SNP data once and computes heterozygosity across all samples,
901-
# yielding substantial speedup over sequential per-sample processing.
902-
cohort_het_results = self._cohort_count_het_vectorized(
900+
# Compute heterozygosity for all samples in the cohort using cohort_count_het().
901+
# This public method loads SNP data once and computes across all samples,
902+
# providing substantial speedup over sequential per-sample processing.
903+
cohort_het_results = self.cohort_count_het(
903904
region=region_prepped,
904905
df_cohort_samples=df_cohort_samples,
905906
sample_sets=sample_sets,

tests/anoph/test_heterozygosity.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -264,10 +264,10 @@ def test_cohort_heterozygosity(fixture, api: AnophelesHetAnalysis):
264264

265265

266266
@parametrize_with_cases("fixture,api", cases=".")
267-
def test_cohort_count_het_vectorized_regression(fixture, api: AnophelesHetAnalysis):
268-
"""Regression test: vectorized method produces identical results to sequential method.
267+
def test_cohort_count_het_regression(fixture, api: AnophelesHetAnalysis):
268+
"""Regression test: cohort method produces identical results to sequential method.
269269
270-
This test verifies that the _cohort_count_het_vectorized() method produces
270+
This test verifies that the cohort_count_het() method produces
271271
numerically identical heterozygosity values as the sequential per-sample approach.
272272
"""
273273
from malariagen_data.util import _parse_single_region
@@ -290,7 +290,7 @@ def test_cohort_count_het_vectorized_regression(fixture, api: AnophelesHetAnalys
290290
region_prepped = _parse_single_region(api, region)
291291

292292
# Method 1: use vectorized method
293-
vectorized_results = api._cohort_count_het_vectorized(
293+
cohort_results = api.cohort_count_het(
294294
region=region_prepped,
295295
df_cohort_samples=df_cohort_samples,
296296
sample_sets=sample_set,
@@ -315,21 +315,21 @@ def test_cohort_count_het_vectorized_regression(fixture, api: AnophelesHetAnalys
315315

316316
# Verify both methods produce identical results
317317
for sample_id in df_cohort_samples["sample_id"]:
318-
windows, counts = vectorized_results[sample_id]
318+
windows, counts = cohort_results[sample_id]
319319

320-
# Convert vectorized counts to heterozygosity
321-
vectorized_het = counts / window_size
320+
# Convert cohort counts to heterozygosity
321+
cohort_het = counts / window_size
322322

323323
# Get sequential heterozygosity
324324
sequential_het = sequential_results[sample_id]
325325

326326
# Check shapes match
327327
assert (
328-
len(vectorized_het) == len(sequential_het)
329-
), f"Shape mismatch for sample {sample_id}: vectorized={len(vectorized_het)}, sequential={len(sequential_het)}"
328+
len(cohort_het) == len(sequential_het)
329+
), f"Shape mismatch for sample {sample_id}: cohort={len(cohort_het)}, sequential={len(sequential_het)}"
330330

331331
# Check values are numerically identical (within floating point precision)
332-
assert np.allclose(vectorized_het, sequential_het, rtol=1e-10), (
332+
assert np.allclose(cohort_het, sequential_het, rtol=1e-10), (
333333
f"Values differ for sample {sample_id}. "
334-
f"Max difference: {np.max(np.abs(vectorized_het - sequential_het))}"
334+
f"Max difference: {np.max(np.abs(cohort_het - sequential_het))}"
335335
)

0 commit comments

Comments
 (0)