Merge branch 'master' of https://github.com/Yashsingh045/malariagen-data-python into GH1221-snp-data-types

Yashsingh045 · Yashsingh045 · commit c300a3d87597 · 2026-03-27T15:48:50.000+05:30
diff --git a/README.md b/README.md
@@ -49,6 +49,7 @@ To get setup for development, see [this video if you prefer VS Code](https://you
 For detailed setup instructions, see:
 - [Linux setup guide](LINUX_SETUP.md)
 - [macOS setup guide](MACOS_SETUP.md)
+- [Windows setup guide](WINDOWS_SETUP.md)
 - [Google Colab (TPU) setup guide](docs/source/colab_tpu_runtime.rst)
 Detailed instructions can be found in the [Contributors guide](https://github.com/malariagen/malariagen-data-python/blob/master/CONTRIBUTING.md).
 
diff --git a/WINDOWS_SETUP.md b/WINDOWS_SETUP.md
@@ -0,0 +1,90 @@
+# Windows Setup Guide
+
+To get setup for development on Windows, see
+[this video if you prefer VS Code](https://youtu.be/zddl3n1DCFM),
+or [this older video if you prefer PyCharm](https://youtu.be/QniQi-Hoo9A),
+and the instructions below.
+
+## 1. Fork and clone this repo
+```bash
+git clone https://github.com/[username]/malariagen-data-python.git
+cd malariagen-data-python
+```
+
+## 2. Install Python
+
+Download and install Python 3.10 from the official website:
+https://www.python.org/downloads/windows/
+
+During installation, check the box that says Add Python to PATH
+before clicking Install.
+
+Verify the installation worked:
+```bash
+python --version
+```
+
+## 3. Install pipx and poetry
+```bash
+python -m pip install --user pipx
+python -m pipx ensurepath
+pipx install poetry
+```
+
+After running ensurepath, close and reopen PowerShell before continuing.
+
+## 4. Create and activate development environment
+```bash
+poetry install
+poetry shell
+```
+
+## 5. Install pre-commit hooks
+```bash
+pipx install pre-commit
+pre-commit install
+```
+
+## 6. Add upstream remote and get latest code
+```bash
+git remote add upstream https://github.com/malariagen/malariagen-data-python
+git pull upstream master
+```
+
+Note: On Windows the default branch is called master, not main.
+
+## 7. Verify everything works
+```bash
+python -c "import malariagen_data; print('Setup successful!')"
+```
+
+## Common Issues on Windows
+
+**poetry not found after install**
+
+Close and reopen PowerShell, then try again.
+
+**git not recognized**
+
+Install Git from https://git-scm.com/download/win
+and restart PowerShell.
+
+**python not recognized**
+
+Reinstall Python and make sure to check
+Add Python to PATH during installation.
+
+**fatal: not a git repository**
+
+Make sure you are inside the malariagen-data-python
+folder before running any git commands.
+```bash
+cd malariagen-data-python
+```
+
+**error: pathspec main did not match**
+
+On Windows use master instead of main.
+```bash
+git checkout master
+```
diff --git a/malariagen_data/anoph/base.py b/malariagen_data/anoph/base.py
@@ -1,4 +1,5 @@
 import os
+import warnings
 
 import json
 from contextlib import nullcontext
@@ -496,7 +497,20 @@ def client_location(self) -> str:
         return location
 
     def _surveillance_flags(self, sample_sets: List[str]):
-        raise NotImplementedError("Subclasses must implement `_surveillance_flags`.")
+        """Return surveillance flags for sample sets. Subclasses should override to
+        load real data; this base implementation returns empty data and warns.
+        """
+        warnings.warn(
+            "Surveillance flags not implemented for this resource; returning empty data.",
+            UserWarning,
+            stacklevel=2,
+        )
+        return pd.DataFrame(
+            {
+                "sample_id": pd.Series(dtype="object"),
+                "is_surveillance": pd.Series(dtype="boolean"),
+            }
+        )
 
     def _release_has_unrestricted_data(self, *, release: str):
         """Return `True` if the specified release has any unrestricted data. Otherwise return `False`."""
diff --git a/malariagen_data/anoph/heterozygosity.py b/malariagen_data/anoph/heterozygosity.py
@@ -395,6 +395,108 @@ def _sample_count_het(
 
         return sample_id, sample_set, windows, counts
 
+    def cohort_count_het(
+        self,
+        region: Region,
+        df_cohort_samples: pd.DataFrame,
+        sample_sets: Optional[base_params.sample_sets],
+        window_size: het_params.window_size,
+        site_mask: Optional[base_params.site_mask],
+        chunks: base_params.chunks,
+        inline_array: base_params.inline_array,
+    ):
+        """Compute windowed heterozygosity counts for multiple samples in a cohort.
+
+        This method efficiently computes heterozygosity for all samples by loading
+        SNP data once and computing across all samples, rather than calling snp_calls()
+        repeatedly for each sample. This vectorized approach provides substantial
+        performance improvements for large cohorts.
+
+        Parameters
+        ----------
+        region : Region
+            Genome region to analyze.
+        df_cohort_samples : pd.DataFrame
+            Sample metadata dataframe with at least 'sample_id' column.
+        sample_sets : str, optional
+            Sample set identifier(s).
+        window_size : int
+            Size of sliding windows for heterozygosity computation.
+        site_mask : str, optional
+            Site mask to apply.
+        chunks : str or int, dict
+            Chunk size for dask arrays.
+        inline_array : bool
+            Whether to inline arrays.
+
+        Returns
+        -------
+        dict
+            Mapping from sample_id to (windows, counts) tuple, where:
+            - windows: array of shape (n_windows, 2) with [start, stop] positions
+            - counts: array of shape (n_windows,) with heterozygous site counts per window
+        """
+        debug = self._log.debug
+
+        # Extract sample IDs from cohort dataframe
+        sample_ids = df_cohort_samples["sample_id"].values
+
+        debug("access SNPs for all cohort samples")
+        # Load SNP data once for all samples in cohort
+        ds_snps = self.snp_calls(
+            region=region,
+            sample_sets=sample_sets,
+            site_mask=site_mask,
+            chunks=chunks,
+            inline_array=inline_array,
+        )
+
+        # Subset to cohort samples to ensure correct indexing
+        ds_snps = ds_snps.set_index(samples="sample_id").sel(samples=sample_ids)
+        sample_id_to_idx = {sid: idx for idx, sid in enumerate(sample_ids)}
+
+        # SNP positions (same for all samples)
+        pos = ds_snps["variant_position"].values
+
+        # guard against window_size exceeding available sites
+        if pos.shape[0] < window_size:
+            raise ValueError(
+                f"Not enough sites ({pos.shape[0]}) for window size "
+                f"({window_size}). Please reduce the window size or "
+                f"use different site selection criteria."
+            )
+
+        # Compute window coordinates once (same for all samples)
+        windows = allel.moving_statistic(
+            values=pos,
+            statistic=lambda x: [x[0], x[-1]],
+            size=window_size,
+        )
+
+        # access genotypes for all samples
+        gt_data = ds_snps["call_genotype"].data
+
+        # Compute windowed heterozygosity for each sample and cache results
+        results = {}
+        for sample_id, sample_idx in sample_id_to_idx.items():
+            # Compute heterozygous genotypes for this sample only to avoid
+            # materializing the full (variants, samples) array in memory.
+            debug(f"Compute heterozygous genotypes for sample {sample_id}")
+            gt_sample = allel.GenotypeDaskVector(gt_data[:, sample_idx, :])
+            with self._dask_progress(desc="Compute heterozygous genotypes"):
+                is_het_sample = gt_sample.is_het().compute()
+
+            # compute windowed heterozygosity for this sample
+            counts = allel.moving_statistic(
+                values=is_het_sample,
+                statistic=np.sum,
+                size=window_size,
+            )
+
+            results[sample_id] = (windows, counts)
+
+        return results
+
     @property
     def _roh_hmm_cache_name(self):
         return "roh_hmm_v1"
@@ -816,18 +918,25 @@ def cohort_heterozygosity(
             )
             n_samples = len(df_cohort_samples)
 
-            # Compute heterozygosity for each sample and take the mean.
+            # Compute heterozygosity for all samples in the cohort using cohort_count_het().
+            # This public method loads SNP data once and computes across all samples,
+            # providing substantial speedup over sequential per-sample processing.
+            cohort_het_results = self.cohort_count_het(
+                region=region_prepped,
+                df_cohort_samples=df_cohort_samples,
+                sample_sets=sample_sets,
+                window_size=window_size,
+                site_mask=site_mask,
+                chunks=chunks,
+                inline_array=inline_array,
+            )
+
+            # Compute per-sample means and aggregate.
             het_values = []
             for sample_id in df_cohort_samples["sample_id"]:
-                df_het = self.sample_count_het(
-                    sample=sample_id,
-                    region=region_prepped,
-                    window_size=window_size,
-                    site_mask=site_mask,
-                    chunks=chunks,
-                    inline_array=inline_array,
-                )
-                het_values.append(df_het["heterozygosity"].mean())
+                _, counts = cohort_het_results[sample_id]
+                het_mean = np.mean(counts / window_size)
+                het_values.append(het_mean)
 
             results.append(
                 {
diff --git a/malariagen_data/util.py b/malariagen_data/util.py
@@ -570,6 +570,9 @@ def __eq__(self, other):
             and (self.end == other.end)
         )
 
+    def __repr__(self):
+        return f"Region({self._contig!r}, {self._start!r}, {self._end!r})"
+
     def __str__(self):
         out = self._contig
         if self._start is not None or self._end is not None:
@@ -927,7 +930,20 @@ def _jitter(a, fraction, random_state=np.random):
 
 
 class CacheMiss(Exception):
-    pass
+    """Raised when a requested item is not present in the cache."""
+
+    def __init__(self, key=None):
+        self.key = key
+        if key is not None:
+            message = f"Cache miss for key: {key!r}"
+        else:
+            message = "Cache miss: requested item not found in cache."
+        super().__init__(message)
+
+    def __repr__(self):
+        if self.key is not None:
+            return f"CacheMiss({self.key!r})"
+        return "CacheMiss()"
 
 
 class LoggingHelper:
@@ -1531,12 +1547,10 @@ def _apply_allele_mapping(x, mapping, max_allele):
 
 def _dask_apply_allele_mapping(v, mapping, max_allele):
     if not isinstance(v, da.Array):
-        raise TypeError(
-            f"Expected v to be a dask.array.Array, " f"got {type(v).__name__}"
-        )
+        raise TypeError(f"Expected v to be a dask.array.Array, got {type(v).__name__}")
     if not isinstance(mapping, np.ndarray):
         raise TypeError(
-            f"Expected mapping to be a numpy.ndarray, " f"got {type(mapping).__name__}"
+            f"Expected mapping to be a numpy.ndarray, got {type(mapping).__name__}"
         )
     assert v.ndim == 2
     assert mapping.ndim == 2
@@ -1558,12 +1572,10 @@ def _genotype_array_map_alleles(gt, mapping):
     # N.B., scikit-allel does not handle empty blocks well, so we
     # include some extra logic to handle that better.
     if not isinstance(gt, np.ndarray):
-        raise TypeError(
-            f"Expected gt to be a numpy.ndarray, " f"got {type(gt).__name__}"
-        )
+        raise TypeError(f"Expected gt to be a numpy.ndarray, got {type(gt).__name__}")
     if not isinstance(mapping, np.ndarray):
         raise TypeError(
-            f"Expected mapping to be a numpy.ndarray, " f"got {type(mapping).__name__}"
+            f"Expected mapping to be a numpy.ndarray, got {type(mapping).__name__}"
         )
     assert gt.ndim == 3
     assert mapping.ndim == 3
@@ -1585,11 +1597,11 @@ def _genotype_array_map_alleles(gt, mapping):
 def _dask_genotype_array_map_alleles(gt, mapping):
     if not isinstance(gt, da.Array):
         raise TypeError(
-            f"Expected gt to be a dask.array.Array, " f"got {type(gt).__name__}"
+            f"Expected gt to be a dask.array.Array, got {type(gt).__name__}"
         )
     if not isinstance(mapping, np.ndarray):
         raise TypeError(
-            f"Expected mapping to be a numpy.ndarray, " f"got {type(mapping).__name__}"
+            f"Expected mapping to be a numpy.ndarray, got {type(mapping).__name__}"
         )
     assert gt.ndim == 3
     assert mapping.ndim == 2
diff --git a/tests/anoph/test_base.py b/tests/anoph/test_base.py
@@ -411,3 +411,28 @@ def test_sample_sets_no_terms_of_use(ag3_sim_fixture):
     finally:
         for mp, bp in zip(manifest_paths, backups):
             shutil.move(bp, mp)
+
+
+class TestSurveillanceFlagsBaseFallback:
+    """Tests for issue #1206: base _surveillance_flags graceful fallback."""
+
+    def test_surveillance_flags_base_returns_empty_and_warns(self, ag3_sim_api):
+        """Base implementation returns empty DataFrame with correct schema and warns."""
+        with pytest.warns(UserWarning, match="Surveillance flags not implemented"):
+            df = ag3_sim_api._surveillance_flags(sample_sets=["AG1000G-AO"])
+
+        assert isinstance(df, pd.DataFrame)
+        assert list(df.columns) == ["sample_id", "is_surveillance"]
+        assert df["sample_id"].dtype == object
+        assert pd.api.types.is_bool_dtype(df["is_surveillance"])
+        assert len(df) == 0
+
+    def test_sample_set_has_surveillance_data_returns_false_when_fallback(
+        self, ag3_sim_api
+    ):
+        """_sample_set_has_surveillance_data returns False when base fallback is used."""
+        with pytest.warns(UserWarning, match="Surveillance flags not implemented"):
+            result = ag3_sim_api._sample_set_has_surveillance_data(
+                sample_set="AG1000G-AO"
+            )
+        assert not result
diff --git a/tests/anoph/test_heterozygosity.py b/tests/anoph/test_heterozygosity.py
diff --git a/tests/test_util.py b/tests/test_util.py