Merge branch 'master' into GH-1054-add-vcf-export

adilraza99 · web-flow · commit c3212b34869f · 2026-03-28T00:58:21.000+05:30
diff --git a/malariagen_data/adar1.py b/malariagen_data/adar1.py
@@ -181,7 +181,7 @@ def _repr_html_(self):
                         <th style="text-align: left">
                             Data releases available
                         </th>
-                        <td>{', '.join(self._available_releases)}</td>
+                        <td>{", ".join(self._available_releases)}</td>
                     </tr>
                     <tr>
                         <th style="text-align: left">
@@ -229,7 +229,7 @@ def _repr_html_(self):
                         <th style="text-align: left">
                             Relevant data releases
                         </th>
-                        <td>{', '.join(self.releases)}</td>
+                        <td>{", ".join(self.releases)}</td>
                     </tr>
                 </tbody>
             </table>
diff --git a/malariagen_data/adir1.py b/malariagen_data/adir1.py
@@ -181,7 +181,7 @@ def _repr_html_(self):
                         <th style="text-align: left">
                             Data releases available
                         </th>
-                        <td>{', '.join(self._available_releases)}</td>
+                        <td>{", ".join(self._available_releases)}</td>
                     </tr>
                     <tr>
                         <th style="text-align: left">
@@ -229,7 +229,7 @@ def _repr_html_(self):
                         <th style="text-align: left">
                             Relevant data releases
                         </th>
-                        <td>{', '.join(self.releases)}</td>
+                        <td>{", ".join(self.releases)}</td>
                     </tr>
                 </tbody>
             </table>
diff --git a/malariagen_data/af1.py b/malariagen_data/af1.py
@@ -183,7 +183,7 @@ def _repr_html_(self):
                         <th style="text-align: left">
                             Data releases available
                         </th>
-                        <td>{', '.join(self._available_releases)}</td>
+                        <td>{", ".join(self._available_releases)}</td>
                     </tr>
                     <tr>
                         <th style="text-align: left">
@@ -231,7 +231,7 @@ def _repr_html_(self):
                         <th style="text-align: left">
                             Relevant data releases
                         </th>
-                        <td>{', '.join(self.releases)}</td>
+                        <td>{", ".join(self.releases)}</td>
                     </tr>
                 </tbody>
             </table>
diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py
@@ -278,7 +278,7 @@ def _repr_html_(self):
                         <th style="text-align: left">
                             Data releases available
                         </th>
-                        <td>{', '.join(self._available_releases)}</td>
+                        <td>{", ".join(self._available_releases)}</td>
                     </tr>
                     <tr>
                         <th style="text-align: left">
@@ -332,7 +332,7 @@ def _repr_html_(self):
                         <th style="text-align: left">
                             Relevant data releases
                         </th>
-                        <td>{', '.join(self.releases)}</td>
+                        <td>{", ".join(self.releases)}</td>
                     </tr>
                 </tbody>
             </table>
diff --git a/malariagen_data/amin1.py b/malariagen_data/amin1.py
@@ -181,7 +181,7 @@ def _repr_html_(self):
                         <th style="text-align: left">
                             Data releases available
                         </th>
-                        <td>{', '.join(self.releases)}</td>
+                        <td>{", ".join(self.releases)}</td>
                     </tr>
                     <tr>
                         <th style="text-align: left">
@@ -229,7 +229,7 @@ def _repr_html_(self):
                         <th style="text-align: left">
                             Relevant data releases
                         </th>
-                        <td>{', '.join(self.releases)}</td>
+                        <td>{", ".join(self.releases)}</td>
                     </tr>
                 </tbody>
             </table>
diff --git a/malariagen_data/anoph/base.py b/malariagen_data/anoph/base.py
@@ -1,4 +1,5 @@
 import os
+import warnings
 
 import json
 from contextlib import nullcontext
@@ -134,7 +135,7 @@ def __init__(
             storage_options = dict()
         try:
             self._fs, self._base_path = _init_filesystem(self._url, **storage_options)
-        except Exception as exc:  # pragma: no cover
+        except (OSError, ImportError) as exc:  # pragma: no cover
             raise IOError(
                 "An error occurred establishing a connection to the storage system. Please see the nested exception for more details."
             ) from exc
@@ -143,7 +144,7 @@ def __init__(
         try:
             with self.open_file(self._config_path) as f:
                 self._config = json.load(f)
-        except Exception as exc:  # pragma: no cover
+        except (OSError, json.JSONDecodeError) as exc:  # pragma: no cover
             if (isinstance(exc, OSError) and "forbidden" in str(exc).lower()) or (
                 getattr(exc, "status", None) == 403
             ):
@@ -496,7 +497,20 @@ def client_location(self) -> str:
         return location
 
     def _surveillance_flags(self, sample_sets: List[str]):
-        raise NotImplementedError("Subclasses must implement `_surveillance_flags`.")
+        """Return surveillance flags for sample sets. Subclasses should override to
+        load real data; this base implementation returns empty data and warns.
+        """
+        warnings.warn(
+            "Surveillance flags not implemented for this resource; returning empty data.",
+            UserWarning,
+            stacklevel=2,
+        )
+        return pd.DataFrame(
+            {
+                "sample_id": pd.Series(dtype="object"),
+                "is_surveillance": pd.Series(dtype="boolean"),
+            }
+        )
 
     def _release_has_unrestricted_data(self, *, release: str):
         """Return `True` if the specified release has any unrestricted data. Otherwise return `False`."""
diff --git a/malariagen_data/anoph/heterozygosity.py b/malariagen_data/anoph/heterozygosity.py
@@ -395,6 +395,108 @@ def _sample_count_het(
 
         return sample_id, sample_set, windows, counts
 
+    def cohort_count_het(
+        self,
+        region: Region,
+        df_cohort_samples: pd.DataFrame,
+        sample_sets: Optional[base_params.sample_sets],
+        window_size: het_params.window_size,
+        site_mask: Optional[base_params.site_mask],
+        chunks: base_params.chunks,
+        inline_array: base_params.inline_array,
+    ):
+        """Compute windowed heterozygosity counts for multiple samples in a cohort.
+
+        This method efficiently computes heterozygosity for all samples by loading
+        SNP data once and computing across all samples, rather than calling snp_calls()
+        repeatedly for each sample. This vectorized approach provides substantial
+        performance improvements for large cohorts.
+
+        Parameters
+        ----------
+        region : Region
+            Genome region to analyze.
+        df_cohort_samples : pd.DataFrame
+            Sample metadata dataframe with at least 'sample_id' column.
+        sample_sets : str, optional
+            Sample set identifier(s).
+        window_size : int
+            Size of sliding windows for heterozygosity computation.
+        site_mask : str, optional
+            Site mask to apply.
+        chunks : str or int, dict
+            Chunk size for dask arrays.
+        inline_array : bool
+            Whether to inline arrays.
+
+        Returns
+        -------
+        dict
+            Mapping from sample_id to (windows, counts) tuple, where:
+            - windows: array of shape (n_windows, 2) with [start, stop] positions
+            - counts: array of shape (n_windows,) with heterozygous site counts per window
+        """
+        debug = self._log.debug
+
+        # Extract sample IDs from cohort dataframe
+        sample_ids = df_cohort_samples["sample_id"].values
+
+        debug("access SNPs for all cohort samples")
+        # Load SNP data once for all samples in cohort
+        ds_snps = self.snp_calls(
+            region=region,
+            sample_sets=sample_sets,
+            site_mask=site_mask,
+            chunks=chunks,
+            inline_array=inline_array,
+        )
+
+        # Subset to cohort samples to ensure correct indexing
+        ds_snps = ds_snps.set_index(samples="sample_id").sel(samples=sample_ids)
+        sample_id_to_idx = {sid: idx for idx, sid in enumerate(sample_ids)}
+
+        # SNP positions (same for all samples)
+        pos = ds_snps["variant_position"].values
+
+        # guard against window_size exceeding available sites
+        if pos.shape[0] < window_size:
+            raise ValueError(
+                f"Not enough sites ({pos.shape[0]}) for window size "
+                f"({window_size}). Please reduce the window size or "
+                f"use different site selection criteria."
+            )
+
+        # Compute window coordinates once (same for all samples)
+        windows = allel.moving_statistic(
+            values=pos,
+            statistic=lambda x: [x[0], x[-1]],
+            size=window_size,
+        )
+
+        # access genotypes for all samples
+        gt_data = ds_snps["call_genotype"].data
+
+        # Compute windowed heterozygosity for each sample and cache results
+        results = {}
+        for sample_id, sample_idx in sample_id_to_idx.items():
+            # Compute heterozygous genotypes for this sample only to avoid
+            # materializing the full (variants, samples) array in memory.
+            debug(f"Compute heterozygous genotypes for sample {sample_id}")
+            gt_sample = allel.GenotypeDaskVector(gt_data[:, sample_idx, :])
+            with self._dask_progress(desc="Compute heterozygous genotypes"):
+                is_het_sample = gt_sample.is_het().compute()
+
+            # compute windowed heterozygosity for this sample
+            counts = allel.moving_statistic(
+                values=is_het_sample,
+                statistic=np.sum,
+                size=window_size,
+            )
+
+            results[sample_id] = (windows, counts)
+
+        return results
+
     @property
     def _roh_hmm_cache_name(self):
         return "roh_hmm_v1"
@@ -816,18 +918,25 @@ def cohort_heterozygosity(
             )
             n_samples = len(df_cohort_samples)
 
-            # Compute heterozygosity for each sample and take the mean.
+            # Compute heterozygosity for all samples in the cohort using cohort_count_het().
+            # This public method loads SNP data once and computes across all samples,
+            # providing substantial speedup over sequential per-sample processing.
+            cohort_het_results = self.cohort_count_het(
+                region=region_prepped,
+                df_cohort_samples=df_cohort_samples,
+                sample_sets=sample_sets,
+                window_size=window_size,
+                site_mask=site_mask,
+                chunks=chunks,
+                inline_array=inline_array,
+            )
+
+            # Compute per-sample means and aggregate.
             het_values = []
             for sample_id in df_cohort_samples["sample_id"]:
-                df_het = self.sample_count_het(
-                    sample=sample_id,
-                    region=region_prepped,
-                    window_size=window_size,
-                    site_mask=site_mask,
-                    chunks=chunks,
-                    inline_array=inline_array,
-                )
-                het_values.append(df_het["heterozygosity"].mean())
+                _, counts = cohort_het_results[sample_id]
+                het_mean = np.mean(counts / window_size)
+                het_values.append(het_mean)
 
             results.append(
                 {
diff --git a/malariagen_data/anoph/map_params.py b/malariagen_data/anoph/map_params.py
@@ -42,7 +42,7 @@ def get_basemap_abbrevs() -> dict:
         for key, provider_fn in _basemap_abbrev_candidates.items():
             try:
                 _basemap_abbrevs[key] = provider_fn()
-            except Exception:
+            except (ImportError, AttributeError):
                 warnings.warn(
                     f"Basemap provider {key!r} is not available and will be skipped.",
                     stacklevel=2,
diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py
@@ -1864,7 +1864,7 @@ def _locate_cohorts(*, cohorts, data, min_cohort_size):
         for coh, query in cohorts.items():
             try:
                 loc_coh = data.eval(query).values
-            except Exception as e:
+            except (KeyError, NameError, SyntaxError, TypeError, AttributeError) as e:
                 raise ValueError(
                     f"Invalid query for cohort {coh!r}: {query!r}. Error: {e}"
                 ) from e
diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
@@ -1,4 +1,3 @@
-from abc import abstractmethod
 from typing import Any, Dict, Mapping, Optional, Tuple, Sequence
 
 import allel  # type: ignore
@@ -182,15 +181,47 @@ def __init__(
             surveillance_use_only=surveillance_use_only,
         )
 
-    @property
-    @abstractmethod
-    def _xpehh_gwss_cache_name(self):
-        raise NotImplementedError("Must override _xpehh_gwss_cache_name")
+    def _get_xpehh_gwss_cache_name(self):
+        """Safely resolve the xpehh gwss cache name.
 
-    @property
-    @abstractmethod
-    def _ihs_gwss_cache_name(self):
-        raise NotImplementedError("Must override _ihs_gwss_cache_name")
+        Supports class attribute, property, or legacy method override.
+        Falls back to the default "xpehh_gwss_v1" if resolution fails.
+
+        See also: https://github.com/malariagen/malariagen-data-python/issues/1151
+        """
+        try:
+            name = self._xpehh_gwss_cache_name
+            # Handle legacy case where _xpehh_gwss_cache_name might be a
+            # callable method rather than a property or class attribute.
+            if callable(name):
+                name = name()
+            if isinstance(name, str) and len(name) > 0:
+                return name
+        except NotImplementedError:
+            pass
+        # Fallback to default.
+        return "xpehh_gwss_v1"
+
+    def _get_ihs_gwss_cache_name(self):
+        """Safely resolve the ihs gwss cache name.
+
+        Supports class attribute, property, or legacy method override.
+        Falls back to the default "ihs_gwss_v1" if resolution fails.
+
+        See also: https://github.com/malariagen/malariagen-data-python/issues/1151
+        """
+        try:
+            name = self._ihs_gwss_cache_name
+            # Handle legacy case where _ihs_gwss_cache_name might be a
+            # callable method rather than a property or class attribute.
+            if callable(name):
+                name = name()
+            if isinstance(name, str) and len(name) > 0:
+                return name
+        except NotImplementedError:
+            pass
+        # Fallback to default.
+        return "ihs_gwss_v1"
 
     @staticmethod
     def _make_gene_cnv_label(gene_id, gene_name, cnv_type):
@@ -727,7 +758,7 @@ def ihs_gwss(
     ) -> Tuple[np.ndarray, np.ndarray]:
         # change this name if you ever change the behaviour of this function, to
         # invalidate any previously cached data
-        name = self._ihs_gwss_cache_name
+        name = self._get_ihs_gwss_cache_name()
 
         params = dict(
             contig=contig,
@@ -1251,7 +1282,7 @@ def xpehh_gwss(
     ) -> Tuple[np.ndarray, np.ndarray]:
         # change this name if you ever change the behaviour of this function, to
         # invalidate any previously cached data
-        name = self._xpehh_gwss_cache_name
+        name = self._get_xpehh_gwss_cache_name()
 
         params = dict(
             contig=contig,
diff --git a/tests/anoph/test_base.py b/tests/anoph/test_base.py
diff --git a/tests/anoph/test_heterozygosity.py b/tests/anoph/test_heterozygosity.py