Merge branch 'master' into feature/amino-acid-distance-metrics

ZiadXI · web-flow · commit 278943d0a6be · 2026-02-27T16:06:04.000+02:00
diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py
@@ -74,6 +74,14 @@ def _setup_aim_palettes():
     "gcx4": TAXON_PALETTE[10],
     "unassigned": "black",
 }
+# Colors for aim_species column, matching the AIM palettes.
+AIM_SPECIES_COLORS = {
+    "gambiae": AIM_PALETTES["gamb_vs_colu"][1],
+    "coluzzii": AIM_PALETTES["gamb_vs_colu"][3],
+    "arabiensis": AIM_PALETTES["gambcolu_vs_arab"][3],
+    "gambcolu": AIM_PALETTES["gambcolu_vs_arab"][1],
+    "unassigned": "black",
+}
 
 # Note: These column names will be treated as case-insensitive,
 # because these column names and the column names from the CSV
@@ -197,6 +205,7 @@ def __init__(
             storage_options=storage_options,
             tqdm_class=tqdm_class,
             taxon_colors=TAXON_COLORS,
+            aim_species_colors=AIM_SPECIES_COLORS,
             virtual_contigs=VIRTUAL_CONTIGS,
             gene_names=GENE_NAMES,
             inversion_tag_path=INVERSION_TAG_PATH,
diff --git a/malariagen_data/anoph/aim_data.py b/malariagen_data/anoph/aim_data.py
@@ -208,6 +208,7 @@ def plot_aim_heatmap(
         show: plotly_params.show = True,
         renderer: plotly_params.renderer = None,
     ) -> plotly_params.figure:
+        aims = self._prep_aims_param(aims=aims)
         # Load AIM calls.
         ds = self.aim_calls(
             aims=aims,
diff --git a/malariagen_data/anoph/cnv_data.py b/malariagen_data/anoph/cnv_data.py
@@ -642,9 +642,13 @@ def cnv_discordant_read_calls(
 
                 ly.append(y)
 
-                if len(ly) == 0:
-                    # Bail out, no data for given sample sets and analysis.
-                    raise ValueError("No data found for requested sample sets.")
+            if len(ly) == 0:
+                # Bail out, no data for given sample sets and contig.
+                raise ValueError(
+                    f"No CNV discordant read calls data found for contig {c!r} "
+                    f"in the requested sample sets. This could be because the "
+                    f"sample sets do not have discordant read calls data available."
+                )
 
             x = _simple_xarray_concat(ly, dim=DIM_SAMPLE)
             lx.append(x)
diff --git a/malariagen_data/anoph/fst.py b/malariagen_data/anoph/fst.py
@@ -539,11 +539,10 @@ def plot_pairwise_average_fst(
             if annotation == "standard error":
                 fig_df.loc[cohort1, cohort2] = se
             elif annotation == "Z score":
-                try:
-                    zs = fst / se
-                    fig_df.loc[cohort1, cohort2] = zs
-                except ZeroDivisionError:
+                if se == 0:
                     fig_df.loc[cohort1, cohort2] = np.nan
+                else:
+                    fig_df.loc[cohort1, cohort2] = fst / se
             else:
                 fig_df.loc[cohort1, cohort2] = fst
 
diff --git a/malariagen_data/anoph/g123.py b/malariagen_data/anoph/g123.py
@@ -170,7 +170,7 @@ def g123_gwss(
     ) -> Tuple[np.ndarray, np.ndarray]:
         # Change this name if you ever change the behaviour of this function, to
         # invalidate any previously cached data.
-        name = "g123_gwss_v1"
+        name = "g123_gwss_v2"
 
         valid_sites = self.phasing_analysis_ids + ("all", "segregating")
         if sites not in valid_sites:
@@ -181,7 +181,7 @@ def g123_gwss(
         params = dict(
             contig=contig,
             sites=sites,
-            site_mask=site_mask,
+            site_mask=self._prep_optional_site_mask_param(site_mask=site_mask),
             window_size=window_size,
             sample_sets=self._prep_sample_sets_param(sample_sets=sample_sets),
             # N.B., do not be tempted to convert this sample query into integer
diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py
@@ -33,6 +33,7 @@ def __init__(
         aim_analysis: Optional[str] = None,
         aim_metadata_dtype: Optional[Mapping[str, Any]] = None,
         taxon_colors: Optional[Mapping[str, str]] = None,
+        aim_species_colors: Optional[Mapping[str, str]] = None,
         **kwargs,
     ):
         # N.B., this class is designed to work cooperatively, and
@@ -73,6 +74,8 @@ def __init__(
         # Set up taxon colors.
         self._taxon_colors = taxon_colors
 
+        self._aim_species_colors = aim_species_colors
+
         # Set up extra metadata.
         self._extra_metadata: List = []
 
@@ -1304,6 +1307,11 @@ def _setup_sample_colors_plotly(
             # Special case, default taxon colors and order.
             color_discrete_map = self._taxon_colors
 
+        # Special handling for aim_species colors.
+        if color == "aim_species" and color_discrete_map is None:
+            # Special case, default aim_species colors and order.
+            color_discrete_map = self._aim_species_colors
+
         if isinstance(color, str):
             if "cohort_" + color in data.columns:
                 # Convenience to allow things like "admin1_year" instead of "cohort_admin1_year".
diff --git a/malariagen_data/anoph/snp_data.py b/malariagen_data/anoph/snp_data.py
@@ -1,3 +1,4 @@
+import warnings
 from functools import lru_cache
 from typing import Any, Dict, List, Optional, Tuple, Union
 
@@ -38,6 +39,11 @@
 from .genome_sequence import AnophelesGenomeSequenceData
 from .sample_metadata import AnophelesSampleMetadata
 
+# Maximum number of entries kept in the per-instance _cache_locate_site_class
+# dict. The natural ceiling is n_contigs × n_site_classes (≈ 45 for Ag3), so
+# 64 gives comfortable headroom without allowing unbounded growth.
+_LOCATE_SITE_CLASS_CACHE_MAXSIZE = 64
+
 
 class AnophelesSnpData(
     AnophelesSampleMetadata, AnophelesGenomeFeaturesData, AnophelesGenomeSequenceData
@@ -68,6 +74,14 @@ def __init__(
         self._cache_site_annotations = None
         self._cache_locate_site_class: Dict = dict()
 
+        # Create the SNP-calls cache as a per-instance lru_cache wrapping the
+        # bound method.  Storing it on the instance (rather than using a
+        # class-level @lru_cache decorator) means:
+        #   1. `self` is not part of the cache key, so old instances are freed
+        #      normally when the caller drops their reference.
+        #   2. Different instances have independent, non-interfering caches.
+        self._cached_snp_calls = lru_cache(maxsize=2)(self._raw_snp_calls)
+
     @property
     def _site_filters_analysis(self) -> Optional[str]:
         if self._site_filters_analysis_override:
@@ -928,6 +942,13 @@ def _locate_site_class(
 
             self._cache_locate_site_class[cache_key] = loc_ann
 
+            # Evict the oldest entry when the cache exceeds its size limit.
+            # Plain dicts preserve insertion order (Python 3.7+), so the first
+            # key is always the oldest.
+            while len(self._cache_locate_site_class) > _LOCATE_SITE_CLASS_CACHE_MAXSIZE:
+                oldest = next(iter(self._cache_locate_site_class))
+                del self._cache_locate_site_class[oldest]
+
         return loc_ann
 
     def _snp_calls_for_contig(
@@ -1088,16 +1109,7 @@ def snp_calls(
             chunks=chunks,
         )
 
-    # Here we cache to improve performance for functions which
-    # access SNP calls more than once. For example, this currently
-    # happens during access of biallelic SNP calls, because a
-    # first computation of allele counts is required, before
-    # then using that to filter SNP calls.
-    #
-    # We only cache up to 2 items because otherwise we can see
-    # high memory usage.
-    @lru_cache(maxsize=2)
-    def _cached_snp_calls(
+    def _raw_snp_calls(
         self,
         *,
         regions: Tuple[Region, ...],
@@ -1253,6 +1265,12 @@ def _snp_calls(
         if max_cohort_size is not None:
             n_samples = ds.sizes["samples"]
             if n_samples > max_cohort_size:
+                warnings.warn(
+                    f"Cohort downsampled from {n_samples} to {max_cohort_size} "
+                    "samples. Set max_cohort_size=None to disable downsampling.",
+                    UserWarning,
+                    stacklevel=2,
+                )
                 rng = np.random.default_rng(seed=random_seed)
                 loc_downsample = rng.choice(
                     n_samples, size=max_cohort_size, replace=False
diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
@@ -133,12 +133,13 @@ def __init__(
         gff_default_attributes: Tuple[str, ...],
         tqdm_class,
         storage_options: Mapping,
-        taxon_colors: Optional[Mapping[str, str]],
-        virtual_contigs: Optional[Mapping[str, Sequence[str]]],
-        gene_names: Optional[Mapping[str, str]],
-        inversion_tag_path: Optional[str],
-        unrestricted_use_only: Optional[bool],
-        surveillance_use_only: Optional[bool],
+        taxon_colors: Optional[Mapping[str, str]] = None,
+        aim_species_colors: Optional[Mapping[str, str]] = None,
+        virtual_contigs: Optional[Mapping[str, Sequence[str]]] = None,
+        gene_names: Optional[Mapping[str, str]] = None,
+        inversion_tag_path: Optional[str] = None,
+        unrestricted_use_only: Optional[bool] = None,
+        surveillance_use_only: Optional[bool] = None,
     ):
         super().__init__(
             url=url,
@@ -171,6 +172,7 @@ def __init__(
             results_cache=results_cache,
             tqdm_class=tqdm_class,
             taxon_colors=taxon_colors,
+            aim_species_colors=aim_species_colors,
             virtual_contigs=virtual_contigs,
             gene_names=gene_names,
             inversion_tag_path=inversion_tag_path,
diff --git a/malariagen_data/mjn.py b/malariagen_data/mjn.py
@@ -281,12 +281,6 @@ def _mjn_graph_edges(
                 # add edge from final intermediate node to node j
                 source = f"anon_{i}_{j}_{sep-2}"
                 target = j
-                graph_node = {
-                    "id": source,
-                    "count": 0,
-                    "width": anon_width,
-                }
-                graph_nodes.append(graph_node)
                 graph_edge = {
                     "id": f"edge_{i}_{j}_{sep-1}",
                     "source": source,
diff --git a/tests/anoph/test_cnv_data.py b/tests/anoph/test_cnv_data.py
@@ -626,7 +626,10 @@ def test_cnv_discordant_read_calls(fixture, api: AnophelesCnvData):
             assert isinstance(d2, xr.DataArray)
 
     # Check with a contig that should not exist
-    with pytest.raises(ValueError):
+    with pytest.raises(
+        ValueError,
+        match="No CNV discordant read calls data found|no CNVs available for contig",
+    ):
         api.cnv_discordant_read_calls(
             contig="foobar", sample_sets=random.choice(all_sample_sets)
         )
diff --git a/tests/anoph/test_dipclust.py b/tests/anoph/test_dipclust.py
@@ -11,6 +11,9 @@ def random_transcripts_contig(*, api, contig, n):
     df_gff = api.genome_features(attributes=["ID", "Parent"])
     df_transcripts = df_gff.query(f"type == 'mRNA' and contig == '{contig}'")
     transcript_ids = df_transcripts["ID"].dropna().to_list()
+    n = min(n, len(transcript_ids))
+    if n == 0:
+        pytest.skip(f"No mRNA transcripts found for contig '{contig}'")
     transcripts = random.sample(transcript_ids, n)
     return transcripts
 
diff --git a/tests/anoph/test_g123.py b/tests/anoph/test_g123.py
@@ -240,8 +240,7 @@ def test_g123_calibration(fixture, api: AnophelesG123Analysis):
 
     # Set up test parameters.
     all_sample_sets = api.sample_sets()["sample_set"].to_list()
-    window_sizes = np.random.randint(100, 500, size=random.randint(2, 5)).tolist()
-    window_sizes = sorted([int(x) for x in window_sizes])
+    window_sizes = sorted(random.sample(range(100, 500), k=random.randint(2, 5)))
     g123_params = dict(
         contig=random.choice(api.contigs),
         sites=random.choice(api.phasing_analysis_ids),
diff --git a/tests/anoph/test_snp_data.py b/tests/anoph/test_snp_data.py
@@ -950,6 +950,63 @@ def test_snp_calls_with_site_class_param(ag3_sim_api: AnophelesSnpData, site_cla
     assert ds2.sizes["variants"] < ds1.sizes["variants"]
 
 
+def test_locate_site_class_cache_is_bounded(ag3_sim_api: AnophelesSnpData):
+    """_cache_locate_site_class must never grow beyond _LOCATE_SITE_CLASS_CACHE_MAXSIZE
+    even when all contigs and site classes are exercised in a single session."""
+    from malariagen_data.anoph.snp_data import _LOCATE_SITE_CLASS_CACHE_MAXSIZE
+
+    site_classes = [
+        "CDS_DEG_4",
+        "CDS_DEG_2_SIMPLE",
+        "CDS_DEG_0",
+        "INTRON_SHORT",
+        "INTRON_LONG",
+        "INTRON_SPLICE_5PRIME",
+        "INTRON_SPLICE_3PRIME",
+        "UTR_5PRIME",
+        "UTR_3PRIME",
+        "INTERGENIC",
+    ]
+    for contig in ag3_sim_api.contigs:
+        for site_class in site_classes:
+            ag3_sim_api.snp_calls(region=contig, site_class=site_class)
+
+    assert len(ag3_sim_api._cache_locate_site_class) <= _LOCATE_SITE_CLASS_CACHE_MAXSIZE
+
+
+def test_snp_calls_cache_is_per_instance(ag3_sim_api: AnophelesSnpData):
+    """_cached_snp_calls must be a per-instance lru_cache, not a class-level one.
+
+    A class-level @lru_cache stores `self` as a key in a class-global dict,
+    which prevents garbage collection of stale API instances and leaks all their
+    subcaches.  The fix stores the cache on the instance in __init__, so each
+    object has its own independent cache that is freed with the object.
+    """
+    # (1) The cache wrapper must live on the instance, not on the class.
+    assert "_cached_snp_calls" in ag3_sim_api.__dict__, (
+        "_cached_snp_calls should be an instance attribute (per-instance lru_cache), "
+        "not a class-level descriptor"
+    )
+
+    # (2) It must be a real lru_cache wrapper (exposes cache_info / cache_clear).
+    assert hasattr(ag3_sim_api._cached_snp_calls, "cache_info")
+    assert hasattr(ag3_sim_api._cached_snp_calls, "cache_clear")
+
+    # (3) Populate the cache and confirm it registers hits.
+    ag3_sim_api.snp_calls(region="3L")
+    ag3_sim_api.snp_calls(region="3L")  # second call — should be a cache hit
+    info = ag3_sim_api._cached_snp_calls.cache_info()
+    assert info.currsize > 0
+    assert info.hits >= 1
+
+    # (4) The class itself must NOT own _cached_snp_calls (it must not be a
+    #     class-level descriptor installed by @lru_cache).
+    assert "_cached_snp_calls" not in AnophelesSnpData.__dict__, (
+        "_cached_snp_calls must not be a class-level attribute; "
+        "a class-level @lru_cache would pin `self` in a global cache dict"
+    )
+
+
 @pytest.mark.parametrize("chrom", ["2RL", "3RL"])
 def test_snp_calls_with_virtual_contigs(ag3_sim_api, chrom):
     api = ag3_sim_api