Merge branch 'master' into GH-1054-add-vcf-export

adilraza99 · web-flow · commit b6705e7220d5 · 2026-03-24T10:27:59.000+05:30
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 .idea
 .vscode
 __pycache__
+.mypy_cache
 *.pyc
 dist
 .venv/
diff --git a/malariagen_data/adar1.py b/malariagen_data/adar1.py
@@ -130,7 +130,6 @@ def __init__(
             tqdm_class=tqdm_class,
             taxon_colors=TAXON_COLORS,
             virtual_contigs=None,
-            gene_names=None,
             inversion_tag_path=None,
             unrestricted_use_only=unrestricted_use_only,
             surveillance_use_only=surveillance_use_only,
diff --git a/malariagen_data/adir1.py b/malariagen_data/adir1.py
@@ -130,7 +130,6 @@ def __init__(
             tqdm_class=tqdm_class,
             taxon_colors=TAXON_COLORS,
             virtual_contigs=None,
-            gene_names=None,
             inversion_tag_path=None,
             unrestricted_use_only=unrestricted_use_only,
             surveillance_use_only=surveillance_use_only,
diff --git a/malariagen_data/af1.py b/malariagen_data/af1.py
@@ -132,7 +132,6 @@ def __init__(
             tqdm_class=tqdm_class,
             taxon_colors=TAXON_COLORS,
             virtual_contigs=None,
-            gene_names=None,
             inversion_tag_path=None,
             unrestricted_use_only=unrestricted_use_only,
             surveillance_use_only=surveillance_use_only,
diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py
@@ -212,7 +212,6 @@ def __init__(
             taxon_colors=TAXON_COLORS,
             aim_species_colors=AIM_SPECIES_COLORS,
             virtual_contigs=VIRTUAL_CONTIGS,
-            gene_names=GENE_NAMES,
             inversion_tag_path=INVERSION_TAG_PATH,
             unrestricted_use_only=unrestricted_use_only,
             surveillance_use_only=surveillance_use_only,
diff --git a/malariagen_data/amin1.py b/malariagen_data/amin1.py
@@ -130,7 +130,6 @@ def __init__(
             tqdm_class=tqdm_class,
             taxon_colors=TAXON_COLORS,
             virtual_contigs=None,
-            gene_names=None,
             inversion_tag_path=None,
             unrestricted_use_only=unrestricted_use_only,
             surveillance_use_only=surveillance_use_only,
diff --git a/malariagen_data/anoph/aim_data.py b/malariagen_data/anoph/aim_data.py
@@ -40,10 +40,24 @@ def __init__(
         # to the superclass constructor.
         super().__init__(**kwargs)
 
-        # Store possible values for the `aims` parameter.
-        # TODO Consider moving this to data resource configuration.
-        self._aim_ids = aim_ids
-        self._aim_palettes = aim_palettes
+        # Read AIM parameters from the JSON config, falling back to
+        # constructor args for backward compatibility.
+        config = self.config
+        _aim_ids = config.get("AIM_IDS", None)
+        if _aim_ids is not None:
+            self._aim_ids: Optional[aim_params.aim_ids] = tuple(_aim_ids)
+        else:
+            self._aim_ids = aim_ids
+
+        _aim_palettes = config.get("AIM_PALETTES", None)
+        if _aim_palettes is not None:
+            # Convert lists to tuples for each palette entry.
+            self._aim_palettes: Optional[aim_params.aim_palettes] = {
+                k: tuple(v)
+                for k, v in _aim_palettes.items()  # type: ignore
+            }
+        else:
+            self._aim_palettes = aim_palettes
 
         # Set up caches.
         self._cache_aim_variants: Dict[str, xr.Dataset] = dict()
diff --git a/malariagen_data/anoph/base_params.py b/malariagen_data/anoph/base_params.py
@@ -189,6 +189,14 @@ def _validate_sample_selection_params(
     "Random seed used for reproducible down-sampling.",
 ]
 
+gene: TypeAlias = Annotated[
+    str,
+    """
+    Gene identifier. Can be either a gene ID or gene name.
+    Gene names are matched case-insensitively.
+    """,
+]
+
 transcript: TypeAlias = Annotated[
     str,
     "Gene transcript identifier.",
diff --git a/malariagen_data/anoph/frq_base.py b/malariagen_data/anoph/frq_base.py
@@ -417,6 +417,14 @@ def plot_frequencies_heatmap(
                 `aa_allele_frequencies_advanced()` or
                 `gene_cnv_frequencies_advanced()`.
             """,
+            taxa="""
+                Taxon or list of taxa to include in the plot. If None,
+                all taxa are shown.
+            """,
+            areas="""
+                Area or list of areas to include in the plot. If None,
+                all areas are shown.
+            """,
             kwargs="Passed through to `px.line()`.",
         ),
         returns="""
diff --git a/malariagen_data/anoph/genome_features.py b/malariagen_data/anoph/genome_features.py
@@ -23,9 +23,9 @@ class AnophelesGenomeFeaturesData(AnophelesGenomeSequenceData):
     def __init__(
         self,
         *,
-        gff_gene_type: str,
-        gff_gene_name_attribute: str,
-        gff_default_attributes: Tuple[str, ...],
+        gff_gene_type: Optional[str] = None,
+        gff_gene_name_attribute: Optional[str] = None,
+        gff_default_attributes: Optional[Tuple[str, ...]] = None,
         gene_names: Optional[Mapping[str, str]] = None,
         **kwargs,
     ):
@@ -34,16 +34,30 @@ def __init__(
         # to the superclass constructor.
         super().__init__(**kwargs)
 
-        # TODO Consider moving these parameters to configuration, as they could
-        # change if the GFF ever changed.
-        self._gff_gene_type = gff_gene_type
-        self._gff_gene_name_attribute = gff_gene_name_attribute
-        self._gff_default_attributes = gff_default_attributes
+        # Read GFF parameters from the JSON config, falling back to
+        # constructor args for backward compatibility.
+        config = self.config
+        self._gff_gene_type = config.get("GFF_GENE_TYPE", gff_gene_type or "gene")
+        self._gff_gene_name_attribute = config.get(
+            "GFF_GENE_NAME_ATTRIBUTE", gff_gene_name_attribute or "Name"
+        )
+        _default_attrs = config.get("GFF_DEFAULT_ATTRIBUTES", None)
+        if _default_attrs is not None:
+            self._gff_default_attributes = tuple(_default_attrs)
+        elif gff_default_attributes is not None:
+            self._gff_default_attributes = gff_default_attributes
+        else:
+            self._gff_default_attributes = ("ID", "Parent", "Name", "description")
 
         # Allow manual override of gene names.
-        if gene_names is None:
-            gene_names = dict()
-        self._gene_name_overrides = gene_names
+        # Read from config if available, falling back to constructor arg.
+        _gene_names = config.get("GENE_NAMES", None)
+        if _gene_names is not None:
+            self._gene_name_overrides = _gene_names
+        elif gene_names is not None:
+            self._gene_name_overrides = gene_names
+        else:
+            self._gene_name_overrides = dict()
 
         # Setup caches.
         self._cache_genome_features: Dict[Tuple[str, ...], pd.DataFrame] = dict()
@@ -314,6 +328,140 @@ def plot_transcript(
             bokeh.plotting.show(fig)
         return fig
 
+    @_check_types
+    @doc(
+        summary="Get the canonical transcript for a gene.",
+        returns="""
+            The transcript ID for the canonical transcript of the specified gene.
+            The canonical transcript is the one with the highest number of
+            transcribed base pairs (sum of exon lengths).
+        """,
+    )
+    def canonical_transcript(
+        self,
+        gene: base_params.gene,
+    ) -> str:
+        """
+        Parameters
+        ----------
+        gene : str
+            A gene identifier. Can be either a gene ID or gene name.
+
+        Returns
+        -------
+        str
+            The transcript ID of the canonical transcript.
+
+        Raises
+        ------
+        ValueError
+            If the gene identifier is not found or if the gene has no transcripts.
+
+        Examples
+        --------
+        Get the canonical transcript for a gene by ID:
+
+        >>> import malariagen_data
+        >>> ag3 = malariagen_data.ag3(pre=False)
+        >>> canonical = ag3.canonical_transcript("AGAP004707")
+
+        Get the canonical transcript for a gene by name:
+
+        >>> canonical = ag3.canonical_transcript("Pvr")
+        """
+        debug = self._log.debug
+        debug(f"Looking up canonical transcript for gene '{gene}'")
+
+        # Load genome features once with required attributes
+        with self._spinner(desc="Load gene data"):
+            # Load required attributes (ordered for consistency with GFF3)
+            attributes = ("ID", "Parent", self._gff_gene_name_attribute)
+            df_features = self.genome_features(attributes=attributes)
+            debug(f"Loaded {len(df_features)} genome features")
+
+        # Filter for genes
+        df_genes = df_features[df_features["type"] == self._gff_gene_type]
+        name_attr = self._gff_gene_name_attribute
+
+        # Normalize input: strip whitespace
+        gene_normalized = gene.strip()
+
+        # Reject empty identifiers after normalization to avoid ambiguous matches
+        if not gene_normalized:
+            raise ValueError(
+                "Gene identifier is empty after stripping whitespace; please provide a valid gene ID or name."
+            )
+        # Try exact ID match first (case-sensitive)
+        debug(f"Attempting ID match for '{gene_normalized}'")
+        gene_id_match = df_genes[df_genes["ID"].str.strip() == gene_normalized]
+
+        if len(gene_id_match) == 1:
+            gene_id = gene_id_match.iloc[0]["ID"]
+            debug(f"Found ID match: {gene_id}")
+        elif len(gene_id_match) > 1:
+            # This should not happen (ID should be unique), but handling gracefully
+            raise ValueError(
+                f"Multiple features with ID '{gene}' found (data integrity issue)"
+            )
+        else:
+            # Trying name match (case-insensitive with whitespace handling)
+            debug("No ID match, attempting name match")
+            gene_name_match = df_genes[
+                df_genes[name_attr].fillna("").str.strip().str.lower()
+                == gene_normalized.lower()
+            ]
+
+            if len(gene_name_match) == 0:
+                raise ValueError(f"Gene '{gene}' not found (no matching ID or name)")
+            elif len(gene_name_match) > 1:
+                # Suggest which genes matched for better debugging
+                matching_ids = ", ".join(gene_name_match["ID"].values)
+                raise ValueError(
+                    f"Gene name '{gene}' is ambiguous (matches {len(gene_name_match)} genes: {matching_ids}). "
+                    f"Please use a specific gene ID instead."
+                )
+
+            gene_id = gene_name_match.iloc[0]["ID"]
+            debug(f"Found name match: {gene_id}")
+
+        # Get transcripts for the gene
+        debug(f"Finding transcripts for gene '{gene_id}'")
+        df_transcripts = self.genome_feature_children(
+            parent=gene_id, attributes=("ID",)
+        )
+        df_transcripts = df_transcripts[df_transcripts["type"] == "mRNA"]
+
+        if len(df_transcripts) == 0:
+            raise ValueError(f"Gene '{gene}' has no transcripts")
+
+        debug(f"Found {len(df_transcripts)} transcripts for gene {gene_id}")
+
+        # Calculate transcript lengths and find canonical
+        debug("Calculating transcript lengths for each transcript")
+        transcript_lengths = {}
+
+        for transcript_id in df_transcripts["ID"]:
+            # Get all exon children (genome_feature_children handles multi-parent exons)
+            df_exons = self.genome_feature_children(
+                parent=transcript_id, attributes=None
+            )
+            # Filter for exons only (important: exclude other feature types)
+            df_exons = df_exons[df_exons["type"] == "exon"].sort_values("start")
+
+            if len(df_exons) > 0:
+                # Calculate total transcribed length (1-based inclusive coordinates)
+                exon_lengths = (df_exons["end"] - df_exons["start"] + 1).sum()
+                transcript_lengths[transcript_id] = exon_lengths
+                debug(f"  {transcript_id}: {len(df_exons)} exons, {exon_lengths} bp")
+        if not transcript_lengths:
+            raise ValueError(f"Gene '{gene}' has no transcripts with exons")
+
+        # Find canonical (maximum length)
+        canonical = max(transcript_lengths, key=lambda tid: transcript_lengths[tid])
+        canonical_length = transcript_lengths[canonical]
+        debug(f"Canonical transcript: {canonical} with {canonical_length} bp")
+        return canonical
+
     @_check_types
     @doc(
         summary="Plot a genes track, using bokeh.",
diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py
@@ -138,7 +138,6 @@ def __init__(
         taxon_colors: Optional[Mapping[str, str]] = None,
         aim_species_colors: Optional[Mapping[str, str]] = None,
         virtual_contigs: Optional[Mapping[str, Sequence[str]]] = None,
-        gene_names: Optional[Mapping[str, str]] = None,
         inversion_tag_path: Optional[str] = None,
         unrestricted_use_only: Optional[bool] = None,
         surveillance_use_only: Optional[bool] = None,
@@ -176,7 +175,6 @@ def __init__(
             taxon_colors=taxon_colors,
             aim_species_colors=aim_species_colors,
             virtual_contigs=virtual_contigs,
-            gene_names=gene_names,
             inversion_tag_path=inversion_tag_path,
             unrestricted_use_only=unrestricted_use_only,
             surveillance_use_only=surveillance_use_only,
diff --git a/malariagen_data/util.py b/malariagen_data/util.py
@@ -361,7 +361,7 @@ def _dask_compress_dataarray(a, indexer, indexer_computed, dim):
 
 
 def _da_compress(
-    indexer: da.Array | np.ndarray,
+    indexer: Union[da.Array, np.ndarray],
     data: da.Array,
     axis: int,
     indexer_computed: Optional[np.ndarray] = None,
diff --git a/malariagen_data/veff.py b/malariagen_data/veff.py
@@ -1,4 +1,5 @@
 import collections
+import functools
 import operator
 
 import pandas as pd
@@ -31,7 +32,7 @@
 
 
 class Annotator(object):
-    def __init__(self, genome, genome_features):
+    def __init__(self, genome, genome_features, genome_cache_maxsize=5):
         """
         An annotator.
 
@@ -41,14 +42,26 @@ def __init__(self, genome, genome_features):
             Reference genome.
         genome_features : pandas dataframe
             Dataframe with genome annotations.
+        genome_cache_maxsize : int or None, optional
+            Maximum number of contig genome sequences to keep in the
+            LRU cache.  Set to ``None`` for an unbounded cache (the
+            previous default behaviour).  Default is 5.
 
         """
 
         # store initialisation parameters
         self._genome = genome
-        self._genome_cache = dict()
         self._genome_features_cache = None
 
+        # Create a per-instance LRU cache for genome sequences.
+        # Defining the cached function inside __init__ ensures each
+        # Annotator instance has its own independent cache.
+        @functools.lru_cache(maxsize=genome_cache_maxsize)
+        def _load_genome_seq(chrom):
+            return self._genome[chrom][:]
+
+        self._load_genome_seq = _load_genome_seq
+
         genome_features = genome_features[
             (genome_features.end - genome_features.start) > 0
         ]
@@ -76,15 +89,15 @@ def get_children(self, feature_id):
 
     def get_ref_seq(self, chrom, start, stop):
         """Accepts 1-based coords."""
-        try:
-            seq = self._genome_cache[chrom]
-        except KeyError:
-            seq = self._genome[chrom][:]
-            self._genome_cache[chrom] = seq
+        seq = self._load_genome_seq(chrom)
         ref_seq = seq[start - 1 : stop]
         ref_seq = ref_seq.tobytes().decode()
         return ref_seq
 
+    def clear_genome_cache(self):
+        """Clear all cached genome sequences to free memory."""
+        self._load_genome_seq.cache_clear()
+
     def get_ref_allele_coords(self, chrom, pos, ref):
         # N.B., use one-based inclusive coordinate system (like GFF3) throughout
         ref_start = pos
diff --git a/tests/anoph/test_genome_features.py b/tests/anoph/test_genome_features.py
diff --git a/tests/test_veff.py b/tests/test_veff.py

-Original file line number
+Diff line change
@@ @@ -1,6 +1,7 @@ @@
 .idea
 .vscode
 __pycache__
 +.mypy_cache
 *.pyc
 dist
 .venv/