Merge branch 'master' into fix/veff-genome-cache-memory-leak

jonbrenas · web-flow · commit a5a917cf4482 · 2026-03-23T13:49:44.000Z
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 .idea
 .vscode
 __pycache__
+.mypy_cache
 *.pyc
 dist
 .venv/
diff --git a/malariagen_data/anoph/base_params.py b/malariagen_data/anoph/base_params.py
@@ -189,6 +189,14 @@ def _validate_sample_selection_params(
     "Random seed used for reproducible down-sampling.",
 ]
 
+gene: TypeAlias = Annotated[
+    str,
+    """
+    Gene identifier. Can be either a gene ID or gene name.
+    Gene names are matched case-insensitively.
+    """,
+]
+
 transcript: TypeAlias = Annotated[
     str,
     "Gene transcript identifier.",
diff --git a/malariagen_data/anoph/cnv_frq.py b/malariagen_data/anoph/cnv_frq.py
@@ -90,7 +90,11 @@ def _gene_cnv(
         inline_array,
     ):
         # Sanity check.
-        assert isinstance(region, Region)
+        if not isinstance(region, Region):
+            raise TypeError(
+                f"Expected region to be a Region object, "
+                f"got {type(region).__name__}: {region!r}"
+            )
 
         # Access genes within the region of interest.
         df_genome_features = self.genome_features(region=region)
@@ -260,7 +264,11 @@ def _gene_cnv_frequencies(
         debug = self._log.debug
 
         debug("sanity check - this function is one region at a time")
-        assert isinstance(region, Region)
+        if not isinstance(region, Region):
+            raise TypeError(
+                f"Expected region to be a Region object, "
+                f"got {type(region).__name__}: {region!r}"
+            )
 
         debug("get gene copy number data")
         ds_cnv = self.gene_cnv(
@@ -504,7 +512,11 @@ def _gene_cnv_frequencies_advanced(
         debug = self._log.debug
 
         debug("sanity check - here we deal with one region only")
-        assert isinstance(region, Region)
+        if not isinstance(region, Region):
+            raise TypeError(
+                f"Expected region to be a Region object, "
+                f"got {type(region).__name__}: {region!r}"
+            )
 
         debug("access gene CNV calls")
         ds_cnv = self.gene_cnv(
diff --git a/malariagen_data/anoph/frq_base.py b/malariagen_data/anoph/frq_base.py
@@ -341,7 +341,11 @@ def plot_frequencies_heatmap(
             for j in range(1, idx_vals.shape[1]):
                 index_col = index_col + ", " + idx_vals[:, j]
         else:
-            assert isinstance(index, str)
+            if not isinstance(index, str):
+                raise TypeError(
+                    f"Expected index to be str or list, "
+                    f"got {type(index).__name__}: {index!r}"
+                )
             index_col = df[index].astype(str)
 
         # Check that index is unique.
@@ -413,6 +417,14 @@ def plot_frequencies_heatmap(
                 `aa_allele_frequencies_advanced()` or
                 `gene_cnv_frequencies_advanced()`.
             """,
+            taxa="""
+                Taxon or list of taxa to include in the plot. If None,
+                all taxa are shown.
+            """,
+            areas="""
+                Area or list of areas to include in the plot. If None,
+                all areas are shown.
+            """,
             kwargs="Passed through to `px.line()`.",
         ),
         returns="""
@@ -588,7 +600,11 @@ def plot_frequencies_map_markers(
             ds_variant = ds.isel(variants=variant)
             variant_label = ds["variant_label"].values[variant]
         else:
-            assert isinstance(variant, str)
+            if not isinstance(variant, str):
+                raise TypeError(
+                    f"Expected variant to be int or str, "
+                    f"got {type(variant).__name__}: {variant!r}"
+                )
             ds_variant = ds.set_index(variants="variant_label").sel(variants=variant)
             variant_label = variant
 
diff --git a/malariagen_data/anoph/genome_features.py b/malariagen_data/anoph/genome_features.py
@@ -314,6 +314,140 @@ def plot_transcript(
             bokeh.plotting.show(fig)
         return fig
 
+    @_check_types
+    @doc(
+        summary="Get the canonical transcript for a gene.",
+        returns="""
+            The transcript ID for the canonical transcript of the specified gene.
+            The canonical transcript is the one with the highest number of
+            transcribed base pairs (sum of exon lengths).
+        """,
+    )
+    def canonical_transcript(
+        self,
+        gene: base_params.gene,
+    ) -> str:
+        """
+        Parameters
+        ----------
+        gene : str
+            A gene identifier. Can be either a gene ID or gene name.
+
+        Returns
+        -------
+        str
+            The transcript ID of the canonical transcript.
+
+        Raises
+        ------
+        ValueError
+            If the gene identifier is not found or if the gene has no transcripts.
+
+        Examples
+        --------
+        Get the canonical transcript for a gene by ID:
+
+        >>> import malariagen_data
+        >>> ag3 = malariagen_data.ag3(pre=False)
+        >>> canonical = ag3.canonical_transcript("AGAP004707")
+
+        Get the canonical transcript for a gene by name:
+
+        >>> canonical = ag3.canonical_transcript("Pvr")
+        """
+        debug = self._log.debug
+        debug(f"Looking up canonical transcript for gene '{gene}'")
+
+        # Load genome features once with required attributes
+        with self._spinner(desc="Load gene data"):
+            # Load required attributes (ordered for consistency with GFF3)
+            attributes = ("ID", "Parent", self._gff_gene_name_attribute)
+            df_features = self.genome_features(attributes=attributes)
+            debug(f"Loaded {len(df_features)} genome features")
+
+        # Filter for genes
+        df_genes = df_features[df_features["type"] == self._gff_gene_type]
+        name_attr = self._gff_gene_name_attribute
+
+        # Normalize input: strip whitespace
+        gene_normalized = gene.strip()
+
+        # Reject empty identifiers after normalization to avoid ambiguous matches
+        if not gene_normalized:
+            raise ValueError(
+                "Gene identifier is empty after stripping whitespace; please provide a valid gene ID or name."
+            )
+        # Try exact ID match first (case-sensitive)
+        debug(f"Attempting ID match for '{gene_normalized}'")
+        gene_id_match = df_genes[df_genes["ID"].str.strip() == gene_normalized]
+
+        if len(gene_id_match) == 1:
+            gene_id = gene_id_match.iloc[0]["ID"]
+            debug(f"Found ID match: {gene_id}")
+        elif len(gene_id_match) > 1:
+            # This should not happen (ID should be unique), but handling gracefully
+            raise ValueError(
+                f"Multiple features with ID '{gene}' found (data integrity issue)"
+            )
+        else:
+            # Trying name match (case-insensitive with whitespace handling)
+            debug("No ID match, attempting name match")
+            gene_name_match = df_genes[
+                df_genes[name_attr].fillna("").str.strip().str.lower()
+                == gene_normalized.lower()
+            ]
+
+            if len(gene_name_match) == 0:
+                raise ValueError(f"Gene '{gene}' not found (no matching ID or name)")
+            elif len(gene_name_match) > 1:
+                # Suggest which genes matched for better debugging
+                matching_ids = ", ".join(gene_name_match["ID"].values)
+                raise ValueError(
+                    f"Gene name '{gene}' is ambiguous (matches {len(gene_name_match)} genes: {matching_ids}). "
+                    f"Please use a specific gene ID instead."
+                )
+
+            gene_id = gene_name_match.iloc[0]["ID"]
+            debug(f"Found name match: {gene_id}")
+
+        # Get transcripts for the gene
+        debug(f"Finding transcripts for gene '{gene_id}'")
+        df_transcripts = self.genome_feature_children(
+            parent=gene_id, attributes=("ID",)
+        )
+        df_transcripts = df_transcripts[df_transcripts["type"] == "mRNA"]
+
+        if len(df_transcripts) == 0:
+            raise ValueError(f"Gene '{gene}' has no transcripts")
+
+        debug(f"Found {len(df_transcripts)} transcripts for gene {gene_id}")
+
+        # Calculate transcript lengths and find canonical
+        debug("Calculating transcript lengths for each transcript")
+        transcript_lengths = {}
+
+        for transcript_id in df_transcripts["ID"]:
+            # Get all exon children (genome_feature_children handles multi-parent exons)
+            df_exons = self.genome_feature_children(
+                parent=transcript_id, attributes=None
+            )
+            # Filter for exons only (important: exclude other feature types)
+            df_exons = df_exons[df_exons["type"] == "exon"].sort_values("start")
+
+            if len(df_exons) > 0:
+                # Calculate total transcribed length (1-based inclusive coordinates)
+                exon_lengths = (df_exons["end"] - df_exons["start"] + 1).sum()
+                transcript_lengths[transcript_id] = exon_lengths
+                debug(f"  {transcript_id}: {len(df_exons)} exons, {exon_lengths} bp")
+        if not transcript_lengths:
+            raise ValueError(f"Gene '{gene}' has no transcripts with exons")
+
+        # Find canonical (maximum length)
+        canonical = max(transcript_lengths, key=lambda tid: transcript_lengths[tid])
+        canonical_length = transcript_lengths[canonical]
+        debug(f"Canonical transcript: {canonical} with {canonical_length} bp")
+        return canonical
+
     @_check_types
     @doc(
         summary="Plot a genes track, using bokeh.",
diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py
@@ -1235,7 +1235,11 @@ def lookup_sample(
         if isinstance(sample, str):
             sample_rec = df_samples.loc[sample]
         else:
-            assert isinstance(sample, int)
+            if not isinstance(sample, int):
+                raise TypeError(
+                    f"Expected sample to be str or int, "
+                    f"got {type(sample).__name__}: {sample!r}"
+                )
             sample_rec = df_samples.iloc[sample]
         return sample_rec
 
@@ -1348,7 +1352,11 @@ def _setup_sample_symbol(
 
         else:
             # Custom grouping using queries.
-            assert isinstance(symbol, Mapping)
+            if not isinstance(symbol, Mapping):
+                raise TypeError(
+                    f"Expected symbol to be str or Mapping, "
+                    f"got {type(symbol).__name__}: {symbol!r}"
+                )
             data["symbol"] = ""
             for key, value in symbol.items():
                 data.loc[data.query(value).index, "symbol"] = key
@@ -1397,7 +1405,11 @@ def _setup_sample_colors_plotly(
 
         else:
             # Custom grouping using queries.
-            assert isinstance(color, Mapping)
+            if not isinstance(color, Mapping):
+                raise TypeError(
+                    f"Expected color to be str or Mapping, "
+                    f"got {type(color).__name__}: {color!r}"
+                )
             data["color"] = ""
             for key, value in color.items():
                 data.loc[data.query(value).index, "color"] = key
@@ -1493,13 +1505,17 @@ def _setup_cohort_queries(
         """Convenience function to normalise the `cohorts` parameter to a
         dictionary mapping cohort labels to sample metadata queries."""
 
-        if isinstance(cohorts, dict):
+        if isinstance(cohorts, Mapping):
             # User has supplied a custom dictionary mapping cohort identifiers
             # to pandas queries.
             cohort_queries = cohorts
 
         else:
-            assert isinstance(cohorts, str)
+            if not isinstance(cohorts, str):
+                raise TypeError(
+                    f"Expected cohorts to be Mapping or str, "
+                    f"got {type(cohorts).__name__}: {cohorts!r}"
+                )
             # User has supplied a column in the sample metadata.
             df_samples = self.sample_metadata(
                 sample_sets=sample_sets,
@@ -1855,7 +1871,11 @@ def _locate_cohorts(*, cohorts, data, min_cohort_size):
             coh_dict[coh] = loc_coh
 
     else:
-        assert isinstance(cohorts, str)
+        if not isinstance(cohorts, str):
+            raise TypeError(
+                f"Expected cohorts to be Mapping or str, "
+                f"got {type(cohorts).__name__}: {cohorts!r}"
+            )
         # User has supplied the name of a sample metadata column.
 
         # Convenience to allow things like "admin1_year" instead of "cohort_admin1_year".
diff --git a/malariagen_data/util.py b/malariagen_data/util.py
diff --git a/tests/anoph/test_genome_features.py b/tests/anoph/test_genome_features.py

-Original file line number
+Diff line change
@@ @@ -1,6 +1,7 @@ @@
 .idea
 .vscode
 __pycache__
 +.mypy_cache
 *.pyc
 dist
 .venv/