Implement issue #794: Add canonical_transcript() method

kunal-10-cloud · kunal-10-cloud · commit 29348059b61b · 2026-03-22T23:57:30.000+05:30
- Add gene TypeAlias to base_params.py for type annotations
- Implement canonical_transcript() method in genome_features.py
  * Finds the transcript with the longest total exon length
  * Supports gene ID and gene name (case-insensitive) lookup
  * Includes comprehensive error handling and debug logging
- Add 11 comprehensive test cases
  * Basic functionality (ID and name lookup)
  * Error handling (non-existent genes, empty strings)
  * Edge cases (whitespace, case sensitivity, single-transcript genes)
  * Algorithm correctness verification
  * Multi-species support (ag3, af1, adir1)

All tests passing: 11/11 new + 24 existing = 35 total
diff --git a/malariagen_data/anoph/base_params.py b/malariagen_data/anoph/base_params.py
@@ -189,6 +189,14 @@ def _validate_sample_selection_params(
     "Random seed used for reproducible down-sampling.",
 ]
 
+gene: TypeAlias = Annotated[
+    str,
+    """
+    Gene identifier. Can be either a gene ID or gene name.
+    Gene names are matched case-insensitively.
+    """,
+]
+
 transcript: TypeAlias = Annotated[
     str,
     "Gene transcript identifier.",
diff --git a/malariagen_data/anoph/genome_features.py b/malariagen_data/anoph/genome_features.py
@@ -314,6 +314,136 @@ def plot_transcript(
             bokeh.plotting.show(fig)
         return fig
 
+    @_check_types
+    @doc(
+        summary="Get the canonical transcript for a gene.",
+        returns="""
+            The transcript ID for the canonical transcript of the specified gene.
+            The canonical transcript is the one with the highest number of
+            transcribed base pairs (sum of exon lengths).
+        """,
+    )
+    def canonical_transcript(
+        self,
+        gene: base_params.gene,
+    ) -> str:
+        """
+        Parameters
+        ----------
+        gene : str
+            A gene identifier. Can be either a gene ID or gene name.
+
+        Returns
+        -------
+        str
+            The transcript ID of the canonical transcript.
+
+        Raises
+        ------
+        ValueError
+            If the gene identifier is not found or if the gene has no transcripts.
+
+        Examples
+        --------
+        Get the canonical transcript for a gene by ID:
+
+        >>> import malariagen_data
+        >>> ag3 = malariagen_data.ag3(pre=False)
+        >>> canonical = ag3.canonical_transcript("AGAP004707")
+
+        Get the canonical transcript for a gene by name:
+
+        >>> canonical = ag3.canonical_transcript("Pvr")
+        """
+        debug = self._log.debug
+        debug(f"Looking up canonical transcript for gene '{gene}'")
+
+        # Load genome features once with required attributes
+        with self._spinner(desc="Load gene data"):
+            # Load required attributes (ordered for consistency with GFF3)
+            attributes = ("ID", "Parent", self._gff_gene_name_attribute)
+            df_features = self.genome_features(attributes=attributes)
+            debug(f"Loaded {len(df_features)} genome features")
+
+        # Filter for genes
+        df_genes = df_features[df_features["type"] == self._gff_gene_type]
+        name_attr = self._gff_gene_name_attribute
+
+        # Normalize input: strip whitespace
+        gene_normalized = gene.strip()
+
+        # Try exact ID match first (case-sensitive)
+        debug(f"Attempting ID match for '{gene_normalized}'")
+        gene_id_match = df_genes[df_genes["ID"].str.strip() == gene_normalized]
+
+        if len(gene_id_match) == 1:
+            gene_id = gene_id_match.iloc[0]["ID"]
+            debug(f"Found ID match: {gene_id}")
+        elif len(gene_id_match) > 1:
+            # This should not happen (ID should be unique), but handling gracefully
+            raise ValueError(
+                f"Multiple features with ID '{gene}' found (data integrity issue)"
+            )
+        else:
+            # Trying name match (case-insensitive with whitespace handling)
+            debug("No ID match, attempting name match")
+            gene_name_match = df_genes[
+                df_genes[name_attr].fillna("").str.strip().str.lower()
+                == gene_normalized.lower()
+            ]
+
+            if len(gene_name_match) == 0:
+                raise ValueError(f"Gene '{gene}' not found (no matching ID or name)")
+            elif len(gene_name_match) > 1:
+                # Suggest which genes matched for better debugging
+                matching_ids = ", ".join(gene_name_match["ID"].values)
+                raise ValueError(
+                    f"Gene name '{gene}' is ambiguous (matches {len(gene_name_match)} genes: {matching_ids}). "
+                    f"Please use a specific gene ID instead."
+                )
+
+            gene_id = gene_name_match.iloc[0]["ID"]
+            debug(f"Found name match: {gene_id}")
+
+        # Get transcripts for the gene
+        debug(f"Finding transcripts for gene '{gene_id}'")
+        df_transcripts = self.genome_feature_children(
+            parent=gene_id, attributes=("ID",)
+        )
+        df_transcripts = df_transcripts[df_transcripts["type"] == "mRNA"]
+
+        if len(df_transcripts) == 0:
+            raise ValueError(f"Gene '{gene}' has no transcripts")
+
+        debug(f"Found {len(df_transcripts)} transcripts for gene {gene_id}")
+
+        # Calculate transcript lengths and find canonical
+        debug("Calculating transcript lengths for each transcript")
+        transcript_lengths = {}
+
+        for transcript_id in df_transcripts["ID"]:
+            # Get all exon children (genome_feature_children handles multi-parent exons)
+            df_exons = self.genome_feature_children(
+                parent=transcript_id, attributes=None
+            )
+            # Filter for exons only (important: exclude other feature types)
+            df_exons = df_exons[df_exons["type"] == "exon"].sort_values("start")
+
+            if len(df_exons) > 0:
+                # Calculate total transcribed length
+                exon_lengths = (df_exons["end"] - df_exons["start"]).sum()
+                transcript_lengths[transcript_id] = exon_lengths
+                debug(f"  {transcript_id}: {len(df_exons)} exons, {exon_lengths} bp")
+
+        if not transcript_lengths:
+            raise ValueError(f"Gene '{gene}' has no transcripts with exons")
+
+        # Find canonical (maximum length)
+        canonical = max(transcript_lengths, key=transcript_lengths.get)
+        canonical_length = transcript_lengths[canonical]
+        debug(f"Canonical transcript: {canonical} with {canonical_length} bp")
+        return canonical
+
     @_check_types
     @doc(
         summary="Plot a genes track, using bokeh.",
diff --git a/tests/anoph/test_genome_features.py b/tests/anoph/test_genome_features.py
@@ -252,3 +252,131 @@ def test_genome_features_virtual_contigs(ag3_sim_api, chrom):
     assert isinstance(df, pd.DataFrame)
     if len(df) > 0:
         assert df["contig"].unique() == region.split(":")[0]
+
+
+# =============================================================================
+# Tests for canonical_transcript functionality
+# =============================================================================
+
+
+def test_canonical_transcript_by_id(ag3_sim_api):
+    """Test finding canonical transcript by gene ID."""
+    # Get a gene from the fixture
+    genes = ag3_sim_api.genome_features().query(
+        f"type == '{ag3_sim_api._gff_gene_type}'"
+    )
+    assert len(genes) > 0
+
+    gene_id = genes.iloc[0]["ID"]
+    canonical = ag3_sim_api.canonical_transcript(gene_id)
+    assert isinstance(canonical, str)
+    assert len(canonical) > 0
+
+
+def test_canonical_transcript_by_name(ag3_sim_api):
+    """Test finding canonical transcript by gene name."""
+    genes = ag3_sim_api.genome_features().query(
+        f"type == '{ag3_sim_api._gff_gene_type}'"
+    )
+    assert len(genes) > 0
+
+    gene_name = genes.iloc[0]["Name"]
+    canonical = ag3_sim_api.canonical_transcript(gene_name)
+    assert isinstance(canonical, str)
+    assert len(canonical) > 0
+
+
+def test_canonical_transcript_invalid_gene(ag3_sim_api):
+    """Test that ValueError is raised for non-existent gene."""
+    with pytest.raises(ValueError, match="not found"):
+        ag3_sim_api.canonical_transcript("NONEXISTENT_GENE_ID_12345")
+
+
+def test_canonical_transcript_empty_string(ag3_sim_api):
+    """Test that ValueError is raised for empty string."""
+    with pytest.raises(ValueError):
+        ag3_sim_api.canonical_transcript("")
+
+
+def test_canonical_transcript_whitespace_handling(ag3_sim_api):
+    """Test that whitespace around input doesn't break lookup."""
+    genes = ag3_sim_api.genome_features().query(
+        f"type == '{ag3_sim_api._gff_gene_type}'"
+    )
+    if len(genes) > 0:
+        gene_id_padded = f"  {genes.iloc[0]['ID']}  "
+        canonical = ag3_sim_api.canonical_transcript(gene_id_padded)
+        assert isinstance(canonical, str)
+
+
+def test_canonical_transcript_case_insensitive(ag3_sim_api):
+    """Test that gene name matching is case-insensitive."""
+    genes = ag3_sim_api.genome_features().query(
+        f"type == '{ag3_sim_api._gff_gene_type}'"
+    )
+    if len(genes) > 0:
+        gene_name_lower = genes.iloc[0]["Name"].lower()
+        canonical = ag3_sim_api.canonical_transcript(gene_name_lower)
+        assert isinstance(canonical, str)
+
+
+def test_canonical_transcript_single_transcript_gene(ag3_sim_api):
+    """Test that genes with only one transcript return that transcript."""
+    genes = ag3_sim_api.genome_features().query(
+        f"type == '{ag3_sim_api._gff_gene_type}'"
+    )
+    # Find a gene with exactly one transcript if possible
+    for gene_id in genes["ID"]:
+        transcripts = ag3_sim_api.genome_feature_children(parent=gene_id)
+        transcripts = transcripts[transcripts["type"] == "mRNA"]
+        if len(transcripts) == 1:
+            canonical = ag3_sim_api.canonical_transcript(gene_id)
+            assert canonical == transcripts.iloc[0]["ID"]
+            break
+
+
+def test_canonical_transcript_calculation_correctness(ag3_sim_api):
+    """Test that the returned transcript actually has the highest exon length."""
+    genes = ag3_sim_api.genome_features().query(
+        f"type == '{ag3_sim_api._gff_gene_type}'"
+    )
+    if len(genes) == 0:
+        pytest.skip("No genes available in fixture")
+
+    gene_id = genes.iloc[0]["ID"]
+    canonical = ag3_sim_api.canonical_transcript(gene_id)
+
+    # Verify by calculating manually
+    all_transcripts = ag3_sim_api.genome_feature_children(parent=gene_id)
+    all_transcripts = all_transcripts[all_transcripts["type"] == "mRNA"]
+
+    # Calculate lengths for all transcripts
+    max_length = 0
+    max_transcript = None
+    for transcript_id in all_transcripts["ID"]:
+        exons = ag3_sim_api.genome_feature_children(parent=transcript_id)
+        exons = exons[exons["type"] == "exon"]
+        length = (exons["end"] - exons["start"]).sum()
+        if length > max_length:
+            max_length = length
+            max_transcript = transcript_id
+
+    # Verify canonical matches the manually calculated maximum
+    assert canonical == max_transcript
+
+    # Verify canonical has the correct length
+    canonical_exons = ag3_sim_api.genome_feature_children(parent=canonical)
+    canonical_exons = canonical_exons[canonical_exons["type"] == "exon"]
+    canonical_length = (canonical_exons["end"] - canonical_exons["start"]).sum()
+    assert canonical_length == max_length
+
+
+@parametrize_with_cases("fixture,api", cases=".")
+def test_canonical_transcript_all_species(fixture, api: AnophelesGenomeFeaturesData):
+    """Test canonical_transcript works with all species."""
+    genes = api.genome_features().query(f"type == '{api._gff_gene_type}'")
+
+    if len(genes) > 0:
+        gene_id = genes.iloc[0]["ID"]
+        canonical = api.canonical_transcript(gene_id)
+        assert isinstance(canonical, str)