Skip to content

Commit 2934805

Browse files
Implement issue #794: Add canonical_transcript() method
- Add gene TypeAlias to base_params.py for type annotations - Implement canonical_transcript() method in genome_features.py * Finds the transcript with the longest total exon length * Supports gene ID and gene name (case-insensitive) lookup * Includes comprehensive error handling and debug logging - Add 11 comprehensive test cases * Basic functionality (ID and name lookup) * Error handling (non-existent genes, empty strings) * Edge cases (whitespace, case sensitivity, single-transcript genes) * Algorithm correctness verification * Multi-species support (ag3, af1, adir1) All tests passing: 11/11 new + 24 existing = 35 total
1 parent 159b91e commit 2934805

3 files changed

Lines changed: 266 additions & 0 deletions

File tree

malariagen_data/anoph/base_params.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,14 @@ def _validate_sample_selection_params(
189189
"Random seed used for reproducible down-sampling.",
190190
]
191191

192+
gene: TypeAlias = Annotated[
193+
str,
194+
"""
195+
Gene identifier. Can be either a gene ID or gene name.
196+
Gene names are matched case-insensitively.
197+
""",
198+
]
199+
192200
transcript: TypeAlias = Annotated[
193201
str,
194202
"Gene transcript identifier.",

malariagen_data/anoph/genome_features.py

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,136 @@ def plot_transcript(
314314
bokeh.plotting.show(fig)
315315
return fig
316316

317+
@_check_types
318+
@doc(
319+
summary="Get the canonical transcript for a gene.",
320+
returns="""
321+
The transcript ID for the canonical transcript of the specified gene.
322+
The canonical transcript is the one with the highest number of
323+
transcribed base pairs (sum of exon lengths).
324+
""",
325+
)
326+
def canonical_transcript(
327+
self,
328+
gene: base_params.gene,
329+
) -> str:
330+
"""
331+
Parameters
332+
----------
333+
gene : str
334+
A gene identifier. Can be either a gene ID or gene name.
335+
336+
Returns
337+
-------
338+
str
339+
The transcript ID of the canonical transcript.
340+
341+
Raises
342+
------
343+
ValueError
344+
If the gene identifier is not found or if the gene has no transcripts.
345+
346+
Examples
347+
--------
348+
Get the canonical transcript for a gene by ID:
349+
350+
>>> import malariagen_data
351+
>>> ag3 = malariagen_data.ag3(pre=False)
352+
>>> canonical = ag3.canonical_transcript("AGAP004707")
353+
354+
Get the canonical transcript for a gene by name:
355+
356+
>>> canonical = ag3.canonical_transcript("Pvr")
357+
"""
358+
debug = self._log.debug
359+
debug(f"Looking up canonical transcript for gene '{gene}'")
360+
361+
# Load genome features once with required attributes
362+
with self._spinner(desc="Load gene data"):
363+
# Load required attributes (ordered for consistency with GFF3)
364+
attributes = ("ID", "Parent", self._gff_gene_name_attribute)
365+
df_features = self.genome_features(attributes=attributes)
366+
debug(f"Loaded {len(df_features)} genome features")
367+
368+
# Filter for genes
369+
df_genes = df_features[df_features["type"] == self._gff_gene_type]
370+
name_attr = self._gff_gene_name_attribute
371+
372+
# Normalize input: strip whitespace
373+
gene_normalized = gene.strip()
374+
375+
# Try exact ID match first (case-sensitive)
376+
debug(f"Attempting ID match for '{gene_normalized}'")
377+
gene_id_match = df_genes[df_genes["ID"].str.strip() == gene_normalized]
378+
379+
if len(gene_id_match) == 1:
380+
gene_id = gene_id_match.iloc[0]["ID"]
381+
debug(f"Found ID match: {gene_id}")
382+
elif len(gene_id_match) > 1:
383+
# This should not happen (ID should be unique), but handling gracefully
384+
raise ValueError(
385+
f"Multiple features with ID '{gene}' found (data integrity issue)"
386+
)
387+
else:
388+
# Trying name match (case-insensitive with whitespace handling)
389+
debug("No ID match, attempting name match")
390+
gene_name_match = df_genes[
391+
df_genes[name_attr].fillna("").str.strip().str.lower()
392+
== gene_normalized.lower()
393+
]
394+
395+
if len(gene_name_match) == 0:
396+
raise ValueError(f"Gene '{gene}' not found (no matching ID or name)")
397+
elif len(gene_name_match) > 1:
398+
# Suggest which genes matched for better debugging
399+
matching_ids = ", ".join(gene_name_match["ID"].values)
400+
raise ValueError(
401+
f"Gene name '{gene}' is ambiguous (matches {len(gene_name_match)} genes: {matching_ids}). "
402+
f"Please use a specific gene ID instead."
403+
)
404+
405+
gene_id = gene_name_match.iloc[0]["ID"]
406+
debug(f"Found name match: {gene_id}")
407+
408+
# Get transcripts for the gene
409+
debug(f"Finding transcripts for gene '{gene_id}'")
410+
df_transcripts = self.genome_feature_children(
411+
parent=gene_id, attributes=("ID",)
412+
)
413+
df_transcripts = df_transcripts[df_transcripts["type"] == "mRNA"]
414+
415+
if len(df_transcripts) == 0:
416+
raise ValueError(f"Gene '{gene}' has no transcripts")
417+
418+
debug(f"Found {len(df_transcripts)} transcripts for gene {gene_id}")
419+
420+
# Calculate transcript lengths and find canonical
421+
debug("Calculating transcript lengths for each transcript")
422+
transcript_lengths = {}
423+
424+
for transcript_id in df_transcripts["ID"]:
425+
# Get all exon children (genome_feature_children handles multi-parent exons)
426+
df_exons = self.genome_feature_children(
427+
parent=transcript_id, attributes=None
428+
)
429+
# Filter for exons only (important: exclude other feature types)
430+
df_exons = df_exons[df_exons["type"] == "exon"].sort_values("start")
431+
432+
if len(df_exons) > 0:
433+
# Calculate total transcribed length
434+
exon_lengths = (df_exons["end"] - df_exons["start"]).sum()
435+
transcript_lengths[transcript_id] = exon_lengths
436+
debug(f" {transcript_id}: {len(df_exons)} exons, {exon_lengths} bp")
437+
438+
if not transcript_lengths:
439+
raise ValueError(f"Gene '{gene}' has no transcripts with exons")
440+
441+
# Find canonical (maximum length)
442+
canonical = max(transcript_lengths, key=transcript_lengths.get)
443+
canonical_length = transcript_lengths[canonical]
444+
debug(f"Canonical transcript: {canonical} with {canonical_length} bp")
445+
return canonical
446+
317447
@_check_types
318448
@doc(
319449
summary="Plot a genes track, using bokeh.",

tests/anoph/test_genome_features.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,3 +252,131 @@ def test_genome_features_virtual_contigs(ag3_sim_api, chrom):
252252
assert isinstance(df, pd.DataFrame)
253253
if len(df) > 0:
254254
assert df["contig"].unique() == region.split(":")[0]
255+
256+
257+
# =============================================================================
258+
# Tests for canonical_transcript functionality
259+
# =============================================================================
260+
261+
262+
def test_canonical_transcript_by_id(ag3_sim_api):
263+
"""Test finding canonical transcript by gene ID."""
264+
# Get a gene from the fixture
265+
genes = ag3_sim_api.genome_features().query(
266+
f"type == '{ag3_sim_api._gff_gene_type}'"
267+
)
268+
assert len(genes) > 0
269+
270+
gene_id = genes.iloc[0]["ID"]
271+
canonical = ag3_sim_api.canonical_transcript(gene_id)
272+
assert isinstance(canonical, str)
273+
assert len(canonical) > 0
274+
275+
276+
def test_canonical_transcript_by_name(ag3_sim_api):
277+
"""Test finding canonical transcript by gene name."""
278+
genes = ag3_sim_api.genome_features().query(
279+
f"type == '{ag3_sim_api._gff_gene_type}'"
280+
)
281+
assert len(genes) > 0
282+
283+
gene_name = genes.iloc[0]["Name"]
284+
canonical = ag3_sim_api.canonical_transcript(gene_name)
285+
assert isinstance(canonical, str)
286+
assert len(canonical) > 0
287+
288+
289+
def test_canonical_transcript_invalid_gene(ag3_sim_api):
290+
"""Test that ValueError is raised for non-existent gene."""
291+
with pytest.raises(ValueError, match="not found"):
292+
ag3_sim_api.canonical_transcript("NONEXISTENT_GENE_ID_12345")
293+
294+
295+
def test_canonical_transcript_empty_string(ag3_sim_api):
296+
"""Test that ValueError is raised for empty string."""
297+
with pytest.raises(ValueError):
298+
ag3_sim_api.canonical_transcript("")
299+
300+
301+
def test_canonical_transcript_whitespace_handling(ag3_sim_api):
302+
"""Test that whitespace around input doesn't break lookup."""
303+
genes = ag3_sim_api.genome_features().query(
304+
f"type == '{ag3_sim_api._gff_gene_type}'"
305+
)
306+
if len(genes) > 0:
307+
gene_id_padded = f" {genes.iloc[0]['ID']} "
308+
canonical = ag3_sim_api.canonical_transcript(gene_id_padded)
309+
assert isinstance(canonical, str)
310+
311+
312+
def test_canonical_transcript_case_insensitive(ag3_sim_api):
313+
"""Test that gene name matching is case-insensitive."""
314+
genes = ag3_sim_api.genome_features().query(
315+
f"type == '{ag3_sim_api._gff_gene_type}'"
316+
)
317+
if len(genes) > 0:
318+
gene_name_lower = genes.iloc[0]["Name"].lower()
319+
canonical = ag3_sim_api.canonical_transcript(gene_name_lower)
320+
assert isinstance(canonical, str)
321+
322+
323+
def test_canonical_transcript_single_transcript_gene(ag3_sim_api):
324+
"""Test that genes with only one transcript return that transcript."""
325+
genes = ag3_sim_api.genome_features().query(
326+
f"type == '{ag3_sim_api._gff_gene_type}'"
327+
)
328+
# Find a gene with exactly one transcript if possible
329+
for gene_id in genes["ID"]:
330+
transcripts = ag3_sim_api.genome_feature_children(parent=gene_id)
331+
transcripts = transcripts[transcripts["type"] == "mRNA"]
332+
if len(transcripts) == 1:
333+
canonical = ag3_sim_api.canonical_transcript(gene_id)
334+
assert canonical == transcripts.iloc[0]["ID"]
335+
break
336+
337+
338+
def test_canonical_transcript_calculation_correctness(ag3_sim_api):
339+
"""Test that the returned transcript actually has the highest exon length."""
340+
genes = ag3_sim_api.genome_features().query(
341+
f"type == '{ag3_sim_api._gff_gene_type}'"
342+
)
343+
if len(genes) == 0:
344+
pytest.skip("No genes available in fixture")
345+
346+
gene_id = genes.iloc[0]["ID"]
347+
canonical = ag3_sim_api.canonical_transcript(gene_id)
348+
349+
# Verify by calculating manually
350+
all_transcripts = ag3_sim_api.genome_feature_children(parent=gene_id)
351+
all_transcripts = all_transcripts[all_transcripts["type"] == "mRNA"]
352+
353+
# Calculate lengths for all transcripts
354+
max_length = 0
355+
max_transcript = None
356+
for transcript_id in all_transcripts["ID"]:
357+
exons = ag3_sim_api.genome_feature_children(parent=transcript_id)
358+
exons = exons[exons["type"] == "exon"]
359+
length = (exons["end"] - exons["start"]).sum()
360+
if length > max_length:
361+
max_length = length
362+
max_transcript = transcript_id
363+
364+
# Verify canonical matches the manually calculated maximum
365+
assert canonical == max_transcript
366+
367+
# Verify canonical has the correct length
368+
canonical_exons = ag3_sim_api.genome_feature_children(parent=canonical)
369+
canonical_exons = canonical_exons[canonical_exons["type"] == "exon"]
370+
canonical_length = (canonical_exons["end"] - canonical_exons["start"]).sum()
371+
assert canonical_length == max_length
372+
373+
374+
@parametrize_with_cases("fixture,api", cases=".")
375+
def test_canonical_transcript_all_species(fixture, api: AnophelesGenomeFeaturesData):
376+
"""Test canonical_transcript works with all species."""
377+
genes = api.genome_features().query(f"type == '{api._gff_gene_type}'")
378+
379+
if len(genes) > 0:
380+
gene_id = genes.iloc[0]["ID"]
381+
canonical = api.canonical_transcript(gene_id)
382+
assert isinstance(canonical, str)

0 commit comments

Comments
 (0)