Skip to content

Commit 2039bbf

Browse files
Merge branch 'master' into GH1151-fix-roh-hmm-cache-name
2 parents ca034a4 + b0db6ed commit 2039bbf

4 files changed

Lines changed: 275 additions & 0 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
.idea
22
.vscode
33
__pycache__
4+
.mypy_cache
45
*.pyc
56
dist
67
.venv/

malariagen_data/anoph/base_params.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,14 @@ def _validate_sample_selection_params(
189189
"Random seed used for reproducible down-sampling.",
190190
]
191191

192+
gene: TypeAlias = Annotated[
193+
str,
194+
"""
195+
Gene identifier. Can be either a gene ID or gene name.
196+
Gene names are matched case-insensitively.
197+
""",
198+
]
199+
192200
transcript: TypeAlias = Annotated[
193201
str,
194202
"Gene transcript identifier.",

malariagen_data/anoph/genome_features.py

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,140 @@ def plot_transcript(
314314
bokeh.plotting.show(fig)
315315
return fig
316316

317+
@_check_types
318+
@doc(
319+
summary="Get the canonical transcript for a gene.",
320+
returns="""
321+
The transcript ID for the canonical transcript of the specified gene.
322+
The canonical transcript is the one with the highest number of
323+
transcribed base pairs (sum of exon lengths).
324+
""",
325+
)
326+
def canonical_transcript(
327+
self,
328+
gene: base_params.gene,
329+
) -> str:
330+
"""
331+
Parameters
332+
----------
333+
gene : str
334+
A gene identifier. Can be either a gene ID or gene name.
335+
336+
Returns
337+
-------
338+
str
339+
The transcript ID of the canonical transcript.
340+
341+
Raises
342+
------
343+
ValueError
344+
If the gene identifier is not found or if the gene has no transcripts.
345+
346+
Examples
347+
--------
348+
Get the canonical transcript for a gene by ID:
349+
350+
>>> import malariagen_data
351+
>>> ag3 = malariagen_data.ag3(pre=False)
352+
>>> canonical = ag3.canonical_transcript("AGAP004707")
353+
354+
Get the canonical transcript for a gene by name:
355+
356+
>>> canonical = ag3.canonical_transcript("Pvr")
357+
"""
358+
debug = self._log.debug
359+
debug(f"Looking up canonical transcript for gene '{gene}'")
360+
361+
# Load genome features once with required attributes
362+
with self._spinner(desc="Load gene data"):
363+
# Load required attributes (ordered for consistency with GFF3)
364+
attributes = ("ID", "Parent", self._gff_gene_name_attribute)
365+
df_features = self.genome_features(attributes=attributes)
366+
debug(f"Loaded {len(df_features)} genome features")
367+
368+
# Filter for genes
369+
df_genes = df_features[df_features["type"] == self._gff_gene_type]
370+
name_attr = self._gff_gene_name_attribute
371+
372+
# Normalize input: strip whitespace
373+
gene_normalized = gene.strip()
374+
375+
# Reject empty identifiers after normalization to avoid ambiguous matches
376+
if not gene_normalized:
377+
raise ValueError(
378+
"Gene identifier is empty after stripping whitespace; please provide a valid gene ID or name."
379+
)
380+
# Try exact ID match first (case-sensitive)
381+
debug(f"Attempting ID match for '{gene_normalized}'")
382+
gene_id_match = df_genes[df_genes["ID"].str.strip() == gene_normalized]
383+
384+
if len(gene_id_match) == 1:
385+
gene_id = gene_id_match.iloc[0]["ID"]
386+
debug(f"Found ID match: {gene_id}")
387+
elif len(gene_id_match) > 1:
388+
# This should not happen (ID should be unique), but handling gracefully
389+
raise ValueError(
390+
f"Multiple features with ID '{gene}' found (data integrity issue)"
391+
)
392+
else:
393+
# Trying name match (case-insensitive with whitespace handling)
394+
debug("No ID match, attempting name match")
395+
gene_name_match = df_genes[
396+
df_genes[name_attr].fillna("").str.strip().str.lower()
397+
== gene_normalized.lower()
398+
]
399+
400+
if len(gene_name_match) == 0:
401+
raise ValueError(f"Gene '{gene}' not found (no matching ID or name)")
402+
elif len(gene_name_match) > 1:
403+
# Suggest which genes matched for better debugging
404+
matching_ids = ", ".join(gene_name_match["ID"].values)
405+
raise ValueError(
406+
f"Gene name '{gene}' is ambiguous (matches {len(gene_name_match)} genes: {matching_ids}). "
407+
f"Please use a specific gene ID instead."
408+
)
409+
410+
gene_id = gene_name_match.iloc[0]["ID"]
411+
debug(f"Found name match: {gene_id}")
412+
413+
# Get transcripts for the gene
414+
debug(f"Finding transcripts for gene '{gene_id}'")
415+
df_transcripts = self.genome_feature_children(
416+
parent=gene_id, attributes=("ID",)
417+
)
418+
df_transcripts = df_transcripts[df_transcripts["type"] == "mRNA"]
419+
420+
if len(df_transcripts) == 0:
421+
raise ValueError(f"Gene '{gene}' has no transcripts")
422+
423+
debug(f"Found {len(df_transcripts)} transcripts for gene {gene_id}")
424+
425+
# Calculate transcript lengths and find canonical
426+
debug("Calculating transcript lengths for each transcript")
427+
transcript_lengths = {}
428+
429+
for transcript_id in df_transcripts["ID"]:
430+
# Get all exon children (genome_feature_children handles multi-parent exons)
431+
df_exons = self.genome_feature_children(
432+
parent=transcript_id, attributes=None
433+
)
434+
# Filter for exons only (important: exclude other feature types)
435+
df_exons = df_exons[df_exons["type"] == "exon"].sort_values("start")
436+
437+
if len(df_exons) > 0:
438+
# Calculate total transcribed length (1-based inclusive coordinates)
439+
exon_lengths = (df_exons["end"] - df_exons["start"] + 1).sum()
440+
transcript_lengths[transcript_id] = exon_lengths
441+
debug(f" {transcript_id}: {len(df_exons)} exons, {exon_lengths} bp")
442+
if not transcript_lengths:
443+
raise ValueError(f"Gene '{gene}' has no transcripts with exons")
444+
445+
# Find canonical (maximum length)
446+
canonical = max(transcript_lengths, key=lambda tid: transcript_lengths[tid])
447+
canonical_length = transcript_lengths[canonical]
448+
debug(f"Canonical transcript: {canonical} with {canonical_length} bp")
449+
return canonical
450+
317451
@_check_types
318452
@doc(
319453
summary="Plot a genes track, using bokeh.",

tests/anoph/test_genome_features.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,3 +252,135 @@ def test_genome_features_virtual_contigs(ag3_sim_api, chrom):
252252
assert isinstance(df, pd.DataFrame)
253253
if len(df) > 0:
254254
assert df["contig"].unique() == region.split(":")[0]
255+
256+
257+
# =============================================================================
258+
# Tests for canonical_transcript functionality
259+
# =============================================================================
260+
261+
262+
@parametrize_with_cases("fixture,api", cases=".")
263+
def test_canonical_transcript_by_id(fixture, api: AnophelesGenomeFeaturesData):
264+
"""Test finding canonical transcript by gene ID."""
265+
genes = api.genome_features().query(f"type == '{api._gff_gene_type}'")
266+
if len(genes) == 0:
267+
pytest.skip("No genes available in fixture")
268+
269+
gene_id = genes.iloc[0]["ID"]
270+
canonical = api.canonical_transcript(gene_id)
271+
assert isinstance(canonical, str)
272+
assert len(canonical) > 0
273+
274+
275+
@parametrize_with_cases("fixture,api", cases=".")
276+
def test_canonical_transcript_by_name(fixture, api: AnophelesGenomeFeaturesData):
277+
"""Test finding canonical transcript by gene name."""
278+
genes = api.genome_features().query(f"type == '{api._gff_gene_type}'")
279+
if len(genes) == 0:
280+
pytest.skip("No genes available in fixture")
281+
282+
gene_name = genes.iloc[0][api._gff_gene_name_attribute]
283+
canonical = api.canonical_transcript(gene_name)
284+
assert isinstance(canonical, str)
285+
assert len(canonical) > 0
286+
287+
288+
@parametrize_with_cases("fixture,api", cases=".")
289+
def test_canonical_transcript_invalid_gene(fixture, api: AnophelesGenomeFeaturesData):
290+
"""Test that ValueError is raised for non-existent gene."""
291+
with pytest.raises(ValueError, match="not found"):
292+
api.canonical_transcript("NONEXISTENT_GENE_ID_12345")
293+
294+
295+
@parametrize_with_cases("fixture,api", cases=".")
296+
def test_canonical_transcript_empty_string(fixture, api: AnophelesGenomeFeaturesData):
297+
"""Test that ValueError is raised for empty string."""
298+
with pytest.raises(ValueError):
299+
api.canonical_transcript("")
300+
301+
302+
@parametrize_with_cases("fixture,api", cases=".")
303+
def test_canonical_transcript_whitespace_handling(
304+
fixture, api: AnophelesGenomeFeaturesData
305+
):
306+
"""Test that whitespace handling is preserved during lookup."""
307+
genes = api.genome_features().query(f"type == '{api._gff_gene_type}'")
308+
if len(genes) == 0:
309+
pytest.skip("No genes available in fixture")
310+
311+
gene_id = genes.iloc[0]["ID"]
312+
canonical = api.canonical_transcript(gene_id)
313+
assert isinstance(canonical, str)
314+
315+
316+
@parametrize_with_cases("fixture,api", cases=".")
317+
def test_canonical_transcript_case_insensitive(
318+
fixture, api: AnophelesGenomeFeaturesData
319+
):
320+
"""Test that gene name matching is case-insensitive."""
321+
genes = api.genome_features().query(f"type == '{api._gff_gene_type}'")
322+
if len(genes) == 0:
323+
pytest.skip("No genes available in fixture")
324+
325+
gene_name = genes.iloc[0][api._gff_gene_name_attribute]
326+
gene_name_lower = gene_name.lower()
327+
canonical = api.canonical_transcript(gene_name_lower)
328+
assert isinstance(canonical, str)
329+
330+
331+
@parametrize_with_cases("fixture,api", cases=".")
332+
def test_canonical_transcript_single_transcript_gene(
333+
fixture, api: AnophelesGenomeFeaturesData
334+
):
335+
"""Test that genes with only one transcript return that transcript."""
336+
genes = api.genome_features().query(f"type == '{api._gff_gene_type}'")
337+
if len(genes) == 0:
338+
pytest.skip("No genes available in fixture")
339+
340+
# Find a gene with exactly one transcript
341+
for gene_id in genes["ID"]:
342+
transcripts = api.genome_feature_children(parent=gene_id)
343+
transcripts = transcripts[transcripts["type"] == "mRNA"]
344+
if len(transcripts) == 1:
345+
canonical = api.canonical_transcript(gene_id)
346+
assert canonical == transcripts.iloc[0]["ID"]
347+
return
348+
349+
pytest.skip("No gene with exactly one transcript available in fixture")
350+
351+
352+
@parametrize_with_cases("fixture,api", cases=".")
353+
def test_canonical_transcript_calculation_correctness(
354+
fixture, api: AnophelesGenomeFeaturesData
355+
):
356+
"""Test that the returned transcript has the highest exon length."""
357+
genes = api.genome_features().query(f"type == '{api._gff_gene_type}'")
358+
if len(genes) == 0:
359+
pytest.skip("No genes available in fixture")
360+
361+
gene_id = genes.iloc[0]["ID"]
362+
canonical = api.canonical_transcript(gene_id)
363+
364+
# Verify by calculating manually
365+
all_transcripts = api.genome_feature_children(parent=gene_id)
366+
all_transcripts = all_transcripts[all_transcripts["type"] == "mRNA"]
367+
368+
# Calculate lengths for all transcripts
369+
max_length = 0
370+
max_transcript = None
371+
for transcript_id in all_transcripts["ID"]:
372+
exons = api.genome_feature_children(parent=transcript_id)
373+
exons = exons[exons["type"] == "exon"]
374+
length = (exons["end"] - exons["start"] + 1).sum()
375+
if length > max_length:
376+
max_length = length
377+
max_transcript = transcript_id
378+
379+
# Verify canonical matches the manually calculated maximum
380+
assert canonical == max_transcript
381+
382+
# Verify canonical has the correct length
383+
canonical_exons = api.genome_feature_children(parent=canonical)
384+
canonical_exons = canonical_exons[canonical_exons["type"] == "exon"]
385+
canonical_length = (canonical_exons["end"] - canonical_exons["start"] + 1).sum()
386+
assert canonical_length == max_length

0 commit comments

Comments
 (0)