@@ -314,6 +314,140 @@ def plot_transcript(
314314 bokeh .plotting .show (fig )
315315 return fig
316316
317+ @_check_types
318+ @doc (
319+ summary = "Get the canonical transcript for a gene." ,
320+ returns = """
321+ The transcript ID for the canonical transcript of the specified gene.
322+ The canonical transcript is the one with the highest number of
323+ transcribed base pairs (sum of exon lengths).
324+ """ ,
325+ )
326+ def canonical_transcript (
327+ self ,
328+ gene : base_params .gene ,
329+ ) -> str :
330+ """
331+ Parameters
332+ ----------
333+ gene : str
334+ A gene identifier. Can be either a gene ID or gene name.
335+
336+ Returns
337+ -------
338+ str
339+ The transcript ID of the canonical transcript.
340+
341+ Raises
342+ ------
343+ ValueError
344+ If the gene identifier is not found or if the gene has no transcripts.
345+
346+ Examples
347+ --------
348+ Get the canonical transcript for a gene by ID:
349+
350+ >>> import malariagen_data
351+ >>> ag3 = malariagen_data.ag3(pre=False)
352+ >>> canonical = ag3.canonical_transcript("AGAP004707")
353+
354+ Get the canonical transcript for a gene by name:
355+
356+ >>> canonical = ag3.canonical_transcript("Pvr")
357+ """
358+ debug = self ._log .debug
359+ debug (f"Looking up canonical transcript for gene '{ gene } '" )
360+
361+ # Load genome features once with required attributes
362+ with self ._spinner (desc = "Load gene data" ):
363+ # Load required attributes (ordered for consistency with GFF3)
364+ attributes = ("ID" , "Parent" , self ._gff_gene_name_attribute )
365+ df_features = self .genome_features (attributes = attributes )
366+ debug (f"Loaded { len (df_features )} genome features" )
367+
368+ # Filter for genes
369+ df_genes = df_features [df_features ["type" ] == self ._gff_gene_type ]
370+ name_attr = self ._gff_gene_name_attribute
371+
372+ # Normalize input: strip whitespace
373+ gene_normalized = gene .strip ()
374+
375+ # Reject empty identifiers after normalization to avoid ambiguous matches
376+ if not gene_normalized :
377+ raise ValueError (
378+ "Gene identifier is empty after stripping whitespace; please provide a valid gene ID or name."
379+ )
380+ # Try exact ID match first (case-sensitive)
381+ debug (f"Attempting ID match for '{ gene_normalized } '" )
382+ gene_id_match = df_genes [df_genes ["ID" ].str .strip () == gene_normalized ]
383+
384+ if len (gene_id_match ) == 1 :
385+ gene_id = gene_id_match .iloc [0 ]["ID" ]
386+ debug (f"Found ID match: { gene_id } " )
387+ elif len (gene_id_match ) > 1 :
388+ # This should not happen (ID should be unique), but handling gracefully
389+ raise ValueError (
390+ f"Multiple features with ID '{ gene } ' found (data integrity issue)"
391+ )
392+ else :
393+ # Trying name match (case-insensitive with whitespace handling)
394+ debug ("No ID match, attempting name match" )
395+ gene_name_match = df_genes [
396+ df_genes [name_attr ].fillna ("" ).str .strip ().str .lower ()
397+ == gene_normalized .lower ()
398+ ]
399+
400+ if len (gene_name_match ) == 0 :
401+ raise ValueError (f"Gene '{ gene } ' not found (no matching ID or name)" )
402+ elif len (gene_name_match ) > 1 :
403+ # Suggest which genes matched for better debugging
404+ matching_ids = ", " .join (gene_name_match ["ID" ].values )
405+ raise ValueError (
406+ f"Gene name '{ gene } ' is ambiguous (matches { len (gene_name_match )} genes: { matching_ids } ). "
407+ f"Please use a specific gene ID instead."
408+ )
409+
410+ gene_id = gene_name_match .iloc [0 ]["ID" ]
411+ debug (f"Found name match: { gene_id } " )
412+
413+ # Get transcripts for the gene
414+ debug (f"Finding transcripts for gene '{ gene_id } '" )
415+ df_transcripts = self .genome_feature_children (
416+ parent = gene_id , attributes = ("ID" ,)
417+ )
418+ df_transcripts = df_transcripts [df_transcripts ["type" ] == "mRNA" ]
419+
420+ if len (df_transcripts ) == 0 :
421+ raise ValueError (f"Gene '{ gene } ' has no transcripts" )
422+
423+ debug (f"Found { len (df_transcripts )} transcripts for gene { gene_id } " )
424+
425+ # Calculate transcript lengths and find canonical
426+ debug ("Calculating transcript lengths for each transcript" )
427+ transcript_lengths = {}
428+
429+ for transcript_id in df_transcripts ["ID" ]:
430+ # Get all exon children (genome_feature_children handles multi-parent exons)
431+ df_exons = self .genome_feature_children (
432+ parent = transcript_id , attributes = None
433+ )
434+ # Filter for exons only (important: exclude other feature types)
435+ df_exons = df_exons [df_exons ["type" ] == "exon" ].sort_values ("start" )
436+
437+ if len (df_exons ) > 0 :
438+ # Calculate total transcribed length (1-based inclusive coordinates)
439+ exon_lengths = (df_exons ["end" ] - df_exons ["start" ] + 1 ).sum ()
440+ transcript_lengths [transcript_id ] = exon_lengths
441+ debug (f" { transcript_id } : { len (df_exons )} exons, { exon_lengths } bp" )
442+ if not transcript_lengths :
443+ raise ValueError (f"Gene '{ gene } ' has no transcripts with exons" )
444+
445+ # Find canonical (maximum length)
446+ canonical = max (transcript_lengths , key = lambda tid : transcript_lengths [tid ])
447+ canonical_length = transcript_lengths [canonical ]
448+ debug (f"Canonical transcript: { canonical } with { canonical_length } bp" )
449+ return canonical
450+
317451 @_check_types
318452 @doc (
319453 summary = "Plot a genes track, using bokeh." ,
0 commit comments