@@ -424,22 +424,30 @@ def canonical_transcript(
424424
425425 # Calculate transcript lengths and find canonical
426426 debug ("Calculating transcript lengths for each transcript" )
427- transcript_lengths = {}
427+ transcript_ids = df_transcripts [ "ID" ]. tolist ()
428428
429- for transcript_id in df_transcripts ["ID" ]:
430- # Get all exon children (genome_feature_children handles multi-parent exons)
431- df_exons = self .genome_feature_children (
432- parent = transcript_id , attributes = None
433- )
434- # Filter for exons only (important: exclude other feature types)
435- df_exons = df_exons [df_exons ["type" ] == "exon" ].sort_values ("start" )
429+ # Get all exon children for all transcripts in a single pass
430+ df_children = self .genome_feature_children (parent = transcript_ids , attributes = None )
431+ if df_children is None or len (df_children ) == 0 :
432+ raise ValueError (f"Gene '{ gene } ' has no transcripts with exons" )
436433
437- if len (df_exons ) > 0 :
438- # Calculate total transcribed length (1-based inclusive coordinates)
439- exon_lengths = (df_exons ["end" ] - df_exons ["start" ] + 1 ).sum ()
440- transcript_lengths [transcript_id ] = exon_lengths
441- debug (f" { transcript_id } : { len (df_exons )} exons, { exon_lengths } bp" )
434+ # Filter for exons only (important: exclude other feature types)
435+ df_exons = df_children [df_children ["type" ] == "exon" ].copy ()
436+ if len (df_exons ) == 0 :
437+ raise ValueError (f"Gene '{ gene } ' has no transcripts with exons" )
438+
439+ # Calculate exon lengths and sum per transcript (Parent)
440+ df_exons = df_exons .sort_values ("start" )
441+ exon_lengths_series = (df_exons ["end" ] - df_exons ["start" ])
442+ df_exons = df_exons .assign (_exon_length = exon_lengths_series )
443+ exon_length_per_transcript = df_exons .groupby ("Parent" )["_exon_length" ].sum ()
442444
445+ # Build transcript_lengths dict and emit debug for each transcript
446+ transcript_lengths = {}
447+ for transcript_id , exon_length in exon_length_per_transcript .items ():
448+ transcript_lengths [transcript_id ] = exon_length
449+ n_exons = (df_exons ["Parent" ] == transcript_id ).sum ()
450+ debug (f" { transcript_id } : { n_exons } exons, { exon_length } bp" )
443451 if not transcript_lengths :
444452 raise ValueError (f"Gene '{ gene } ' has no transcripts with exons" )
445453
0 commit comments