Skip to content

Commit 66607b7

Browse files
Update malariagen_data/anoph/genome_features.py
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent e3488eb commit 66607b7

1 file changed

Lines changed: 21 additions & 13 deletions

File tree

malariagen_data/anoph/genome_features.py

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -424,22 +424,30 @@ def canonical_transcript(
424424

425425
# Calculate transcript lengths and find canonical
426426
debug("Calculating transcript lengths for each transcript")
427-
transcript_lengths = {}
427+
transcript_ids = df_transcripts["ID"].tolist()
428428

429-
for transcript_id in df_transcripts["ID"]:
430-
# Get all exon children (genome_feature_children handles multi-parent exons)
431-
df_exons = self.genome_feature_children(
432-
parent=transcript_id, attributes=None
433-
)
434-
# Filter for exons only (important: exclude other feature types)
435-
df_exons = df_exons[df_exons["type"] == "exon"].sort_values("start")
429+
# Get all exon children for all transcripts in a single pass
430+
df_children = self.genome_feature_children(parent=transcript_ids, attributes=None)
431+
if df_children is None or len(df_children) == 0:
432+
raise ValueError(f"Gene '{gene}' has no transcripts with exons")
436433

437-
if len(df_exons) > 0:
438-
# Calculate total transcribed length (1-based inclusive coordinates)
439-
exon_lengths = (df_exons["end"] - df_exons["start"] + 1).sum()
440-
transcript_lengths[transcript_id] = exon_lengths
441-
debug(f" {transcript_id}: {len(df_exons)} exons, {exon_lengths} bp")
434+
# Filter for exons only (important: exclude other feature types)
435+
df_exons = df_children[df_children["type"] == "exon"].copy()
436+
if len(df_exons) == 0:
437+
raise ValueError(f"Gene '{gene}' has no transcripts with exons")
438+
439+
# Calculate exon lengths and sum per transcript (Parent)
440+
df_exons = df_exons.sort_values("start")
441+
exon_lengths_series = (df_exons["end"] - df_exons["start"])
442+
df_exons = df_exons.assign(_exon_length=exon_lengths_series)
443+
exon_length_per_transcript = df_exons.groupby("Parent")["_exon_length"].sum()
442444

445+
# Build transcript_lengths dict and emit debug for each transcript
446+
transcript_lengths = {}
447+
for transcript_id, exon_length in exon_length_per_transcript.items():
448+
transcript_lengths[transcript_id] = exon_length
449+
n_exons = (df_exons["Parent"] == transcript_id).sum()
450+
debug(f" {transcript_id}: {n_exons} exons, {exon_length} bp")
443451
if not transcript_lengths:
444452
raise ValueError(f"Gene '{gene}' has no transcripts with exons")
445453

0 commit comments

Comments
 (0)