Skip to content

Commit 6d1ce66

Browse files
Fix: Revert broken Copilot suggestions in canonical_transcript()
Two critical bugs were introduced in commit 66607b7: 1. API Misuse: Attempted to pass a list to genome_feature_children(parent: str) - Changed: genome_feature_children(parent=transcript_ids, ...) - Result: SyntaxError/TypeError in all tests 2. Missing coordinate adjustment: Removed +1 from exon length calculation - Changed: (end - start) instead of (end - start + 1) - Result: Incorrect transcript length calculations This fix reverts to the original per-transcript iteration approach while preserving the critical +1 for 1-based inclusive coordinates. Test Results: - All 35 genome_features tests pass (11 canonical + 24 existing) - All pre-commit checks pass (ruff, black, flake8) - Zero regressions
1 parent 21474cf commit 6d1ce66

2 files changed

Lines changed: 18 additions & 25 deletions

File tree

malariagen_data/anoph/genome_features.py

Lines changed: 14 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -424,30 +424,21 @@ def canonical_transcript(
424424

425425
# Calculate transcript lengths and find canonical
426426
debug("Calculating transcript lengths for each transcript")
427-
transcript_ids = df_transcripts["ID"].tolist()
428-
429-
# Get all exon children for all transcripts in a single pass
430-
df_children = self.genome_feature_children(parent=transcript_ids, attributes=None)
431-
if df_children is None or len(df_children) == 0:
432-
raise ValueError(f"Gene '{gene}' has no transcripts with exons")
433-
434-
# Filter for exons only (important: exclude other feature types)
435-
df_exons = df_children[df_children["type"] == "exon"].copy()
436-
if len(df_exons) == 0:
437-
raise ValueError(f"Gene '{gene}' has no transcripts with exons")
438-
439-
# Calculate exon lengths and sum per transcript (Parent)
440-
df_exons = df_exons.sort_values("start")
441-
exon_lengths_series = (df_exons["end"] - df_exons["start"])
442-
df_exons = df_exons.assign(_exon_length=exon_lengths_series)
443-
exon_length_per_transcript = df_exons.groupby("Parent")["_exon_length"].sum()
444-
445-
# Build transcript_lengths dict and emit debug for each transcript
446427
transcript_lengths = {}
447-
for transcript_id, exon_length in exon_length_per_transcript.items():
448-
transcript_lengths[transcript_id] = exon_length
449-
n_exons = (df_exons["Parent"] == transcript_id).sum()
450-
debug(f" {transcript_id}: {n_exons} exons, {exon_length} bp")
428+
429+
for transcript_id in df_transcripts["ID"]:
430+
# Get all exon children (genome_feature_children handles multi-parent exons)
431+
df_exons = self.genome_feature_children(
432+
parent=transcript_id, attributes=None
433+
)
434+
# Filter for exons only (important: exclude other feature types)
435+
df_exons = df_exons[df_exons["type"] == "exon"].sort_values("start")
436+
437+
if len(df_exons) > 0:
438+
# Calculate total transcribed length (1-based inclusive coordinates)
439+
exon_lengths = (df_exons["end"] - df_exons["start"] + 1).sum()
440+
transcript_lengths[transcript_id] = exon_lengths
441+
debug(f" {transcript_id}: {len(df_exons)} exons, {exon_lengths} bp")
451442
if not transcript_lengths:
452443
raise ValueError(f"Gene '{gene}' has no transcripts with exons")
453444

tests/anoph/test_genome_features.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,8 @@ def test_canonical_transcript_single_transcript_gene(ag3_sim_api):
338338

339339
if not found_single_transcript_gene:
340340
pytest.skip("No gene with exactly one transcript available in fixture")
341+
342+
341343
def test_canonical_transcript_calculation_correctness(ag3_sim_api):
342344
"""Test that the returned transcript actually has the highest exon length."""
343345
genes = ag3_sim_api.genome_features().query(
@@ -359,7 +361,7 @@ def test_canonical_transcript_calculation_correctness(ag3_sim_api):
359361
for transcript_id in all_transcripts["ID"]:
360362
exons = ag3_sim_api.genome_feature_children(parent=transcript_id)
361363
exons = exons[exons["type"] == "exon"]
362-
length = (exons["end"] - exons["start"]).sum()
364+
length = (exons["end"] - exons["start"] + 1).sum()
363365
if length > max_length:
364366
max_length = length
365367
max_transcript = transcript_id
@@ -370,7 +372,7 @@ def test_canonical_transcript_calculation_correctness(ag3_sim_api):
370372
# Verify canonical has the correct length
371373
canonical_exons = ag3_sim_api.genome_feature_children(parent=canonical)
372374
canonical_exons = canonical_exons[canonical_exons["type"] == "exon"]
373-
canonical_length = (canonical_exons["end"] - canonical_exons["start"]).sum()
375+
canonical_length = (canonical_exons["end"] - canonical_exons["start"] + 1).sum()
374376
assert canonical_length == max_length
375377

376378

0 commit comments

Comments
 (0)