Skip to content

Commit 63c3623

Browse files
committed
fix: handle single-child transcripts in veff.Annotator.get_children (#840)
When a transcript has only one child (e.g. a single-exon non-coding transcript), pandas DataFrame.loc returns a Series instead of a DataFrame. This caused a TypeError when downstream code called .sort_values('start') on the result. Changes: - Annotator.get_children() now always returns a DataFrame by converting a Series result to a single-row DataFrame via .to_frame().T - Annotator.get_effects() now raises an informative ValueError when a transcript has no CDS or UTR children (non-coding), instead of failing with a confusing error downstream Closes #840
1 parent bef737a commit 63c3623

1 file changed

Lines changed: 21 additions & 1 deletion

File tree

malariagen_data/veff.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import collections
22
import operator
33

4+
import pandas as pd
45
from Bio.Seq import Seq # type: ignore
56

67
VariantEffect = collections.namedtuple(
@@ -62,7 +63,15 @@ def get_feature(self, feature_id):
6263
return self._idx_feature_id.loc[feature_id]
6364

6465
def get_children(self, feature_id):
65-
return self._idx_parent_id.loc[feature_id]
66+
result = self._idx_parent_id.loc[feature_id]
67+
# When there is only one child, pandas .loc returns a Series
68+
# instead of a DataFrame. Ensure we always return a DataFrame
69+
# so downstream code (e.g. .sort_values, column filtering) works.
70+
if isinstance(result, pd.Series):
71+
result = result.to_frame().T
72+
# Preserve the index name from the parent DataFrame.
73+
result.index.name = self._idx_parent_id.index.name
74+
return result
6675

6776
def get_ref_seq(self, chrom, start, stop):
6877
"""Accepts 1-based coords."""
@@ -104,6 +113,17 @@ def get_effects(self, transcript, variants, progress=None):
104113
utr3 = list(children[children.type == "three_prime_UTR"].itertuples())
105114
introns = [(x.end + 1, y.start - 1) for x, y in zip(exons[:-1], exons[1:])]
106115

116+
# Guard: raise an informative error if the transcript has no CDS
117+
# regions, as variant effect annotation is not meaningful for
118+
# non-coding transcripts.
119+
if len(cdss) == 0 and len(utr5) == 0 and len(utr3) == 0:
120+
raise ValueError(
121+
f"Transcript {transcript!r} has no CDS or UTR children. "
122+
f"Variant effect annotation is only supported for "
123+
f"protein-coding transcripts. This may indicate "
124+
f"incomplete or incorrect genome annotations."
125+
)
126+
107127
effect_values = []
108128
impact_values = []
109129
ref_codon_values = []

0 commit comments

Comments
 (0)