Skip to content

Commit 79c38ce

Browse files
authored
Merge pull request #1147 from khushthecoder/GH1064-fix-transient-aa-allele-frequencies-test
Fix #1064: Return empty DataFrame instead of raising ValueError in aa…
2 parents 3014709 + 4058606 commit 79c38ce

2 files changed

Lines changed: 94 additions & 28 deletions

File tree

malariagen_data/anoph/snp_frq.py

Lines changed: 50 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -345,11 +345,55 @@ def aa_allele_frequencies(
345345
# We just want aa change.
346346
df_ns_snps = df_snps.query(AA_CHANGE_QUERY).copy()
347347

348-
# Early check for no matching SNPs.
349-
if len(df_ns_snps) == 0: # pragma: no cover
350-
raise ValueError(
351-
"No amino acid change SNPs found for the given transcript and site mask."
348+
# Handle case where no amino acid change SNPs are found.
349+
# N.B., this can legitimately happen for some transcript/site_mask/query
350+
# combinations. Return a well-formed empty DataFrame rather than raising,
351+
# to avoid transient test failures and to allow downstream code to handle
352+
# the empty result gracefully. See also:
353+
# https://github.com/malariagen/malariagen-data-python/issues/1064
354+
if len(df_ns_snps) == 0:
355+
warnings.warn(
356+
"No amino acid change SNPs found for the given transcript "
357+
"and site mask. Returning an empty DataFrame.",
358+
stacklevel=2,
359+
)
360+
# Build an empty DataFrame with the expected schema.
361+
freq_cols = [col for col in df_snps.columns if col.startswith("frq_")]
362+
count_cols = [col for col in df_snps.columns if col.startswith("count_")]
363+
nobs_cols = [col for col in df_snps.columns if col.startswith("nobs_")]
364+
keep_cols = [
365+
"contig",
366+
"transcript",
367+
"aa_pos",
368+
"ref_allele",
369+
"ref_aa",
370+
"alt_aa",
371+
"effect",
372+
"impact",
373+
"grantham_score",
374+
"sneath_score",
375+
]
376+
all_cols = (
377+
["aa_change"]
378+
+ freq_cols
379+
+ ["max_af"]
380+
+ keep_cols
381+
+ ["alt_allele", "label", "position"]
352382
)
383+
if include_counts:
384+
all_cols = all_cols + count_cols + nobs_cols
385+
df_empty = pd.DataFrame(columns=all_cols)
386+
df_empty.set_index(["aa_change", "contig", "position"], inplace=True)
387+
388+
# Add metadata.
389+
gene_name = self._transcript_to_parent_name(transcript)
390+
title = transcript
391+
if gene_name:
392+
title += f" ({gene_name})"
393+
title += " SNP frequencies"
394+
df_empty.attrs["title"] = title
395+
396+
return df_empty
353397

354398
# N.B., we need to worry about the possibility of the
355399
# same aa change due to SNPs at different positions. We cannot
@@ -375,7 +419,7 @@ def np_sum(g):
375419
for c in nobs_cols:
376420
agg[c] = "first"
377421

378-
keep_cols = (
422+
keep_cols = [
379423
"contig",
380424
"transcript",
381425
"aa_pos",
@@ -386,7 +430,7 @@ def np_sum(g):
386430
"impact",
387431
"grantham_score",
388432
"sneath_score",
389-
)
433+
]
390434
for c in keep_cols:
391435
agg[c] = "first"
392436
agg["alt_allele"] = lambda v: "{" + ",".join(v) + "}" if len(v) > 1 else v

tests/anoph/test_snp_frq.py

Lines changed: 44 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -403,14 +403,21 @@ def test_allele_frequencies_with_str_cohorts(
403403
# Run the function under test.
404404
df_aa = api.aa_allele_frequencies(**params)
405405

406-
check_plot_frequencies_heatmap(api, df_aa)
407-
408-
# Standard checks.
409-
check_aa_allele_frequencies(
410-
df=df_aa,
411-
cohort_labels=cohort_labels,
412-
transcript=transcript,
413-
)
406+
# Handle the case where no amino acid change SNPs are found.
407+
# In this case, aa_allele_frequencies returns an empty DataFrame
408+
# instead of raising (see issue #1064).
409+
if len(df_aa) > 0:
410+
check_plot_frequencies_heatmap(api, df_aa)
411+
412+
# Standard checks.
413+
check_aa_allele_frequencies(
414+
df=df_aa,
415+
cohort_labels=cohort_labels,
416+
transcript=transcript,
417+
)
418+
else:
419+
assert isinstance(df_aa, pd.DataFrame)
420+
assert df_aa.index.names == ["aa_change", "contig", "position"]
414421

415422

416423
@pytest.mark.parametrize("min_cohort_size", [0, 10, 100])
@@ -531,14 +538,22 @@ def test_allele_frequencies_with_str_cohorts_and_sample_query(
531538
# Run the function under test.
532539
df_aa = api.aa_allele_frequencies(**params)
533540

534-
check_plot_frequencies_heatmap(api, df_aa)
541+
# Handle the case where no amino acid change SNPs are found.
542+
# In this case, aa_allele_frequencies returns an empty DataFrame
543+
# instead of raising (see issue #1064).
544+
if len(df_aa) > 0:
545+
check_plot_frequencies_heatmap(api, df_aa)
535546

536-
# Standard checks.
537-
check_aa_allele_frequencies(
538-
df=df_aa,
539-
cohort_labels=cohort_labels,
540-
transcript=transcript,
541-
)
547+
# Standard checks.
548+
check_aa_allele_frequencies(
549+
df=df_aa,
550+
cohort_labels=cohort_labels,
551+
transcript=transcript,
552+
)
553+
else:
554+
# Verify the empty DataFrame has the expected structure.
555+
assert isinstance(df_aa, pd.DataFrame)
556+
assert df_aa.index.names == ["aa_change", "contig", "position"]
542557

543558

544559
@parametrize_with_cases(
@@ -604,14 +619,21 @@ def test_allele_frequencies_with_str_cohorts_and_sample_query_options(
604619
# Run the function under test.
605620
df_aa = api.aa_allele_frequencies(**params)
606621

607-
check_plot_frequencies_heatmap(api, df_aa)
622+
# Handle the case where no amino acid change SNPs are found.
623+
# In this case, aa_allele_frequencies returns an empty DataFrame
624+
# instead of raising (see issue #1064).
625+
if len(df_aa) > 0:
626+
check_plot_frequencies_heatmap(api, df_aa)
608627

609-
# Standard checks.
610-
check_aa_allele_frequencies(
611-
df=df_aa,
612-
cohort_labels=cohort_labels,
613-
transcript=transcript,
614-
)
628+
# Standard checks.
629+
check_aa_allele_frequencies(
630+
df=df_aa,
631+
cohort_labels=cohort_labels,
632+
transcript=transcript,
633+
)
634+
else:
635+
assert isinstance(df_aa, pd.DataFrame)
636+
assert df_aa.index.names == ["aa_change", "contig", "position"]
615637

616638

617639
@parametrize_with_cases("fixture,api", cases=".")

0 commit comments

Comments
 (0)