Skip to content

Commit 4338780

Browse files
authored
Merge branch 'master' into GH1049-add-ld-pruning-support
2 parents 8e6fab2 + db8e6df commit 4338780

14 files changed

Lines changed: 459 additions & 296 deletions

File tree

.github/actions/setup-python/action.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,4 @@ runs:
1919
shell: bash
2020
run: |
2121
poetry env use ${{ inputs.python-version }}
22-
poetry install --extras dev
22+
poetry install --with dev,test,docs

CONTRIBUTING.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ This package provides Python tools for accessing and analyzing genomic data from
1212

1313
You'll need:
1414

15-
- [pipx](https://python-poetry.org/) for installing Python tools
15+
- [pipx](https://pipx.pypa.io/) for installing Python tools
1616
- [git](https://git-scm.com/) for version control
1717

1818
Both of these can be installed using your distribution's package manager or [Homebrew](https://brew.sh/) on Mac.
@@ -52,9 +52,13 @@ Both of these can be installed using your distribution's package manager or [Hom
5252

5353
```bash
5454
poetry env use 3.12
55-
poetry install --extras dev
55+
poetry install --with dev,test,docs
5656
```
5757

58+
This installs the runtime dependencies along with the `dev`, `test`, and `docs`
59+
[dependency groups](https://python-poetry.org/docs/managing-dependencies/#dependency-groups).
60+
If you only need to run tests, `poetry install --with test` is sufficient.
61+
5862
**Recommended**: Use `poetry run` to run commands inside the virtual environment:
5963

6064
```bash

malariagen_data/anoph/base_params.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,10 @@
6969
str,
7070
"""
7171
A pandas query string to be evaluated against the sample metadata, to
72-
select samples to be included in the returned data.
72+
select samples to be included in the returned data. E.g.,
73+
"country == 'Uganda'". If the query returns zero results, a warning
74+
will be emitted with fuzzy-match suggestions for possible typos or
75+
case mismatches.
7376
""",
7477
]
7578

malariagen_data/anoph/dipclust.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import warnings
12
from typing import Optional, Tuple
23

34
import allel # type: ignore
@@ -540,8 +541,9 @@ def _insert_dipclust_snp_trace(
540541
figures.append(snp_trace)
541542
subplot_heights.append(snp_row_height * n_snps_transcript)
542543
else:
543-
print(
544-
f"No SNPs were found below {snp_filter_min_maf} allele frequency. Omitting SNP genotype plot."
544+
warnings.warn(
545+
f"No SNPs were found below {snp_filter_min_maf} allele frequency. Omitting SNP genotype plot.",
546+
stacklevel=2,
545547
)
546548
return figures, subplot_heights, n_snps_transcript
547549

@@ -607,8 +609,9 @@ def plot_diplotype_clustering_advanced(
607609
cnv_colorscale = cnv_params.colorscale_default
608610
if cohort_size and snp_transcript:
609611
cohort_size = None
610-
print(
611-
"Cohort size is not supported with amino acid heatmap. Overriding cohort size to None."
612+
warnings.warn(
613+
"Cohort size is not supported with amino acid heatmap. Overriding cohort size to None.",
614+
stacklevel=2,
612615
)
613616

614617
res = self.plot_diplotype_clustering(

malariagen_data/anoph/frq_base.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,13 @@ def _prep_samples_for_cohort_grouping(
2929
# Users can explicitly override with True/False.
3030
filter_unassigned = taxon_by == "taxon"
3131

32+
# Validate taxon_by.
33+
if taxon_by not in df_samples.columns:
34+
raise ValueError(
35+
f"Invalid value for `taxon_by`: {taxon_by!r}. "
36+
f"Must be the name of an existing column in the sample metadata."
37+
)
38+
3239
if filter_unassigned:
3340
# Remove samples with "intermediate" or "unassigned" taxon values,
3441
# as we only want cohorts with clean taxon calls.
@@ -78,6 +85,13 @@ def _prep_samples_for_cohort_grouping(
7885
# Apply the matching period_by function to create a new "period" column.
7986
df_samples["period"] = df_samples.apply(period_by_func, axis="columns")
8087

88+
# Validate area_by.
89+
if area_by not in df_samples.columns:
90+
raise ValueError(
91+
f"Invalid value for `area_by`: {area_by!r}. "
92+
f"Must be the name of an existing column in the sample metadata."
93+
)
94+
8195
# Copy the specified area_by column to a new "area" column.
8296
df_samples["area"] = df_samples[area_by]
8397

malariagen_data/anoph/hapclust.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import warnings
12
from typing import Optional, Tuple
23

34
import allel # type: ignore
@@ -402,8 +403,9 @@ def plot_haplotype_clustering_advanced(
402403

403404
if cohort_size and snp_transcript:
404405
cohort_size = None
405-
print(
406-
"Cohort size is not supported with amino acid heatmap. Overriding cohort size to None."
406+
warnings.warn(
407+
"Cohort size is not supported with amino acid heatmap. Overriding cohort size to None.",
408+
stacklevel=2,
407409
)
408410

409411
res = self.plot_haplotype_clustering(
@@ -709,8 +711,9 @@ def _insert_hapclust_snp_trace(
709711
figures.append(snp_trace)
710712
subplot_heights.append(snp_row_height * df_haps.shape[0])
711713
else:
712-
print(
713-
f"No SNPs were found below {snp_filter_min_maf} allele frequency. Omitting SNP genotype plot."
714+
warnings.warn(
715+
f"No SNPs were found below {snp_filter_min_maf} allele frequency. Omitting SNP genotype plot.",
716+
stacklevel=2,
714717
)
715718
return figures, subplot_heights, n_snps_transcript
716719

malariagen_data/anoph/sample_metadata.py

Lines changed: 75 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1+
import difflib
12
import io
23
import json
4+
import re
35
from itertools import cycle
46
from typing import (
57
Any,
@@ -705,6 +707,17 @@ def clear_extra_metadata(self):
705707
@doc(
706708
summary="Access sample metadata for one or more sample sets.",
707709
returns="A dataframe of sample metadata, one row per sample.",
710+
notes="""
711+
Some samples in the dataset are lab crosses — mosquitoes bred in
712+
the laboratory that have no real collection date. These samples
713+
use ``year=-1`` and ``month=-1`` as sentinel values. They may
714+
cause unexpected results in date-based analyses (e.g.,
715+
``pd.to_datetime`` will fail on negative year values).
716+
717+
To exclude lab cross samples, use::
718+
719+
df = api.sample_metadata(sample_query="year >= 0")
720+
""",
708721
)
709722
def sample_metadata(
710723
self,
@@ -784,12 +797,65 @@ def sample_metadata(
784797
if prepared_sample_query is not None:
785798
# Assume a pandas query string.
786799
sample_query_options = sample_query_options or {}
800+
801+
# Save a reference to the pre-query DataFrame so we can detect
802+
# zero-result queries and provide a helpful warning.
803+
df_before_query = df_samples
804+
787805
# Use the python engine in order to support extension array dtypes, e.g. Float64, Int64, boolean.
788806
df_samples = df_samples.query(
789807
prepared_sample_query, **sample_query_options, engine="python"
790808
)
791809
df_samples = df_samples.reset_index(drop=True)
792810

811+
# Warn if query returned zero results on a non-empty dataset.
812+
# Provide fuzzy-match suggestions so users can spot typos,
813+
# case mismatches, or partial-value issues.
814+
if len(df_samples) == 0 and len(df_before_query) > 0:
815+
hint_lines = [
816+
f"sample_metadata() returned 0 samples for query: {prepared_sample_query!r}.",
817+
]
818+
819+
# Extract column == 'value' pairs from the query.
820+
col_val_pairs = re.findall(
821+
r"\b(\w+)\s*==\s*['\"]([^'\"]+)['\"]",
822+
prepared_sample_query,
823+
)
824+
825+
for col_name, queried_val in col_val_pairs:
826+
# If the column name is not recognised, suggest
827+
# close column names.
828+
if col_name not in df_before_query.columns:
829+
close_cols = difflib.get_close_matches(
830+
col_name,
831+
df_before_query.columns.tolist(),
832+
n=3,
833+
cutoff=0.6,
834+
)
835+
if close_cols:
836+
hint_lines.append(
837+
f"Column {col_name!r} not found. "
838+
f"Did you mean: {close_cols}?"
839+
)
840+
continue
841+
842+
# For string columns, suggest close values.
843+
if df_before_query[col_name].dtype == object:
844+
valid_vals = (
845+
df_before_query[col_name].dropna().unique().tolist()
846+
)
847+
close_vals = difflib.get_close_matches(
848+
queried_val, valid_vals, n=5, cutoff=0.6
849+
)
850+
if close_vals:
851+
hint_lines.append(
852+
f"Value {queried_val!r} not found in "
853+
f"column {col_name!r}. "
854+
f"Did you mean: {close_vals}?"
855+
)
856+
857+
warnings.warn("\n".join(hint_lines), UserWarning, stacklevel=2)
858+
793859
# Apply the sample_indices, if there are any.
794860
# Note: this might need to apply to the result of an internal sample_query, e.g. `is_surveillance == True`.
795861
if sample_indices is not None:
@@ -1468,8 +1534,9 @@ def _setup_cohort_queries(
14681534
if min_cohort_size is not None:
14691535
cohort_size = min_cohort_size
14701536
if cohort_size is not None and n_samples < cohort_size:
1471-
print(
1472-
f"Cohort ({cohort_label}) has insufficient samples ({n_samples}) for requested cohort size ({cohort_size}), dropping."
1537+
warnings.warn(
1538+
f"Cohort ({cohort_label}) has insufficient samples ({n_samples}) for requested cohort size ({cohort_size}), dropping.",
1539+
stacklevel=2,
14731540
)
14741541
else:
14751542
cohort_queries_checked[cohort_label] = cohort_query
@@ -1773,7 +1840,12 @@ def _locate_cohorts(*, cohorts, data, min_cohort_size):
17731840
# to pandas queries.
17741841

17751842
for coh, query in cohorts.items():
1776-
loc_coh = data.eval(query).values
1843+
try:
1844+
loc_coh = data.eval(query).values
1845+
except Exception as e:
1846+
raise ValueError(
1847+
f"Invalid query for cohort {coh!r}: {query!r}. Error: {e}"
1848+
) from e
17771849
coh_dict[coh] = loc_coh
17781850

17791851
else:

malariagen_data/anoph/snp_frq.py

Lines changed: 50 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -345,11 +345,55 @@ def aa_allele_frequencies(
345345
# We just want aa change.
346346
df_ns_snps = df_snps.query(AA_CHANGE_QUERY).copy()
347347

348-
# Early check for no matching SNPs.
349-
if len(df_ns_snps) == 0: # pragma: no cover
350-
raise ValueError(
351-
"No amino acid change SNPs found for the given transcript and site mask."
348+
# Handle case where no amino acid change SNPs are found.
349+
# N.B., this can legitimately happen for some transcript/site_mask/query
350+
# combinations. Return a well-formed empty DataFrame rather than raising,
351+
# to avoid transient test failures and to allow downstream code to handle
352+
# the empty result gracefully. See also:
353+
# https://github.com/malariagen/malariagen-data-python/issues/1064
354+
if len(df_ns_snps) == 0:
355+
warnings.warn(
356+
"No amino acid change SNPs found for the given transcript "
357+
"and site mask. Returning an empty DataFrame.",
358+
stacklevel=2,
359+
)
360+
# Build an empty DataFrame with the expected schema.
361+
freq_cols = [col for col in df_snps.columns if col.startswith("frq_")]
362+
count_cols = [col for col in df_snps.columns if col.startswith("count_")]
363+
nobs_cols = [col for col in df_snps.columns if col.startswith("nobs_")]
364+
keep_cols = [
365+
"contig",
366+
"transcript",
367+
"aa_pos",
368+
"ref_allele",
369+
"ref_aa",
370+
"alt_aa",
371+
"effect",
372+
"impact",
373+
"grantham_score",
374+
"sneath_score",
375+
]
376+
all_cols = (
377+
["aa_change"]
378+
+ freq_cols
379+
+ ["max_af"]
380+
+ keep_cols
381+
+ ["alt_allele", "label", "position"]
352382
)
383+
if include_counts:
384+
all_cols = all_cols + count_cols + nobs_cols
385+
df_empty = pd.DataFrame(columns=all_cols)
386+
df_empty.set_index(["aa_change", "contig", "position"], inplace=True)
387+
388+
# Add metadata.
389+
gene_name = self._transcript_to_parent_name(transcript)
390+
title = transcript
391+
if gene_name:
392+
title += f" ({gene_name})"
393+
title += " SNP frequencies"
394+
df_empty.attrs["title"] = title
395+
396+
return df_empty
353397

354398
# N.B., we need to worry about the possibility of the
355399
# same aa change due to SNPs at different positions. We cannot
@@ -375,7 +419,7 @@ def np_sum(g):
375419
for c in nobs_cols:
376420
agg[c] = "first"
377421

378-
keep_cols = (
422+
keep_cols = [
379423
"contig",
380424
"transcript",
381425
"aa_pos",
@@ -386,7 +430,7 @@ def np_sum(g):
386430
"impact",
387431
"grantham_score",
388432
"sneath_score",
389-
)
433+
]
390434
for c in keep_cols:
391435
agg[c] = "first"
392436
agg["alt_allele"] = lambda v: "{" + ",".join(v) + "}" if len(v) > 1 else v

malariagen_data/veff.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -356,10 +356,15 @@ def _get_within_cds_effect(ann, base_effect, cds, cdss):
356356
effect = base_effect._replace(effect="STOP_GAINED", impact="HIGH")
357357

358358
else:
359-
# TODO NON_SYNONYMOUS_START and NON_SYNONYMOUS_STOP
360-
361359
# variant causes a codon that produces a different amino acid
362360
# e.g.: Tgg/Cgg, W/R
361+
# N.B. NON_SYNONYMOUS_START and NON_SYNONYMOUS_STOP from the SnpEff
362+
# taxonomy do not require separate handling here. Any start codon
363+
# mutation that changes the amino acid is already classified as
364+
# START_LOST (when ref_aa == "M"). Any stop codon mutation that
365+
# changes the amino acid is already classified as STOP_LOST or
366+
# STOP_GAINED. There is no reachable case that falls through to here
367+
# with ref or alt at a start or stop codon position.
363368
effect = base_effect._replace(
364369
effect="NON_SYNONYMOUS_CODING", impact="MODERATE"
365370
)

0 commit comments

Comments
 (0)