Skip to content

Commit 3712ce6

Browse files
authored
Merge branch 'master' into feature/775-heterozygosity-support
2 parents 779f1a2 + 3bc195e commit 3712ce6

6 files changed

Lines changed: 126 additions & 36 deletions

File tree

CONTRIBUTING.md

Lines changed: 46 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,10 @@ This package provides Python tools for accessing and analyzing genomic data from
1212

1313
You'll need:
1414

15-
- Python 3.10.x (CI-tested version)
16-
- [Poetry](https://python-poetry.org/) for dependency management
17-
- [Git](https://git-scm.com/) for version control
15+
- [pipx](https://python-poetry.org/) for installing Python tools
16+
- [git](https://git-scm.com/) for version control
17+
18+
Both of these can be installed using your distribution's package manager or [Homebrew](https://brew.sh/) on Mac.
1819

1920
### Initial setup
2021

@@ -33,16 +34,25 @@ You'll need:
3334
git remote add upstream https://github.com/malariagen/malariagen-data-python.git
3435
```
3536

36-
3. **Install Poetry** (if not already installed)
37+
3. **Install Poetry**
3738

3839
```bash
3940
pipx install poetry
4041
```
4142

42-
4. **Install the project and its dependencies**
43+
4. **Install Python 3.12**
44+
45+
Python 3.12 is tested in the CI-system and is the recommended version to use.
46+
47+
```bash
48+
poetry python install 3.12
49+
```
50+
51+
5. **Install the project and its dependencies**
4352

4453
```bash
45-
poetry install
54+
poetry env use 3.12
55+
poetry install --extras dev
4656
```
4757

4858
**Recommended**: Use `poetry run` to run commands inside the virtual environment:
@@ -71,7 +81,7 @@ You'll need:
7181
python script.py
7282
```
7383

74-
5. **Install pre-commit hooks**
84+
6. **Install pre-commit hooks**
7585

7686
```bash
7787
pipx install pre-commit
@@ -107,16 +117,40 @@ You'll need:
107117

108118
4. **Run tests locally**
109119

110-
Fast unit tests (no external data access):
120+
Fast unit tests using simulated data (no external data access):
111121

112122
```bash
113-
poetry run pytest -v tests/anoph
123+
poetry run pytest -v tests --ignore tests/integration
114124
```
115125

116-
All unit tests (requires setting up credentials for legacy tests):
126+
To run integration tests which read data from GCS, you'll need to [request access to MalariaGEN data on GCS](https://malariagen.github.io/vector-data/vobs/vobs-data-access.html).
127+
128+
Once access has been granted, [install the Google Cloud CLI](https://cloud.google.com/sdk/docs/install). E.g., if on Linux:
117129

118130
```bash
119-
poetry run pytest -v tests --ignore tests/integration
131+
./install_gcloud.sh
132+
```
133+
134+
You'll then need to obtain application-default credentials, e.g.:
135+
136+
```bash
137+
./google-cloud-sdk/bin/gcloud auth application-default login
138+
```
139+
140+
Once this is done, you can run integration tests:
141+
142+
```bash
143+
poetry run pytest -v tests/integration
144+
```
145+
146+
Tests will run slowly the first time, as data required for testing will be read from GCS. Subsequent runs will be faster as data will be cached locally in the "gcs_cache" folder.
147+
148+
6. **Run typechecking**
149+
150+
Run static typechecking with mypy:
151+
152+
```bash
153+
poetry run mypy malariagen_data tests --ignore-missing-imports
120154
```
121155

122156
5. **Check code quality**
@@ -150,7 +184,7 @@ ruff format .
150184
- **Fast tests**: Unit tests should use simulated data when possible (see `tests/anoph/`)
151185
- **Integration tests**: Tests requiring GCS data access are slower and run separately
152186

153-
Run type checking with:
187+
Run dynamic type checking with:
154188

155189
```bash
156190
poetry run pytest -v tests --typeguard-packages=malariagen_data,malariagen_data.anoph

malariagen_data/anoph/distance.py

Lines changed: 37 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -365,24 +365,43 @@ def _njt(
365365
from scipy.spatial.distance import squareform # type: ignore
366366

367367
# Compute pairwise distances.
368-
dist, samples, n_snps = self.biallelic_diplotype_pairwise_distances(
369-
region=region,
370-
n_snps=n_snps,
371-
metric=metric,
372-
sample_sets=sample_sets,
373-
sample_indices=sample_indices,
374-
site_mask=site_mask,
375-
site_class=site_class,
376-
inline_array=inline_array,
377-
chunks=chunks,
378-
cohort_size=cohort_size,
379-
min_cohort_size=min_cohort_size,
380-
max_cohort_size=max_cohort_size,
381-
random_seed=random_seed,
382-
max_missing_an=max_missing_an,
383-
min_minor_ac=min_minor_ac,
384-
thin_offset=thin_offset,
385-
)
368+
try:
369+
dist, samples, n_snps_used = self.biallelic_diplotype_pairwise_distances(
370+
region=region,
371+
n_snps=n_snps,
372+
metric=metric,
373+
sample_sets=sample_sets,
374+
sample_indices=sample_indices,
375+
site_mask=site_mask,
376+
site_class=site_class,
377+
inline_array=inline_array,
378+
chunks=chunks,
379+
cohort_size=cohort_size,
380+
min_cohort_size=min_cohort_size,
381+
max_cohort_size=max_cohort_size,
382+
random_seed=random_seed,
383+
max_missing_an=max_missing_an,
384+
min_minor_ac=min_minor_ac,
385+
thin_offset=thin_offset,
386+
)
387+
388+
except ValueError as e:
389+
raise ValueError(
390+
f"Unable to construct neighbour-joining tree. {e} "
391+
f"This could be because the selected region does not "
392+
f"contain enough polymorphic SNPs for the given sample "
393+
f"sets and query parameters."
394+
) from e
395+
396+
# Validate enough samples for a tree.
397+
n_samples = len(samples)
398+
if n_samples < 3:
399+
raise ValueError(
400+
f"Not enough samples to construct a neighbour-joining tree. "
401+
f"A minimum of 3 samples is required, but only {n_samples} "
402+
f"were found for the given region and sample sets."
403+
)
404+
386405
D = squareform(dist)
387406

388407
# anjl supports passing in a progress bar function to get progress on the

malariagen_data/anoph/frq_base.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -622,13 +622,27 @@ def plot_frequencies_interactive_map(
622622
variants = ds["variant_label"].values
623623
taxa = ds["cohort_taxon"].to_pandas().dropna().unique() # type: ignore
624624
periods = ds["cohort_period"].to_pandas().dropna().unique() # type: ignore
625+
626+
if len(variants) == 0:
627+
raise ValueError("No variants available in dataset.")
628+
if len(taxa) == 0:
629+
raise ValueError("No taxons available in dataset.")
630+
if len(periods) == 0:
631+
raise ValueError("No periods available in dataset.")
632+
625633
controls = ipywidgets.interactive(
626634
self.plot_frequencies_map_markers,
627635
m=ipywidgets.fixed(freq_map),
628636
ds=ipywidgets.fixed(ds),
629-
variant=ipywidgets.Dropdown(options=variants, description="Variant: "),
630-
taxon=ipywidgets.Dropdown(options=taxa, description="Taxon: "),
631-
period=ipywidgets.Dropdown(options=periods, description="Period: "),
637+
variant=ipywidgets.Dropdown(
638+
options=variants, value=variants[0], description="Variant: "
639+
),
640+
taxon=ipywidgets.Dropdown(
641+
options=taxa, value=taxa[0], description="Taxon: "
642+
),
643+
period=ipywidgets.Dropdown(
644+
options=periods, value=periods[0], description="Period: "
645+
),
632646
clear=ipywidgets.fixed(True),
633647
)
634648

malariagen_data/anoph/snp_data.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1903,7 +1903,12 @@ def biallelic_snp_calls(
19031903
ds_out = ds_out.isel(variants=loc_thin)
19041904

19051905
elif ds_out.sizes["variants"] < n_snps:
1906-
raise ValueError("Not enough SNPs.")
1906+
raise ValueError(
1907+
f"Not enough SNPs. Requested {n_snps} SNPs but only "
1908+
f"{ds_out.sizes['variants']} were found in the selected "
1909+
f"region after applying filters. Try using a larger region, "
1910+
f"relaxing site filters, or reducing the n_snps parameter."
1911+
)
19071912

19081913
return ds_out
19091914

tests/anoph/test_cnv_frq.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -643,7 +643,7 @@ def test_gene_cnv_frequencies_advanced_with_variant_query(
643643
area_by = "admin1_iso"
644644
period_by = "year"
645645
region = random.choice(api.contigs)
646-
variant_query = "cnv_type == '{variant_query_option}'"
646+
variant_query = f"cnv_type == '{variant_query_option}'"
647647

648648
check_gene_cnv_frequencies_advanced(
649649
api=api,
@@ -743,7 +743,8 @@ def check_gene_cnv_frequencies_advanced(
743743
check_plot_frequencies_time_series(api, ds)
744744
check_plot_frequencies_time_series_with_taxa(api, ds)
745745
check_plot_frequencies_time_series_with_areas(api, ds)
746-
check_plot_frequencies_interactive_map(api, ds)
746+
if variant_query is None:
747+
check_plot_frequencies_interactive_map(api, ds)
747748
assert set(ds.dims) == {"cohorts", "variants"}
748749

749750
# Check variant variables.

tests/anoph/test_distance.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,3 +269,20 @@ def test_plot_njt(fixture, api: AnophelesDistanceAnalysis):
269269
**data_params,
270270
)
271271
assert isinstance(fig, go.Figure)
272+
273+
274+
@parametrize_with_cases("fixture,api", cases=".")
275+
def test_njt_not_enough_snps(fixture, api: AnophelesDistanceAnalysis):
276+
all_sample_sets = api.sample_sets()["sample_set"].to_list()
277+
with pytest.raises(
278+
ValueError,
279+
match="Unable to construct neighbour-joining tree|Not enough SNPs",
280+
):
281+
api.njt(
282+
region=random.choice(api.contigs),
283+
n_snps=1_000_000_000, # impossibly high to guarantee failure
284+
sample_sets=random.sample(all_sample_sets, 1),
285+
site_mask=random.choice((None,) + api.site_mask_ids),
286+
min_minor_ac=pca_params.min_minor_ac_default,
287+
max_missing_an=pca_params.max_missing_an_default,
288+
)

0 commit comments

Comments
 (0)