Skip to content

Commit ef17757

Browse files
authored
Merge branch 'master' into GH-1054-add-vcf-export
2 parents b6705e7 + 10b360b commit ef17757

12 files changed

Lines changed: 313 additions & 25 deletions

File tree

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ To get setup for development, see [this video if you prefer VS Code](https://you
4949
For detailed setup instructions, see:
5050
- [Linux setup guide](LINUX_SETUP.md)
5151
- [macOS setup guide](MACOS_SETUP.md)
52+
- [Windows setup guide](WINDOWS_SETUP.md)
5253
- [Google Colab (TPU) setup guide](docs/source/colab_tpu_runtime.rst)
5354
Detailed instructions can be found in the [Contributors guide](https://github.com/malariagen/malariagen-data-python/blob/master/CONTRIBUTING.md).
5455

WINDOWS_SETUP.md

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
# Windows Setup Guide
2+
3+
To get setup for development on Windows, see
4+
[this video if you prefer VS Code](https://youtu.be/zddl3n1DCFM),
5+
or [this older video if you prefer PyCharm](https://youtu.be/QniQi-Hoo9A),
6+
and the instructions below.
7+
8+
## 1. Fork and clone this repo
9+
```bash
10+
git clone https://github.com/[username]/malariagen-data-python.git
11+
cd malariagen-data-python
12+
```
13+
14+
## 2. Install Python
15+
16+
Download and install Python 3.10 from the official website:
17+
https://www.python.org/downloads/windows/
18+
19+
During installation, check the box that says Add Python to PATH
20+
before clicking Install.
21+
22+
Verify the installation worked:
23+
```bash
24+
python --version
25+
```
26+
27+
## 3. Install pipx and poetry
28+
```bash
29+
python -m pip install --user pipx
30+
python -m pipx ensurepath
31+
pipx install poetry
32+
```
33+
34+
After running ensurepath, close and reopen PowerShell before continuing.
35+
36+
## 4. Create and activate development environment
37+
```bash
38+
poetry install
39+
poetry shell
40+
```
41+
42+
## 5. Install pre-commit hooks
43+
```bash
44+
pipx install pre-commit
45+
pre-commit install
46+
```
47+
48+
## 6. Add upstream remote and get latest code
49+
```bash
50+
git remote add upstream https://github.com/malariagen/malariagen-data-python
51+
git pull upstream master
52+
```
53+
54+
Note: On Windows the default branch is called master, not main.
55+
56+
## 7. Verify everything works
57+
```bash
58+
python -c "import malariagen_data; print('Setup successful!')"
59+
```
60+
61+
## Common Issues on Windows
62+
63+
**poetry not found after install**
64+
65+
Close and reopen PowerShell, then try again.
66+
67+
**git not recognized**
68+
69+
Install Git from https://git-scm.com/download/win
70+
and restart PowerShell.
71+
72+
**python not recognized**
73+
74+
Reinstall Python and make sure to check
75+
Add Python to PATH during installation.
76+
77+
**fatal: not a git repository**
78+
79+
Make sure you are inside the malariagen-data-python
80+
folder before running any git commands.
81+
```bash
82+
cd malariagen-data-python
83+
```
84+
85+
**error: pathspec main did not match**
86+
87+
On Windows use master instead of main.
88+
```bash
89+
git checkout master
90+
```

malariagen_data/ag3.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,7 @@ def __init__(
212212
taxon_colors=TAXON_COLORS,
213213
aim_species_colors=AIM_SPECIES_COLORS,
214214
virtual_contigs=VIRTUAL_CONTIGS,
215+
gene_names=GENE_NAMES,
215216
inversion_tag_path=INVERSION_TAG_PATH,
216217
unrestricted_use_only=unrestricted_use_only,
217218
surveillance_use_only=surveillance_use_only,

malariagen_data/anoph/heterozygosity.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,27 @@ def _sample_count_het(
399399
def _roh_hmm_cache_name(self):
400400
return "roh_hmm_v1"
401401

402+
def _get_roh_hmm_cache_name(self):
403+
"""Safely resolve the ROH HMM cache name.
404+
405+
Supports class attribute, property, or legacy method override.
406+
Falls back to the default "roh_hmm_v1" if resolution fails.
407+
408+
See also: https://github.com/malariagen/malariagen-data-python/issues/1151
409+
"""
410+
try:
411+
name = self._roh_hmm_cache_name
412+
# Handle legacy case where _roh_hmm_cache_name might be a
413+
# callable method rather than a property or class attribute.
414+
if callable(name):
415+
name = name()
416+
if isinstance(name, str) and len(name) > 0:
417+
return name
418+
except NotImplementedError:
419+
pass
420+
# Fallback to default.
421+
return "roh_hmm_v1"
422+
402423
@_check_types
403424
@doc(
404425
summary="Infer runs of homozygosity for a single sample over a genome region.",
@@ -420,7 +441,7 @@ def roh_hmm(
420441

421442
resolved_region: Region = _parse_single_region(self, region)
422443

423-
name = self._roh_hmm_cache_name
444+
name = self._get_roh_hmm_cache_name()
424445

425446
params = dict(
426447
sample=sample,

malariagen_data/anoph/pca.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -399,9 +399,9 @@ def plot_pca_coords(
399399

400400
# Apply jitter if desired - helps spread out points when tightly clustered.
401401
if jitter_frac:
402-
np.random.seed(random_seed)
403-
data[x] = _jitter(data[x], jitter_frac)
404-
data[y] = _jitter(data[y], jitter_frac)
402+
rng = np.random.default_rng(seed=random_seed)
403+
data[x] = _jitter(data[x], jitter_frac, random_state=rng)
404+
data[y] = _jitter(data[y], jitter_frac, random_state=rng)
405405

406406
# Convenience variables.
407407
# Prevent lint error (mypy): Unsupported operand types for + ("Series[Any]" and "str")
@@ -503,10 +503,10 @@ def plot_pca_coords_3d(
503503

504504
# Apply jitter if desired - helps spread out points when tightly clustered.
505505
if jitter_frac:
506-
np.random.seed(random_seed)
507-
data[x] = _jitter(data[x], jitter_frac)
508-
data[y] = _jitter(data[y], jitter_frac)
509-
data[z] = _jitter(data[z], jitter_frac)
506+
rng = np.random.default_rng(seed=random_seed)
507+
data[x] = _jitter(data[x], jitter_frac, random_state=rng)
508+
data[y] = _jitter(data[y], jitter_frac, random_state=rng)
509+
data[z] = _jitter(data[z], jitter_frac, random_state=rng)
510510

511511
# Convenience variables.
512512
# Prevent lint error (mypy): Unsupported operand types for + ("Series[Any]" and "str")

malariagen_data/anopheles.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ def __init__(
138138
taxon_colors: Optional[Mapping[str, str]] = None,
139139
aim_species_colors: Optional[Mapping[str, str]] = None,
140140
virtual_contigs: Optional[Mapping[str, Sequence[str]]] = None,
141+
gene_names: Optional[Mapping[str, str]] = None,
141142
inversion_tag_path: Optional[str] = None,
142143
unrestricted_use_only: Optional[bool] = None,
143144
surveillance_use_only: Optional[bool] = None,
@@ -175,6 +176,7 @@ def __init__(
175176
taxon_colors=taxon_colors,
176177
aim_species_colors=aim_species_colors,
177178
virtual_contigs=virtual_contigs,
179+
gene_names=gene_names,
178180
inversion_tag_path=inversion_tag_path,
179181
unrestricted_use_only=unrestricted_use_only,
180182
surveillance_use_only=surveillance_use_only,

malariagen_data/util.py

Lines changed: 49 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -570,6 +570,9 @@ def __eq__(self, other):
570570
and (self.end == other.end)
571571
)
572572

573+
def __repr__(self):
574+
return f"Region({self._contig!r}, {self._start!r}, {self._end!r})"
575+
573576
def __str__(self):
574577
out = self._contig
575578
if self._start is not None or self._end is not None:
@@ -897,14 +900,50 @@ def _hash_params(params):
897900
return h, s
898901

899902

900-
def _jitter(a, fraction):
901-
"""Jitter data in `a` using the fraction `f`."""
903+
def _jitter(a, fraction, random_state=np.random):
904+
"""Jitter data by adding uniform noise scaled by the data range.
905+
906+
Parameters
907+
----------
908+
a : array-like
909+
Input data to jitter. Can be a numpy array or pandas Series.
910+
fraction : float
911+
Controls the amplitude of the jitter relative to the data range.
912+
random_state : numpy.random.Generator or module, optional
913+
Random number generator to use. Accepts a ``numpy.random.Generator``
914+
(from ``np.random.default_rng()``) or the ``numpy.random`` module.
915+
Defaults to ``np.random`` (global RNG) for backward compatibility.
916+
917+
Returns
918+
-------
919+
array-like
920+
Jittered copy of the input data with the same shape and type.
921+
922+
Notes
923+
-----
924+
Prefer passing a local ``np.random.default_rng(seed=...)`` to avoid
925+
mutating global RNG state and to ensure reproducibility.
926+
927+
"""
902928
r = a.max() - a.min()
903-
return a + fraction * np.random.uniform(-r, r, a.shape)
929+
return a + fraction * random_state.uniform(-r, r, a.shape)
904930

905931

906932
class CacheMiss(Exception):
907-
pass
933+
"""Raised when a requested item is not present in the cache."""
934+
935+
def __init__(self, key=None):
936+
self.key = key
937+
if key is not None:
938+
message = f"Cache miss for key: {key!r}"
939+
else:
940+
message = "Cache miss: requested item not found in cache."
941+
super().__init__(message)
942+
943+
def __repr__(self):
944+
if self.key is not None:
945+
return f"CacheMiss({self.key!r})"
946+
return "CacheMiss()"
908947

909948

910949
class LoggingHelper:
@@ -1508,12 +1547,10 @@ def _apply_allele_mapping(x, mapping, max_allele):
15081547

15091548
def _dask_apply_allele_mapping(v, mapping, max_allele):
15101549
if not isinstance(v, da.Array):
1511-
raise TypeError(
1512-
f"Expected v to be a dask.array.Array, " f"got {type(v).__name__}"
1513-
)
1550+
raise TypeError(f"Expected v to be a dask.array.Array, got {type(v).__name__}")
15141551
if not isinstance(mapping, np.ndarray):
15151552
raise TypeError(
1516-
f"Expected mapping to be a numpy.ndarray, " f"got {type(mapping).__name__}"
1553+
f"Expected mapping to be a numpy.ndarray, got {type(mapping).__name__}"
15171554
)
15181555
assert v.ndim == 2
15191556
assert mapping.ndim == 2
@@ -1535,12 +1572,10 @@ def _genotype_array_map_alleles(gt, mapping):
15351572
# N.B., scikit-allel does not handle empty blocks well, so we
15361573
# include some extra logic to handle that better.
15371574
if not isinstance(gt, np.ndarray):
1538-
raise TypeError(
1539-
f"Expected gt to be a numpy.ndarray, " f"got {type(gt).__name__}"
1540-
)
1575+
raise TypeError(f"Expected gt to be a numpy.ndarray, got {type(gt).__name__}")
15411576
if not isinstance(mapping, np.ndarray):
15421577
raise TypeError(
1543-
f"Expected mapping to be a numpy.ndarray, " f"got {type(mapping).__name__}"
1578+
f"Expected mapping to be a numpy.ndarray, got {type(mapping).__name__}"
15441579
)
15451580
assert gt.ndim == 3
15461581
assert mapping.ndim == 3
@@ -1562,11 +1597,11 @@ def _genotype_array_map_alleles(gt, mapping):
15621597
def _dask_genotype_array_map_alleles(gt, mapping):
15631598
if not isinstance(gt, da.Array):
15641599
raise TypeError(
1565-
f"Expected gt to be a dask.array.Array, " f"got {type(gt).__name__}"
1600+
f"Expected gt to be a dask.array.Array, got {type(gt).__name__}"
15661601
)
15671602
if not isinstance(mapping, np.ndarray):
15681603
raise TypeError(
1569-
f"Expected mapping to be a numpy.ndarray, " f"got {type(mapping).__name__}"
1604+
f"Expected mapping to be a numpy.ndarray, got {type(mapping).__name__}"
15701605
)
15711606
assert gt.ndim == 3
15721607
assert mapping.ndim == 2

tests/anoph/test_fst.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -346,8 +346,16 @@ def test_pairwise_average_fst_with_sample_query(fixture, api: AnophelesFstAnalys
346346
n_jack=random.randint(10, 200),
347347
)
348348

349-
# Run checks.
350-
check_pairwise_average_fst(api=api, fst_params=fst_params)
349+
# Run checks - skip if random parameter selection results in insufficient cohorts.
350+
try:
351+
check_pairwise_average_fst(api=api, fst_params=fst_params)
352+
except ValueError as e:
353+
if "No cohorts remain" in str(e):
354+
pytest.skip(
355+
f"Skipping: random parameter selection produced insufficient "
356+
f"cohorts for taxon={taxon!r}: {e}"
357+
)
358+
raise
351359

352360

353361
@parametrize_with_cases("fixture,api", cases=".")

tests/anoph/test_hap_frq.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,12 @@ def test_hap_frequencies_with_str_cohorts(
187187
return
188188

189189
# Run the function under test.
190-
df_hap = api.haplotypes_frequencies(**params)
190+
try:
191+
df_hap = api.haplotypes_frequencies(**params)
192+
except ValueError as e:
193+
if "No SNPs available for the given region" in str(e):
194+
pytest.skip("Random region contained no SNPs")
195+
raise
191196

192197
check_plot_frequencies_heatmap(api, df_hap)
193198

tests/anoph/test_heterozygosity.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,19 @@ def test_roh_hmm(fixture, api: AnophelesHetAnalysis):
185185
assert col in df_roh.columns
186186

187187

188+
@parametrize_with_cases("fixture,api", cases=".")
189+
def test_roh_hmm_cache_name_resolution(fixture, api: AnophelesHetAnalysis):
190+
"""Regression test for GH#1151: _roh_hmm_cache_name must resolve to a string.
191+
192+
Verifies that the cache name resolver handles class attributes, properties,
193+
and legacy method overrides without raising NotImplementedError.
194+
"""
195+
# The resolver should always return a non-empty string.
196+
name = api._get_roh_hmm_cache_name()
197+
assert isinstance(name, str), f"Expected str, got {type(name)}"
198+
assert len(name) > 0, "Cache name must be non-empty"
199+
200+
188201
@parametrize_with_cases("fixture,api", cases=".")
189202
def test_plot_roh(fixture, api: AnophelesHetAnalysis):
190203
# Set up test parameters.

0 commit comments

Comments
 (0)