Skip to content

Commit 9dafe2e

Browse files
authored
Merge branch 'master' into GH995-fix-sample-query-options-forwarding-in-pairwise-average-fst
2 parents d3811ed + c6a63ee commit 9dafe2e

30 files changed

Lines changed: 745 additions & 178 deletions

.codecov.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
coverage:
2+
status:
3+
project:
4+
default:
5+
target: auto
6+
patch:
7+
default:
8+
target: 80%
9+
threshold: 0%

.github/workflows/tests.yml

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@ jobs:
1212
fail-fast: true
1313
matrix:
1414
python-version: ["3.10", "3.11", "3.12"]
15+
numpy-spec:
16+
# Keep this aligned with pyproject.toml: numpy = ">=2.0.2,<2.1"
17+
- "==2.0.2" # locked baseline
18+
- ">=2.0.2,<2.1" # latest allowed in declared range
1519
runs-on: ubuntu-latest
1620

1721
steps:
@@ -23,8 +27,26 @@ jobs:
2327
with:
2428
python-version: ${{ matrix.python-version }}
2529

26-
- name: Verify NumPy version
27-
run: poetry run python -c "import numpy; print('NumPy version:', numpy.__version__)"
30+
- name: Install matrix NumPy version
31+
run: poetry run pip install --upgrade --no-deps "numpy${{ matrix.numpy-spec }}"
32+
33+
- name: Verify NumPy version and spec
34+
env:
35+
NUMPY_SPEC: ${{ matrix.numpy-spec }}
36+
run: |
37+
poetry run python - <<'PY'
38+
import os
39+
import numpy
40+
from packaging.specifiers import SpecifierSet
41+
42+
spec = SpecifierSet(os.environ["NUMPY_SPEC"])
43+
version = numpy.__version__
44+
if version not in spec:
45+
raise RuntimeError(
46+
f"NumPy version {version} does not satisfy matrix spec {spec}"
47+
)
48+
print("NumPy version:", version, "| spec:", spec)
49+
PY
2850
2951
- name: Run unit tests
3052
run: poetry run pytest -v tests --ignore tests/integration --typeguard-packages=malariagen_data,malariagen_data.anoph

malariagen_data/adir1.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@
1919
"dirus": TAXON_PALETTE[0],
2020
}
2121

22+
XPEHH_GWSS_CACHE_NAME = "adir1_xpehh_gwss_v1"
23+
IHS_GWSS_CACHE_NAME = "adir1_ihs_gwss_v1"
24+
ROH_HMM_CACHE_NAME = "adir1_roh_hmm_v1"
25+
2226

2327
class Adir1(AnophelesDataResource):
2428
"""Provides access to data from Adir1.0 releases.
@@ -71,6 +75,10 @@ class Adir1(AnophelesDataResource):
7175
7276
"""
7377

78+
_xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME
79+
_ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME
80+
_roh_hmm_cache_name = ROH_HMM_CACHE_NAME
81+
7482
def __init__(
7583
self,
7684
url=None,

malariagen_data/af1.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@
2121
"funestus": TAXON_PALETTE[0],
2222
}
2323

24+
XPEHH_GWSS_CACHE_NAME = "af1_xpehh_gwss_v1"
25+
IHS_GWSS_CACHE_NAME = "af1_ihs_gwss_v1"
26+
ROH_HMM_CACHE_NAME = "af1_roh_hmm_v1"
27+
2428

2529
class Af1(AnophelesDataResource):
2630
"""Provides access to data from Af1.x releases.
@@ -75,6 +79,7 @@ class Af1(AnophelesDataResource):
7579

7680
_xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME
7781
_ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME
82+
_roh_hmm_cache_name = ROH_HMM_CACHE_NAME
7883

7984
def __init__(
8085
self,

malariagen_data/ag3.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,10 @@ def _setup_aim_palettes():
9595
"aim_species": "object",
9696
}
9797

98+
XPEHH_GWSS_CACHE_NAME = "ag3_xpehh_gwss_v1"
99+
IHS_GWSS_CACHE_NAME = "ag3_ihs_gwss_v1"
100+
ROH_HMM_CACHE_NAME = "ag3_roh_hmm_v1"
101+
98102

99103
class Ag3(AnophelesDataResource):
100104
"""Provides access to data from Ag3.x releases.
@@ -153,6 +157,7 @@ class Ag3(AnophelesDataResource):
153157

154158
_xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME
155159
_ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME
160+
_roh_hmm_cache_name = ROH_HMM_CACHE_NAME
156161

157162
def __init__(
158163
self,

malariagen_data/amin1.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@
1919
"dirus": TAXON_PALETTE[0],
2020
}
2121

22+
XPEHH_GWSS_CACHE_NAME = "amin1_xpehh_gwss_v1"
23+
IHS_GWSS_CACHE_NAME = "amin1_ihs_gwss_v1"
24+
ROH_HMM_CACHE_NAME = "amin1_roh_hmm_v1"
25+
2226

2327
class Amin1(AnophelesDataResource):
2428
"""Provides access to data from Amin1.0 releases.
@@ -71,8 +75,9 @@ class Amin1(AnophelesDataResource):
7175
7276
"""
7377

74-
# _xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME
75-
# _ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME
78+
_xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME
79+
_ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME
80+
_roh_hmm_cache_name = ROH_HMM_CACHE_NAME
7681

7782
def __init__(
7883
self,

malariagen_data/anoph/base.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -562,6 +562,13 @@ def _sample_set_has_unrestricted_use(self, *, sample_set: str):
562562
release_manifest_df = self._read_sample_sets_manifest(
563563
single_release=sample_set_release
564564
)
565+
566+
if "unrestricted_use" not in release_manifest_df.columns:
567+
raise ValueError(
568+
f"Column 'unrestricted_use' missing from manifest for sample set '{sample_set}'. "
569+
"This indicates a data integrity issue in the release manifest."
570+
)
571+
565572
sample_set_records_srs = release_manifest_df.loc[
566573
release_manifest_df["sample_set"] == sample_set, "unrestricted_use"
567574
]
@@ -824,12 +831,19 @@ def lookup_study_info(self, sample_set: base_params.sample_set) -> dict:
824831
def lookup_terms_of_use_info(self, sample_set: base_params.sample_set) -> dict:
825832
if self._cache_sample_set_to_terms_of_use_info is None:
826833
df_sample_sets = self._available_sample_sets().set_index("sample_set")
834+
expected_cols = [
835+
"terms_of_use_expiry_date",
836+
"terms_of_use_url",
837+
"unrestricted_use",
838+
]
839+
missing_cols = [c for c in expected_cols if c not in df_sample_sets.columns]
840+
if missing_cols:
841+
raise ValueError(
842+
f"Terms-of-use columns missing from manifest: {missing_cols}. "
843+
"This indicates a data integrity issue in the release manifest."
844+
)
827845
self._cache_sample_set_to_terms_of_use_info = df_sample_sets[
828-
[
829-
"terms_of_use_expiry_date",
830-
"terms_of_use_url",
831-
"unrestricted_use",
832-
]
846+
expected_cols
833847
].to_dict(orient="index")
834848
try:
835849
return self._cache_sample_set_to_terms_of_use_info[sample_set]

malariagen_data/anoph/cnv_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -296,7 +296,7 @@ def open_cnv_coverage_calls(
296296
marker = path + "/.zmetadata"
297297
if not self._fs.exists(marker):
298298
raise ValueError(
299-
f"CNV coverage calls analysis f{analysis!r} not implemented for sample set {sample_set!r}"
299+
f"CNV coverage calls analysis {analysis!r} not implemented for sample set {sample_set!r}"
300300
)
301301
store = _init_zarr_store(fs=self._fs, path=path)
302302
root = zarr.open_consolidated(store=store)

malariagen_data/anoph/cnv_frq.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,7 @@ def gene_cnv_frequencies_advanced(
446446
chunks: base_params.chunks = base_params.native_chunks,
447447
inline_array: base_params.inline_array = base_params.inline_array_default,
448448
taxon_by: frq_params.taxon_by = frq_params.taxon_by_default,
449+
filter_unassigned: Optional[frq_params.filter_unassigned] = None,
449450
) -> xr.Dataset:
450451
regions: List[Region] = _parse_multi_region(self, region)
451452
del region
@@ -468,6 +469,7 @@ def gene_cnv_frequencies_advanced(
468469
chunks=chunks,
469470
inline_array=inline_array,
470471
taxon_by=taxon_by,
472+
filter_unassigned=filter_unassigned,
471473
)
472474
for r in regions
473475
],
@@ -497,6 +499,7 @@ def _gene_cnv_frequencies_advanced(
497499
chunks,
498500
inline_array,
499501
taxon_by,
502+
filter_unassigned,
500503
):
501504
debug = self._log.debug
502505

@@ -527,6 +530,7 @@ def _gene_cnv_frequencies_advanced(
527530
area_by=area_by,
528531
period_by=period_by,
529532
taxon_by=taxon_by,
533+
filter_unassigned=filter_unassigned,
530534
)
531535

532536
debug("group samples to make cohorts")

malariagen_data/anoph/distance.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,9 @@ def _biallelic_diplotype_pairwise_distances(
217217
n_snps = gn.shape[0]
218218

219219
# Prepare data for pairwise distance calculation.
220+
# Mask missing calls (-127) before computing distances.
221+
gn = gn.astype(float)
222+
gn[gn == -127] = np.nan
220223
X = np.ascontiguousarray(gn.T)
221224

222225
# Look up distance function.

0 commit comments

Comments
 (0)