Skip to content

Commit f5e5723

Browse files
Jon BrenasJon Brenas
authored andcommitted
Merge branch 'master' into GH837-adar1
2 parents 9669c80 + 0c9aa64 commit f5e5723

28 files changed

Lines changed: 708 additions & 155 deletions

.codecov.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
coverage:
2+
status:
3+
project:
4+
default:
5+
target: auto
6+
patch:
7+
default:
8+
target: 80%
9+
threshold: 0%

malariagen_data/adir1.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@
1919
"dirus": TAXON_PALETTE[0],
2020
}
2121

22+
XPEHH_GWSS_CACHE_NAME = "adir1_xpehh_gwss_v1"
23+
IHS_GWSS_CACHE_NAME = "adir1_ihs_gwss_v1"
24+
ROH_HMM_CACHE_NAME = "adir1_roh_hmm_v1"
25+
2226

2327
class Adir1(AnophelesDataResource):
2428
"""Provides access to data from Adir1.0 releases.
@@ -71,6 +75,10 @@ class Adir1(AnophelesDataResource):
7175
7276
"""
7377

78+
_xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME
79+
_ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME
80+
_roh_hmm_cache_name = ROH_HMM_CACHE_NAME
81+
7482
def __init__(
7583
self,
7684
url=None,

malariagen_data/af1.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@
2121
"funestus": TAXON_PALETTE[0],
2222
}
2323

24+
XPEHH_GWSS_CACHE_NAME = "af1_xpehh_gwss_v1"
25+
IHS_GWSS_CACHE_NAME = "af1_ihs_gwss_v1"
26+
ROH_HMM_CACHE_NAME = "af1_roh_hmm_v1"
27+
2428

2529
class Af1(AnophelesDataResource):
2630
"""Provides access to data from Af1.x releases.
@@ -75,6 +79,7 @@ class Af1(AnophelesDataResource):
7579

7680
_xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME
7781
_ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME
82+
_roh_hmm_cache_name = ROH_HMM_CACHE_NAME
7883

7984
def __init__(
8085
self,

malariagen_data/ag3.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,10 @@ def _setup_aim_palettes():
9595
"aim_species": "object",
9696
}
9797

98+
XPEHH_GWSS_CACHE_NAME = "ag3_xpehh_gwss_v1"
99+
IHS_GWSS_CACHE_NAME = "ag3_ihs_gwss_v1"
100+
ROH_HMM_CACHE_NAME = "ag3_roh_hmm_v1"
101+
98102

99103
class Ag3(AnophelesDataResource):
100104
"""Provides access to data from Ag3.x releases.
@@ -153,6 +157,7 @@ class Ag3(AnophelesDataResource):
153157

154158
_xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME
155159
_ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME
160+
_roh_hmm_cache_name = ROH_HMM_CACHE_NAME
156161

157162
def __init__(
158163
self,

malariagen_data/amin1.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@
1919
"dirus": TAXON_PALETTE[0],
2020
}
2121

22+
XPEHH_GWSS_CACHE_NAME = "amin1_xpehh_gwss_v1"
23+
IHS_GWSS_CACHE_NAME = "amin1_ihs_gwss_v1"
24+
ROH_HMM_CACHE_NAME = "amin1_roh_hmm_v1"
25+
2226

2327
class Amin1(AnophelesDataResource):
2428
"""Provides access to data from Amin1.0 releases.
@@ -71,8 +75,9 @@ class Amin1(AnophelesDataResource):
7175
7276
"""
7377

74-
# _xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME
75-
# _ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME
78+
_xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME
79+
_ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME
80+
_roh_hmm_cache_name = ROH_HMM_CACHE_NAME
7681

7782
def __init__(
7883
self,

malariagen_data/anoph/base.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -562,6 +562,13 @@ def _sample_set_has_unrestricted_use(self, *, sample_set: str):
562562
release_manifest_df = self._read_sample_sets_manifest(
563563
single_release=sample_set_release
564564
)
565+
566+
if "unrestricted_use" not in release_manifest_df.columns:
567+
raise ValueError(
568+
f"Column 'unrestricted_use' missing from manifest for sample set '{sample_set}'. "
569+
"This indicates a data integrity issue in the release manifest."
570+
)
571+
565572
sample_set_records_srs = release_manifest_df.loc[
566573
release_manifest_df["sample_set"] == sample_set, "unrestricted_use"
567574
]
@@ -824,12 +831,19 @@ def lookup_study_info(self, sample_set: base_params.sample_set) -> dict:
824831
def lookup_terms_of_use_info(self, sample_set: base_params.sample_set) -> dict:
825832
if self._cache_sample_set_to_terms_of_use_info is None:
826833
df_sample_sets = self._available_sample_sets().set_index("sample_set")
834+
expected_cols = [
835+
"terms_of_use_expiry_date",
836+
"terms_of_use_url",
837+
"unrestricted_use",
838+
]
839+
missing_cols = [c for c in expected_cols if c not in df_sample_sets.columns]
840+
if missing_cols:
841+
raise ValueError(
842+
f"Terms-of-use columns missing from manifest: {missing_cols}. "
843+
"This indicates a data integrity issue in the release manifest."
844+
)
827845
self._cache_sample_set_to_terms_of_use_info = df_sample_sets[
828-
[
829-
"terms_of_use_expiry_date",
830-
"terms_of_use_url",
831-
"unrestricted_use",
832-
]
846+
expected_cols
833847
].to_dict(orient="index")
834848
try:
835849
return self._cache_sample_set_to_terms_of_use_info[sample_set]

malariagen_data/anoph/cnv_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -296,7 +296,7 @@ def open_cnv_coverage_calls(
296296
marker = path + "/.zmetadata"
297297
if not self._fs.exists(marker):
298298
raise ValueError(
299-
f"CNV coverage calls analysis f{analysis!r} not implemented for sample set {sample_set!r}"
299+
f"CNV coverage calls analysis {analysis!r} not implemented for sample set {sample_set!r}"
300300
)
301301
store = _init_zarr_store(fs=self._fs, path=path)
302302
root = zarr.open_consolidated(store=store)

malariagen_data/anoph/cnv_frq.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -446,6 +446,7 @@ def gene_cnv_frequencies_advanced(
446446
chunks: base_params.chunks = base_params.native_chunks,
447447
inline_array: base_params.inline_array = base_params.inline_array_default,
448448
taxon_by: frq_params.taxon_by = frq_params.taxon_by_default,
449+
filter_unassigned: Optional[frq_params.filter_unassigned] = None,
449450
) -> xr.Dataset:
450451
regions: List[Region] = _parse_multi_region(self, region)
451452
del region
@@ -468,6 +469,7 @@ def gene_cnv_frequencies_advanced(
468469
chunks=chunks,
469470
inline_array=inline_array,
470471
taxon_by=taxon_by,
472+
filter_unassigned=filter_unassigned,
471473
)
472474
for r in regions
473475
],
@@ -497,6 +499,7 @@ def _gene_cnv_frequencies_advanced(
497499
chunks,
498500
inline_array,
499501
taxon_by,
502+
filter_unassigned,
500503
):
501504
debug = self._log.debug
502505

@@ -527,6 +530,7 @@ def _gene_cnv_frequencies_advanced(
527530
area_by=area_by,
528531
period_by=period_by,
529532
taxon_by=taxon_by,
533+
filter_unassigned=filter_unassigned,
530534
)
531535

532536
debug("group samples to make cohorts")

malariagen_data/anoph/distance.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,9 @@ def _biallelic_diplotype_pairwise_distances(
217217
n_snps = gn.shape[0]
218218

219219
# Prepare data for pairwise distance calculation.
220+
# Mask missing calls (-127) before computing distances.
221+
gn = gn.astype(float)
222+
gn[gn == -127] = np.nan
220223
X = np.ascontiguousarray(gn.T)
221224

222225
# Look up distance function.

malariagen_data/anoph/frq_base.py

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,20 +16,30 @@
1616
from .base import AnophelesBase
1717

1818

19-
def _prep_samples_for_cohort_grouping(*, df_samples, area_by, period_by, taxon_by):
19+
def _prep_samples_for_cohort_grouping(
20+
*, df_samples, area_by, period_by, taxon_by, filter_unassigned=None
21+
):
2022
# Take a copy, as we will modify the dataframe.
2123
df_samples = df_samples.copy()
2224

23-
# Fix "intermediate" or "unassigned" taxon values - we only want to build
24-
# cohorts with clean taxon calls, so we set other values to None.
25-
loc_intermediate_taxon = (
26-
df_samples[taxon_by].str.startswith("intermediate").fillna(False)
27-
)
28-
df_samples.loc[loc_intermediate_taxon, taxon_by] = None
29-
loc_unassigned_taxon = (
30-
df_samples[taxon_by].str.startswith("unassigned").fillna(False)
31-
)
32-
df_samples.loc[loc_unassigned_taxon, taxon_by] = None
25+
# Determine whether to filter "intermediate"/"unassigned" taxon values.
26+
# See: https://github.com/malariagen/malariagen-data-python/issues/806
27+
if filter_unassigned is None:
28+
# Auto-apply filtering only when using the default "taxon" column.
29+
# Users can explicitly override with True/False.
30+
filter_unassigned = taxon_by == "taxon"
31+
32+
if filter_unassigned:
33+
# Remove samples with "intermediate" or "unassigned" taxon values,
34+
# as we only want cohorts with clean taxon calls.
35+
loc_intermediate_taxon = (
36+
df_samples[taxon_by].str.startswith("intermediate").fillna(False)
37+
)
38+
df_samples.loc[loc_intermediate_taxon, taxon_by] = None
39+
loc_unassigned_taxon = (
40+
df_samples[taxon_by].str.startswith("unassigned").fillna(False)
41+
)
42+
df_samples.loc[loc_unassigned_taxon, taxon_by] = None
3343

3444
# Add period column.
3545

0 commit comments

Comments
 (0)