Skip to content

Commit 44e3855

Browse files
authored
Merge branch 'master' into GH985-fix-stray-f-in-error-messages
2 parents a14a956 + 5999f49 commit 44e3855

19 files changed

Lines changed: 546 additions & 136 deletions

.codecov.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
coverage:
2+
status:
3+
project:
4+
default:
5+
target: auto
6+
patch:
7+
default:
8+
target: 80%
9+
threshold: 0%

malariagen_data/adir1.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@
1919
"dirus": TAXON_PALETTE[0],
2020
}
2121

22+
XPEHH_GWSS_CACHE_NAME = "adir1_xpehh_gwss_v1"
23+
IHS_GWSS_CACHE_NAME = "adir1_ihs_gwss_v1"
24+
ROH_HMM_CACHE_NAME = "adir1_roh_hmm_v1"
25+
2226

2327
class Adir1(AnophelesDataResource):
2428
"""Provides access to data from Adir1.0 releases.
@@ -71,6 +75,10 @@ class Adir1(AnophelesDataResource):
7175
7276
"""
7377

78+
_xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME
79+
_ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME
80+
_roh_hmm_cache_name = ROH_HMM_CACHE_NAME
81+
7482
def __init__(
7583
self,
7684
url=None,

malariagen_data/af1.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@
2121
"funestus": TAXON_PALETTE[0],
2222
}
2323

24+
XPEHH_GWSS_CACHE_NAME = "af1_xpehh_gwss_v1"
25+
IHS_GWSS_CACHE_NAME = "af1_ihs_gwss_v1"
26+
ROH_HMM_CACHE_NAME = "af1_roh_hmm_v1"
27+
2428

2529
class Af1(AnophelesDataResource):
2630
"""Provides access to data from Af1.x releases.
@@ -75,6 +79,7 @@ class Af1(AnophelesDataResource):
7579

7680
_xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME
7781
_ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME
82+
_roh_hmm_cache_name = ROH_HMM_CACHE_NAME
7883

7984
def __init__(
8085
self,

malariagen_data/ag3.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,10 @@ def _setup_aim_palettes():
9595
"aim_species": "object",
9696
}
9797

98+
XPEHH_GWSS_CACHE_NAME = "ag3_xpehh_gwss_v1"
99+
IHS_GWSS_CACHE_NAME = "ag3_ihs_gwss_v1"
100+
ROH_HMM_CACHE_NAME = "ag3_roh_hmm_v1"
101+
98102

99103
class Ag3(AnophelesDataResource):
100104
"""Provides access to data from Ag3.x releases.
@@ -153,6 +157,7 @@ class Ag3(AnophelesDataResource):
153157

154158
_xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME
155159
_ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME
160+
_roh_hmm_cache_name = ROH_HMM_CACHE_NAME
156161

157162
def __init__(
158163
self,

malariagen_data/amin1.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@
1919
"dirus": TAXON_PALETTE[0],
2020
}
2121

22+
XPEHH_GWSS_CACHE_NAME = "amin1_xpehh_gwss_v1"
23+
IHS_GWSS_CACHE_NAME = "amin1_ihs_gwss_v1"
24+
ROH_HMM_CACHE_NAME = "amin1_roh_hmm_v1"
25+
2226

2327
class Amin1(AnophelesDataResource):
2428
"""Provides access to data from Amin1.0 releases.
@@ -71,8 +75,9 @@ class Amin1(AnophelesDataResource):
7175
7276
"""
7377

74-
# _xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME
75-
# _ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME
78+
_xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME
79+
_ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME
80+
_roh_hmm_cache_name = ROH_HMM_CACHE_NAME
7681

7782
def __init__(
7883
self,

malariagen_data/anoph/base.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -562,6 +562,13 @@ def _sample_set_has_unrestricted_use(self, *, sample_set: str):
562562
release_manifest_df = self._read_sample_sets_manifest(
563563
single_release=sample_set_release
564564
)
565+
566+
if "unrestricted_use" not in release_manifest_df.columns:
567+
raise ValueError(
568+
f"Column 'unrestricted_use' missing from manifest for sample set '{sample_set}'. "
569+
"This indicates a data integrity issue in the release manifest."
570+
)
571+
565572
sample_set_records_srs = release_manifest_df.loc[
566573
release_manifest_df["sample_set"] == sample_set, "unrestricted_use"
567574
]
@@ -824,12 +831,19 @@ def lookup_study_info(self, sample_set: base_params.sample_set) -> dict:
824831
def lookup_terms_of_use_info(self, sample_set: base_params.sample_set) -> dict:
825832
if self._cache_sample_set_to_terms_of_use_info is None:
826833
df_sample_sets = self._available_sample_sets().set_index("sample_set")
834+
expected_cols = [
835+
"terms_of_use_expiry_date",
836+
"terms_of_use_url",
837+
"unrestricted_use",
838+
]
839+
missing_cols = [c for c in expected_cols if c not in df_sample_sets.columns]
840+
if missing_cols:
841+
raise ValueError(
842+
f"Terms-of-use columns missing from manifest: {missing_cols}. "
843+
"This indicates a data integrity issue in the release manifest."
844+
)
827845
self._cache_sample_set_to_terms_of_use_info = df_sample_sets[
828-
[
829-
"terms_of_use_expiry_date",
830-
"terms_of_use_url",
831-
"unrestricted_use",
832-
]
846+
expected_cols
833847
].to_dict(orient="index")
834848
try:
835849
return self._cache_sample_set_to_terms_of_use_info[sample_set]

malariagen_data/anoph/distance.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,9 @@ def _biallelic_diplotype_pairwise_distances(
217217
n_snps = gn.shape[0]
218218

219219
# Prepare data for pairwise distance calculation.
220+
# Mask missing calls (-127) before computing distances.
221+
gn = gn.astype(float)
222+
gn[gn == -127] = np.nan
220223
X = np.ascontiguousarray(gn.T)
221224

222225
# Look up distance function.

malariagen_data/anoph/g123.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,13 @@ def _g123_gwss(
133133
chunks=chunks,
134134
)
135135

136+
if gt.shape[0] < window_size:
137+
raise ValueError(
138+
f"Not enough sites ({gt.shape[0]}) for window size "
139+
f"({window_size}). Please reduce the window size or "
140+
f"use different site selection criteria."
141+
)
142+
136143
with self._spinner("Compute G123"):
137144
g123 = allel.moving_statistic(gt, statistic=_garud_g123, size=window_size)
138145
x = allel.moving_statistic(pos, statistic=np.mean, size=window_size)
@@ -240,6 +247,12 @@ def _g123_calibration(
240247

241248
calibration_runs: Dict[str, np.ndarray] = dict()
242249
for window_size in self._progress(window_sizes, desc="Compute G123"):
250+
if gt.shape[0] < window_size:
251+
raise ValueError(
252+
f"Not enough sites ({gt.shape[0]}) for window size "
253+
f"({window_size}). Please reduce the window size or "
254+
f"use different site selection criteria."
255+
)
243256
g123 = allel.moving_statistic(gt, statistic=_garud_g123, size=window_size)
244257
calibration_runs[str(window_size)] = g123
245258

malariagen_data/anoph/h12.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -558,6 +558,8 @@ def plot_h12_gwss_multi_overlay_track(
558558
show: gplt_params.show = True,
559559
x_range: Optional[gplt_params.x_range] = None,
560560
output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
561+
chunks: base_params.chunks = base_params.native_chunks,
562+
inline_array: base_params.inline_array = base_params.inline_array_default,
561563
) -> gplt_params.optional_figure:
562564
cohort_queries = self._setup_cohort_queries(
563565
cohorts=cohorts,
@@ -585,8 +587,11 @@ def plot_h12_gwss_multi_overlay_track(
585587
min_cohort_size=min_cohort_size,
586588
max_cohort_size=max_cohort_size,
587589
sample_query=cohort_query,
590+
sample_query_options=sample_query_options,
588591
sample_sets=sample_sets,
589592
random_seed=random_seed,
593+
chunks=chunks,
594+
inline_array=inline_array,
590595
)
591596

592597
# Determine X axis range.
@@ -625,7 +630,7 @@ def plot_h12_gwss_multi_overlay_track(
625630
)
626631

627632
# Plot H12.
628-
for i, (cohort_label, (x, h12, contig)) in enumerate(res.items()):
633+
for i, (cohort_label, (x, h12, contig_idx)) in enumerate(res.items()):
629634
fig.scatter(
630635
x=x,
631636
y=h12,
@@ -679,6 +684,8 @@ def plot_h12_gwss_multi_overlay(
679684
output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
680685
gene_labels: Optional[gplt_params.gene_labels] = None,
681686
gene_labelset: Optional[gplt_params.gene_labelset] = None,
687+
chunks: base_params.chunks = base_params.native_chunks,
688+
inline_array: base_params.inline_array = base_params.inline_array_default,
682689
) -> gplt_params.optional_figure:
683690
# Plot GWSS track.
684691
fig1 = self.plot_h12_gwss_multi_overlay_track(
@@ -700,6 +707,8 @@ def plot_h12_gwss_multi_overlay(
700707
height=track_height,
701708
show=False,
702709
output_backend=output_backend,
710+
chunks=chunks,
711+
inline_array=inline_array,
703712
)
704713

705714
fig1.xaxis.visible = False
@@ -764,6 +773,8 @@ def plot_h12_gwss_multi_panel(
764773
output_backend: gplt_params.output_backend = gplt_params.output_backend_default,
765774
gene_labels: Optional[gplt_params.gene_labels] = None,
766775
gene_labelset: Optional[gplt_params.gene_labelset] = None,
776+
chunks: base_params.chunks = base_params.native_chunks,
777+
inline_array: base_params.inline_array = base_params.inline_array_default,
767778
) -> gplt_params.optional_figure:
768779
cohort_queries = self._setup_cohort_queries(
769780
cohorts=cohorts,
@@ -789,6 +800,7 @@ def plot_h12_gwss_multi_panel(
789800
window_size=window_size[cohort_label],
790801
sample_sets=sample_sets,
791802
sample_query=cohort_query,
803+
sample_query_options=sample_query_options,
792804
cohort_size=cohort_size,
793805
min_cohort_size=min_cohort_size,
794806
max_cohort_size=max_cohort_size,
@@ -799,6 +811,8 @@ def plot_h12_gwss_multi_panel(
799811
height=track_height,
800812
show=False,
801813
output_backend=output_backend,
814+
chunks=chunks,
815+
inline_array=inline_array,
802816
)
803817
if i > 0:
804818
track = self.plot_h12_gwss_track(x_range=figs[0].x_range, **params)

malariagen_data/anoph/pca.py

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,14 @@ def __init__(
4444
`random_seed`.
4545
4646
""",
47+
parameters=dict(
48+
imputation_method="""
49+
Method to use for imputing missing genotype calls. Options are
50+
'most_common' (replace missing calls with the most common genotype at each site,
51+
the default), 'mean' (replace missing calls with the
52+
mean value at each site), or 'zero' (replace missing calls with zero).
53+
""",
54+
),
4755
returns=("df_pca", "evr"),
4856
notes="""
4957
This computation may take some time to run, depending on your computing
@@ -69,6 +77,7 @@ def pca(
6977
max_missing_an: Optional[
7078
base_params.max_missing_an
7179
] = pca_params.max_missing_an_default,
80+
imputation_method: pca_params.imputation_method = pca_params.imputation_method_default,
7281
cohort_size: Optional[base_params.cohort_size] = None,
7382
min_cohort_size: Optional[base_params.min_cohort_size] = None,
7483
max_cohort_size: Optional[base_params.max_cohort_size] = None,
@@ -80,7 +89,7 @@ def pca(
8089
) -> Tuple[pca_params.df_pca, pca_params.evr]:
8190
# Change this name if you ever change the behaviour of this function, to
8291
# invalidate any previously cached data.
83-
name = "pca_v5"
92+
name = "pca_v8"
8493

8594
# Check that either sample_query xor sample_indices are provided.
8695
base_params._validate_sample_selection_params(
@@ -121,6 +130,7 @@ def pca(
121130
site_class=site_class,
122131
min_minor_ac=min_minor_ac,
123132
max_missing_an=max_missing_an,
133+
imputation_method=imputation_method,
124134
n_components=n_components,
125135
cohort_size=cohort_size,
126136
min_cohort_size=min_cohort_size,
@@ -152,7 +162,7 @@ def pca(
152162
# df_pca.index = df_pca.index.astype(str)
153163

154164
# Name the DataFrame's columns as PC1, PC2, etc.
155-
df_pca.columns = pd.Index([f"PC{i+1}" for i in range(coords.shape[1])])
165+
df_pca.columns = pd.Index([f"PC{i + 1}" for i in range(coords.shape[1])])
156166

157167
# Load the sample metadata.
158168
df_samples = self.sample_metadata(
@@ -185,6 +195,7 @@ def _pca(
185195
site_class,
186196
min_minor_ac,
187197
max_missing_an,
198+
imputation_method="most_common",
188199
n_components,
189200
cohort_size,
190201
min_cohort_size,
@@ -231,6 +242,50 @@ def _pca(
231242
loc_keep_fit = np.ones(len(samples), dtype=bool)
232243
gn_fit = gn
233244

245+
# Impute missing calls (-127) using the chosen imputation method.
246+
if max_missing_an is not None and max_missing_an > 0:
247+
gn_fit = gn_fit.astype(float)
248+
gn = gn.astype(float)
249+
for arr in [gn_fit, gn]:
250+
missing_mask = arr == -127
251+
252+
if imputation_method == "most_common":
253+
# For each site, find the most common non-missing value.
254+
site_modes = []
255+
for row in arr:
256+
non_missing = row[row != -127]
257+
if len(non_missing) == 0:
258+
site_modes.append(0)
259+
else:
260+
values, counts = np.unique(
261+
non_missing, return_counts=True
262+
)
263+
site_modes.append(values[np.argmax(counts)])
264+
site_modes = np.array(site_modes)
265+
fill_values = np.take(site_modes, np.where(missing_mask)[0])
266+
elif imputation_method == "mean":
267+
site_means = np.where(
268+
np.all(missing_mask, axis=1, keepdims=True),
269+
0,
270+
np.nanmean(
271+
np.where(missing_mask, np.nan, arr),
272+
axis=1,
273+
keepdims=True,
274+
),
275+
)
276+
fill_values = np.take(
277+
site_means.flatten(), np.where(missing_mask)[0]
278+
)
279+
elif imputation_method == "zero":
280+
fill_values = 0
281+
else:
282+
raise ValueError(
283+
f"Unknown imputation_method: {imputation_method!r}. "
284+
"Choose from 'most_common', 'mean' or 'zero'."
285+
)
286+
287+
arr[missing_mask] = fill_values
288+
234289
# Remove any sites where all genotypes are identical.
235290
loc_var = np.any(gn_fit != gn_fit[:, 0, np.newaxis], axis=1)
236291
gn_fit_var = np.compress(loc_var, gn_fit, axis=0)

0 commit comments

Comments
 (0)