Skip to content

Commit ab9ee71

Browse files
committed
refactor: standardise biallelic diplotypes to count ref allele and handle missing calls
1 parent af991c9 commit ab9ee71

3 files changed

Lines changed: 25 additions & 1 deletion

File tree

malariagen_data/anoph/distance.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,9 @@ def _biallelic_diplotype_pairwise_distances(
217217
n_snps = gn.shape[0]
218218

219219
# Prepare data for pairwise distance calculation.
220+
# Mask missing calls (-127) before computing distances.
221+
gn = gn.astype(float)
222+
gn[gn == -127] = np.nan
220223
X = np.ascontiguousarray(gn.T)
221224

222225
# Look up distance function.

malariagen_data/anoph/pca.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ def pca(
152152
# df_pca.index = df_pca.index.astype(str)
153153

154154
# Name the DataFrame's columns as PC1, PC2, etc.
155-
df_pca.columns = pd.Index([f"PC{i+1}" for i in range(coords.shape[1])])
155+
df_pca.columns = pd.Index([f"PC{i + 1}" for i in range(coords.shape[1])])
156156

157157
# Load the sample metadata.
158158
df_samples = self.sample_metadata(
@@ -231,6 +231,23 @@ def _pca(
231231
loc_keep_fit = np.ones(len(samples), dtype=bool)
232232
gn_fit = gn
233233

234+
# Impute missing calls (-127) with the mean value at each site.
235+
if max_missing_an is not None and max_missing_an > 0:
236+
gn_fit = gn_fit.astype(float)
237+
gn = gn.astype(float)
238+
for arr in [gn_fit, gn]:
239+
missing_mask = arr == -127
240+
site_means = np.where(
241+
np.all(missing_mask, axis=1, keepdims=True),
242+
0,
243+
np.nanmean(
244+
np.where(missing_mask, np.nan, arr), axis=1, keepdims=True
245+
),
246+
)
247+
arr[missing_mask] = np.take(
248+
site_means.flatten(), np.where(missing_mask)[0]
249+
)
250+
234251
# Remove any sites where all genotypes are identical.
235252
loc_var = np.any(gn_fit != gn_fit[:, 0, np.newaxis], axis=1)
236253
gn_fit_var = np.compress(loc_var, gn_fit, axis=0)

malariagen_data/anoph/snp_data.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2037,8 +2037,12 @@ def _biallelic_diplotypes(
20372037
samples = ds["sample_id"].values.astype("U")
20382038

20392039
# Compute diplotypes as the number of alt alleles per genotype call.
2040+
# with missing calls coded as -127.
20402041
gt = allel.GenotypeDaskArray(ds["call_genotype"].data)
20412042
with self._dask_progress(desc="Compute biallelic diplotypes"):
20422043
gn = gt.to_n_alt().compute()
2044+
# Code missing calls as -127.
2045+
missing = np.all(ds["call_genotype"].values == -1, axis=2)
2046+
gn[missing] = -127
20432047

20442048
return dict(samples=samples, gn=gn)

0 commit comments

Comments
 (0)