Merge pull request #975 from Tanisha127/standardise-biallelic-diplotypes-471

jonbrenas · web-flow · commit 27ac08caa2ee · 2026-03-01T23:01:03.000Z
refactor: standardise biallelic diplotypes and handling of missing calls
diff --git a/malariagen_data/anoph/distance.py b/malariagen_data/anoph/distance.py
@@ -217,6 +217,9 @@ def _biallelic_diplotype_pairwise_distances(
         n_snps = gn.shape[0]
 
         # Prepare data for pairwise distance calculation.
+        # Mask missing calls (-127) before computing distances.
+        gn = gn.astype(float)
+        gn[gn == -127] = np.nan
         X = np.ascontiguousarray(gn.T)
 
         # Look up distance function.
diff --git a/malariagen_data/anoph/pca.py b/malariagen_data/anoph/pca.py
@@ -44,6 +44,14 @@ def __init__(
             `random_seed`.
 
         """,
+        parameters=dict(
+            imputation_method="""
+                Method to use for imputing missing genotype calls. Options are
+                'most_common' (replace missing calls with the most common genotype at each site,
+                the default), 'mean' (replace missing calls with the
+                mean value at each site), or 'zero' (replace missing calls with zero).
+            """,
+        ),
         returns=("df_pca", "evr"),
         notes="""
             This computation may take some time to run, depending on your computing
@@ -69,6 +77,7 @@ def pca(
         max_missing_an: Optional[
             base_params.max_missing_an
         ] = pca_params.max_missing_an_default,
+        imputation_method: pca_params.imputation_method = pca_params.imputation_method_default,
         cohort_size: Optional[base_params.cohort_size] = None,
         min_cohort_size: Optional[base_params.min_cohort_size] = None,
         max_cohort_size: Optional[base_params.max_cohort_size] = None,
@@ -80,7 +89,7 @@ def pca(
     ) -> Tuple[pca_params.df_pca, pca_params.evr]:
         # Change this name if you ever change the behaviour of this function, to
         # invalidate any previously cached data.
-        name = "pca_v5"
+        name = "pca_v8"
 
         # Check that either sample_query xor sample_indices are provided.
         base_params._validate_sample_selection_params(
@@ -121,6 +130,7 @@ def pca(
             site_class=site_class,
             min_minor_ac=min_minor_ac,
             max_missing_an=max_missing_an,
+            imputation_method=imputation_method,
             n_components=n_components,
             cohort_size=cohort_size,
             min_cohort_size=min_cohort_size,
@@ -152,7 +162,7 @@ def pca(
         # df_pca.index = df_pca.index.astype(str)
 
         # Name the DataFrame's columns as PC1, PC2, etc.
-        df_pca.columns = pd.Index([f"PC{i+1}" for i in range(coords.shape[1])])
+        df_pca.columns = pd.Index([f"PC{i + 1}" for i in range(coords.shape[1])])
 
         # Load the sample metadata.
         df_samples = self.sample_metadata(
@@ -185,6 +195,7 @@ def _pca(
         site_class,
         min_minor_ac,
         max_missing_an,
+        imputation_method="most_common",
         n_components,
         cohort_size,
         min_cohort_size,
@@ -231,6 +242,50 @@ def _pca(
                 loc_keep_fit = np.ones(len(samples), dtype=bool)
                 gn_fit = gn
 
+            # Impute missing calls (-127) using the chosen imputation method.
+            if max_missing_an is not None and max_missing_an > 0:
+                gn_fit = gn_fit.astype(float)
+                gn = gn.astype(float)
+                for arr in [gn_fit, gn]:
+                    missing_mask = arr == -127
+
+                    if imputation_method == "most_common":
+                        # For each site, find the most common non-missing value.
+                        site_modes = []
+                        for row in arr:
+                            non_missing = row[row != -127]
+                            if len(non_missing) == 0:
+                                site_modes.append(0)
+                            else:
+                                values, counts = np.unique(
+                                    non_missing, return_counts=True
+                                )
+                                site_modes.append(values[np.argmax(counts)])
+                        site_modes = np.array(site_modes)
+                        fill_values = np.take(site_modes, np.where(missing_mask)[0])
+                    elif imputation_method == "mean":
+                        site_means = np.where(
+                            np.all(missing_mask, axis=1, keepdims=True),
+                            0,
+                            np.nanmean(
+                                np.where(missing_mask, np.nan, arr),
+                                axis=1,
+                                keepdims=True,
+                            ),
+                        )
+                        fill_values = np.take(
+                            site_means.flatten(), np.where(missing_mask)[0]
+                        )
+                    elif imputation_method == "zero":
+                        fill_values = 0
+                    else:
+                        raise ValueError(
+                            f"Unknown imputation_method: {imputation_method!r}. "
+                            "Choose from 'most_common', 'mean' or 'zero'."
+                        )
+
+                    arr[missing_mask] = fill_values
+
             # Remove any sites where all genotypes are identical.
             loc_var = np.any(gn_fit != gn_fit[:, 0, np.newaxis], axis=1)
             gn_fit_var = np.compress(loc_var, gn_fit, axis=0)
diff --git a/malariagen_data/anoph/pca_params.py b/malariagen_data/anoph/pca_params.py
@@ -86,3 +86,12 @@
 min_minor_ac_default: base_params.min_minor_ac = 2
 
 max_missing_an_default: base_params.max_missing_an = 0
+
+imputation_method: TypeAlias = Annotated[
+    str,
+    "Method to use for imputing missing genotype calls when max_missing_an > 0. "
+    "Options are 'most_common' (replace missing calls with the most common genotype "
+    "at each site), 'mean' (replace with the site mean), or 'zero' (replace with zero).",
+]
+
+imputation_method_default: imputation_method = "most_common"
diff --git a/malariagen_data/anoph/snp_data.py b/malariagen_data/anoph/snp_data.py
@@ -1939,7 +1939,7 @@ def biallelic_diplotypes(
     ) -> Tuple[np.ndarray, np.ndarray]:
         # Change this name if you ever change the behaviour of this function, to
         # invalidate any previously cached data.
-        name = "biallelic_diplotypes"
+        name = "biallelic_diplotypes_v2"
 
         # Check that either sample_query xor sample_indices are provided.
         base_params._validate_sample_selection_params(
@@ -2046,8 +2046,12 @@ def _biallelic_diplotypes(
         samples = ds["sample_id"].values.astype("U")
 
         # Compute diplotypes as the number of alt alleles per genotype call.
+        # with missing calls coded as -127.
         gt = allel.GenotypeDaskArray(ds["call_genotype"].data)
         with self._dask_progress(desc="Compute biallelic diplotypes"):
-            gn = gt.to_n_alt().compute()
+            gn = gt.to_n_ref().compute()
+        # Code missing calls as -127.
+        missing = np.all(ds["call_genotype"].values == -1, axis=2)
+        gn[missing] = -127
 
         return dict(samples=samples, gn=gn)
diff --git a/tests/anoph/test_snp_data.py b/tests/anoph/test_snp_data.py
@@ -1388,10 +1388,10 @@ def check_biallelic_snp_calls_and_diplotypes(
     assert gn.ndim == 2
     assert gn.shape[0] == ds.sizes["variants"]
     assert gn.shape[1] == ds.sizes["samples"]
-    assert np.all(gn >= 0)
-    assert np.all(gn <= 2)
+    assert np.all((gn >= 0) | (gn == -127))
+    assert np.all((gn <= 2) | (gn == -127))
     ac = ds["variant_allele_count"].values
-    assert np.all(np.sum(gn, axis=1) == ac[:, 1])
+    assert np.all(np.sum(np.where(gn == -127, 0, gn), axis=1) == ac[:, 0])
     assert samples.ndim == 1
     assert samples.shape[0] == gn.shape[1]
     assert samples.tolist() == expected_samples