Merge branch 'master' into GH998-fix-max-missing-an-fraction-denominator

rehanxt5 · web-flow · commit e5c7d3df99df · 2026-03-02T14:44:51.000+05:30
diff --git a/malariagen_data/adir1.py b/malariagen_data/adir1.py
@@ -19,6 +19,10 @@
     "dirus": TAXON_PALETTE[0],
 }
 
+XPEHH_GWSS_CACHE_NAME = "adir1_xpehh_gwss_v1"
+IHS_GWSS_CACHE_NAME = "adir1_ihs_gwss_v1"
+ROH_HMM_CACHE_NAME = "adir1_roh_hmm_v1"
+
 
 class Adir1(AnophelesDataResource):
     """Provides access to data from Adir1.0 releases.
@@ -71,6 +75,10 @@ class Adir1(AnophelesDataResource):
 
     """
 
+    _xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME
+    _ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME
+    _roh_hmm_cache_name = ROH_HMM_CACHE_NAME
+
     def __init__(
         self,
         url=None,
diff --git a/malariagen_data/af1.py b/malariagen_data/af1.py
@@ -21,6 +21,10 @@
     "funestus": TAXON_PALETTE[0],
 }
 
+XPEHH_GWSS_CACHE_NAME = "af1_xpehh_gwss_v1"
+IHS_GWSS_CACHE_NAME = "af1_ihs_gwss_v1"
+ROH_HMM_CACHE_NAME = "af1_roh_hmm_v1"
+
 
 class Af1(AnophelesDataResource):
     """Provides access to data from Af1.x releases.
@@ -75,6 +79,7 @@ class Af1(AnophelesDataResource):
 
     _xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME
     _ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME
+    _roh_hmm_cache_name = ROH_HMM_CACHE_NAME
 
     def __init__(
         self,
diff --git a/malariagen_data/ag3.py b/malariagen_data/ag3.py
@@ -95,6 +95,10 @@ def _setup_aim_palettes():
     "aim_species": "object",
 }
 
+XPEHH_GWSS_CACHE_NAME = "ag3_xpehh_gwss_v1"
+IHS_GWSS_CACHE_NAME = "ag3_ihs_gwss_v1"
+ROH_HMM_CACHE_NAME = "ag3_roh_hmm_v1"
+
 
 class Ag3(AnophelesDataResource):
     """Provides access to data from Ag3.x releases.
@@ -153,6 +157,7 @@ class Ag3(AnophelesDataResource):
 
     _xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME
     _ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME
+    _roh_hmm_cache_name = ROH_HMM_CACHE_NAME
 
     def __init__(
         self,
diff --git a/malariagen_data/amin1.py b/malariagen_data/amin1.py
@@ -19,6 +19,10 @@
     "dirus": TAXON_PALETTE[0],
 }
 
+XPEHH_GWSS_CACHE_NAME = "amin1_xpehh_gwss_v1"
+IHS_GWSS_CACHE_NAME = "amin1_ihs_gwss_v1"
+ROH_HMM_CACHE_NAME = "amin1_roh_hmm_v1"
+
 
 class Amin1(AnophelesDataResource):
     """Provides access to data from Amin1.0 releases.
@@ -71,8 +75,9 @@ class Amin1(AnophelesDataResource):
 
     """
 
-    #    _xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME
-    #    _ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME
+    _xpehh_gwss_cache_name = XPEHH_GWSS_CACHE_NAME
+    _ihs_gwss_cache_name = IHS_GWSS_CACHE_NAME
+    _roh_hmm_cache_name = ROH_HMM_CACHE_NAME
 
     def __init__(
         self,
diff --git a/malariagen_data/anoph/distance.py b/malariagen_data/anoph/distance.py
@@ -217,6 +217,9 @@ def _biallelic_diplotype_pairwise_distances(
         n_snps = gn.shape[0]
 
         # Prepare data for pairwise distance calculation.
+        # Mask missing calls (-127) before computing distances.
+        gn = gn.astype(float)
+        gn[gn == -127] = np.nan
         X = np.ascontiguousarray(gn.T)
 
         # Look up distance function.
diff --git a/malariagen_data/anoph/pca.py b/malariagen_data/anoph/pca.py
@@ -44,6 +44,14 @@ def __init__(
             `random_seed`.
 
         """,
+        parameters=dict(
+            imputation_method="""
+                Method to use for imputing missing genotype calls. Options are
+                'most_common' (replace missing calls with the most common genotype at each site,
+                the default), 'mean' (replace missing calls with the
+                mean value at each site), or 'zero' (replace missing calls with zero).
+            """,
+        ),
         returns=("df_pca", "evr"),
         notes="""
             This computation may take some time to run, depending on your computing
@@ -69,6 +77,7 @@ def pca(
         max_missing_an: Optional[
             base_params.max_missing_an
         ] = pca_params.max_missing_an_default,
+        imputation_method: pca_params.imputation_method = pca_params.imputation_method_default,
         cohort_size: Optional[base_params.cohort_size] = None,
         min_cohort_size: Optional[base_params.min_cohort_size] = None,
         max_cohort_size: Optional[base_params.max_cohort_size] = None,
@@ -80,7 +89,7 @@ def pca(
     ) -> Tuple[pca_params.df_pca, pca_params.evr]:
         # Change this name if you ever change the behaviour of this function, to
         # invalidate any previously cached data.
-        name = "pca_v5"
+        name = "pca_v8"
 
         # Check that either sample_query xor sample_indices are provided.
         base_params._validate_sample_selection_params(
@@ -121,6 +130,7 @@ def pca(
             site_class=site_class,
             min_minor_ac=min_minor_ac,
             max_missing_an=max_missing_an,
+            imputation_method=imputation_method,
             n_components=n_components,
             cohort_size=cohort_size,
             min_cohort_size=min_cohort_size,
@@ -152,7 +162,7 @@ def pca(
         # df_pca.index = df_pca.index.astype(str)
 
         # Name the DataFrame's columns as PC1, PC2, etc.
-        df_pca.columns = pd.Index([f"PC{i+1}" for i in range(coords.shape[1])])
+        df_pca.columns = pd.Index([f"PC{i + 1}" for i in range(coords.shape[1])])
 
         # Load the sample metadata.
         df_samples = self.sample_metadata(
@@ -185,6 +195,7 @@ def _pca(
         site_class,
         min_minor_ac,
         max_missing_an,
+        imputation_method="most_common",
         n_components,
         cohort_size,
         min_cohort_size,
@@ -231,6 +242,50 @@ def _pca(
                 loc_keep_fit = np.ones(len(samples), dtype=bool)
                 gn_fit = gn
 
+            # Impute missing calls (-127) using the chosen imputation method.
+            if max_missing_an is not None and max_missing_an > 0:
+                gn_fit = gn_fit.astype(float)
+                gn = gn.astype(float)
+                for arr in [gn_fit, gn]:
+                    missing_mask = arr == -127
+
+                    if imputation_method == "most_common":
+                        # For each site, find the most common non-missing value.
+                        site_modes = []
+                        for row in arr:
+                            non_missing = row[row != -127]
+                            if len(non_missing) == 0:
+                                site_modes.append(0)
+                            else:
+                                values, counts = np.unique(
+                                    non_missing, return_counts=True
+                                )
+                                site_modes.append(values[np.argmax(counts)])
+                        site_modes = np.array(site_modes)
+                        fill_values = np.take(site_modes, np.where(missing_mask)[0])
+                    elif imputation_method == "mean":
+                        site_means = np.where(
+                            np.all(missing_mask, axis=1, keepdims=True),
+                            0,
+                            np.nanmean(
+                                np.where(missing_mask, np.nan, arr),
+                                axis=1,
+                                keepdims=True,
+                            ),
+                        )
+                        fill_values = np.take(
+                            site_means.flatten(), np.where(missing_mask)[0]
+                        )
+                    elif imputation_method == "zero":
+                        fill_values = 0
+                    else:
+                        raise ValueError(
+                            f"Unknown imputation_method: {imputation_method!r}. "
+                            "Choose from 'most_common', 'mean' or 'zero'."
+                        )
+
+                    arr[missing_mask] = fill_values
+
             # Remove any sites where all genotypes are identical.
             loc_var = np.any(gn_fit != gn_fit[:, 0, np.newaxis], axis=1)
             gn_fit_var = np.compress(loc_var, gn_fit, axis=0)
diff --git a/malariagen_data/anoph/pca_params.py b/malariagen_data/anoph/pca_params.py
@@ -86,3 +86,12 @@
 min_minor_ac_default: base_params.min_minor_ac = 2
 
 max_missing_an_default: base_params.max_missing_an = 0
+
+imputation_method: TypeAlias = Annotated[
+    str,
+    "Method to use for imputing missing genotype calls when max_missing_an > 0. "
+    "Options are 'most_common' (replace missing calls with the most common genotype "
+    "at each site), 'mean' (replace with the site mean), or 'zero' (replace with zero).",
+]
+
+imputation_method_default: imputation_method = "most_common"
diff --git a/malariagen_data/anoph/snp_data.py b/malariagen_data/anoph/snp_data.py
@@ -1940,7 +1940,7 @@ def biallelic_diplotypes(
     ) -> Tuple[np.ndarray, np.ndarray]:
         # Change this name if you ever change the behaviour of this function, to
         # invalidate any previously cached data.
-        name = "biallelic_diplotypes"
+        name = "biallelic_diplotypes_v2"
 
         # Check that either sample_query xor sample_indices are provided.
         base_params._validate_sample_selection_params(
@@ -2047,8 +2047,12 @@ def _biallelic_diplotypes(
         samples = ds["sample_id"].values.astype("U")
 
         # Compute diplotypes as the number of alt alleles per genotype call.
+        # with missing calls coded as -127.
         gt = allel.GenotypeDaskArray(ds["call_genotype"].data)
         with self._dask_progress(desc="Compute biallelic diplotypes"):
-            gn = gt.to_n_alt().compute()
+            gn = gt.to_n_ref().compute()
+        # Code missing calls as -127.
+        missing = np.all(ds["call_genotype"].values == -1, axis=2)
+        gn[missing] = -127
 
         return dict(samples=samples, gn=gn)
diff --git a/malariagen_data/util.py b/malariagen_data/util.py
@@ -430,6 +430,7 @@ def _init_filesystem(url, **kwargs):
     """Initialise a fsspec filesystem from a given base URL and parameters."""
 
     storage_options = None  # To prevent using before assignment (Pylint).
+    simplecache_options = kwargs.pop("simplecache", None)
 
     # Special case Google Cloud Storage, authenticate the user.
     if "gs://" in url or "gcs://" in url:
@@ -487,6 +488,9 @@ def _init_filesystem(url, **kwargs):
         # Some other kind of URL, pass through kwargs as-is.
         storage_options = kwargs
 
+    if simplecache_options is not None:
+        storage_options["simplecache"] = simplecache_options
+
     # Process the URL using fsspec.
     fs, path = url_to_fs(url, **storage_options)
 
diff --git a/tests/anoph/test_snp_data.py b/tests/anoph/test_snp_data.py
@@ -1388,10 +1388,10 @@ def check_biallelic_snp_calls_and_diplotypes(
     assert gn.ndim == 2
     assert gn.shape[0] == ds.sizes["variants"]
     assert gn.shape[1] == ds.sizes["samples"]
-    assert np.all(gn >= 0)
-    assert np.all(gn <= 2)
+    assert np.all((gn >= 0) | (gn == -127))
+    assert np.all((gn <= 2) | (gn == -127))
     ac = ds["variant_allele_count"].values
-    assert np.all(np.sum(gn, axis=1) == ac[:, 1])
+    assert np.all(np.sum(np.where(gn == -127, 0, gn), axis=1) == ac[:, 0])
     assert samples.ndim == 1
     assert samples.shape[0] == gn.shape[1]
     assert samples.tolist() == expected_samples