malariagen
diff --git a/‎malariagen_data/anoph/base_params.py‎
Lines changed: 9 additions & 0 deletions b/‎malariagen_data/anoph/base_params.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎malariagen_data/anoph/distance.py‎
Lines changed: 29 additions & 6 deletions b/‎malariagen_data/anoph/distance.py‎
Lines changed: 29 additions & 6 deletions
diff --git a/‎malariagen_data/anoph/fst.py‎
Lines changed: 46 additions & 2 deletions b/‎malariagen_data/anoph/fst.py‎
Lines changed: 46 additions & 2 deletions
diff --git a/‎malariagen_data/anoph/fst_params.py‎
Lines changed: 16 additions & 0 deletions b/‎malariagen_data/anoph/fst_params.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎malariagen_data/anoph/hapclust.py‎
Lines changed: 29 additions & 8 deletions b/‎malariagen_data/anoph/hapclust.py‎
Lines changed: 29 additions & 8 deletions
diff --git a/‎malariagen_data/anoph/pca.py‎
Lines changed: 4 additions & 1 deletion b/‎malariagen_data/anoph/pca.py‎
Lines changed: 4 additions & 1 deletion
@@ -326,3 +326,12 @@ def _validate_sample_selection_params(
     to select SNPs to be included
     """,
 ]
+
+return_dataset: TypeAlias = Annotated[
+    bool,
+    """
+    If True, return an xarray Dataset containing computed results as
+    additional data variables. If False (default), return the legacy
+    format (numpy array or tuple) for backward compatibility.
+    """,
+]
@@ -1,5 +1,5 @@
 # Standard library imports.
-from typing import Optional, Tuple
+from typing import Any, Optional, Tuple
 import math
 
 # Third-party library imports.
@@ -86,7 +86,12 @@ def __init__(self, **kwargs):
         summary="""
             Compute pairwise distances between samples using biallelic SNP genotypes.
         """,
-        returns=("dist", "samples", "n_snps_used"),
+        returns="""
+            If `return_dataset` is False (default), return a tuple
+            `(dist, samples, n_snps_used)`. If `return_dataset` is True,
+            return an xarray Dataset with `dist`, `sample_id`, and
+            `n_snps_used` as variables/coordinates.
+        """,
     )
     def biallelic_diplotype_pairwise_distances(
         self,
@@ -108,9 +113,8 @@ def biallelic_diplotype_pairwise_distances(
         random_seed: base_params.random_seed = 42,
         inline_array: base_params.inline_array = base_params.inline_array_default,
         chunks: base_params.chunks = base_params.native_chunks,
-    ) -> Tuple[
-        distance_params.dist, distance_params.samples, distance_params.n_snps_used
-    ]:
+        return_dataset: base_params.return_dataset = False,
+    ) -> Any:
         # Change this name if you ever change the behaviour of this function, to
         # invalidate any previously cached data.
         name = "biallelic_diplotype_pairwise_distances"
@@ -173,6 +177,22 @@ def biallelic_diplotype_pairwise_distances(
         samples: np.ndarray = results["samples"]
         n_snps_used: int = int(results["n_snps"][()])  # ensure scalar
 
+        if return_dataset:
+            import xarray as xr
+            from scipy.spatial.distance import squareform
+
+            dist_square = squareform(dist)
+            ds = xr.Dataset(
+                data_vars={
+                    "dist": (("sample_x", "sample_y"), dist_square),
+                },
+                coords={
+                    "sample_id": ("sample_x", samples),
+                },
+                attrs={"n_snps_used": n_snps_used},
+            )
+            return ds
+
         return dist, samples, n_snps_used
 
     def _biallelic_diplotype_pairwise_distances(
@@ -195,7 +215,7 @@ def _biallelic_diplotype_pairwise_distances(
         max_missing_an,
     ):
         # Compute diplotypes.
-        gn, samples = self.biallelic_diplotypes(
+        ds = self.biallelic_diplotypes(
             region=region,
             sample_sets=sample_sets,
             sample_indices=sample_indices,
@@ -211,7 +231,10 @@ def _biallelic_diplotype_pairwise_distances(
             min_minor_ac=min_minor_ac,
             n_snps=n_snps,
             thin_offset=thin_offset,
+            return_dataset=True,
         )
+        gn = ds["call_diplotype"].values
+        samples = ds["sample_id"].values.astype("U")
 
         # Record number of SNPs used.
         n_snps = gn.shape[0]
 
@@ -1,3 +1,4 @@
+import warnings
 from typing import Tuple, Optional
 
 import numpy as np
@@ -43,6 +44,8 @@ def _fst_gwss(
         inline_array,
         chunks,
         clip_min,
+        min_snps_threshold,
+        window_adjustment_factor,
     ):
         # Compute allele counts.
         ac1 = self.snp_allele_counts(
@@ -81,6 +84,24 @@ def _fst_gwss(
                 chunks=chunks,
             ).compute()
 
+        n_snps = len(pos)
+        if n_snps < min_snps_threshold:
+            raise ValueError(
+                f"Too few SNP sites ({n_snps}) available for Fst GWSS. "
+                f"At least {min_snps_threshold} sites are required. "
+                "Try a larger genomic region or different site selection criteria."
+            )
+        if window_size >= n_snps:
+            adjusted_window_size = max(1, n_snps // window_adjustment_factor)
+            warnings.warn(
+                f"window_size ({window_size}) is >= the number of SNP sites "
+                f"available ({n_snps}); automatically adjusting window_size to "
+                f"{adjusted_window_size} (= {n_snps} // {window_adjustment_factor}).",
+                UserWarning,
+                stacklevel=2,
+            )
+            window_size = adjusted_window_size
+
         with self._spinner(desc="Compute Fst"):
             with np.errstate(divide="ignore", invalid="ignore"):
                 fst = allel.moving_hudson_fst(ac1, ac2, size=window_size)
@@ -96,8 +117,23 @@ def _fst_gwss(
     @doc(
         summary="""
             Run a Fst genome-wide scan to investigate genetic differentiation
-            between two cohorts.
+            between two cohorts. If window_size is >= the number of available
+            SNP sites, a UserWarning is issued and window_size is automatically
+            adjusted to number_of_snps // window_adjustment_factor. A ValueError
+            is raised if the number of available SNP sites is below
+            min_snps_threshold.
         """,
+        parameters=dict(
+            min_snps_threshold="""
+                Minimum number of SNP sites required. If fewer sites are
+                available a ValueError is raised.
+            """,
+            window_adjustment_factor="""
+                If window_size is >= the number of available SNP sites,
+                window_size is automatically set to
+                number_of_snps // window_adjustment_factor.
+            """,
+        ),
         returns=dict(
             x="An array containing the window centre point genomic positions",
             fst="An array with Fst statistic values for each window.",
@@ -123,6 +159,8 @@ def fst_gwss(
         inline_array: base_params.inline_array = base_params.inline_array_default,
         chunks: base_params.chunks = base_params.native_chunks,
         clip_min: fst_params.clip_min = 0.0,
+        min_snps_threshold: fst_params.min_snps_threshold = 1000,
+        window_adjustment_factor: fst_params.window_adjustment_factor = 10,
     ) -> Tuple[np.ndarray, np.ndarray]:
         # Change this name if you ever change the behaviour of this function, to
         # invalidate any previously cached data.
@@ -147,7 +185,13 @@ def fst_gwss(
             results = self.results_cache_get(name=name, params=params)
 
         except CacheMiss:
-            results = self._fst_gwss(**params, inline_array=inline_array, chunks=chunks)
+            results = self._fst_gwss(
+                **params,
+                inline_array=inline_array,
+                chunks=chunks,
+                min_snps_threshold=min_snps_threshold,
+                window_adjustment_factor=window_adjustment_factor,
+            )
             self.results_cache_set(name=name, params=params, results=results)
 
         x = results["x"]
 
@@ -34,6 +34,22 @@
     """,
 ]
 
+min_snps_threshold: TypeAlias = Annotated[
+    int,
+    """
+    Minimum number of SNP sites required for the Fst GWSS computation. If
+    fewer sites are available, a ValueError is raised.
+    """,
+]
+
+window_adjustment_factor: TypeAlias = Annotated[
+    int,
+    """
+    If window_size is >= the number of available SNP sites, the window_size
+    is automatically adjusted to number_of_snps // window_adjustment_factor.
+    """,
+]
+
 annotation: TypeAlias = Annotated[
     Optional[Literal["standard error", "Z score", "lower triangle"]],
     """
 
@@ -1,5 +1,5 @@
 import warnings
-from typing import Optional, Tuple
+from typing import Any, Optional
 
 import allel  # type: ignore
 import numpy as np
@@ -204,11 +204,12 @@ def plot_haplotype_clustering(
         summary="""
             Compute pairwise distances between haplotypes.
         """,
-        returns=dict(
-            dist="Pairwise distance.",
-            phased_samples="Sample identifiers for haplotypes.",
-            n_snps="Number of SNPs used.",
-        ),
+        returns="""
+            If `return_dataset` is False (default), return a tuple
+            `(dist, phased_samples, n_snps)`. If `return_dataset` is True,
+            return an xarray Dataset with `dist`, `sample_id`, and
+            `n_snps` as variables/attributes.
+        """,
     )
     def haplotype_pairwise_distances(
         self,
@@ -222,7 +223,8 @@ def haplotype_pairwise_distances(
         random_seed: base_params.random_seed = 42,
         chunks: base_params.chunks = base_params.native_chunks,
         inline_array: base_params.inline_array = base_params.inline_array_default,
-    ) -> Tuple[np.ndarray, np.ndarray, int]:
+        return_dataset: base_params.return_dataset = False,
+    ) -> Any:
         # Change this name if you ever change the behaviour of this function, to
         # invalidate any previously cached data.
         name = "haplotype_pairwise_distances"
@@ -255,11 +257,30 @@ def haplotype_pairwise_distances(
             )
             self.results_cache_set(name=name, params=params, results=results)
 
-        # Unpack results")
+        # Unpack results.
         dist: np.ndarray = results["dist"]
         phased_samples: np.ndarray = results["phased_samples"]
         n_snps: int = int(results["n_snps"][()])  # ensure scalar
 
+        if return_dataset:
+            import xarray as xr
+            from scipy.spatial.distance import squareform
+
+            dist_square = squareform(dist)
+            # Each phased sample contributes 2 haplotypes; create
+            # haplotype-level labels to match the distance matrix.
+            hap_labels = np.repeat(phased_samples, 2)
+            ds = xr.Dataset(
+                data_vars={
+                    "dist": (("sample_x", "sample_y"), dist_square),
+                },
+                coords={
+                    "sample_id": ("sample_x", hap_labels),
+                },
+                attrs={"n_snps": n_snps},
+            )
+            return ds
+
         return dist, phased_samples, n_snps
 
     def _haplotype_pairwise_distances(
 
@@ -207,7 +207,7 @@ def _pca(
         inline_array,
     ):
         # Load diplotypes.
-        gn, samples = self.biallelic_diplotypes(
+        ds_diplotypes = self.biallelic_diplotypes(
             region=region,
             n_snps=n_snps,
             thin_offset=thin_offset,
@@ -223,7 +223,10 @@ def _pca(
             random_seed=random_seed,
             chunks=chunks,
             inline_array=inline_array,
+            return_dataset=True,
         )
+        gn = ds_diplotypes["call_diplotype"].values
+        samples = ds_diplotypes["sample_id"].values.astype("U")
 
         with self._spinner(desc="Compute PCA"):
             # Exclude any samples prior to computing PCA.