malariagen
diff --git a/‎docs/source/Af1.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/Af1.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/Ag3.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/Ag3.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/Amin1.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/Amin1.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/_static/switcher.json‎
Lines changed: 7 additions & 57 deletions b/‎docs/source/_static/switcher.json‎
Lines changed: 7 additions & 57 deletions
diff --git a/‎malariagen_data/anoph/base.py‎
Lines changed: 8 additions & 3 deletions b/‎malariagen_data/anoph/base.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎malariagen_data/anoph/base_params.py‎
Lines changed: 19 additions & 9 deletions b/‎malariagen_data/anoph/base_params.py‎
Lines changed: 19 additions & 9 deletions
diff --git a/‎malariagen_data/anoph/pca.py‎
Lines changed: 43 additions & 5 deletions b/‎malariagen_data/anoph/pca.py‎
Lines changed: 43 additions & 5 deletions
diff --git a/‎malariagen_data/anoph/snp_data.py‎
Lines changed: 13 additions & 3 deletions b/‎malariagen_data/anoph/snp_data.py‎
Lines changed: 13 additions & 3 deletions
@@ -14,7 +14,7 @@ All the functions below can then be accessed as methods on the ``af1`` object. E
 
     df_samples = af1.sample_metadata()
 
-For more information about the data and terns of use, please see the
+For more information about the data and terms of use, please see the
 `MalariaGEN Anopheles funestus genomic surveillance project <https://www.malariagen.net/projects/anopheles-funestus-genomic-surveillance-project>`_
 home page.
 
 
@@ -14,7 +14,7 @@ All the functions below can then be accessed as methods on the ``ag3`` object. E
 
     df_samples = ag3.sample_metadata()
 
-For more information about the data and terns of use, please see the
+For more information about the data and terms of use, please see the
 `MalariaGEN Anopheles gambiae genomic surveillance project <https://www.malariagen.net/anopheles-gambiae-genomic-surveillance-project>`_
 home page.
 
 
@@ -14,7 +14,7 @@ All the functions below can then be accessed as methods on the ``amin1`` object.
 
     df_samples = amin1.sample_metadata()
 
-For more information about the data and terns of use, please see the
+For more information about the data and terms of use, please see the
 `MalariaGEN Vector Observatory Asia <https://www.malariagen.net/mosquito/vector-observatory-asia>`_
 home page.
 
 
@@ -1,25 +1,20 @@
 [
+    {
+        "name": "13.0.0",
+        "version": "v13.0.0",
+        "url": "https:///malariagen.github.io/malariagen-data-python/v13.0.0/",
+        "preferred": true
+    },
     {
         "name": "12.0.0",
         "version": "v12.0.0",
-        "url": "https:///malariagen.github.io/malariagen-data-python/v12.0.0/",
-        "preferred": true
+        "url": "https:///malariagen.github.io/malariagen-data-python/v12.0.0/"
     },
     {
         "name": "11.0.0",
         "version": "v11.0.0",
         "url": "https:///malariagen.github.io/malariagen-data-python/v11.0.0/"
     },
-    {
-        "name": "10.2.0",
-        "version": "v10.2.0",
-        "url": "https:///malariagen.github.io/malariagen-data-python/v10.2.0/"
-    },
-    {
-        "name": "10.1.0",
-        "version": "v10.1.0",
-        "url": "https:///malariagen.github.io/malariagen-data-python/v10.1.0/"
-    },
     {
         "name": "10.0.0",
         "version": "v10.0.0",
@@ -30,46 +25,6 @@
         "version": "v9.0.0",
         "url": "https:///malariagen.github.io/malariagen-data-python/v9.0.0/"
     },
-    {
-        "name": "8.8.0",
-        "version": "v8.8.0",
-        "url": "https:///malariagen.github.io/malariagen-data-python/v8.8.0/"
-    },
-    {
-        "name": "8.7.0",
-        "version": "v8.7.0",
-        "url": "https:///malariagen.github.io/malariagen-data-python/v8.7.0/"
-    },
-    {
-        "name": "8.6.0",
-        "version": "v8.6.0",
-        "url": "https:///malariagen.github.io/malariagen-data-python/v8.6.0/"
-    },
-    {
-        "name": "8.5.0",
-        "version": "v8.5.0",
-        "url": "https:///malariagen.github.io/malariagen-data-python/v8.5.0/"
-    },
-    {
-        "name": "8.4.0",
-        "version": "v8.4.0",
-        "url": "https:///malariagen.github.io/malariagen-data-python/v8.4.0/"
-    },
-    {
-        "name": "8.3.0",
-        "version": "v8.3.0",
-        "url": "https:///malariagen.github.io/malariagen-data-python/v8.3.0/"
-    },
-    {
-        "name": "8.2.0",
-        "version": "v8.2.0",
-        "url": "https:///malariagen.github.io/malariagen-data-python/v8.2.0/"
-    },
-    {
-        "name": "8.1.0",
-        "version": "v8.1.0",
-        "url": "https:///malariagen.github.io/malariagen-data-python/v8.1.0/"
-    },
     {
         "name": "8.0.0",
         "version": "v8.0.0",
@@ -80,11 +35,6 @@
         "version": "v7.15.0",
         "url": "https:///malariagen.github.io/malariagen-data-python/v7.15.0/"
     },
-    {
-        "name": "7.14.1",
-        "version": "v7.14.1",
-        "url": "https:///malariagen.github.io/malariagen-data-python/v7.14.1/"
-    },
     {
         "name": "7.14.0",
         "version": "v7.14.0",
 
@@ -32,6 +32,7 @@
     LoggingHelper,
     check_colab_location,
     check_types,
+    distributed_client,
     get_gcp_region,
     hash_params,
     init_filesystem,
@@ -174,9 +175,13 @@ def _dask_progress(self, desc=None, leave=False, **kwargs):  # pragma: no cover
         # Progress doesn't mix well with debug logging.
         show_progress = self._show_progress and not self._debug
         if show_progress:
-            return TqdmCallback(
-                desc=desc, leave=leave, tqdm_class=self._tqdm_class, **kwargs
-            )
+            if distributed_client():
+                # Cannot easily show progress, fall back to spinner.
+                return self._spinner(desc=desc)
+            else:
+                return TqdmCallback(
+                    desc=desc, leave=leave, tqdm_class=self._tqdm_class, **kwargs
+                )
         else:
             return nullcontext()
 
 
@@ -1,6 +1,6 @@
 """General parameters common to many functions in the public API."""
 
-from typing import Final, List, Mapping, Optional, Sequence, Tuple, Union, Callable
+from typing import Final, List, Mapping, Optional, Sequence, Tuple, Union
 
 from typing_extensions import Annotated, TypeAlias
 
@@ -9,6 +9,7 @@
     region_param_type,
     single_contig_param_type,
     single_region_param_type,
+    chunks_param_type,
 )
 
 contig: TypeAlias = Annotated[
@@ -226,15 +227,22 @@ def validate_sample_selection_params(
 inline_array_default: inline_array = True
 
 chunks: TypeAlias = Annotated[
-    Union[str, Tuple[int, ...], Callable[[Tuple[int, ...]], Tuple[int, ...]]],
+    chunks_param_type,
     """
     If 'auto' let dask decide chunk size. If 'native' use native zarr
-    chunks. Also, can be a target size, e.g., '200 MiB', or a tuple of
-    integers.
+    chunks. If 'ndauto' let dask decide chunk size but only for arrays with
+    more than one dimension. If 'ndauto0' as 'ndauto' but only vary the first
+    chunk dimension. If 'ndauto1' as 'ndauto' but only vary the second chunk
+    dimension. If 'ndauto01' as 'ndauto' but only vary the first and second
+    chunk dimensions. Also, can be a target size, e.g., '200 MiB', or a tuple of
+    integers, or a callable which accepts the native chunks as a single argument
+    and returns a valid dask chunks value.
     """,
 ]
 
-chunks_default: chunks = "native"
+# The "ndauto0" value means auto-size chunks for arrays with more than one dimension,
+# allowing the first chunk dimension to be varied.
+chunks_default: chunks = "ndauto0"
 
 gff_attributes: TypeAlias = Annotated[
     Optional[Union[Sequence[str], str]],
@@ -263,19 +271,21 @@ def validate_sample_selection_params(
 ]
 
 min_minor_ac: TypeAlias = Annotated[
-    int,
+    Union[int, float],
     """
     The minimum minor allele count. SNPs with a minor allele count
-    below this value will be excluded.
+    below this value will be excluded. Can also be a float, which will
+    be interpreted as a fraction.
     """,
 ]
 
 max_missing_an: TypeAlias = Annotated[
-    int,
+    Union[int, float],
     """
     The maximum number of missing allele calls to accept. SNPs with
     more than this value will be excluded. Set to 0 to require no
-    missing calls.
+    missing calls. Can also be a float, which will be interpreted as
+    a fraction.
     """,
 ]
 
 
@@ -71,13 +71,15 @@ def pca(
         cohort_size: Optional[base_params.cohort_size] = None,
         min_cohort_size: Optional[base_params.min_cohort_size] = None,
         max_cohort_size: Optional[base_params.max_cohort_size] = None,
+        exclude_samples: Optional[base_params.samples] = None,
+        fit_exclude_samples: Optional[base_params.samples] = None,
         random_seed: base_params.random_seed = 42,
         inline_array: base_params.inline_array = base_params.inline_array_default,
         chunks: base_params.chunks = base_params.chunks_default,
     ) -> Tuple[pca_params.df_pca, pca_params.evr]:
         # Change this name if you ever change the behaviour of this function, to
         # invalidate any previously cached data.
-        name = "pca_v3"
+        name = "pca_v4"
 
         # Normalize params for consistent hash value.
         (
@@ -104,6 +106,8 @@ def pca(
             cohort_size=cohort_size,
             min_cohort_size=min_cohort_size,
             max_cohort_size=max_cohort_size,
+            exclude_samples=exclude_samples,
+            fit_exclude_samples=fit_exclude_samples,
             random_seed=random_seed,
         )
 
@@ -119,11 +123,11 @@ def pca(
         coords = results["coords"]
         evr = results["evr"]
         samples = results["samples"]
+        loc_keep_fit = results["loc_keep_fit"]
 
         # Load sample metadata.
         df_samples = self.sample_metadata(
             sample_sets=sample_sets,
-            sample_indices=sample_indices_prepped,
         )
 
         # Ensure aligned with genotype data.
@@ -134,6 +138,8 @@ def pca(
             {f"PC{i + 1}": coords[:, i] for i in range(coords.shape[1])}
         )
         df_pca = df_samples.join(df_coords, how="inner")
+        # Add a column for which samples were included in fitting.
+        df_pca["pca_fit"] = loc_keep_fit
 
         return df_pca, evr
 
@@ -153,6 +159,8 @@ def _pca(
         cohort_size,
         min_cohort_size,
         max_cohort_size,
+        exclude_samples,
+        fit_exclude_samples,
         random_seed,
         chunks,
         inline_array,
@@ -177,12 +185,39 @@ def _pca(
         )
 
         with self._spinner(desc="Compute PCA"):
+            # Exclude any samples prior to computing PCA.
+            if exclude_samples is not None:
+                x = np.array(exclude_samples, dtype="U")
+                loc_keep = ~np.isin(samples, x)
+                samples = samples[loc_keep]
+                gn = gn[:, loc_keep]
+
+            # Exclude any samples from fitting only.
+            if fit_exclude_samples is not None:
+                xf = np.array(fit_exclude_samples, dtype="U")
+                loc_keep_fit = ~np.isin(samples, xf)
+                gn_fit = gn[:, loc_keep_fit]
+            else:
+                loc_keep_fit = np.ones(len(samples), dtype=bool)
+                gn_fit = gn
+
             # Remove any sites where all genotypes are identical.
-            loc_var = np.any(gn != gn[:, 0, np.newaxis], axis=1)
+            loc_var = np.any(gn_fit != gn_fit[:, 0, np.newaxis], axis=1)
+            gn_fit_var = np.compress(loc_var, gn_fit, axis=0)
             gn_var = np.compress(loc_var, gn, axis=0)
 
             # Run the PCA.
-            coords, model = allel.pca(gn_var, n_components=n_components)
+            if fit_exclude_samples is None:
+                # Simple fit and transform on the same data.
+                coords, model = allel.pca(gn_fit_var, n_components=n_components)
+
+            else:
+                # Fit and transform separately.
+                model = allel.stats.decomposition.GenotypePCA(
+                    n_components=n_components,
+                )
+                model.fit(gn_fit_var)
+                coords = model.transform(gn_var, copy=False)
 
             # Work around sign indeterminacy.
             for i in range(coords.shape[1]):
@@ -191,7 +226,10 @@ def _pca(
                     coords[:, i] = c * -1
 
         results = dict(
-            samples=samples, coords=coords, evr=model.explained_variance_ratio_
+            samples=samples,
+            coords=coords,
+            evr=model.explained_variance_ratio_,
+            loc_keep_fit=loc_keep_fit,
         )
         return results
 
 
@@ -1582,6 +1582,8 @@ def biallelic_snp_calls(
             min_cohort_size=min_cohort_size,
             max_cohort_size=max_cohort_size,
             random_seed=random_seed,
+            inline_array=inline_array,
+            chunks=chunks,
         )
 
         # Locate biallelic SNPs.
@@ -1657,18 +1659,26 @@ def biallelic_snp_calls(
             # Apply conditions.
             if max_missing_an is not None or min_minor_ac is not None:
                 loc_out = np.ones(ds_out.sizes["variants"], dtype=bool)
+                an = ac_out.sum(axis=1)
 
                 # Apply missingness condition.
                 if max_missing_an is not None:
-                    an = ac_out.sum(axis=1)
                     an_missing = (ds_out.sizes["samples"] * ds_out.sizes["ploidy"]) - an
-                    loc_missing = an_missing <= max_missing_an
+                    if isinstance(max_missing_an, float):
+                        an_missing_frac = an_missing / an
+                        loc_missing = an_missing_frac <= max_missing_an
+                    else:
+                        loc_missing = an_missing <= max_missing_an
                     loc_out &= loc_missing
 
                 # Apply minor allele count condition.
                 if min_minor_ac is not None:
                     ac_minor = ac_out.min(axis=1)
-                    loc_minor = ac_minor >= min_minor_ac
+                    if isinstance(min_minor_ac, float):
+                        ac_minor_frac = ac_minor / an
+                        loc_minor = ac_minor_frac >= min_minor_ac
+                    else:
+                        loc_minor = ac_minor >= min_minor_ac
                     loc_out &= loc_minor
 
                 ds_out = ds_out.isel(variants=loc_out)