Merge branch 'master' into fix/issue-1280-vcf-performance

jonbrenas · web-flow · commit ad8b35dfd7bd · 2026-04-17T15:05:08.000+01:00
diff --git a/malariagen_data/anoph/pca.py b/malariagen_data/anoph/pca.py
@@ -42,7 +42,6 @@ def __init__(
             The following additional parameters were also added in version 8.0.0:
             `site_class`, `cohort_size`, `min_cohort_size`, `max_cohort_size`,
             `random_seed`.
-
         """,
         parameters=dict(
             imputation_method="""
@@ -69,6 +68,10 @@ def pca(
         sample_query: Optional[base_params.sample_query] = None,
         sample_query_options: Optional[base_params.sample_query_options] = None,
         sample_indices: Optional[base_params.sample_indices] = None,
+        cohorts: Optional[base_params.cohorts] = None,
+        cohort_size: Optional[base_params.cohort_size] = None,
+        min_cohort_size: Optional[base_params.min_cohort_size] = None,
+        max_cohort_size: Optional[base_params.max_cohort_size] = None,
         site_mask: Optional[base_params.site_mask] = base_params.DEFAULT,
         site_class: Optional[base_params.site_class] = None,
         min_minor_ac: Optional[
@@ -78,9 +81,6 @@ def pca(
             base_params.max_missing_an
         ] = pca_params.max_missing_an_default,
         imputation_method: pca_params.imputation_method = pca_params.imputation_method_default,
-        cohort_size: Optional[base_params.cohort_size] = None,
-        min_cohort_size: Optional[base_params.min_cohort_size] = None,
-        max_cohort_size: Optional[base_params.max_cohort_size] = None,
         exclude_samples: Optional[base_params.samples] = None,
         fit_exclude_samples: Optional[base_params.samples] = None,
         random_seed: base_params.random_seed = 42,
@@ -98,8 +98,44 @@ def pca(
 
         ## Normalize params for consistent hash value.
 
-        # Note: `_prep_sample_selection_cache_params` converts `sample_query` and `sample_query_options` into `sample_indices`.
-        # So `sample_query` and `sample_query_options` should not be used beyond this point. (`sample_indices` should be used instead.)
+        # Handle cohort downsampling.
+        if cohorts is not None:
+            if max_cohort_size is None:
+                raise ValueError(
+                    "`max_cohort_size` is required when `cohorts` is provided."
+                )
+            if sample_indices is not None:
+                raise ValueError(
+                    "Cannot use `sample_indices` with `cohorts` and `max_cohort_size`."
+                )
+            if cohort_size is not None or min_cohort_size is not None:
+                raise ValueError(
+                    "Cannot use `cohort_size` or `min_cohort_size` with `cohorts`."
+                )
+            df_samples = self.sample_metadata(
+                sample_sets=sample_sets,
+                sample_query=sample_query,
+                sample_query_options=sample_query_options,
+            )
+            # N.B., we are going to overwrite the sample_indices parameter here.
+            groups = df_samples.groupby(cohorts, sort=False)
+            ix = []
+            for _, group in groups:
+                if len(group) > max_cohort_size:
+                    ix.extend(
+                        group.sample(
+                            n=max_cohort_size, random_state=random_seed, replace=False
+                        ).index
+                    )
+                else:
+                    ix.extend(group.index)
+            sample_indices = ix
+            # From this point onwards, the sample_query is no longer needed, because
+            # the sample selection is defined by the sample_indices.
+            sample_query = None
+            sample_query_options = None
+
+        # Normalize params for consistent hash value.
         (
             prepared_sample_sets,
             prepared_sample_indices,
@@ -132,6 +168,7 @@ def pca(
             max_missing_an=max_missing_an,
             imputation_method=imputation_method,
             n_components=n_components,
+            cohorts=cohorts,
             cohort_size=cohort_size,
             min_cohort_size=min_cohort_size,
             max_cohort_size=max_cohort_size,
@@ -149,10 +186,10 @@ def pca(
             self.results_cache_set(name=name, params=params, results=results)
 
         # Unpack results.
-        coords = results["coords"]
-        evr = results["evr"]
-        samples = results["samples"]
-        loc_keep_fit = results["loc_keep_fit"]
+        coords = np.array(results["coords"])
+        evr = np.array(results["evr"])
+        samples = np.array(results["samples"])
+        loc_keep_fit = np.array(results["loc_keep_fit"])
 
         # Create a new DataFrame containing the PCA coords data.
         df_pca = pd.DataFrame(coords, index=samples)
@@ -205,6 +242,7 @@ def _pca(
         random_seed,
         chunks,
         inline_array,
+        **kwargs,
     ):
         # Load diplotypes.
         ds_diplotypes = self.biallelic_diplotypes(
diff --git a/notebooks/plot_pca.ipynb b/notebooks/plot_pca.ipynb
@@ -620,10 +620,38 @@
     ")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "f1e8c954",
+   "metadata": {},
+   "source": [
+    "## PCA with cohort downsampling"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e4a484f3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_pca_cohorts, evr_cohorts = ag3.pca(\n",
+    "    region=\"3L:15,000,000-16,000,000\",\n",
+    "    sample_sets=\"3.0\",\n",
+    "    n_snps=10_000,\n",
+    "    cohorts=\"country\",\n",
+    "    max_cohort_size=20,\n",
+    ")\n",
+    "ag3.plot_pca_coords(\n",
+    "    df_pca_cohorts,\n",
+    "    color=\"country\",\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "33d788a2-f256-4930-b1e5-b4f31e681a36",
+   "id": "abb2ee83",
    "metadata": {},
    "outputs": [],
    "source": []
diff --git a/tests/anoph/test_pca.py b/tests/anoph/test_pca.py
@@ -340,6 +340,85 @@ def test_pca_fit_exclude_samples(fixture, api: AnophelesPca):
     )
 
 
+@parametrize_with_cases("fixture,api", cases=".")
+def test_pca_cohort_downsampling(fixture, api: AnophelesPca):
+    # Parameters for selecting input data.
+    all_sample_sets = api.sample_sets()["sample_set"].to_list()
+    sample_sets = np.random.choice(all_sample_sets, size=2, replace=False).tolist()
+    data_params = dict(
+        region=str(np.random.choice(api.contigs)),
+        sample_sets=sample_sets,
+        site_mask=np.random.choice(list(api.site_mask_ids) + [None]),
+    )
+
+    # Test cohort downsampling.
+    cohort_col = "country"
+    max_cohort_size = 10
+    random_seed = 42
+
+    # Try to run the PCA with cohort downsampling.
+    try:
+        pca_df, pca_evr = api.pca(
+            n_snps=100,  # Use a small number to avoid "Not enough SNPs" errors
+            n_components=2,
+            cohorts=cohort_col,
+            max_cohort_size=max_cohort_size,
+            random_seed=random_seed,
+            **data_params,
+        )
+    except ValueError as e:
+        if "Not enough SNPs" in str(e):
+            pytest.skip("Not enough SNPs available after downsampling to run test.")
+        else:
+            raise
+
+    # Check types.
+    assert isinstance(pca_df, pd.DataFrame)
+    assert isinstance(pca_evr, np.ndarray)
+
+    # Check basic structure.
+    assert len(pca_df) > 0
+    assert "PC1" in pca_df.columns
+    assert "PC2" in pca_df.columns
+    assert "pca_fit" in pca_df.columns
+    assert pca_df["pca_fit"].all()
+    assert pca_evr.ndim == 1
+    assert pca_evr.shape[0] == 2
+
+    # Check cohort counts.
+    final_cohort_counts = pca_df[cohort_col].value_counts()
+    for cohort, count in final_cohort_counts.items():
+        assert count <= max_cohort_size
+
+    # Test bad parameter combinations.
+    with pytest.raises(ValueError):
+        api.pca(
+            n_snps=100,
+            n_components=2,
+            cohorts=cohort_col,
+            # max_cohort_size is missing
+            **data_params,
+        )
+    with pytest.raises(ValueError):
+        api.pca(
+            n_snps=100,
+            n_components=2,
+            cohorts=cohort_col,
+            max_cohort_size=max_cohort_size,
+            sample_indices=[0, 1, 2],
+            **data_params,
+        )
+    with pytest.raises(ValueError):
+        api.pca(
+            n_snps=100,
+            n_components=2,
+            cohorts=cohort_col,
+            max_cohort_size=max_cohort_size,
+            cohort_size=10,
+            **data_params,
+        )
+
+
 # --- _jitter() determinism unit tests ---