malariagen
diff --git a/‎malariagen_data/anoph/base.py‎
Lines changed: 5 additions & 4 deletions b/‎malariagen_data/anoph/base.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎malariagen_data/anoph/cnv_frq.py‎
Lines changed: 2 additions & 0 deletions b/‎malariagen_data/anoph/cnv_frq.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎malariagen_data/anoph/frq_base.py‎
Lines changed: 4 additions & 2 deletions b/‎malariagen_data/anoph/frq_base.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎malariagen_data/anoph/genome_features.py‎
Lines changed: 9 additions & 7 deletions b/‎malariagen_data/anoph/genome_features.py‎
Lines changed: 9 additions & 7 deletions
diff --git a/‎malariagen_data/anoph/hap_data.py‎
Lines changed: 4 additions & 1 deletion b/‎malariagen_data/anoph/hap_data.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎malariagen_data/anoph/hapclust.py‎
Lines changed: 2 additions & 0 deletions b/‎malariagen_data/anoph/hapclust.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎malariagen_data/anoph/karyotype.py‎
Lines changed: 1 addition & 1 deletion b/‎malariagen_data/anoph/karyotype.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎malariagen_data/anoph/pca.py‎
Lines changed: 48 additions & 10 deletions b/‎malariagen_data/anoph/pca.py‎
Lines changed: 48 additions & 10 deletions
@@ -28,6 +28,8 @@
 from numpydoc_decorator import doc  # type: ignore
 from tqdm.auto import tqdm as tqdm_auto  # type: ignore
 from tqdm.dask import TqdmCallback  # type: ignore
+
+from .safe_query import validate_query
 from yaspin import yaspin  # type: ignore
 import xarray as xr
 
@@ -980,10 +982,9 @@ def _filter_sample_dataset(
 
         # Determine which samples match the sample query.
         if sample_query != "":
-            # Use the python engine in order to support extension array dtypes, e.g. Float64, Int64, boolean.
-            loc_samples = df_samples.eval(
-                sample_query, **sample_query_options, engine="python"
-            )
+            # Validate the query to prevent arbitrary code execution (GH-1292).
+            validate_query(sample_query)
+            loc_samples = df_samples.eval(sample_query, **sample_query_options)
         else:
             loc_samples = pd.Series(True, index=df_samples.index)
 
 
@@ -15,6 +15,7 @@
     _build_cohorts_from_sample_grouping,
     _add_frequency_ci,
 )
+from .safe_query import validate_query
 from ..util import (
     _check_types,
     _pandas_apply,
@@ -671,6 +672,7 @@ def _gene_cnv_frequencies_advanced(
 
         debug("apply variant query")
         if variant_query is not None:
+            validate_query(variant_query)
             loc_variants = df_variants.eval(variant_query).values
             # Convert boolean mask to integer indices for NumPy 2.x compatibility
             variant_indices = np.where(loc_variants)[0]
 
@@ -147,8 +147,10 @@ def _build_cohorts_from_sample_grouping(
         period_str = df_cohorts["period"].astype(str)
         df_cohorts["label"] = area_str + "_" + taxon_clean + "_" + period_str
 
-    # Apply minimum cohort size.
-    df_cohorts = df_cohorts.query(f"size >= {min_cohort_size}").reset_index(drop=True)
+    # Apply minimum cohort size using safe boolean indexing.
+    df_cohorts = df_cohorts.loc[df_cohorts["size"] >= min_cohort_size].reset_index(
+        drop=True
+    )
 
     # Early check for no cohorts.
     if len(df_cohorts) == 0:
 
@@ -117,8 +117,8 @@ def _genome_features_for_contig(self, *, contig: str, attributes: Tuple[str, ...
                 )
             df = self._genome_features(attributes=attributes)
 
-            # Apply contig query.
-            df = df.query(f"contig == '{contig}'")
+            # Apply contig filter using safe boolean indexing.
+            df = df.loc[df["contig"] == contig]
             return df
 
     def _prep_gff_attributes(
@@ -162,9 +162,9 @@ def genome_features(
                         contig=r.contig, attributes=attributes_normed
                     )
                     if r.end is not None:
-                        df_part = df_part.query(f"start <= {r.end}")
+                        df_part = df_part.loc[df_part["start"] <= r.end]
                     if r.start is not None:
-                        df_part = df_part.query(f"end >= {r.start}")
+                        df_part = df_part.loc[df_part["end"] >= r.start]
                     parts.append(df_part)
                 df = pd.concat(parts, axis=0)
                 return df.sort_values(["contig", "start"]).reset_index(drop=True).copy()
@@ -192,8 +192,8 @@ def genome_feature_children(
         df_gf["Parent"] = df_gf["Parent"].str.split(",")
         df_gf = df_gf.explode(column="Parent", ignore_index=True)
 
-        # Query to find children of the requested parent.
-        df_children = df_gf.query(f"Parent == '{parent}'")
+        # Filter to find children of the requested parent using safe indexing.
+        df_children = df_gf.loc[df_gf["Parent"] == parent]
 
         return df_children.copy()
 
@@ -670,7 +670,9 @@ def plot_genes(
     def _plot_genes_setup_data(self, *, region):
         attributes = [a for a in self._gff_default_attributes if a != "Parent"]
         df_genome_features = self.genome_features(region=region, attributes=attributes)
-        data = df_genome_features.query(f"type == '{self._gff_gene_type}'").copy()
+        data = df_genome_features.loc[
+            df_genome_features["type"] == self._gff_gene_type
+        ].copy()
         tooltips = [(a.capitalize(), f"@{a}") for a in attributes]
         tooltips += [("Location", "@contig:@start{,}-@end{,}")]
         return data, tooltips
 
@@ -6,6 +6,8 @@
 import zarr  # type: ignore
 from numpydoc_decorator import doc  # type: ignore
 
+from .safe_query import validate_query
+
 from ..util import (
     DIM_ALLELE,
     DIM_PLOIDY,
@@ -418,7 +420,8 @@ def haplotypes(
                 df_samples.set_index("sample_id").loc[phased_samples].reset_index()
             )
 
-            # Apply the query.
+            # Validate the query to prevent arbitrary code execution (GH-1292).
+            validate_query(sample_query_prepped)
             sample_query_options = sample_query_options or {}
             loc_samples = df_samples_phased.eval(
                 sample_query_prepped, **sample_query_options
 
@@ -8,6 +8,7 @@
 
 from ..util import CacheMiss, _check_types, _pdist_abs_hamming, _pandas_apply
 from ..plotly_dendrogram import _plot_dendrogram, concat_clustering_subplots
+from .safe_query import validate_query
 from . import (
     base_params,
     plotly_params,
@@ -630,6 +631,7 @@ def transcript_haplotypes(
         """
 
         # Get SNP genotype allele counts for the transcript, applying snp_query
+        validate_query(snp_query)
         df_eff = (
             self.snp_effects(
                 transcript=transcript,
 
@@ -62,7 +62,7 @@ def load_inversion_tags(self, inversion: inversion_param) -> pd.DataFrame:
         else:
             with importlib.resources.path(resources, self._inversion_tag_path) as path:
                 df_tag_snps = pd.read_csv(path, sep=",")
-            return df_tag_snps.query(f"inversion == '{inversion}'").reset_index()
+            return df_tag_snps.loc[df_tag_snps["inversion"] == inversion].reset_index()
 
     @_check_types
     @doc(
 
@@ -42,7 +42,6 @@ def __init__(
             The following additional parameters were also added in version 8.0.0:
             `site_class`, `cohort_size`, `min_cohort_size`, `max_cohort_size`,
             `random_seed`.
-
         """,
         parameters=dict(
             imputation_method="""
@@ -69,6 +68,10 @@ def pca(
         sample_query: Optional[base_params.sample_query] = None,
         sample_query_options: Optional[base_params.sample_query_options] = None,
         sample_indices: Optional[base_params.sample_indices] = None,
+        cohorts: Optional[base_params.cohorts] = None,
+        cohort_size: Optional[base_params.cohort_size] = None,
+        min_cohort_size: Optional[base_params.min_cohort_size] = None,
+        max_cohort_size: Optional[base_params.max_cohort_size] = None,
         site_mask: Optional[base_params.site_mask] = base_params.DEFAULT,
         site_class: Optional[base_params.site_class] = None,
         min_minor_ac: Optional[
@@ -78,9 +81,6 @@ def pca(
             base_params.max_missing_an
         ] = pca_params.max_missing_an_default,
         imputation_method: pca_params.imputation_method = pca_params.imputation_method_default,
-        cohort_size: Optional[base_params.cohort_size] = None,
-        min_cohort_size: Optional[base_params.min_cohort_size] = None,
-        max_cohort_size: Optional[base_params.max_cohort_size] = None,
         exclude_samples: Optional[base_params.samples] = None,
         fit_exclude_samples: Optional[base_params.samples] = None,
         random_seed: base_params.random_seed = 42,
@@ -98,8 +98,44 @@ def pca(
 
         ## Normalize params for consistent hash value.
 
-        # Note: `_prep_sample_selection_cache_params` converts `sample_query` and `sample_query_options` into `sample_indices`.
-        # So `sample_query` and `sample_query_options` should not be used beyond this point. (`sample_indices` should be used instead.)
+        # Handle cohort downsampling.
+        if cohorts is not None:
+            if max_cohort_size is None:
+                raise ValueError(
+                    "`max_cohort_size` is required when `cohorts` is provided."
+                )
+            if sample_indices is not None:
+                raise ValueError(
+                    "Cannot use `sample_indices` with `cohorts` and `max_cohort_size`."
+                )
+            if cohort_size is not None or min_cohort_size is not None:
+                raise ValueError(
+                    "Cannot use `cohort_size` or `min_cohort_size` with `cohorts`."
+                )
+            df_samples = self.sample_metadata(
+                sample_sets=sample_sets,
+                sample_query=sample_query,
+                sample_query_options=sample_query_options,
+            )
+            # N.B., we are going to overwrite the sample_indices parameter here.
+            groups = df_samples.groupby(cohorts, sort=False)
+            ix = []
+            for _, group in groups:
+                if len(group) > max_cohort_size:
+                    ix.extend(
+                        group.sample(
+                            n=max_cohort_size, random_state=random_seed, replace=False
+                        ).index
+                    )
+                else:
+                    ix.extend(group.index)
+            sample_indices = ix
+            # From this point onwards, the sample_query is no longer needed, because
+            # the sample selection is defined by the sample_indices.
+            sample_query = None
+            sample_query_options = None
+
+        # Normalize params for consistent hash value.
         (
             prepared_sample_sets,
             prepared_sample_indices,
@@ -132,6 +168,7 @@ def pca(
             max_missing_an=max_missing_an,
             imputation_method=imputation_method,
             n_components=n_components,
+            cohorts=cohorts,
             cohort_size=cohort_size,
             min_cohort_size=min_cohort_size,
             max_cohort_size=max_cohort_size,
@@ -149,10 +186,10 @@ def pca(
             self.results_cache_set(name=name, params=params, results=results)
 
         # Unpack results.
-        coords = results["coords"]
-        evr = results["evr"]
-        samples = results["samples"]
-        loc_keep_fit = results["loc_keep_fit"]
+        coords = np.array(results["coords"])
+        evr = np.array(results["evr"])
+        samples = np.array(results["samples"])
+        loc_keep_fit = np.array(results["loc_keep_fit"])
 
         # Create a new DataFrame containing the PCA coords data.
         df_pca = pd.DataFrame(coords, index=samples)
@@ -205,6 +242,7 @@ def _pca(
         random_seed,
         chunks,
         inline_array,
+        **kwargs,
     ):
         # Load diplotypes.
         ds_diplotypes = self.biallelic_diplotypes(