malariagen
diff --git a/‎malariagen_data/anoph/base.py‎
Lines changed: 5 additions & 4 deletions b/‎malariagen_data/anoph/base.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎malariagen_data/anoph/cnv_frq.py‎
Lines changed: 2 additions & 0 deletions b/‎malariagen_data/anoph/cnv_frq.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎malariagen_data/anoph/frq_base.py‎
Lines changed: 4 additions & 2 deletions b/‎malariagen_data/anoph/frq_base.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎malariagen_data/anoph/genome_features.py‎
Lines changed: 9 additions & 7 deletions b/‎malariagen_data/anoph/genome_features.py‎
Lines changed: 9 additions & 7 deletions
diff --git a/‎malariagen_data/anoph/hap_data.py‎
Lines changed: 4 additions & 1 deletion b/‎malariagen_data/anoph/hap_data.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎malariagen_data/anoph/hapclust.py‎
Lines changed: 2 additions & 0 deletions b/‎malariagen_data/anoph/hapclust.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎malariagen_data/anoph/pca.py‎
Lines changed: 48 additions & 10 deletions b/‎malariagen_data/anoph/pca.py‎
Lines changed: 48 additions & 10 deletions
diff --git a/‎malariagen_data/anoph/safe_query.py‎
Lines changed: 157 additions & 0 deletions b/‎malariagen_data/anoph/safe_query.py‎
Lines changed: 157 additions & 0 deletions
@@ -28,6 +28,8 @@
 from numpydoc_decorator import doc  # type: ignore
 from tqdm.auto import tqdm as tqdm_auto  # type: ignore
 from tqdm.dask import TqdmCallback  # type: ignore
+
+from .safe_query import validate_query
 from yaspin import yaspin  # type: ignore
 import xarray as xr
 
@@ -980,10 +982,9 @@ def _filter_sample_dataset(
 
         # Determine which samples match the sample query.
         if sample_query != "":
-            # Use the python engine in order to support extension array dtypes, e.g. Float64, Int64, boolean.
-            loc_samples = df_samples.eval(
-                sample_query, **sample_query_options, engine="python"
-            )
+            # Validate the query to prevent arbitrary code execution (GH-1292).
+            validate_query(sample_query)
+            loc_samples = df_samples.eval(sample_query, **sample_query_options)
         else:
             loc_samples = pd.Series(True, index=df_samples.index)
 
 
@@ -15,6 +15,7 @@
     _build_cohorts_from_sample_grouping,
     _add_frequency_ci,
 )
+from .safe_query import validate_query
 from ..util import (
     _check_types,
     _pandas_apply,
@@ -671,6 +672,7 @@ def _gene_cnv_frequencies_advanced(
 
         debug("apply variant query")
         if variant_query is not None:
+            validate_query(variant_query)
             loc_variants = df_variants.eval(variant_query).values
             # Convert boolean mask to integer indices for NumPy 2.x compatibility
             variant_indices = np.where(loc_variants)[0]
 
@@ -147,8 +147,10 @@ def _build_cohorts_from_sample_grouping(
         period_str = df_cohorts["period"].astype(str)
         df_cohorts["label"] = area_str + "_" + taxon_clean + "_" + period_str
 
-    # Apply minimum cohort size.
-    df_cohorts = df_cohorts.query(f"size >= {min_cohort_size}").reset_index(drop=True)
+    # Apply minimum cohort size using safe boolean indexing.
+    df_cohorts = df_cohorts.loc[df_cohorts["size"] >= min_cohort_size].reset_index(
+        drop=True
+    )
 
     # Early check for no cohorts.
     if len(df_cohorts) == 0:
 
@@ -117,8 +117,8 @@ def _genome_features_for_contig(self, *, contig: str, attributes: Tuple[str, ...
                 )
             df = self._genome_features(attributes=attributes)
 
-            # Apply contig query.
-            df = df.query(f"contig == '{contig}'")
+            # Apply contig filter using safe boolean indexing.
+            df = df.loc[df["contig"] == contig]
             return df
 
     def _prep_gff_attributes(
@@ -162,9 +162,9 @@ def genome_features(
                         contig=r.contig, attributes=attributes_normed
                     )
                     if r.end is not None:
-                        df_part = df_part.query(f"start <= {r.end}")
+                        df_part = df_part.loc[df_part["start"] <= r.end]
                     if r.start is not None:
-                        df_part = df_part.query(f"end >= {r.start}")
+                        df_part = df_part.loc[df_part["end"] >= r.start]
                     parts.append(df_part)
                 df = pd.concat(parts, axis=0)
                 return df.sort_values(["contig", "start"]).reset_index(drop=True).copy()
@@ -192,8 +192,8 @@ def genome_feature_children(
         df_gf["Parent"] = df_gf["Parent"].str.split(",")
         df_gf = df_gf.explode(column="Parent", ignore_index=True)
 
-        # Query to find children of the requested parent.
-        df_children = df_gf.query(f"Parent == '{parent}'")
+        # Filter to find children of the requested parent using safe indexing.
+        df_children = df_gf.loc[df_gf["Parent"] == parent]
 
         return df_children.copy()
 
@@ -670,7 +670,9 @@ def plot_genes(
     def _plot_genes_setup_data(self, *, region):
         attributes = [a for a in self._gff_default_attributes if a != "Parent"]
         df_genome_features = self.genome_features(region=region, attributes=attributes)
-        data = df_genome_features.query(f"type == '{self._gff_gene_type}'").copy()
+        data = df_genome_features.loc[
+            df_genome_features["type"] == self._gff_gene_type
+        ].copy()
         tooltips = [(a.capitalize(), f"@{a}") for a in attributes]
         tooltips += [("Location", "@contig:@start{,}-@end{,}")]
         return data, tooltips
 
@@ -6,6 +6,8 @@
 import zarr  # type: ignore
 from numpydoc_decorator import doc  # type: ignore
 
+from .safe_query import validate_query
+
 from ..util import (
     DIM_ALLELE,
     DIM_PLOIDY,
@@ -418,7 +420,8 @@ def haplotypes(
                 df_samples.set_index("sample_id").loc[phased_samples].reset_index()
             )
 
-            # Apply the query.
+            # Validate the query to prevent arbitrary code execution (GH-1292).
+            validate_query(sample_query_prepped)
             sample_query_options = sample_query_options or {}
             loc_samples = df_samples_phased.eval(
                 sample_query_prepped, **sample_query_options
 
@@ -8,6 +8,7 @@
 
 from ..util import CacheMiss, _check_types, _pdist_abs_hamming, _pandas_apply
 from ..plotly_dendrogram import _plot_dendrogram, concat_clustering_subplots
+from .safe_query import validate_query
 from . import (
     base_params,
     plotly_params,
@@ -623,6 +624,7 @@ def transcript_haplotypes(
         """
 
         # Get SNP genotype allele counts for the transcript, applying snp_query
+        validate_query(snp_query)
         df_eff = (
             self.snp_effects(
                 transcript=transcript,
 
@@ -42,7 +42,6 @@ def __init__(
             The following additional parameters were also added in version 8.0.0:
             `site_class`, `cohort_size`, `min_cohort_size`, `max_cohort_size`,
             `random_seed`.
-
         """,
         parameters=dict(
             imputation_method="""
@@ -69,6 +68,10 @@ def pca(
         sample_query: Optional[base_params.sample_query] = None,
         sample_query_options: Optional[base_params.sample_query_options] = None,
         sample_indices: Optional[base_params.sample_indices] = None,
+        cohorts: Optional[base_params.cohorts] = None,
+        cohort_size: Optional[base_params.cohort_size] = None,
+        min_cohort_size: Optional[base_params.min_cohort_size] = None,
+        max_cohort_size: Optional[base_params.max_cohort_size] = None,
         site_mask: Optional[base_params.site_mask] = base_params.DEFAULT,
         site_class: Optional[base_params.site_class] = None,
         min_minor_ac: Optional[
@@ -78,9 +81,6 @@ def pca(
             base_params.max_missing_an
         ] = pca_params.max_missing_an_default,
         imputation_method: pca_params.imputation_method = pca_params.imputation_method_default,
-        cohort_size: Optional[base_params.cohort_size] = None,
-        min_cohort_size: Optional[base_params.min_cohort_size] = None,
-        max_cohort_size: Optional[base_params.max_cohort_size] = None,
         exclude_samples: Optional[base_params.samples] = None,
         fit_exclude_samples: Optional[base_params.samples] = None,
         random_seed: base_params.random_seed = 42,
@@ -98,8 +98,44 @@ def pca(
 
         ## Normalize params for consistent hash value.
 
-        # Note: `_prep_sample_selection_cache_params` converts `sample_query` and `sample_query_options` into `sample_indices`.
-        # So `sample_query` and `sample_query_options` should not be used beyond this point. (`sample_indices` should be used instead.)
+        # Handle cohort downsampling.
+        if cohorts is not None:
+            if max_cohort_size is None:
+                raise ValueError(
+                    "`max_cohort_size` is required when `cohorts` is provided."
+                )
+            if sample_indices is not None:
+                raise ValueError(
+                    "Cannot use `sample_indices` with `cohorts` and `max_cohort_size`."
+                )
+            if cohort_size is not None or min_cohort_size is not None:
+                raise ValueError(
+                    "Cannot use `cohort_size` or `min_cohort_size` with `cohorts`."
+                )
+            df_samples = self.sample_metadata(
+                sample_sets=sample_sets,
+                sample_query=sample_query,
+                sample_query_options=sample_query_options,
+            )
+            # N.B., we are going to overwrite the sample_indices parameter here.
+            groups = df_samples.groupby(cohorts, sort=False)
+            ix = []
+            for _, group in groups:
+                if len(group) > max_cohort_size:
+                    ix.extend(
+                        group.sample(
+                            n=max_cohort_size, random_state=random_seed, replace=False
+                        ).index
+                    )
+                else:
+                    ix.extend(group.index)
+            sample_indices = ix
+            # From this point onwards, the sample_query is no longer needed, because
+            # the sample selection is defined by the sample_indices.
+            sample_query = None
+            sample_query_options = None
+
+        # Normalize params for consistent hash value.
         (
             prepared_sample_sets,
             prepared_sample_indices,
@@ -132,6 +168,7 @@ def pca(
             max_missing_an=max_missing_an,
             imputation_method=imputation_method,
             n_components=n_components,
+            cohorts=cohorts,
             cohort_size=cohort_size,
             min_cohort_size=min_cohort_size,
             max_cohort_size=max_cohort_size,
@@ -149,10 +186,10 @@ def pca(
             self.results_cache_set(name=name, params=params, results=results)
 
         # Unpack results.
-        coords = results["coords"]
-        evr = results["evr"]
-        samples = results["samples"]
-        loc_keep_fit = results["loc_keep_fit"]
+        coords = np.array(results["coords"])
+        evr = np.array(results["evr"])
+        samples = np.array(results["samples"])
+        loc_keep_fit = np.array(results["loc_keep_fit"])
 
         # Create a new DataFrame containing the PCA coords data.
         df_pca = pd.DataFrame(coords, index=samples)
@@ -205,6 +242,7 @@ def _pca(
         random_seed,
         chunks,
         inline_array,
+        **kwargs,
     ):
         # Load diplotypes.
         ds_diplotypes = self.biallelic_diplotypes(
 
@@ -0,0 +1,157 @@
+"""Safe query validation for pandas eval/query expressions.
+
+This module provides AST-based validation of query strings to prevent
+arbitrary code execution via pandas DataFrame.eval() and DataFrame.query().
+
+Only a restricted subset of Python expressions is allowed:
+- Boolean operators: and, or, not
+- Comparison operators: ==, !=, <, <=, >, >=, in, not in, is
+- Arithmetic operators: +, -, *, /, //, %, **
+- Unary operators: +, -, ~, not
+- Constants: strings, numbers, booleans, None
+- Names: must match an allowlist of known column names (if provided)
+- Parenthesized expressions
+
+Forbidden constructs include:
+- Function calls (e.g., __import__('os'))
+- Attribute access (e.g., os.system)
+- Subscript/indexing (e.g., x[0])
+- Comprehensions, lambdas, f-strings, starred expressions
+- Any identifier containing double underscores (__)
+"""
+
+import ast
+import re
+from typing import Optional, Set
+
+# Pattern matching pandas @variable references in query strings.
+# These are not valid Python but are a pandas feature for referencing
+# local/global variables via the `local_dict` or `global_dict` kwargs.
+_AT_VAR_PATTERN = re.compile(r"@([A-Za-z_][A-Za-z0-9_]*)")
+
+
+# AST node types that are safe in query expressions.
+_SAFE_NODE_TYPES = (
+    ast.Expression,
+    ast.BoolOp,
+    ast.BinOp,
+    ast.UnaryOp,
+    ast.Compare,
+    ast.And,
+    ast.Or,
+    ast.Not,
+    ast.Add,
+    ast.Sub,
+    ast.Mult,
+    ast.Div,
+    ast.FloorDiv,
+    ast.Mod,
+    ast.Pow,
+    ast.USub,
+    ast.UAdd,
+    ast.Invert,
+    ast.Eq,
+    ast.NotEq,
+    ast.Lt,
+    ast.LtE,
+    ast.Gt,
+    ast.GtE,
+    ast.In,
+    ast.NotIn,
+    ast.Is,
+    ast.IsNot,
+    ast.Constant,
+    ast.Name,
+    ast.Load,
+    ast.Tuple,
+    ast.List,
+)
+
+
+class UnsafeQueryError(ValueError):
+    """Raised when a query string contains unsafe constructs."""
+
+    pass
+
+
+def _validate_node(node: ast.AST, allowed_names: Optional[Set[str]] = None) -> None:
+    """Recursively validate that an AST node contains only safe constructs.
+
+    Parameters
+    ----------
+    node : ast.AST
+        The AST node to validate.
+    allowed_names : set of str, optional
+        If provided, restrict identifier names to this set.
+
+    Raises
+    ------
+    UnsafeQueryError
+        If the node or any of its children contain unsafe constructs.
+    """
+    if not isinstance(node, _SAFE_NODE_TYPES):
+        raise UnsafeQueryError(
+            f"Unsafe expression: {type(node).__name__} nodes are not allowed "
+            f"in query strings. Only comparisons, boolean logic, and constants "
+            f"are permitted."
+        )
+
+    if isinstance(node, ast.Name):
+        name = node.id
+        # Block dunder identifiers.
+        if "__" in name:
+            raise UnsafeQueryError(
+                f"Unsafe expression: identifier '{name}' contains double "
+                f"underscores and is not allowed in query strings."
+            )
+        # Check against allowlist if provided.
+        if allowed_names is not None and name not in allowed_names:
+            # Allow common boolean literals that pandas recognizes.
+            if name not in {"True", "False", "None"}:
+                raise UnsafeQueryError(
+                    f"Unknown column name '{name}' in query string. "
+                    f"Allowed column names: {sorted(allowed_names)}"
+                )
+
+    # Recurse into child nodes.
+    for child in ast.iter_child_nodes(node):
+        _validate_node(child, allowed_names)
+
+
+def validate_query(query: str, allowed_names: Optional[Set[str]] = None) -> None:
+    """Validate that a query string is safe for use with pandas eval/query.
+
+    Parameters
+    ----------
+    query : str
+        The query string to validate.
+    allowed_names : set of str, optional
+        If provided, restrict identifier names to this set of known column
+        names. If None, any identifier (except those containing ``__``) is
+        allowed.
+
+    Raises
+    ------
+    UnsafeQueryError
+        If the query contains unsafe constructs such as function calls,
+        attribute access, or dunder identifiers.
+    """
+    if not isinstance(query, str):
+        raise UnsafeQueryError(f"Query must be a string, got {type(query).__name__}.")
+
+    query = query.strip()
+    if not query:
+        raise UnsafeQueryError("Query string must not be empty.")
+
+    # Replace pandas @variable references with plain identifiers so the
+    # expression can be parsed as valid Python.  The replaced names are
+    # prefixed with ``_at_`` to avoid collisions with real column names
+    # while remaining dunder-free.
+    query_for_parse = _AT_VAR_PATTERN.sub(r"_at_\1", query)
+
+    try:
+        tree = ast.parse(query_for_parse, mode="eval")
+    except SyntaxError as e:
+        raise UnsafeQueryError(f"Query string is not a valid expression: {e}") from e
+
+    _validate_node(tree, allowed_names)