|
| 1 | +"""Safe query validation for pandas eval/query expressions. |
| 2 | +
|
| 3 | +This module provides AST-based validation of query strings to prevent |
| 4 | +arbitrary code execution via pandas DataFrame.eval() and DataFrame.query(). |
| 5 | +
|
| 6 | +Only a restricted subset of Python expressions is allowed: |
| 7 | +- Boolean operators: and, or, not |
| 8 | +- Comparison operators: ==, !=, <, <=, >, >=, in, not in, is |
| 9 | +- Arithmetic operators: +, -, *, /, //, %, ** |
| 10 | +- Unary operators: +, -, ~, not |
| 11 | +- Constants: strings, numbers, booleans, None |
| 12 | +- Names: must match an allowlist of known column names (if provided) |
| 13 | +- Parenthesized expressions |
| 14 | +
|
| 15 | +Forbidden constructs include: |
| 16 | +- Function calls (e.g., __import__('os')) |
| 17 | +- Attribute access (e.g., os.system) |
| 18 | +- Subscript/indexing (e.g., x[0]) |
| 19 | +- Comprehensions, lambdas, f-strings, starred expressions |
| 20 | +- Any identifier containing double underscores (__) |
| 21 | +""" |
| 22 | + |
| 23 | +import ast |
| 24 | +import re |
| 25 | +from typing import Optional, Set |
| 26 | + |
| 27 | +# Pattern matching pandas @variable references in query strings. |
| 28 | +# These are not valid Python but are a pandas feature for referencing |
| 29 | +# local/global variables via the `local_dict` or `global_dict` kwargs. |
| 30 | +_AT_VAR_PATTERN = re.compile(r"@([A-Za-z_][A-Za-z0-9_]*)") |
| 31 | + |
| 32 | + |
| 33 | +# AST node types that are safe in query expressions. |
| 34 | +_SAFE_NODE_TYPES = ( |
| 35 | + ast.Expression, |
| 36 | + ast.BoolOp, |
| 37 | + ast.BinOp, |
| 38 | + ast.UnaryOp, |
| 39 | + ast.Compare, |
| 40 | + ast.And, |
| 41 | + ast.Or, |
| 42 | + ast.Not, |
| 43 | + ast.Add, |
| 44 | + ast.Sub, |
| 45 | + ast.Mult, |
| 46 | + ast.Div, |
| 47 | + ast.FloorDiv, |
| 48 | + ast.Mod, |
| 49 | + ast.Pow, |
| 50 | + ast.USub, |
| 51 | + ast.UAdd, |
| 52 | + ast.Invert, |
| 53 | + ast.Eq, |
| 54 | + ast.NotEq, |
| 55 | + ast.Lt, |
| 56 | + ast.LtE, |
| 57 | + ast.Gt, |
| 58 | + ast.GtE, |
| 59 | + ast.In, |
| 60 | + ast.NotIn, |
| 61 | + ast.Is, |
| 62 | + ast.IsNot, |
| 63 | + ast.Constant, |
| 64 | + ast.Name, |
| 65 | + ast.Load, |
| 66 | + ast.Tuple, |
| 67 | + ast.List, |
| 68 | +) |
| 69 | + |
| 70 | + |
| 71 | +class UnsafeQueryError(ValueError): |
| 72 | + """Raised when a query string contains unsafe constructs.""" |
| 73 | + |
| 74 | + pass |
| 75 | + |
| 76 | + |
| 77 | +def _validate_node(node: ast.AST, allowed_names: Optional[Set[str]] = None) -> None: |
| 78 | + """Recursively validate that an AST node contains only safe constructs. |
| 79 | +
|
| 80 | + Parameters |
| 81 | + ---------- |
| 82 | + node : ast.AST |
| 83 | + The AST node to validate. |
| 84 | + allowed_names : set of str, optional |
| 85 | + If provided, restrict identifier names to this set. |
| 86 | +
|
| 87 | + Raises |
| 88 | + ------ |
| 89 | + UnsafeQueryError |
| 90 | + If the node or any of its children contain unsafe constructs. |
| 91 | + """ |
| 92 | + if not isinstance(node, _SAFE_NODE_TYPES): |
| 93 | + raise UnsafeQueryError( |
| 94 | + f"Unsafe expression: {type(node).__name__} nodes are not allowed " |
| 95 | + f"in query strings. Only comparisons, boolean logic, and constants " |
| 96 | + f"are permitted." |
| 97 | + ) |
| 98 | + |
| 99 | + if isinstance(node, ast.Name): |
| 100 | + name = node.id |
| 101 | + # Block dunder identifiers. |
| 102 | + if "__" in name: |
| 103 | + raise UnsafeQueryError( |
| 104 | + f"Unsafe expression: identifier '{name}' contains double " |
| 105 | + f"underscores and is not allowed in query strings." |
| 106 | + ) |
| 107 | + # Check against allowlist if provided. |
| 108 | + if allowed_names is not None and name not in allowed_names: |
| 109 | + # Allow common boolean literals that pandas recognizes. |
| 110 | + if name not in {"True", "False", "None"}: |
| 111 | + raise UnsafeQueryError( |
| 112 | + f"Unknown column name '{name}' in query string. " |
| 113 | + f"Allowed column names: {sorted(allowed_names)}" |
| 114 | + ) |
| 115 | + |
| 116 | + # Recurse into child nodes. |
| 117 | + for child in ast.iter_child_nodes(node): |
| 118 | + _validate_node(child, allowed_names) |
| 119 | + |
| 120 | + |
| 121 | +def validate_query(query: str, allowed_names: Optional[Set[str]] = None) -> None: |
| 122 | + """Validate that a query string is safe for use with pandas eval/query. |
| 123 | +
|
| 124 | + Parameters |
| 125 | + ---------- |
| 126 | + query : str |
| 127 | + The query string to validate. |
| 128 | + allowed_names : set of str, optional |
| 129 | + If provided, restrict identifier names to this set of known column |
| 130 | + names. If None, any identifier (except those containing ``__``) is |
| 131 | + allowed. |
| 132 | +
|
| 133 | + Raises |
| 134 | + ------ |
| 135 | + UnsafeQueryError |
| 136 | + If the query contains unsafe constructs such as function calls, |
| 137 | + attribute access, or dunder identifiers. |
| 138 | + """ |
| 139 | + if not isinstance(query, str): |
| 140 | + raise UnsafeQueryError(f"Query must be a string, got {type(query).__name__}.") |
| 141 | + |
| 142 | + query = query.strip() |
| 143 | + if not query: |
| 144 | + raise UnsafeQueryError("Query string must not be empty.") |
| 145 | + |
| 146 | + # Replace pandas @variable references with plain identifiers so the |
| 147 | + # expression can be parsed as valid Python. The replaced names are |
| 148 | + # prefixed with ``_at_`` to avoid collisions with real column names |
| 149 | + # while remaining dunder-free. |
| 150 | + query_for_parse = _AT_VAR_PATTERN.sub(r"_at_\1", query) |
| 151 | + |
| 152 | + try: |
| 153 | + tree = ast.parse(query_for_parse, mode="eval") |
| 154 | + except SyntaxError as e: |
| 155 | + raise UnsafeQueryError(f"Query string is not a valid expression: {e}") from e |
| 156 | + |
| 157 | + _validate_node(tree, allowed_names) |
0 commit comments