Skip to content

Commit 8b412de

Browse files
petern48alamb
andauthored
fix: Optimize !~ '.*' case to col IS NULL AND Boolean(NULL) instead of Eq "" (#20702)
## Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes #123` indicates that this PR will close issue #123. --> - Closes #20701 ## Rationale for this change <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> ## What changes are included in this PR? <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> A pre-existing optimization rule for the `!~ .*` (regexp not match) case rewrote the plan to `Eq ""`, which would return empty strings as part of the result. This is incorrect and doesn't match the output without the optimization rule. Instead, this PR rewrites the plan to simply `col IS NULL AND Boolean(NULL)` or, in other words, "NULL if col is NULL else false." I've confirmed this behavior matches the result of running queries manually with the optimization rule turned off. ## Are these changes tested? <!-- We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 2. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? --> Fixed expected output in tests. Added new tests for nulls ## Are there any user-facing changes? <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. --> <!-- If there are any breaking changes to public APIs, please add the `api change` label. --> Yes, a minor bug fix. When querying `s !~ .*`, empty strings will no longer be included in the result which is consistent with the behavior without the optimization rule. --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent 385d9db commit 8b412de

3 files changed

Lines changed: 68 additions & 19 deletions

File tree

datafusion/optimizer/src/simplify_expressions/regex.rs

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
// under the License.
1717

1818
use datafusion_common::tree_node::Transformed;
19-
use datafusion_common::{DataFusionError, Result};
19+
use datafusion_common::{DataFusionError, Result, ScalarValue};
2020
use datafusion_expr::{BinaryExpr, Expr, Like, Operator, lit};
2121
use regex_syntax::hir::{Capture, Hir, HirKind, Literal, Look};
2222

@@ -39,7 +39,7 @@ const ANY_CHAR_REGEX_PATTERN: &str = ".*";
3939
/// - partial anchored regex patterns (e.g. `^foo`) to `LIKE 'foo%'`
4040
/// - combinations (alternatives) of the above, will be concatenated with `OR` or `AND`
4141
/// - `EQ .*` to NotNull
42-
/// - `NE .*` means IS EMPTY
42+
/// - `NE .*` to col IS NULL AND Boolean(NULL) (false for any string, or NULL if col is NULL)
4343
///
4444
/// Dev note: unit tests of this function are in `expr_simplifier.rs`, case `test_simplify_regex`.
4545
pub fn simplify_regex_expr(
@@ -68,12 +68,11 @@ pub fn simplify_regex_expr(
6868
// Handle the special case for ".*" pattern
6969
if pattern == ANY_CHAR_REGEX_PATTERN {
7070
let new_expr = if mode.not {
71-
// not empty
72-
let empty_lit = Box::new(string_scalar.to_expr(""));
71+
let null_bool = lit(ScalarValue::Boolean(None));
7372
Expr::BinaryExpr(BinaryExpr {
74-
left,
75-
op: Operator::Eq,
76-
right: empty_lit,
73+
left: Box::new(left.is_null()),
74+
op: Operator::And,
75+
right: Box::new(null_bool),
7776
})
7877
} else {
7978
// not null

datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs

Lines changed: 45 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ mod tests {
155155
use arrow::datatypes::{DataType, Field, Schema};
156156
use chrono::{DateTime, Utc};
157157

158+
use datafusion_common::ScalarValue;
158159
use datafusion_expr::logical_plan::builder::table_scan_with_filters;
159160
use datafusion_expr::logical_plan::table_scan;
160161
use datafusion_expr::*;
@@ -870,7 +871,7 @@ mod tests {
870871
]);
871872
let table_scan = table_scan(Some("test"), &schema, None)?.build()?;
872873

873-
// Test `= ".*"` transforms to true (except for empty strings)
874+
// Test `~ ".*"` transforms to true for any non-NULL string
874875
let plan = LogicalPlanBuilder::from(table_scan.clone())
875876
.filter(binary_expr(col("a"), Operator::RegexMatch, lit(".*")))?
876877
.build()?;
@@ -883,22 +884,22 @@ mod tests {
883884
"
884885
)?;
885886

886-
// Test `!= ".*"` transforms to checking if the column is empty
887+
// Test `!~ ".*"` preserves NULL semantics while remaining false for non-NULL strings
887888
let plan = LogicalPlanBuilder::from(table_scan.clone())
888889
.filter(binary_expr(col("a"), Operator::RegexNotMatch, lit(".*")))?
889890
.build()?;
890891

891892
assert_optimized_plan_equal!(
892893
plan,
893-
@ r#"
894-
Filter: test.a = Utf8("")
894+
@ r"
895+
Filter: test.a IS NULL AND Boolean(NULL)
895896
TableScan: test
896-
"#
897+
"
897898
)?;
898899

899900
// Test case-insensitive versions
900901

901-
// Test `=~ ".*"` (case-insensitive) transforms to true (except for empty strings)
902+
// Test `~* ".*"` transforms to true for any non-NULL string
902903
let plan = LogicalPlanBuilder::from(table_scan.clone())
903904
.filter(binary_expr(col("b"), Operator::RegexIMatch, lit(".*")))?
904905
.build()?;
@@ -911,17 +912,51 @@ mod tests {
911912
"
912913
)?;
913914

914-
// Test `!~ ".*"` (case-insensitive) transforms to checking if the column is empty
915+
// Test NULL `!~ ".*"` transforms to Boolean(NULL)
916+
let plan = LogicalPlanBuilder::from(table_scan.clone())
917+
.filter(binary_expr(
918+
lit(ScalarValue::Utf8(None)),
919+
Operator::RegexNotMatch,
920+
lit(".*"),
921+
))?
922+
.build()?;
923+
924+
assert_optimized_plan_equal!(
925+
plan,
926+
@ r"
927+
Filter: Boolean(NULL)
928+
TableScan: test
929+
"
930+
)?;
931+
932+
// Test `!~* ".*"` preserves NULL semantics while remaining false for non-NULL strings
915933
let plan = LogicalPlanBuilder::from(table_scan.clone())
916934
.filter(binary_expr(col("a"), Operator::RegexNotIMatch, lit(".*")))?
917935
.build()?;
918936

919937
assert_optimized_plan_equal!(
920938
plan,
921-
@ r#"
922-
Filter: test.a = Utf8("")
939+
@ r"
940+
Filter: test.a IS NULL AND Boolean(NULL)
923941
TableScan: test
924-
"#
942+
"
943+
)?;
944+
945+
// Test NULL `!~* ".*"` transforms to Boolean(NULL)
946+
let plan = LogicalPlanBuilder::from(table_scan.clone())
947+
.filter(binary_expr(
948+
lit(ScalarValue::Utf8(None)),
949+
Operator::RegexNotIMatch,
950+
lit(".*"),
951+
))?
952+
.build()?;
953+
954+
assert_optimized_plan_equal!(
955+
plan,
956+
@ r"
957+
Filter: Boolean(NULL)
958+
TableScan: test
959+
"
925960
)
926961
}
927962

datafusion/sqllogictest/test_files/simplify_expr.slt

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,12 +44,27 @@ query TT
4444
explain select b from t where b !~ '.*'
4545
----
4646
logical_plan
47-
01)Filter: t.b = Utf8View("")
47+
01)Filter: t.b IS NULL AND Boolean(NULL)
4848
02)--TableScan: t projection=[b]
4949
physical_plan
50-
01)FilterExec: b@0 =
50+
01)FilterExec: b@0 IS NULL AND NULL
5151
02)--DataSourceExec: partitions=1, partition_sizes=[1]
5252

53+
query TB
54+
WITH vals(id, col) AS (
55+
VALUES
56+
(1, 'foo'::text),
57+
(2, ''::text),
58+
(3, NULL::text)
59+
)
60+
SELECT col, col !~ '.*'
61+
FROM vals
62+
ORDER BY id
63+
----
64+
foo false
65+
(empty) false
66+
NULL NULL
67+
5368
query T
5469
select b from t where b ~ '.*'
5570
----

0 commit comments

Comments
 (0)