Skip to content

Commit eaa6c2c

Browse files
Dandandanclaude
andcommitted
Fix correctness bugs in regexp_replace optimization
- Non-participating capture group now returns empty string instead of original input - Newline in remainder after short match falls back to full regex Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 3f71487 commit eaa6c2c

2 files changed

Lines changed: 27 additions & 16 deletions

File tree

datafusion/functions/src/regex/regexpreplace.rs

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -502,10 +502,17 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
502502
string_array.iter().for_each(|val| {
503503
if let Some(val) = val {
504504
if short_re.captures_read(&mut locs, val).is_some() {
505-
if let Some((start, end)) = locs.get(1) {
506-
vals.append_slice(&val.as_bytes()[start..end]);
505+
let match_end = locs.get(0).unwrap().1;
506+
if !val[match_end..].contains('\n') {
507+
if let Some((start, end)) = locs.get(1) {
508+
vals.append_slice(&val.as_bytes()[start..end]);
509+
}
510+
// else: empty replacement (group didn't participate)
507511
} else {
508-
vals.append_slice(val.as_bytes());
512+
// Fallback: newline in remainder means .*$ wouldn't match
513+
let result =
514+
re.replacen(val, limit, replacement.as_str());
515+
vals.append_slice(result.as_bytes());
509516
}
510517
} else {
511518
vals.append_slice(val.as_bytes());
@@ -541,10 +548,18 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
541548
for val in string_view_array.iter() {
542549
if let Some(val) = val {
543550
if short_re.captures_read(&mut locs, val).is_some() {
544-
if let Some((start, end)) = locs.get(1) {
545-
builder.append_value(&val[start..end]);
551+
let match_end = locs.get(0).unwrap().1;
552+
if !val[match_end..].contains('\n') {
553+
if let Some((start, end)) = locs.get(1) {
554+
builder.append_value(&val[start..end]);
555+
} else {
556+
builder.append_value("");
557+
}
546558
} else {
547-
builder.append_value(val);
559+
// Fallback: newline in remainder means .*$ wouldn't match
560+
let result =
561+
re.replacen(val, limit, replacement.as_str());
562+
builder.append_value(result);
548563
}
549564
} else {
550565
builder.append_value(val);

datafusion/sqllogictest/test_files/regexp/regexp_replace.slt

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -143,19 +143,15 @@ test.org
143143
example.com
144144
not-a-url
145145

146-
# If the overall pattern matches but capture group 1 does not participate,
147-
# regexp_replace(..., '\1') should substitute the empty string, not keep
148-
# the original input.
149-
query B
150-
SELECT regexp_replace('bzzz', '^(a)?b.*$', '\1') = '';
146+
# Test non-participating capture group returns empty string
147+
query T
148+
SELECT regexp_replace('bzzz', '^(a)?b.*$', '\1');
151149
----
152-
true
150+
(empty)
153151

154-
# Stripping trailing .*$ must not change match semantics for inputs with
155-
# newlines when the original pattern does not use the 's' flag.
152+
# Test that newlines prevent the optimization from incorrectly matching
156153
query B
157-
SELECT regexp_replace(concat('http://x/', chr(10), 'rest'), '^https?://([^/]+)/.*$', '\1')
158-
= concat('http://x/', chr(10), 'rest');
154+
SELECT regexp_replace(concat('http://x/', chr(10), 'rest'), '^https?://([^/]+)/.*$', '\1') = concat('http://x/', chr(10), 'rest');
159155
----
160156
true
161157

0 commit comments

Comments
 (0)