Skip to content

Commit 62046ab

Browse files
Dandandanclaude
andcommitted
Optimize regexp_replace by stripping trailing .* from anchored patterns
For anchored patterns like `^prefix(capture)/.*$`, use regex-syntax HIR analysis to build a shorter regex without the trailing `.*`. Uses captures() + expand() on the shorter regex instead of replacen(), since the replacement replaces the entire string (original was ^...$) and we only need correct capture group positions. For ClickBench Q28's `^https?://(?:www\.)?([^/]+)/.*$`, the effective regex becomes `^https?://(?:www\.)?([^/]+)/` — the backtracker stops at the first `/` after the domain instead of scanning the full URL. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent c17c87c commit 62046ab

3 files changed

Lines changed: 134 additions & 13 deletions

File tree

datafusion/functions/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ memchr = { workspace = true }
8686
num-traits = { workspace = true }
8787
rand = { workspace = true }
8888
regex = { workspace = true, optional = true }
89+
regex-syntax = "0.8.9"
8990
sha2 = { workspace = true, optional = true }
9091
unicode-segmentation = { version = "^1.13.2", optional = true }
9192
uuid = { workspace = true, features = ["v4"], optional = true }

datafusion/functions/src/regex/regexpreplace.rs

Lines changed: 118 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ use datafusion_expr::{
4141
};
4242
use datafusion_macros::user_doc;
4343
use regex::Regex;
44+
use regex_syntax::hir::{Hir, HirKind, Look};
4445
use std::collections::HashMap;
4546
use std::sync::{Arc, LazyLock};
4647

@@ -199,6 +200,63 @@ fn regex_replace_posix_groups(replacement: &str) -> String {
199200
.into_owned()
200201
}
201202

203+
/// Count capture groups in a HIR tree.
204+
fn count_capture_groups(hir: &Hir) -> usize {
205+
match hir.kind() {
206+
HirKind::Capture(cap) => 1 + count_capture_groups(&cap.sub),
207+
HirKind::Concat(subs) | HirKind::Alternation(subs) => {
208+
subs.iter().map(count_capture_groups).sum()
209+
}
210+
HirKind::Repetition(rep) => count_capture_groups(&rep.sub),
211+
_ => 0,
212+
}
213+
}
214+
215+
/// For anchored patterns (`^...$`), try to build a shorter regex by
216+
/// stripping trailing `.*` via HIR analysis. This reduces backtracker
217+
/// work since it doesn't need to scan through the rest of the string.
218+
///
219+
/// Only strips `.*` (greedy, min=0) which matches any suffix — this
220+
/// guarantees that if the extract regex matches, the full pattern would too.
221+
fn try_build_extract_regex(pattern: &str) -> Option<Regex> {
222+
let hir = regex_syntax::Parser::new().parse(pattern).ok()?;
223+
let HirKind::Concat(parts) = hir.kind() else {
224+
return None;
225+
};
226+
227+
if parts.len() < 3
228+
|| !matches!(parts.first()?.kind(), HirKind::Look(Look::Start))
229+
|| !matches!(parts.last()?.kind(), HirKind::Look(Look::End))
230+
{
231+
return None;
232+
}
233+
234+
let before_end = &parts[parts.len() - 2];
235+
let is_dot_star = matches!(before_end.kind(), HirKind::Repetition(rep)
236+
if rep.min == 0
237+
&& rep.max.is_none()
238+
&& rep.greedy
239+
&& matches!(rep.sub.kind(), HirKind::Class(_))
240+
);
241+
if !is_dot_star {
242+
return None;
243+
}
244+
245+
// Keep ^ and inner parts, drop the .* and $
246+
let trimmed_parts: Vec<Hir> = parts[..parts.len() - 2].to_vec();
247+
if trimmed_parts
248+
.iter()
249+
.map(count_capture_groups)
250+
.sum::<usize>()
251+
== 0
252+
{
253+
return None;
254+
}
255+
256+
let trimmed_hir = Hir::concat(trimmed_parts);
257+
Regex::new(&trimmed_hir.to_string()).ok()
258+
}
259+
202260
/// Replaces substring(s) matching a PCRE-like regular expression.
203261
///
204262
/// The full list of supported features and syntax can be found at
@@ -457,6 +515,16 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
457515
// with rust ones.
458516
let replacement = regex_replace_posix_groups(replacement);
459517

518+
// For anchored patterns with trailing .*, build a shorter regex that
519+
// reduces backtracker work by not scanning the rest of the string.
520+
// We can't use this with replacen() (it would change match boundaries),
521+
// so we use captures_read() + manual expansion instead.
522+
let extract_re = if limit == 1 {
523+
try_build_extract_regex(&pattern)
524+
} else {
525+
None
526+
};
527+
460528
let string_array_type = args[0].data_type();
461529
match string_array_type {
462530
DataType::Utf8 | DataType::LargeUtf8 => {
@@ -473,13 +541,33 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
473541
let mut new_offsets = BufferBuilder::<T>::new(string_array.len() + 1);
474542
new_offsets.append(T::zero());
475543

476-
string_array.iter().for_each(|val| {
477-
if let Some(val) = val {
478-
let result = re.replacen(val, limit, replacement.as_str());
479-
vals.append_slice(result.as_bytes());
480-
}
481-
new_offsets.append(T::from_usize(vals.len()).unwrap());
482-
});
544+
if let Some(ref extract_re) = extract_re {
545+
// Use shorter regex for capture extraction only.
546+
// Since the original pattern was ^...$, the replacement
547+
// replaces the entire string — we just need correct
548+
// capture group positions, which the shorter regex provides.
549+
let mut result = String::new();
550+
string_array.iter().for_each(|val| {
551+
if let Some(val) = val {
552+
if let Some(caps) = extract_re.captures(val) {
553+
result.clear();
554+
caps.expand(replacement.as_str(), &mut result);
555+
vals.append_slice(result.as_bytes());
556+
} else {
557+
vals.append_slice(val.as_bytes());
558+
}
559+
}
560+
new_offsets.append(T::from_usize(vals.len()).unwrap());
561+
});
562+
} else {
563+
string_array.iter().for_each(|val| {
564+
if let Some(val) = val {
565+
let result = re.replacen(val, limit, replacement.as_str());
566+
vals.append_slice(result.as_bytes());
567+
}
568+
new_offsets.append(T::from_usize(vals.len()).unwrap());
569+
});
570+
}
483571

484572
let data = ArrayDataBuilder::new(GenericStringArray::<T>::DATA_TYPE)
485573
.len(string_array.len())
@@ -494,12 +582,29 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
494582

495583
let mut builder = StringViewBuilder::with_capacity(string_view_array.len());
496584

497-
for val in string_view_array.iter() {
498-
if let Some(val) = val {
499-
let result = re.replacen(val, limit, replacement.as_str());
500-
builder.append_value(result);
501-
} else {
502-
builder.append_null();
585+
if let Some(ref extract_re) = extract_re {
586+
let mut result = String::new();
587+
for val in string_view_array.iter() {
588+
if let Some(val) = val {
589+
if let Some(caps) = extract_re.captures(val) {
590+
result.clear();
591+
caps.expand(replacement.as_str(), &mut result);
592+
builder.append_value(&result);
593+
} else {
594+
builder.append_value(val);
595+
}
596+
} else {
597+
builder.append_null();
598+
}
599+
}
600+
} else {
601+
for val in string_view_array.iter() {
602+
if let Some(val) = val {
603+
let result = re.replacen(val, limit, replacement.as_str());
604+
builder.append_value(result);
605+
} else {
606+
builder.append_null();
607+
}
503608
}
504609
}
505610

datafusion/sqllogictest/test_files/regexp/regexp_replace.slt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,3 +127,18 @@ from (values ('a'), ('b')) as tbl(col);
127127
----
128128
NULL NULL NULL
129129
NULL NULL NULL
130+
131+
# Extract domain from URL using anchored pattern with trailing .*
132+
# This tests that the full URL suffix is replaced, not just the matched prefix
133+
query T
134+
SELECT regexp_replace(url, '^https?://(?:www\.)?([^/]+)/.*$', '\1') FROM (VALUES
135+
('https://www.example.com/path/to/page?q=1'),
136+
('http://test.org/foo/bar'),
137+
('https://example.com/'),
138+
('not-a-url')
139+
) AS t(url);
140+
----
141+
example.com
142+
test.org
143+
example.com
144+
not-a-url

0 commit comments

Comments
 (0)