Skip to content

Commit 1b123ad

Browse files
Dandandanclaude
andcommitted
Simplify regexp_replace optimization: drop regex-syntax, use captures_read
Replace HIR-based regex splitting with a simple string check: strip trailing `.*$` from anchored patterns and use captures_read with pre-allocated CaptureLocations for direct extraction. Eliminates regex-syntax dependency, expand(), and String allocation. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 91dc88a commit 1b123ad

2 files changed

Lines changed: 32 additions & 112 deletions

File tree

datafusion/functions/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,6 @@ memchr = { workspace = true }
8686
num-traits = { workspace = true }
8787
rand = { workspace = true }
8888
regex = { workspace = true, optional = true }
89-
regex-syntax = "0.8.9"
9089
sha2 = { workspace = true, optional = true }
9190
unicode-segmentation = { version = "^1.13.2", optional = true }
9291
uuid = { workspace = true, features = ["v4"], optional = true }

datafusion/functions/src/regex/regexpreplace.rs

Lines changed: 32 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ use datafusion_expr::{
4141
};
4242
use datafusion_macros::user_doc;
4343
use regex::Regex;
44-
use regex_syntax::hir::{Hir, HirKind, Look};
4544
use std::collections::HashMap;
4645
use std::sync::{Arc, LazyLock};
4746

@@ -200,91 +199,20 @@ fn regex_replace_posix_groups(replacement: &str) -> String {
200199
.into_owned()
201200
}
202201

203-
/// Parts of an anchored `^prefix(capture)suffix.*$` pattern, split so
204-
/// the hot loop can use two `find()` calls (no capture groups).
205-
struct ExtractParts {
206-
prefix_re: Regex,
207-
content_re: Regex,
208-
suffix_literal: Vec<u8>,
209-
}
210-
211-
/// For anchored patterns like `^prefix(capture)suffix.*$` where the
212-
/// replacement is `\1`, split into a prefix regex and a capture-content
213-
/// regex (without capture groups). The hot loop uses `find()` on each
214-
/// instead of `captures()` + `expand()`, avoiding capture-group overhead.
215-
///
216-
/// Requirements:
217-
/// - Pattern is `^...$` with trailing `.*` before `$`
218-
/// - Exactly one capture group, at the top-level concat
219-
/// - Replacement is exactly `\1` (converted to `${1}`)
220-
/// - Any suffix between the capture and `.*` must be literal
221-
fn try_build_extract_parts(pattern: &str, replacement: &str) -> Option<ExtractParts> {
222-
if replacement != "${1}" {
223-
return None;
224-
}
225-
226-
let hir = regex_syntax::Parser::new().parse(pattern).ok()?;
227-
let HirKind::Concat(parts) = hir.kind() else {
228-
return None;
229-
};
230-
231-
if parts.len() < 3
232-
|| !matches!(parts.first()?.kind(), HirKind::Look(Look::Start))
233-
|| !matches!(parts.last()?.kind(), HirKind::Look(Look::End))
234-
{
235-
return None;
236-
}
237-
238-
let before_end = &parts[parts.len() - 2];
239-
let is_dot_star = matches!(before_end.kind(), HirKind::Repetition(rep)
240-
if rep.min == 0
241-
&& rep.max.is_none()
242-
&& rep.greedy
243-
&& matches!(rep.sub.kind(), HirKind::Class(_))
244-
);
245-
if !is_dot_star {
202+
/// For anchored patterns like `^...(capture)....*$` where the replacement
203+
/// is `\1`, build a shorter regex (stripping trailing `.*$`) and use
204+
/// `captures_read` with `CaptureLocations` for direct extraction — no
205+
/// `expand()`, no `String` allocation.
206+
fn try_build_short_extract_regex(pattern: &str, replacement: &str) -> Option<Regex> {
207+
if replacement != "${1}" || !pattern.starts_with('^') || !pattern.ends_with(".*$") {
246208
return None;
247209
}
248-
249-
// inner parts: between ^ and .*$
250-
let inner_end = parts.len() - 2;
251-
252-
// Find the sole capture group (must be a direct child of the concat)
253-
let capture_positions: Vec<usize> = (1..inner_end)
254-
.filter(|&i| matches!(parts[i].kind(), HirKind::Capture(_)))
255-
.collect();
256-
if capture_positions.len() != 1 {
210+
let short = &pattern[..pattern.len() - 3];
211+
let re = Regex::new(short).ok()?;
212+
if re.captures_len() != 2 {
257213
return None;
258214
}
259-
let capture_idx = capture_positions[0];
260-
261-
let HirKind::Capture(cap) = parts[capture_idx].kind() else {
262-
return None;
263-
};
264-
265-
// Suffix between capture and .*$ must be all literals
266-
let mut suffix_literal = Vec::new();
267-
for part in &parts[capture_idx + 1..inner_end] {
268-
match part.kind() {
269-
HirKind::Literal(lit) => suffix_literal.extend_from_slice(&lit.0),
270-
_ => return None,
271-
}
272-
}
273-
274-
// Build prefix regex: ^ + parts before capture
275-
let prefix_hir = Hir::concat(parts[..capture_idx].to_vec());
276-
let prefix_re = Regex::new(&prefix_hir.to_string()).ok()?;
277-
278-
// Build content regex: ^(capture inner) — anchored so it matches
279-
// right where the prefix ended
280-
let content_hir = Hir::concat(vec![Hir::look(Look::Start), (*cap.sub).clone()]);
281-
let content_re = Regex::new(&content_hir.to_string()).ok()?;
282-
283-
Some(ExtractParts {
284-
prefix_re,
285-
content_re,
286-
suffix_literal,
287-
})
215+
Some(re)
288216
}
289217

290218
/// Replaces substring(s) matching a PCRE-like regular expression.
@@ -545,10 +473,10 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
545473
// with rust ones.
546474
let replacement = regex_replace_posix_groups(replacement);
547475

548-
// For anchored patterns like ^prefix(capture)suffix.*$, split into
549-
// prefix + content regexes (no capture groups) for faster matching.
550-
let extract_parts = if limit == 1 {
551-
try_build_extract_parts(&pattern, &replacement)
476+
// For anchored patterns like ^...(capture)....*$, build a shorter
477+
// regex and use captures_read for direct extraction.
478+
let short_re = if limit == 1 {
479+
try_build_short_extract_regex(&pattern, &replacement)
552480
} else {
553481
None
554482
};
@@ -569,23 +497,19 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
569497
let mut new_offsets = BufferBuilder::<T>::new(string_array.len() + 1);
570498
new_offsets.append(T::zero());
571499

572-
if let Some(ref parts) = extract_parts {
573-
// Fast path: two find() calls, no capture groups.
500+
if let Some(ref short_re) = short_re {
501+
let mut locs = short_re.capture_locations();
574502
string_array.iter().for_each(|val| {
575503
if let Some(val) = val {
576-
let extracted = parts.prefix_re.find(val).and_then(|pm| {
577-
let rest = &val[pm.end()..];
578-
let cm = parts.content_re.find(rest)?;
579-
let after = &rest.as_bytes()[cm.end()..];
580-
if parts.suffix_literal.is_empty()
581-
|| after.starts_with(&parts.suffix_literal)
582-
{
583-
Some(cm.as_str())
504+
if short_re.captures_read(&mut locs, val).is_some() {
505+
if let Some((start, end)) = locs.get(1) {
506+
vals.append_slice(&val.as_bytes()[start..end]);
584507
} else {
585-
None
508+
vals.append_slice(val.as_bytes());
586509
}
587-
});
588-
vals.append_slice(extracted.unwrap_or(val).as_bytes());
510+
} else {
511+
vals.append_slice(val.as_bytes());
512+
}
589513
}
590514
new_offsets.append(T::from_usize(vals.len()).unwrap());
591515
});
@@ -612,22 +536,19 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
612536

613537
let mut builder = StringViewBuilder::with_capacity(string_view_array.len());
614538

615-
if let Some(ref parts) = extract_parts {
539+
if let Some(ref short_re) = short_re {
540+
let mut locs = short_re.capture_locations();
616541
for val in string_view_array.iter() {
617542
if let Some(val) = val {
618-
let extracted = parts.prefix_re.find(val).and_then(|pm| {
619-
let rest = &val[pm.end()..];
620-
let cm = parts.content_re.find(rest)?;
621-
let after = &rest.as_bytes()[cm.end()..];
622-
if parts.suffix_literal.is_empty()
623-
|| after.starts_with(&parts.suffix_literal)
624-
{
625-
Some(cm.as_str())
543+
if short_re.captures_read(&mut locs, val).is_some() {
544+
if let Some((start, end)) = locs.get(1) {
545+
builder.append_value(&val[start..end]);
626546
} else {
627-
None
547+
builder.append_value(val);
628548
}
629-
});
630-
builder.append_value(extracted.unwrap_or(val));
549+
} else {
550+
builder.append_value(val);
551+
}
631552
} else {
632553
builder.append_null();
633554
}

0 commit comments

Comments
 (0)