@@ -41,7 +41,6 @@ use datafusion_expr::{
4141} ;
4242use datafusion_macros:: user_doc;
4343use regex:: Regex ;
44- use regex_syntax:: hir:: { Hir , HirKind , Look } ;
4544use std:: collections:: HashMap ;
4645use std:: sync:: { Arc , LazyLock } ;
4746
@@ -200,91 +199,20 @@ fn regex_replace_posix_groups(replacement: &str) -> String {
200199 . into_owned ( )
201200}
202201
203- /// Parts of an anchored `^prefix(capture)suffix.*$` pattern, split so
204- /// the hot loop can use two `find()` calls (no capture groups).
205- struct ExtractParts {
206- prefix_re : Regex ,
207- content_re : Regex ,
208- suffix_literal : Vec < u8 > ,
209- }
210-
211- /// For anchored patterns like `^prefix(capture)suffix.*$` where the
212- /// replacement is `\1`, split into a prefix regex and a capture-content
213- /// regex (without capture groups). The hot loop uses `find()` on each
214- /// instead of `captures()` + `expand()`, avoiding capture-group overhead.
215- ///
216- /// Requirements:
217- /// - Pattern is `^...$` with trailing `.*` before `$`
218- /// - Exactly one capture group, at the top-level concat
219- /// - Replacement is exactly `\1` (converted to `${1}`)
220- /// - Any suffix between the capture and `.*` must be literal
221- fn try_build_extract_parts ( pattern : & str , replacement : & str ) -> Option < ExtractParts > {
222- if replacement != "${1}" {
223- return None ;
224- }
225-
226- let hir = regex_syntax:: Parser :: new ( ) . parse ( pattern) . ok ( ) ?;
227- let HirKind :: Concat ( parts) = hir. kind ( ) else {
228- return None ;
229- } ;
230-
231- if parts. len ( ) < 3
232- || !matches ! ( parts. first( ) ?. kind( ) , HirKind :: Look ( Look :: Start ) )
233- || !matches ! ( parts. last( ) ?. kind( ) , HirKind :: Look ( Look :: End ) )
234- {
235- return None ;
236- }
237-
238- let before_end = & parts[ parts. len ( ) - 2 ] ;
239- let is_dot_star = matches ! ( before_end. kind( ) , HirKind :: Repetition ( rep)
240- if rep. min == 0
241- && rep. max. is_none( )
242- && rep. greedy
243- && matches!( rep. sub. kind( ) , HirKind :: Class ( _) )
244- ) ;
245- if !is_dot_star {
202+ /// For anchored patterns like `^...(capture)....*$` where the replacement
203+ /// is `\1`, build a shorter regex (stripping trailing `.*$`) and use
204+ /// `captures_read` with `CaptureLocations` for direct extraction — no
205+ /// `expand()`, no `String` allocation.
206+ fn try_build_short_extract_regex ( pattern : & str , replacement : & str ) -> Option < Regex > {
207+ if replacement != "${1}" || !pattern. starts_with ( '^' ) || !pattern. ends_with ( ".*$" ) {
246208 return None ;
247209 }
248-
249- // inner parts: between ^ and .*$
250- let inner_end = parts. len ( ) - 2 ;
251-
252- // Find the sole capture group (must be a direct child of the concat)
253- let capture_positions: Vec < usize > = ( 1 ..inner_end)
254- . filter ( |& i| matches ! ( parts[ i] . kind( ) , HirKind :: Capture ( _) ) )
255- . collect ( ) ;
256- if capture_positions. len ( ) != 1 {
210+ let short = & pattern[ ..pattern. len ( ) - 3 ] ;
211+ let re = Regex :: new ( short) . ok ( ) ?;
212+ if re. captures_len ( ) != 2 {
257213 return None ;
258214 }
259- let capture_idx = capture_positions[ 0 ] ;
260-
261- let HirKind :: Capture ( cap) = parts[ capture_idx] . kind ( ) else {
262- return None ;
263- } ;
264-
265- // Suffix between capture and .*$ must be all literals
266- let mut suffix_literal = Vec :: new ( ) ;
267- for part in & parts[ capture_idx + 1 ..inner_end] {
268- match part. kind ( ) {
269- HirKind :: Literal ( lit) => suffix_literal. extend_from_slice ( & lit. 0 ) ,
270- _ => return None ,
271- }
272- }
273-
274- // Build prefix regex: ^ + parts before capture
275- let prefix_hir = Hir :: concat ( parts[ ..capture_idx] . to_vec ( ) ) ;
276- let prefix_re = Regex :: new ( & prefix_hir. to_string ( ) ) . ok ( ) ?;
277-
278- // Build content regex: ^(capture inner) — anchored so it matches
279- // right where the prefix ended
280- let content_hir = Hir :: concat ( vec ! [ Hir :: look( Look :: Start ) , ( * cap. sub) . clone( ) ] ) ;
281- let content_re = Regex :: new ( & content_hir. to_string ( ) ) . ok ( ) ?;
282-
283- Some ( ExtractParts {
284- prefix_re,
285- content_re,
286- suffix_literal,
287- } )
215+ Some ( re)
288216}
289217
290218/// Replaces substring(s) matching a PCRE-like regular expression.
@@ -545,10 +473,10 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
545473 // with rust ones.
546474 let replacement = regex_replace_posix_groups ( replacement) ;
547475
548- // For anchored patterns like ^prefix (capture)suffix. *$, split into
549- // prefix + content regexes (no capture groups) for faster matching .
550- let extract_parts = if limit == 1 {
551- try_build_extract_parts ( & pattern, & replacement)
476+ // For anchored patterns like ^... (capture).... *$, build a shorter
477+ // regex and use captures_read for direct extraction .
478+ let short_re = if limit == 1 {
479+ try_build_short_extract_regex ( & pattern, & replacement)
552480 } else {
553481 None
554482 } ;
@@ -569,23 +497,19 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
569497 let mut new_offsets = BufferBuilder :: < T > :: new ( string_array. len ( ) + 1 ) ;
570498 new_offsets. append ( T :: zero ( ) ) ;
571499
572- if let Some ( ref parts ) = extract_parts {
573- // Fast path: two find() calls, no capture groups.
500+ if let Some ( ref short_re ) = short_re {
501+ let mut locs = short_re . capture_locations ( ) ;
574502 string_array. iter ( ) . for_each ( |val| {
575503 if let Some ( val) = val {
576- let extracted = parts. prefix_re . find ( val) . and_then ( |pm| {
577- let rest = & val[ pm. end ( ) ..] ;
578- let cm = parts. content_re . find ( rest) ?;
579- let after = & rest. as_bytes ( ) [ cm. end ( ) ..] ;
580- if parts. suffix_literal . is_empty ( )
581- || after. starts_with ( & parts. suffix_literal )
582- {
583- Some ( cm. as_str ( ) )
504+ if short_re. captures_read ( & mut locs, val) . is_some ( ) {
505+ if let Some ( ( start, end) ) = locs. get ( 1 ) {
506+ vals. append_slice ( & val. as_bytes ( ) [ start..end] ) ;
584507 } else {
585- None
508+ vals . append_slice ( val . as_bytes ( ) ) ;
586509 }
587- } ) ;
588- vals. append_slice ( extracted. unwrap_or ( val) . as_bytes ( ) ) ;
510+ } else {
511+ vals. append_slice ( val. as_bytes ( ) ) ;
512+ }
589513 }
590514 new_offsets. append ( T :: from_usize ( vals. len ( ) ) . unwrap ( ) ) ;
591515 } ) ;
@@ -612,22 +536,19 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
612536
613537 let mut builder = StringViewBuilder :: with_capacity ( string_view_array. len ( ) ) ;
614538
615- if let Some ( ref parts) = extract_parts {
539+ if let Some ( ref short_re) = short_re {
540+ let mut locs = short_re. capture_locations ( ) ;
616541 for val in string_view_array. iter ( ) {
617542 if let Some ( val) = val {
618- let extracted = parts. prefix_re . find ( val) . and_then ( |pm| {
619- let rest = & val[ pm. end ( ) ..] ;
620- let cm = parts. content_re . find ( rest) ?;
621- let after = & rest. as_bytes ( ) [ cm. end ( ) ..] ;
622- if parts. suffix_literal . is_empty ( )
623- || after. starts_with ( & parts. suffix_literal )
624- {
625- Some ( cm. as_str ( ) )
543+ if short_re. captures_read ( & mut locs, val) . is_some ( ) {
544+ if let Some ( ( start, end) ) = locs. get ( 1 ) {
545+ builder. append_value ( & val[ start..end] ) ;
626546 } else {
627- None
547+ builder . append_value ( val ) ;
628548 }
629- } ) ;
630- builder. append_value ( extracted. unwrap_or ( val) ) ;
549+ } else {
550+ builder. append_value ( val) ;
551+ }
631552 } else {
632553 builder. append_null ( ) ;
633554 }
0 commit comments