@@ -41,6 +41,7 @@ use datafusion_expr::{
4141} ;
4242use datafusion_macros:: user_doc;
4343use regex:: Regex ;
44+ use regex_syntax:: hir:: { Hir , HirKind , Look } ;
4445use std:: collections:: HashMap ;
4546use std:: sync:: { Arc , LazyLock } ;
4647
@@ -199,6 +200,63 @@ fn regex_replace_posix_groups(replacement: &str) -> String {
199200 . into_owned ( )
200201}
201202
203+ /// Count capture groups in a HIR tree.
204+ fn count_capture_groups ( hir : & Hir ) -> usize {
205+ match hir. kind ( ) {
206+ HirKind :: Capture ( cap) => 1 + count_capture_groups ( & cap. sub ) ,
207+ HirKind :: Concat ( subs) | HirKind :: Alternation ( subs) => {
208+ subs. iter ( ) . map ( count_capture_groups) . sum ( )
209+ }
210+ HirKind :: Repetition ( rep) => count_capture_groups ( & rep. sub ) ,
211+ _ => 0 ,
212+ }
213+ }
214+
215+ /// For anchored patterns (`^...$`), try to build a shorter regex by
216+ /// stripping trailing `.*` via HIR analysis. This reduces backtracker
217+ /// work since it doesn't need to scan through the rest of the string.
218+ ///
219+ /// Only strips `.*` (greedy, min=0) which matches any suffix — this
220+ /// guarantees that if the extract regex matches, the full pattern would too.
221+ fn try_build_extract_regex ( pattern : & str ) -> Option < Regex > {
222+ let hir = regex_syntax:: Parser :: new ( ) . parse ( pattern) . ok ( ) ?;
223+ let HirKind :: Concat ( parts) = hir. kind ( ) else {
224+ return None ;
225+ } ;
226+
227+ if parts. len ( ) < 3
228+ || !matches ! ( parts. first( ) ?. kind( ) , HirKind :: Look ( Look :: Start ) )
229+ || !matches ! ( parts. last( ) ?. kind( ) , HirKind :: Look ( Look :: End ) )
230+ {
231+ return None ;
232+ }
233+
234+ let before_end = & parts[ parts. len ( ) - 2 ] ;
235+ let is_dot_star = matches ! ( before_end. kind( ) , HirKind :: Repetition ( rep)
236+ if rep. min == 0
237+ && rep. max. is_none( )
238+ && rep. greedy
239+ && matches!( rep. sub. kind( ) , HirKind :: Class ( _) )
240+ ) ;
241+ if !is_dot_star {
242+ return None ;
243+ }
244+
245+ // Keep ^ and inner parts, drop the .* and $
246+ let trimmed_parts: Vec < Hir > = parts[ ..parts. len ( ) - 2 ] . to_vec ( ) ;
247+ if trimmed_parts
248+ . iter ( )
249+ . map ( count_capture_groups)
250+ . sum :: < usize > ( )
251+ == 0
252+ {
253+ return None ;
254+ }
255+
256+ let trimmed_hir = Hir :: concat ( trimmed_parts) ;
257+ Regex :: new ( & trimmed_hir. to_string ( ) ) . ok ( )
258+ }
259+
202260/// Replaces substring(s) matching a PCRE-like regular expression.
203261///
204262/// The full list of supported features and syntax can be found at
@@ -457,6 +515,16 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
457515 // with rust ones.
458516 let replacement = regex_replace_posix_groups ( replacement) ;
459517
518+ // For anchored patterns with trailing .*, build a shorter regex that
519+ // reduces backtracker work by not scanning the rest of the string.
520+ // We can't use this with replacen() (it would change match boundaries),
521+ // so we use captures_read() + manual expansion instead.
522+ let extract_re = if limit == 1 {
523+ try_build_extract_regex ( & pattern)
524+ } else {
525+ None
526+ } ;
527+
460528 let string_array_type = args[ 0 ] . data_type ( ) ;
461529 match string_array_type {
462530 DataType :: Utf8 | DataType :: LargeUtf8 => {
@@ -473,13 +541,33 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
473541 let mut new_offsets = BufferBuilder :: < T > :: new ( string_array. len ( ) + 1 ) ;
474542 new_offsets. append ( T :: zero ( ) ) ;
475543
476- string_array. iter ( ) . for_each ( |val| {
477- if let Some ( val) = val {
478- let result = re. replacen ( val, limit, replacement. as_str ( ) ) ;
479- vals. append_slice ( result. as_bytes ( ) ) ;
480- }
481- new_offsets. append ( T :: from_usize ( vals. len ( ) ) . unwrap ( ) ) ;
482- } ) ;
544+ if let Some ( ref extract_re) = extract_re {
545+ // Use shorter regex for capture extraction only.
546+ // Since the original pattern was ^...$, the replacement
547+ // replaces the entire string — we just need correct
548+ // capture group positions, which the shorter regex provides.
549+ let mut result = String :: new ( ) ;
550+ string_array. iter ( ) . for_each ( |val| {
551+ if let Some ( val) = val {
552+ if let Some ( caps) = extract_re. captures ( val) {
553+ result. clear ( ) ;
554+ caps. expand ( replacement. as_str ( ) , & mut result) ;
555+ vals. append_slice ( result. as_bytes ( ) ) ;
556+ } else {
557+ vals. append_slice ( val. as_bytes ( ) ) ;
558+ }
559+ }
560+ new_offsets. append ( T :: from_usize ( vals. len ( ) ) . unwrap ( ) ) ;
561+ } ) ;
562+ } else {
563+ string_array. iter ( ) . for_each ( |val| {
564+ if let Some ( val) = val {
565+ let result = re. replacen ( val, limit, replacement. as_str ( ) ) ;
566+ vals. append_slice ( result. as_bytes ( ) ) ;
567+ }
568+ new_offsets. append ( T :: from_usize ( vals. len ( ) ) . unwrap ( ) ) ;
569+ } ) ;
570+ }
483571
484572 let data = ArrayDataBuilder :: new ( GenericStringArray :: < T > :: DATA_TYPE )
485573 . len ( string_array. len ( ) )
@@ -494,12 +582,29 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
494582
495583 let mut builder = StringViewBuilder :: with_capacity ( string_view_array. len ( ) ) ;
496584
497- for val in string_view_array. iter ( ) {
498- if let Some ( val) = val {
499- let result = re. replacen ( val, limit, replacement. as_str ( ) ) ;
500- builder. append_value ( result) ;
501- } else {
502- builder. append_null ( ) ;
585+ if let Some ( ref extract_re) = extract_re {
586+ let mut result = String :: new ( ) ;
587+ for val in string_view_array. iter ( ) {
588+ if let Some ( val) = val {
589+ if let Some ( caps) = extract_re. captures ( val) {
590+ result. clear ( ) ;
591+ caps. expand ( replacement. as_str ( ) , & mut result) ;
592+ builder. append_value ( & result) ;
593+ } else {
594+ builder. append_value ( val) ;
595+ }
596+ } else {
597+ builder. append_null ( ) ;
598+ }
599+ }
600+ } else {
601+ for val in string_view_array. iter ( ) {
602+ if let Some ( val) = val {
603+ let result = re. replacen ( val, limit, replacement. as_str ( ) ) ;
604+ builder. append_value ( result) ;
605+ } else {
606+ builder. append_null ( ) ;
607+ }
503608 }
504609 }
505610
0 commit comments