Skip to content

Commit f488a90

Browse files
perf: Optimize scalar fast path for regexp_like and rejects g inside combined flags like ig (#20354)
## Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes #123` indicates that this PR will close issue #123. --> - Part of apache/datafusion-comet#2986 ## Rationale for this change `regexp_like` was converting scalar inputs into single‑element arrays, adding avoidable overhead for constant folding and scalar‑only evaluations. <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> ## What changes are included in this PR? - Add a scalar fast path in RegexpLikeFunc::invoke_with_args that evaluates regexp_like directly for scalar inputs - Add benchmark - Fixes regexp_like to reject the global flag even when provided in combined flags (e.g., ig) across scalar and array+scalar execution paths; adds tests for both branches. | Type | Before | After | Speedup | |------|--------|-------|---------| | regexp_like_scalar_utf8 | 12.092 µs | 10.943 µs | 1.10x | <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> ## Are these changes tested? Yes <!-- We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 2. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? --> ## Are there any user-facing changes? NO <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. --> <!-- If there are any breaking changes to public APIs, please add the `api change` label. --> --------- Co-authored-by: Jeffrey Vo <jeffrey.vo.australia@gmail.com>
1 parent cfdd7c1 commit f488a90

2 files changed

Lines changed: 251 additions & 30 deletions

File tree

datafusion/functions/benches/regx.rs

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,23 +15,27 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
use std::hint::black_box;
19+
use std::iter;
20+
use std::sync::Arc;
21+
1822
use arrow::array::builder::StringBuilder;
1923
use arrow::array::{ArrayRef, AsArray, Int64Array, StringArray, StringViewArray};
2024
use arrow::compute::cast;
21-
use arrow::datatypes::DataType;
25+
use arrow::datatypes::{DataType, Field};
2226
use criterion::{Criterion, criterion_group, criterion_main};
27+
use datafusion_common::ScalarValue;
28+
use datafusion_common::config::ConfigOptions;
29+
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
2330
use datafusion_functions::regex::regexpcount::regexp_count_func;
2431
use datafusion_functions::regex::regexpinstr::regexp_instr_func;
25-
use datafusion_functions::regex::regexplike::regexp_like;
32+
use datafusion_functions::regex::regexplike::{RegexpLikeFunc, regexp_like};
2633
use datafusion_functions::regex::regexpmatch::regexp_match;
2734
use datafusion_functions::regex::regexpreplace::regexp_replace;
2835
use rand::Rng;
2936
use rand::distr::Alphanumeric;
3037
use rand::prelude::IndexedRandom;
3138
use rand::rngs::ThreadRng;
32-
use std::hint::black_box;
33-
use std::iter;
34-
use std::sync::Arc;
3539
fn data(rng: &mut ThreadRng) -> StringArray {
3640
let mut data: Vec<String> = vec![];
3741
for _ in 0..1000 {
@@ -105,6 +109,8 @@ fn subexp(rng: &mut ThreadRng) -> Int64Array {
105109
}
106110

107111
fn criterion_benchmark(c: &mut Criterion) {
112+
let regexp_like_func = RegexpLikeFunc::new();
113+
let config_options = Arc::new(ConfigOptions::default());
108114
c.bench_function("regexp_count_1000 string", |b| {
109115
let mut rng = rand::rng();
110116
let data = Arc::new(data(&mut rng)) as ArrayRef;
@@ -219,6 +225,32 @@ fn criterion_benchmark(c: &mut Criterion) {
219225
})
220226
});
221227

228+
let scalar_args = vec![
229+
ColumnarValue::Scalar(ScalarValue::Utf8(Some("foobarbequebaz".to_string()))),
230+
ColumnarValue::Scalar(ScalarValue::Utf8(Some("(bar)(beque)".to_string()))),
231+
];
232+
let scalar_arg_fields = vec![
233+
Field::new("arg_0", DataType::Utf8, false).into(),
234+
Field::new("arg_1", DataType::Utf8, false).into(),
235+
];
236+
let return_field = Field::new("f", DataType::Boolean, true).into();
237+
238+
c.bench_function("regexp_like scalar utf8", |b| {
239+
b.iter(|| {
240+
black_box(
241+
regexp_like_func
242+
.invoke_with_args(ScalarFunctionArgs {
243+
args: scalar_args.clone(),
244+
arg_fields: scalar_arg_fields.clone(),
245+
number_rows: 1,
246+
return_field: Arc::clone(&return_field),
247+
config_options: Arc::clone(&config_options),
248+
})
249+
.expect("regexp_like scalar should work on valid values"),
250+
)
251+
})
252+
});
253+
222254
c.bench_function("regexp_match_1000", |b| {
223255
let mut rng = rand::rng();
224256
let data = Arc::new(data(&mut rng)) as ArrayRef;

datafusion/functions/src/regex/regexplike.rs

Lines changed: 214 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
//! Regex expressions
1919
20-
use arrow::array::{Array, ArrayRef, AsArray, GenericStringArray};
20+
use arrow::array::{Array, ArrayRef, AsArray, BooleanArray, GenericStringArray};
2121
use arrow::compute::kernels::regexp;
2222
use arrow::datatypes::DataType;
2323
use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View};
@@ -34,6 +34,7 @@ use datafusion_macros::user_doc;
3434
use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
3535
use datafusion_expr_common::operator::Operator;
3636
use datafusion_expr_common::type_coercion::binary::BinaryTypeCoercer;
37+
use regex::Regex;
3738
use std::any::Any;
3839
use std::sync::Arc;
3940

@@ -130,28 +131,45 @@ impl ScalarUDFImpl for RegexpLikeFunc {
130131
args: datafusion_expr::ScalarFunctionArgs,
131132
) -> Result<ColumnarValue> {
132133
let args = &args.args;
133-
134-
let len = args
135-
.iter()
136-
.fold(Option::<usize>::None, |acc, arg| match arg {
137-
ColumnarValue::Scalar(_) => acc,
138-
ColumnarValue::Array(a) => Some(a.len()),
139-
});
140-
141-
let is_scalar = len.is_none();
142-
let inferred_length = len.unwrap_or(1);
143-
let args = args
144-
.iter()
145-
.map(|arg| arg.to_array(inferred_length))
146-
.collect::<Result<Vec<_>>>()?;
147-
148-
let result = regexp_like(&args);
149-
if is_scalar {
150-
// If all inputs are scalar, keeps output as scalar
151-
let result = result.and_then(|arr| ScalarValue::try_from_array(&arr, 0));
152-
result.map(ColumnarValue::Scalar)
153-
} else {
154-
result.map(ColumnarValue::Array)
134+
match args.as_slice() {
135+
[ColumnarValue::Scalar(value), ColumnarValue::Scalar(pattern)] => {
136+
let value = scalar_string(value)?;
137+
let pattern = scalar_string(pattern)?;
138+
regexp_like_scalar(value, pattern, None)
139+
}
140+
[
141+
ColumnarValue::Scalar(value),
142+
ColumnarValue::Scalar(pattern),
143+
ColumnarValue::Scalar(flags),
144+
] => {
145+
let value = scalar_string(value)?;
146+
let pattern = scalar_string(pattern)?;
147+
let flags = scalar_string(flags)?;
148+
regexp_like_scalar(value, pattern, flags)
149+
}
150+
[ColumnarValue::Array(values), ColumnarValue::Scalar(pattern)] => {
151+
let pattern = scalar_string(pattern)?;
152+
let array = regexp_like_array_scalar(values, pattern, None)?;
153+
Ok(ColumnarValue::Array(array))
154+
}
155+
[
156+
ColumnarValue::Array(values),
157+
ColumnarValue::Scalar(pattern),
158+
ColumnarValue::Scalar(flags),
159+
] => {
160+
let flags = scalar_string(flags)?;
161+
if flags.is_some_and(|flagz| flagz.contains('g')) {
162+
plan_err!("regexp_like() does not support the \"global\" option")
163+
} else {
164+
let pattern = scalar_string(pattern)?;
165+
let array = regexp_like_array_scalar(values, pattern, flags)?;
166+
Ok(ColumnarValue::Array(array))
167+
}
168+
}
169+
_ => {
170+
let args = ColumnarValue::values_to_arrays(args)?;
171+
regexp_like(&args).map(ColumnarValue::Array)
172+
}
155173
}
156174
}
157175

@@ -302,7 +320,10 @@ pub fn regexp_like(args: &[ArrayRef]) -> Result<ArrayRef> {
302320
}
303321
};
304322

305-
if flags.iter().any(|s| s == Some("g")) {
323+
if flags
324+
.iter()
325+
.any(|s| s.is_some_and(|flagz| flagz.contains('g')))
326+
{
306327
return plan_err!("regexp_like() does not support the \"global\" option");
307328
}
308329

@@ -314,6 +335,83 @@ pub fn regexp_like(args: &[ArrayRef]) -> Result<ArrayRef> {
314335
}
315336
}
316337

338+
fn scalar_string(value: &ScalarValue) -> Result<Option<&str>> {
339+
match value.try_as_str() {
340+
Some(v) => Ok(v),
341+
None => internal_err!(
342+
"Unsupported data type {:?} for function `regexp_like`",
343+
value.data_type()
344+
),
345+
}
346+
}
347+
348+
fn regexp_like_array_scalar(
349+
values: &ArrayRef,
350+
pattern: Option<&str>,
351+
flags: Option<&str>,
352+
) -> Result<ArrayRef> {
353+
use DataType::*;
354+
355+
let Some(pattern) = pattern else {
356+
return Ok(Arc::new(BooleanArray::new_null(values.len())));
357+
};
358+
let array = match values.data_type() {
359+
Utf8 => {
360+
let array = values.as_string::<i32>();
361+
regexp::regexp_is_match_scalar(array, pattern, flags)?
362+
}
363+
Utf8View => {
364+
let array = values.as_string_view();
365+
regexp::regexp_is_match_scalar(array, pattern, flags)?
366+
}
367+
LargeUtf8 => {
368+
let array = values.as_string::<i64>();
369+
regexp::regexp_is_match_scalar(array, pattern, flags)?
370+
}
371+
other => {
372+
return internal_err!(
373+
"Unsupported data type {other:?} for function `regexp_like`"
374+
);
375+
}
376+
};
377+
378+
Ok(Arc::new(array))
379+
}
380+
381+
fn regexp_like_scalar(
382+
value: Option<&str>,
383+
pattern: Option<&str>,
384+
flags: Option<&str>,
385+
) -> Result<ColumnarValue> {
386+
if flags.is_some_and(|flagz| flagz.contains('g')) {
387+
return plan_err!("regexp_like() does not support the \"global\" option");
388+
}
389+
390+
if value.is_none() || pattern.is_none() {
391+
return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None)));
392+
}
393+
394+
let value = value.unwrap();
395+
let pattern = pattern.unwrap();
396+
let pattern = match flags {
397+
Some(flagz) => format!("(?{flagz}){pattern}"),
398+
None => pattern.to_string(),
399+
};
400+
401+
let result = if pattern.is_empty() {
402+
true
403+
} else {
404+
let re = Regex::new(pattern.as_str()).map_err(|e| {
405+
datafusion_common::DataFusionError::Execution(format!(
406+
"Regular expression did not compile: {e:?}"
407+
))
408+
})?;
409+
re.is_match(value)
410+
};
411+
412+
Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(result))))
413+
}
414+
317415
fn handle_regexp_like(
318416
values: &ArrayRef,
319417
patterns: &ArrayRef,
@@ -399,8 +497,37 @@ mod tests {
399497

400498
use arrow::array::StringArray;
401499
use arrow::array::{BooleanBuilder, StringViewArray};
500+
use arrow::datatypes::{DataType, Field};
501+
use datafusion_common::config::ConfigOptions;
502+
use datafusion_common::{Result, ScalarValue};
503+
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
402504

403-
use crate::regex::regexplike::regexp_like;
505+
use crate::regex::regexplike::{RegexpLikeFunc, regexp_like};
506+
507+
fn invoke_regexp_like(args: Vec<ColumnarValue>) -> Result<ColumnarValue> {
508+
let number_rows = args
509+
.iter()
510+
.find_map(|arg| match arg {
511+
ColumnarValue::Array(array) => Some(array.len()),
512+
_ => None,
513+
})
514+
.unwrap_or(1);
515+
let arg_fields = args
516+
.iter()
517+
.enumerate()
518+
.map(|(idx, arg)| {
519+
Arc::new(Field::new(format!("arg_{idx}"), arg.data_type(), true))
520+
})
521+
.collect::<Vec<_>>();
522+
523+
RegexpLikeFunc::new().invoke_with_args(ScalarFunctionArgs {
524+
args,
525+
arg_fields,
526+
number_rows,
527+
return_field: Arc::new(Field::new("f", DataType::Boolean, true)),
528+
config_options: Arc::new(ConfigOptions::default()),
529+
})
530+
}
404531

405532
#[test]
406533
fn test_case_sensitive_regexp_like_utf8() {
@@ -499,4 +626,66 @@ mod tests {
499626
"Error during planning: regexp_like() does not support the \"global\" option"
500627
);
501628
}
629+
630+
#[test]
631+
fn test_regexp_like_scalar_invoke() {
632+
let args = vec![
633+
ColumnarValue::Scalar(ScalarValue::Utf8(Some("foobarbequebaz".to_string()))),
634+
ColumnarValue::Scalar(ScalarValue::Utf8(Some("(bar)(beque)".to_string()))),
635+
];
636+
let result = invoke_regexp_like(args).unwrap();
637+
match result {
638+
ColumnarValue::Scalar(ScalarValue::Boolean(Some(true))) => {}
639+
other => panic!("Unexpected result {other:?}"),
640+
}
641+
}
642+
643+
#[test]
644+
fn test_regexp_like_array_scalar_invoke() {
645+
let values = Arc::new(StringArray::from(vec!["abc", "xyz"]));
646+
let args = vec![
647+
ColumnarValue::Array(values),
648+
ColumnarValue::Scalar(ScalarValue::Utf8(Some("^(a)".to_string()))),
649+
];
650+
let result = invoke_regexp_like(args).unwrap();
651+
let mut expected_builder = BooleanBuilder::new();
652+
expected_builder.append_value(true);
653+
expected_builder.append_value(false);
654+
let expected = expected_builder.finish();
655+
match result {
656+
ColumnarValue::Array(array) => {
657+
assert_eq!(array.as_ref(), &expected);
658+
}
659+
other => panic!("Unexpected result {other:?}"),
660+
}
661+
}
662+
663+
#[test]
664+
fn test_regexp_like_scalar_flags_with_global() {
665+
let args = vec![
666+
ColumnarValue::Scalar(ScalarValue::Utf8(Some("abc".to_string()))),
667+
ColumnarValue::Scalar(ScalarValue::Utf8(Some("^(a)".to_string()))),
668+
ColumnarValue::Scalar(ScalarValue::Utf8(Some("ig".to_string()))),
669+
];
670+
let err = invoke_regexp_like(args).expect_err("global flag should be rejected");
671+
assert_eq!(
672+
err.strip_backtrace(),
673+
"Error during planning: regexp_like() does not support the \"global\" option"
674+
);
675+
}
676+
677+
#[test]
678+
fn test_regexp_like_array_scalar_flags_with_global() {
679+
let values = Arc::new(StringArray::from(vec!["abc", "xyz"]));
680+
let args = vec![
681+
ColumnarValue::Array(values),
682+
ColumnarValue::Scalar(ScalarValue::Utf8(Some("^(a)".to_string()))),
683+
ColumnarValue::Scalar(ScalarValue::Utf8(Some("ig".to_string()))),
684+
];
685+
let err = invoke_regexp_like(args).expect_err("global flag should be rejected");
686+
assert_eq!(
687+
err.strip_backtrace(),
688+
"Error during planning: regexp_like() does not support the \"global\" option"
689+
);
690+
}
502691
}

0 commit comments

Comments
 (0)