Skip to content

Commit f779e96

Browse files
authored
perf: Optimize scalar path for chr function (#20073)
## Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes #123` indicates that this PR will close issue #123. --> - Part of apache/datafusion-comet#2986. ## Rationale for this change `chr` currently routes scalar inputs through `make_scalar_function(chr, vec![])`, which performs a scalar → size-1 array → scalar roundtrip. This adds unnecessary overhead for constant folding / scalar evaluation. This PR adds a match-based scalar fast path and removes reliance on `make_scalar_function` for `chr`, while keeping the array behavior unchanged. <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> ## What changes are included in this PR? - Refactor `ChrFunc::invoke_with_args` to: - Handle `ColumnarValue::Scalar(Int64)` directly (scalar fast path) - Handle `ColumnarValue::Array(Int64Array)` using the existing conversion logic - Add scalar benchmark to `benches/chr.rs` (`chr/scalar`) outside any size loop | Type | Before | After | Speedup | |------|--------|-------|---------| | `chr/scalar` | 342.05 ns | 87.339 ns | **3.92x** | <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> ## Are these changes tested? Yes <!-- We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 2. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? --> ## Are there any user-facing changes? No <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. --> <!-- If there are any breaking changes to public APIs, please add the `api change` label. -->
1 parent 9de192a commit f779e96

3 files changed

Lines changed: 127 additions & 40 deletions

File tree

datafusion/functions/benches/chr.rs

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ extern crate criterion;
1919

2020
use arrow::{array::PrimitiveArray, datatypes::Int64Type};
2121
use criterion::{Criterion, criterion_group, criterion_main};
22+
use datafusion_common::ScalarValue;
2223
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
2324
use datafusion_functions::string::chr;
2425
use rand::{Rng, SeedableRng};
@@ -35,11 +36,32 @@ pub fn seedable_rng() -> StdRng {
3536
}
3637

3738
fn criterion_benchmark(c: &mut Criterion) {
38-
let cot_fn = chr();
39+
let chr_fn = chr();
40+
let config_options = Arc::new(ConfigOptions::default());
41+
42+
// Scalar benchmarks
43+
c.bench_function("chr/scalar", |b| {
44+
let args = vec![ColumnarValue::Scalar(ScalarValue::Int64(Some(65)))];
45+
let arg_fields = vec![Field::new("arg_0", DataType::Int64, true).into()];
46+
b.iter(|| {
47+
black_box(
48+
chr_fn
49+
.invoke_with_args(ScalarFunctionArgs {
50+
args: args.clone(),
51+
arg_fields: arg_fields.clone(),
52+
number_rows: 1,
53+
return_field: Field::new("f", DataType::Utf8, true).into(),
54+
config_options: Arc::clone(&config_options),
55+
})
56+
.unwrap(),
57+
)
58+
})
59+
});
60+
3961
let size = 1024;
4062
let input: PrimitiveArray<Int64Type> = {
4163
let null_density = 0.2;
42-
let mut rng = StdRng::seed_from_u64(42);
64+
let mut rng = seedable_rng();
4365
(0..size)
4466
.map(|_| {
4567
if rng.random::<f32>() < null_density {
@@ -57,12 +79,11 @@ fn criterion_benchmark(c: &mut Criterion) {
5779
.enumerate()
5880
.map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
5981
.collect::<Vec<_>>();
60-
let config_options = Arc::new(ConfigOptions::default());
6182

62-
c.bench_function("chr", |b| {
83+
c.bench_function("chr/array", |b| {
6384
b.iter(|| {
6485
black_box(
65-
cot_fn
86+
chr_fn
6687
.invoke_with_args(ScalarFunctionArgs {
6788
args: args.clone(),
6889
arg_fields: arg_fields.clone(),

datafusion/functions/src/string/chr.rs

Lines changed: 91 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -18,24 +18,21 @@
1818
use std::any::Any;
1919
use std::sync::Arc;
2020

21-
use arrow::array::ArrayRef;
22-
use arrow::array::GenericStringBuilder;
21+
use arrow::array::{ArrayRef, GenericStringBuilder, Int64Array};
2322
use arrow::datatypes::DataType;
2423
use arrow::datatypes::DataType::Int64;
2524
use arrow::datatypes::DataType::Utf8;
2625

27-
use crate::utils::make_scalar_function;
2826
use datafusion_common::cast::as_int64_array;
29-
use datafusion_common::{Result, exec_err};
27+
use datafusion_common::utils::take_function_args;
28+
use datafusion_common::{Result, ScalarValue, exec_err, internal_err};
3029
use datafusion_expr::{ColumnarValue, Documentation, Volatility};
3130
use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature};
3231
use datafusion_macros::user_doc;
3332

3433
/// Returns the character with the given code.
3534
/// chr(65) = 'A'
36-
fn chr(args: &[ArrayRef]) -> Result<ArrayRef> {
37-
let integer_array = as_int64_array(&args[0])?;
38-
35+
fn chr_array(integer_array: &Int64Array) -> Result<ArrayRef> {
3936
let mut builder = GenericStringBuilder::<i32>::with_capacity(
4037
integer_array.len(),
4138
// 1 byte per character, assuming that is the common case
@@ -56,15 +53,11 @@ fn chr(args: &[ArrayRef]) -> Result<ArrayRef> {
5653

5754
return exec_err!("invalid Unicode scalar value: {integer}");
5855
}
59-
None => {
60-
builder.append_null();
61-
}
56+
None => builder.append_null(),
6257
}
6358
}
6459

65-
let result = builder.finish();
66-
67-
Ok(Arc::new(result) as ArrayRef)
60+
Ok(Arc::new(builder.finish()) as ArrayRef)
6861
}
6962

7063
#[user_doc(
@@ -119,7 +112,32 @@ impl ScalarUDFImpl for ChrFunc {
119112
}
120113

121114
fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
122-
make_scalar_function(chr, vec![])(&args.args)
115+
let [arg] = take_function_args(self.name(), args.args)?;
116+
117+
match arg {
118+
ColumnarValue::Scalar(ScalarValue::Int64(Some(code_point))) => {
119+
if let Ok(u) = u32::try_from(code_point)
120+
&& let Some(c) = core::char::from_u32(u)
121+
{
122+
Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(
123+
c.to_string(),
124+
))))
125+
} else {
126+
exec_err!("invalid Unicode scalar value: {code_point}")
127+
}
128+
}
129+
ColumnarValue::Scalar(ScalarValue::Int64(None)) => {
130+
Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
131+
}
132+
ColumnarValue::Array(array) => {
133+
let integer_array = as_int64_array(&array)?;
134+
Ok(ColumnarValue::Array(chr_array(integer_array)?))
135+
}
136+
other => internal_err!(
137+
"Unexpected data type {:?} for function chr",
138+
other.data_type()
139+
),
140+
}
123141
}
124142

125143
fn documentation(&self) -> Option<&Documentation> {
@@ -130,13 +148,27 @@ impl ScalarUDFImpl for ChrFunc {
130148
#[cfg(test)]
131149
mod tests {
132150
use super::*;
151+
133152
use arrow::array::{Array, Int64Array, StringArray};
153+
use arrow::datatypes::Field;
134154
use datafusion_common::assert_contains;
155+
use datafusion_common::config::ConfigOptions;
156+
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
157+
158+
fn invoke_chr(arg: ColumnarValue, number_rows: usize) -> Result<ColumnarValue> {
159+
ChrFunc::new().invoke_with_args(ScalarFunctionArgs {
160+
args: vec![arg],
161+
arg_fields: vec![Field::new("a", Int64, true).into()],
162+
number_rows,
163+
return_field: Field::new("f", Utf8, true).into(),
164+
config_options: Arc::new(ConfigOptions::default()),
165+
})
166+
}
135167

136168
#[test]
137169
fn test_chr_normal() {
138170
let input = Arc::new(Int64Array::from(vec![
139-
Some(0), // null
171+
Some(0), // \u{0000}
140172
Some(65), // A
141173
Some(66), // B
142174
Some(67), // C
@@ -149,8 +181,13 @@ mod tests {
149181
Some(9), // tab
150182
Some(0x10FFFF), // 0x10FFFF, the largest Unicode code point
151183
]));
152-
let result = chr(&[input]).unwrap();
153-
let string_array = result.as_any().downcast_ref::<StringArray>().unwrap();
184+
185+
let result = invoke_chr(ColumnarValue::Array(input), 12).unwrap();
186+
let ColumnarValue::Array(arr) = result else {
187+
panic!("Expected array");
188+
};
189+
let string_array = arr.as_any().downcast_ref::<StringArray>().unwrap();
190+
154191
let expected = [
155192
"\u{0000}",
156193
"A",
@@ -174,55 +211,48 @@ mod tests {
174211

175212
#[test]
176213
fn test_chr_error() {
177-
// invalid Unicode code points (too large)
178214
let input = Arc::new(Int64Array::from(vec![i64::MAX]));
179-
let result = chr(&[input]);
215+
let result = invoke_chr(ColumnarValue::Array(input), 1);
180216
assert!(result.is_err());
181217
assert_contains!(
182218
result.err().unwrap().to_string(),
183219
"invalid Unicode scalar value: 9223372036854775807"
184220
);
185221

186-
// invalid Unicode code points (too large) case 2
187222
let input = Arc::new(Int64Array::from(vec![0x10FFFF + 1]));
188-
let result = chr(&[input]);
223+
let result = invoke_chr(ColumnarValue::Array(input), 1);
189224
assert!(result.is_err());
190225
assert_contains!(
191226
result.err().unwrap().to_string(),
192227
"invalid Unicode scalar value: 1114112"
193228
);
194229

195-
// invalid Unicode code points (surrogate code point)
196-
// link: <https://learn.microsoft.com/en-us/globalization/encoding/unicode-standard#surrogate-pairs>
197230
let input = Arc::new(Int64Array::from(vec![0xD800 + 1]));
198-
let result = chr(&[input]);
231+
let result = invoke_chr(ColumnarValue::Array(input), 1);
199232
assert!(result.is_err());
200233
assert_contains!(
201234
result.err().unwrap().to_string(),
202235
"invalid Unicode scalar value: 55297"
203236
);
204237

205-
// negative input
206-
let input = Arc::new(Int64Array::from(vec![i64::MIN + 2i64])); // will be 2 if cast to u32
207-
let result = chr(&[input]);
238+
let input = Arc::new(Int64Array::from(vec![i64::MIN + 2i64]));
239+
let result = invoke_chr(ColumnarValue::Array(input), 1);
208240
assert!(result.is_err());
209241
assert_contains!(
210242
result.err().unwrap().to_string(),
211243
"invalid Unicode scalar value: -9223372036854775806"
212244
);
213245

214-
// negative input case 2
215246
let input = Arc::new(Int64Array::from(vec![-1]));
216-
let result = chr(&[input]);
247+
let result = invoke_chr(ColumnarValue::Array(input), 1);
217248
assert!(result.is_err());
218249
assert_contains!(
219250
result.err().unwrap().to_string(),
220251
"invalid Unicode scalar value: -1"
221252
);
222253

223-
// one error with valid values after
224-
let input = Arc::new(Int64Array::from(vec![65, -1, 66])); // A, -1, B
225-
let result = chr(&[input]);
254+
let input = Arc::new(Int64Array::from(vec![65, -1, 66]));
255+
let result = invoke_chr(ColumnarValue::Array(input), 3);
226256
assert!(result.is_err());
227257
assert_contains!(
228258
result.err().unwrap().to_string(),
@@ -232,10 +262,36 @@ mod tests {
232262

233263
#[test]
234264
fn test_chr_empty() {
235-
// empty input array
236265
let input = Arc::new(Int64Array::from(Vec::<i64>::new()));
237-
let result = chr(&[input]).unwrap();
238-
let string_array = result.as_any().downcast_ref::<StringArray>().unwrap();
266+
let result = invoke_chr(ColumnarValue::Array(input), 0).unwrap();
267+
let ColumnarValue::Array(arr) = result else {
268+
panic!("Expected array");
269+
};
270+
let string_array = arr.as_any().downcast_ref::<StringArray>().unwrap();
239271
assert_eq!(string_array.len(), 0);
240272
}
273+
274+
#[test]
275+
fn test_chr_scalar() {
276+
let result =
277+
invoke_chr(ColumnarValue::Scalar(ScalarValue::Int64(Some(65))), 1).unwrap();
278+
279+
match result {
280+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(s))) => {
281+
assert_eq!(s, "A");
282+
}
283+
other => panic!("Unexpected result: {other:?}"),
284+
}
285+
}
286+
287+
#[test]
288+
fn test_chr_scalar_null() {
289+
let result =
290+
invoke_chr(ColumnarValue::Scalar(ScalarValue::Int64(None)), 1).unwrap();
291+
292+
match result {
293+
ColumnarValue::Scalar(ScalarValue::Utf8(None)) => {}
294+
other => panic!("Unexpected result: {other:?}"),
295+
}
296+
}
241297
}

datafusion/sqllogictest/test_files/expr.slt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -432,6 +432,16 @@ SELECT chr(CAST(0 AS int))
432432
statement error DataFusion error: Execution error: invalid Unicode scalar value: 9223372036854775807
433433
SELECT chr(CAST(9223372036854775807 AS bigint))
434434

435+
statement error DataFusion error: Execution error: invalid Unicode scalar value: 1114112
436+
SELECT chr(CAST(1114112 AS bigint))
437+
438+
statement error DataFusion error: Execution error: invalid Unicode scalar value: -1
439+
SELECT chr(CAST(-1 AS bigint))
440+
441+
# surrogate code point (invalid scalar value)
442+
statement error DataFusion error: Execution error: invalid Unicode scalar value: 55297
443+
SELECT chr(CAST(55297 AS bigint))
444+
435445
query T
436446
SELECT concat('a','b','c')
437447
----

0 commit comments

Comments
 (0)