Skip to content

Commit d87d8f6

Browse files
authored
perf: Optimize string_to_array for scalar args (#21131)
## Which issue does this PR close? - Closes #21129. ## Rationale for this change When the delimiter (and null string, if supplied) are scalars, we can implement `string_to_array` more efficiently. In particular, we can construct a `memmem::Finder` and use it to search for delimiters more efficiently. This PR implements this optimization; it also fixes a place where we were allocating an intermediate `String` for every character when the delimiter is `NULL`. (This isn't a common case but worth fixing.) Benchmarks (M4 Max): ``` single_char_delim/5: 34.8 µs (was 61.1 µs) -43% single_char_delim/20: 145.1 µs (was 220.7 µs) -34% single_char_delim/100: 679.4 µs (was 1.04 ms) -35% multi_char_delim/5: 41.7 µs (was 56.7 µs) -27% multi_char_delim/20: 158.9 µs (was 185.1 µs) -14% multi_char_delim/100: 731.4 µs (was 858.3 µs) -15% with_null_str/5: 43.1 µs (was 68.7 µs) -37% with_null_str/20: 179.3 µs (was 244.3 µs) -27% with_null_str/100: 895.8 µs (was 1.16 ms) -23% null_delim/5: 17.4 µs (was 64.1 µs) -73% null_delim/20: 63.0 µs (was 233.4 µs) -73% null_delim/100: 280.2 µs (was 1.12 ms) -75% columnar_delim/5: 65.2 µs (was 60.2 µs) +8% columnar_delim/20: 217.2 µs (was 224.1 µs) -3% columnar_delim/100: 1.02 ms (was 1.05 ms) -3% ``` ## What changes are included in this PR? * Add benchmark for `string_to_array` * Implement optimizations described above * Refactor columnar (fallback) path to get rid of a lot of type dispatch boilerplate * Improve SLT test coverage for the "columnar string, scalar other-args" case ## Are these changes tested? Yes. ## Are there any user-facing changes? No.
1 parent 0bf9def commit d87d8f6

5 files changed

Lines changed: 585 additions & 277 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion/functions-nested/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ hashbrown = { workspace = true }
6161
itertools = { workspace = true, features = ["use_std"] }
6262
itoa = { workspace = true }
6363
log = { workspace = true }
64+
memchr = { workspace = true }
6465

6566
[dev-dependencies]
6667
criterion = { workspace = true, features = ["async_tokio"] }
@@ -117,3 +118,7 @@ name = "array_position"
117118
[[bench]]
118119
harness = false
119120
name = "array_sort"
121+
122+
[[bench]]
123+
harness = false
124+
name = "string_to_array"
Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use arrow::array::{ArrayRef, StringArray};
19+
use arrow::datatypes::{DataType, Field};
20+
use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
21+
use datafusion_common::ScalarValue;
22+
use datafusion_common::config::ConfigOptions;
23+
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
24+
use datafusion_functions_nested::string::StringToArray;
25+
use rand::rngs::StdRng;
26+
use rand::{Rng, SeedableRng};
27+
use std::hint::black_box;
28+
use std::sync::Arc;
29+
30+
const NUM_ROWS: usize = 1000;
31+
const SEED: u64 = 42;
32+
33+
fn criterion_benchmark(c: &mut Criterion) {
34+
// Single-char delimiter
35+
let comma = ColumnarValue::Scalar(ScalarValue::Utf8(Some(",".to_string())));
36+
bench_string_to_array(
37+
c,
38+
"string_to_array_single_char_delim",
39+
create_csv_strings,
40+
&comma,
41+
None,
42+
);
43+
44+
// Multi-char delimiter
45+
let double_colon = ColumnarValue::Scalar(ScalarValue::Utf8(Some("::".to_string())));
46+
bench_string_to_array(
47+
c,
48+
"string_to_array_multi_char_delim",
49+
create_multi_delim_strings,
50+
&double_colon,
51+
None,
52+
);
53+
54+
// With null_str argument
55+
let null_str = ColumnarValue::Scalar(ScalarValue::Utf8(Some("NULL".to_string())));
56+
bench_string_to_array(
57+
c,
58+
"string_to_array_with_null_str",
59+
create_csv_strings_with_nulls,
60+
&comma,
61+
Some(&null_str),
62+
);
63+
64+
// NULL delimiter
65+
let null_delim = ColumnarValue::Scalar(ScalarValue::Utf8(None));
66+
bench_string_to_array(
67+
c,
68+
"string_to_array_null_delim",
69+
create_short_strings,
70+
&null_delim,
71+
None,
72+
);
73+
74+
// Columnar delimiter (fall-back path)
75+
bench_string_to_array_columnar_delim(c);
76+
}
77+
78+
fn bench_string_to_array_columnar_delim(c: &mut Criterion) {
79+
let mut group = c.benchmark_group("string_to_array_columnar_delim");
80+
81+
for &num_elements in &[5, 20, 100] {
82+
let string_array = create_csv_strings(num_elements);
83+
let delimiter_array: ArrayRef =
84+
Arc::new(StringArray::from(vec![Some(","); NUM_ROWS]));
85+
86+
let args = vec![
87+
ColumnarValue::Array(string_array.clone()),
88+
ColumnarValue::Array(delimiter_array),
89+
];
90+
let arg_fields = vec![
91+
Field::new("str", DataType::Utf8, true).into(),
92+
Field::new("delimiter", DataType::Utf8, false).into(),
93+
];
94+
95+
let return_field = Field::new(
96+
"result",
97+
DataType::List(Arc::new(Field::new_list_field(DataType::Utf8, true))),
98+
true,
99+
);
100+
101+
group.bench_with_input(
102+
BenchmarkId::from_parameter(num_elements),
103+
&num_elements,
104+
|b, _| {
105+
let udf = StringToArray::new();
106+
b.iter(|| {
107+
black_box(
108+
udf.invoke_with_args(ScalarFunctionArgs {
109+
args: args.clone(),
110+
arg_fields: arg_fields.clone(),
111+
number_rows: NUM_ROWS,
112+
return_field: return_field.clone().into(),
113+
config_options: Arc::new(ConfigOptions::default()),
114+
})
115+
.unwrap(),
116+
)
117+
})
118+
},
119+
);
120+
}
121+
122+
group.finish();
123+
}
124+
125+
fn bench_string_to_array(
126+
c: &mut Criterion,
127+
group_name: &str,
128+
make_strings: fn(usize) -> ArrayRef,
129+
delimiter: &ColumnarValue,
130+
null_str: Option<&ColumnarValue>,
131+
) {
132+
let mut group = c.benchmark_group(group_name);
133+
134+
for &num_elements in &[5, 20, 100] {
135+
let string_array = make_strings(num_elements);
136+
137+
let mut args = vec![
138+
ColumnarValue::Array(string_array.clone()),
139+
delimiter.clone(),
140+
];
141+
let mut arg_fields = vec![
142+
Field::new("str", DataType::Utf8, true).into(),
143+
Field::new("delimiter", DataType::Utf8, true).into(),
144+
];
145+
if let Some(ns) = null_str {
146+
args.push(ns.clone());
147+
arg_fields.push(Field::new("null_str", DataType::Utf8, true).into());
148+
}
149+
150+
let return_field = Field::new(
151+
"result",
152+
DataType::List(Arc::new(Field::new_list_field(DataType::Utf8, true))),
153+
true,
154+
);
155+
156+
group.bench_with_input(
157+
BenchmarkId::from_parameter(num_elements),
158+
&num_elements,
159+
|b, _| {
160+
let udf = StringToArray::new();
161+
b.iter(|| {
162+
black_box(
163+
udf.invoke_with_args(ScalarFunctionArgs {
164+
args: args.clone(),
165+
arg_fields: arg_fields.clone(),
166+
number_rows: NUM_ROWS,
167+
return_field: return_field.clone().into(),
168+
config_options: Arc::new(ConfigOptions::default()),
169+
})
170+
.unwrap(),
171+
)
172+
})
173+
},
174+
);
175+
}
176+
177+
group.finish();
178+
}
179+
180+
/// Creates strings like "val1,val2,val3,...,valN" with `num_elements` elements.
181+
fn create_csv_strings(num_elements: usize) -> ArrayRef {
182+
let mut rng = StdRng::seed_from_u64(SEED);
183+
let strings: StringArray = (0..NUM_ROWS)
184+
.map(|_| {
185+
let parts: Vec<String> = (0..num_elements)
186+
.map(|_| format!("val{}", rng.random_range(0..1000)))
187+
.collect();
188+
Some(parts.join(","))
189+
})
190+
.collect();
191+
Arc::new(strings)
192+
}
193+
194+
/// Creates strings like "val1::val2::val3::...::valN".
195+
fn create_multi_delim_strings(num_elements: usize) -> ArrayRef {
196+
let mut rng = StdRng::seed_from_u64(SEED);
197+
let strings: StringArray = (0..NUM_ROWS)
198+
.map(|_| {
199+
let parts: Vec<String> = (0..num_elements)
200+
.map(|_| format!("val{}", rng.random_range(0..1000)))
201+
.collect();
202+
Some(parts.join("::"))
203+
})
204+
.collect();
205+
Arc::new(strings)
206+
}
207+
208+
/// Creates CSV strings where ~10% of elements are the literal "NULL".
209+
fn create_csv_strings_with_nulls(num_elements: usize) -> ArrayRef {
210+
let mut rng = StdRng::seed_from_u64(SEED);
211+
let strings: StringArray = (0..NUM_ROWS)
212+
.map(|_| {
213+
let parts: Vec<String> = (0..num_elements)
214+
.map(|_| {
215+
if rng.random::<f64>() < 0.1 {
216+
"NULL".to_string()
217+
} else {
218+
format!("val{}", rng.random_range(0..1000))
219+
}
220+
})
221+
.collect();
222+
Some(parts.join(","))
223+
})
224+
.collect();
225+
Arc::new(strings)
226+
}
227+
228+
/// Creates short strings (length = `num_chars`) for the NULL-delimiter
229+
/// (split-into-characters) benchmark.
230+
fn create_short_strings(num_chars: usize) -> ArrayRef {
231+
let mut rng = StdRng::seed_from_u64(SEED);
232+
let strings: StringArray = (0..NUM_ROWS)
233+
.map(|_| {
234+
let s: String = (0..num_chars)
235+
.map(|_| rng.random_range(b'a'..=b'z') as char)
236+
.collect();
237+
Some(s)
238+
})
239+
.collect();
240+
Arc::new(strings)
241+
}
242+
243+
criterion_group!(benches, criterion_benchmark);
244+
criterion_main!(benches);

0 commit comments

Comments
 (0)