Skip to content

Commit 924037e

Browse files
andygrovemartin-galamb
authored
perf: Improve performance of split_part (#19570)
## Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes #123` indicates that this PR will close issue #123. --> - Closes #. ## Rationale for this change <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> I ran microbenchmarks comparing DataFusion with DuckDB for string functions (see apache/datafusion-benchmarks#26) and noticed that DF was very slow for `split_part`. This PR fixes some obvious performance issues. Speedups are: | Benchmark | Before | After | Speedup | |-----------------------------------|--------|-------|--------------| | single_char_delim/pos_first | 1.27ms | 140µs | 9.1x faster | | single_char_delim/pos_middle | 1.39ms | 396µs | 3.5x faster | | single_char_delim/pos_last | 1.47ms | 738µs | 2.0x faster | | single_char_delim/pos_negative | 1.35ms | 148µs | 9.1x faster | | multi_char_delim/pos_first | 1.22ms | 174µs | 7.0x faster | | multi_char_delim/pos_middle | 1.22ms | 407µs | 3.0x faster | | string_view_single_char/pos_first | 1.42ms | 139µs | 10.2x faster | | many_parts_20/pos_second | 2.48ms | 201µs | 12.3x faster | | long_strings_50_parts/pos_first | 8.18ms | 178µs | 46x faster | ## What changes are included in this PR? <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> ## Are these changes tested? <!-- We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 2. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? --> ## Are there any user-facing changes? <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. --> <!-- If there are any breaking changes to public APIs, please add the `api change` label. --> --------- Co-authored-by: Martin Grigorov <martin-g@users.noreply.github.com> Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent 1037f0a commit 924037e

3 files changed

Lines changed: 411 additions & 14 deletions

File tree

datafusion/functions/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,11 @@ harness = false
296296
name = "levenshtein"
297297
required-features = ["unicode_expressions"]
298298

299+
[[bench]]
300+
harness = false
301+
name = "split_part"
302+
required-features = ["string_expressions"]
303+
299304
[[bench]]
300305
harness = false
301306
name = "left"
Lines changed: 382 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,382 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
extern crate criterion;
19+
20+
use arrow::array::{ArrayRef, Int64Array, StringArray, StringViewArray};
21+
use arrow::datatypes::{DataType, Field};
22+
use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
23+
use datafusion_common::config::ConfigOptions;
24+
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
25+
use datafusion_functions::string::split_part;
26+
use rand::distr::Alphanumeric;
27+
use rand::prelude::StdRng;
28+
use rand::{Rng, SeedableRng};
29+
use std::hint::black_box;
30+
use std::sync::Arc;
31+
32+
const N_ROWS: usize = 8192;
33+
34+
/// Generate test data for split_part benchmarks
35+
/// Creates strings with multiple parts separated by the delimiter
36+
fn gen_split_part_data(
37+
n_rows: usize,
38+
num_parts: usize, // number of parts in each string (separated by delimiter)
39+
part_len: usize, // length of each part
40+
delimiter: &str, // the delimiter to use
41+
use_string_view: bool, // false -> StringArray, true -> StringViewArray
42+
) -> (ColumnarValue, ColumnarValue) {
43+
let mut rng = StdRng::seed_from_u64(42);
44+
45+
let mut strings: Vec<String> = Vec::with_capacity(n_rows);
46+
for _ in 0..n_rows {
47+
let mut parts: Vec<String> = Vec::with_capacity(num_parts);
48+
for _ in 0..num_parts {
49+
let part: String = (&mut rng)
50+
.sample_iter(&Alphanumeric)
51+
.take(part_len)
52+
.map(char::from)
53+
.collect();
54+
parts.push(part);
55+
}
56+
strings.push(parts.join(delimiter));
57+
}
58+
59+
let delimiters: Vec<String> = vec![delimiter.to_string(); n_rows];
60+
61+
if use_string_view {
62+
let string_array: StringViewArray = strings.into_iter().map(Some).collect();
63+
let delimiter_array: StringViewArray = delimiters.into_iter().map(Some).collect();
64+
(
65+
ColumnarValue::Array(Arc::new(string_array) as ArrayRef),
66+
ColumnarValue::Array(Arc::new(delimiter_array) as ArrayRef),
67+
)
68+
} else {
69+
let string_array: StringArray = strings.into_iter().map(Some).collect();
70+
let delimiter_array: StringArray = delimiters.into_iter().map(Some).collect();
71+
(
72+
ColumnarValue::Array(Arc::new(string_array) as ArrayRef),
73+
ColumnarValue::Array(Arc::new(delimiter_array) as ArrayRef),
74+
)
75+
}
76+
}
77+
78+
fn gen_positions(n_rows: usize, position: i64) -> ColumnarValue {
79+
let positions: Vec<i64> = vec![position; n_rows];
80+
ColumnarValue::Array(Arc::new(Int64Array::from(positions)) as ArrayRef)
81+
}
82+
83+
fn criterion_benchmark(c: &mut Criterion) {
84+
let split_part_func = split_part();
85+
let config_options = Arc::new(ConfigOptions::default());
86+
87+
let mut group = c.benchmark_group("split_part");
88+
89+
// Test different scenarios
90+
// Scenario 1: Single-char delimiter, first position (should be fastest with optimization)
91+
{
92+
let (strings, delimiters) = gen_split_part_data(N_ROWS, 10, 8, ".", false);
93+
let positions = gen_positions(N_ROWS, 1);
94+
let args = vec![strings, delimiters, positions];
95+
let arg_fields: Vec<_> = args
96+
.iter()
97+
.enumerate()
98+
.map(|(idx, arg)| {
99+
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
100+
})
101+
.collect();
102+
let return_field = Field::new("f", DataType::Utf8, true).into();
103+
104+
group.bench_function(BenchmarkId::new("single_char_delim", "pos_first"), |b| {
105+
b.iter(|| {
106+
black_box(
107+
split_part_func
108+
.invoke_with_args(ScalarFunctionArgs {
109+
args: args.clone(),
110+
arg_fields: arg_fields.clone(),
111+
number_rows: N_ROWS,
112+
return_field: Arc::clone(&return_field),
113+
config_options: Arc::clone(&config_options),
114+
})
115+
.expect("split_part should work"),
116+
)
117+
})
118+
});
119+
}
120+
121+
// Scenario 2: Single-char delimiter, middle position
122+
{
123+
let (strings, delimiters) = gen_split_part_data(N_ROWS, 10, 8, ".", false);
124+
let positions = gen_positions(N_ROWS, 5);
125+
let args = vec![strings, delimiters, positions];
126+
let arg_fields: Vec<_> = args
127+
.iter()
128+
.enumerate()
129+
.map(|(idx, arg)| {
130+
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
131+
})
132+
.collect();
133+
let return_field = Field::new("f", DataType::Utf8, true).into();
134+
135+
group.bench_function(BenchmarkId::new("single_char_delim", "pos_middle"), |b| {
136+
b.iter(|| {
137+
black_box(
138+
split_part_func
139+
.invoke_with_args(ScalarFunctionArgs {
140+
args: args.clone(),
141+
arg_fields: arg_fields.clone(),
142+
number_rows: N_ROWS,
143+
return_field: Arc::clone(&return_field),
144+
config_options: Arc::clone(&config_options),
145+
})
146+
.expect("split_part should work"),
147+
)
148+
})
149+
});
150+
}
151+
152+
// Scenario 3: Single-char delimiter, last position
153+
{
154+
let (strings, delimiters) = gen_split_part_data(N_ROWS, 10, 8, ".", false);
155+
let positions = gen_positions(N_ROWS, 10);
156+
let args = vec![strings, delimiters, positions];
157+
let arg_fields: Vec<_> = args
158+
.iter()
159+
.enumerate()
160+
.map(|(idx, arg)| {
161+
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
162+
})
163+
.collect();
164+
let return_field = Field::new("f", DataType::Utf8, true).into();
165+
166+
group.bench_function(BenchmarkId::new("single_char_delim", "pos_last"), |b| {
167+
b.iter(|| {
168+
black_box(
169+
split_part_func
170+
.invoke_with_args(ScalarFunctionArgs {
171+
args: args.clone(),
172+
arg_fields: arg_fields.clone(),
173+
number_rows: N_ROWS,
174+
return_field: Arc::clone(&return_field),
175+
config_options: Arc::clone(&config_options),
176+
})
177+
.expect("split_part should work"),
178+
)
179+
})
180+
});
181+
}
182+
183+
// Scenario 4: Single-char delimiter, negative position (last element)
184+
{
185+
let (strings, delimiters) = gen_split_part_data(N_ROWS, 10, 8, ".", false);
186+
let positions = gen_positions(N_ROWS, -1);
187+
let args = vec![strings, delimiters, positions];
188+
let arg_fields: Vec<_> = args
189+
.iter()
190+
.enumerate()
191+
.map(|(idx, arg)| {
192+
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
193+
})
194+
.collect();
195+
let return_field = Field::new("f", DataType::Utf8, true).into();
196+
197+
group.bench_function(
198+
BenchmarkId::new("single_char_delim", "pos_negative"),
199+
|b| {
200+
b.iter(|| {
201+
black_box(
202+
split_part_func
203+
.invoke_with_args(ScalarFunctionArgs {
204+
args: args.clone(),
205+
arg_fields: arg_fields.clone(),
206+
number_rows: N_ROWS,
207+
return_field: Arc::clone(&return_field),
208+
config_options: Arc::clone(&config_options),
209+
})
210+
.expect("split_part should work"),
211+
)
212+
})
213+
},
214+
);
215+
}
216+
217+
// Scenario 5: Multi-char delimiter, first position
218+
{
219+
let (strings, delimiters) = gen_split_part_data(N_ROWS, 10, 8, "~@~", false);
220+
let positions = gen_positions(N_ROWS, 1);
221+
let args = vec![strings, delimiters, positions];
222+
let arg_fields: Vec<_> = args
223+
.iter()
224+
.enumerate()
225+
.map(|(idx, arg)| {
226+
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
227+
})
228+
.collect();
229+
let return_field = Field::new("f", DataType::Utf8, true).into();
230+
231+
group.bench_function(BenchmarkId::new("multi_char_delim", "pos_first"), |b| {
232+
b.iter(|| {
233+
black_box(
234+
split_part_func
235+
.invoke_with_args(ScalarFunctionArgs {
236+
args: args.clone(),
237+
arg_fields: arg_fields.clone(),
238+
number_rows: N_ROWS,
239+
return_field: Arc::clone(&return_field),
240+
config_options: Arc::clone(&config_options),
241+
})
242+
.expect("split_part should work"),
243+
)
244+
})
245+
});
246+
}
247+
248+
// Scenario 6: Multi-char delimiter, middle position
249+
{
250+
let (strings, delimiters) = gen_split_part_data(N_ROWS, 10, 8, "~@~", false);
251+
let positions = gen_positions(N_ROWS, 5);
252+
let args = vec![strings, delimiters, positions];
253+
let arg_fields: Vec<_> = args
254+
.iter()
255+
.enumerate()
256+
.map(|(idx, arg)| {
257+
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
258+
})
259+
.collect();
260+
let return_field = Field::new("f", DataType::Utf8, true).into();
261+
262+
group.bench_function(BenchmarkId::new("multi_char_delim", "pos_middle"), |b| {
263+
b.iter(|| {
264+
black_box(
265+
split_part_func
266+
.invoke_with_args(ScalarFunctionArgs {
267+
args: args.clone(),
268+
arg_fields: arg_fields.clone(),
269+
number_rows: N_ROWS,
270+
return_field: Arc::clone(&return_field),
271+
config_options: Arc::clone(&config_options),
272+
})
273+
.expect("split_part should work"),
274+
)
275+
})
276+
});
277+
}
278+
279+
// Scenario 7: StringViewArray, single-char delimiter, first position
280+
{
281+
let (strings, delimiters) = gen_split_part_data(N_ROWS, 10, 8, ".", true);
282+
let positions = gen_positions(N_ROWS, 1);
283+
let args = vec![strings, delimiters, positions];
284+
let arg_fields: Vec<_> = args
285+
.iter()
286+
.enumerate()
287+
.map(|(idx, arg)| {
288+
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
289+
})
290+
.collect();
291+
let return_field = Field::new("f", DataType::Utf8, true).into();
292+
293+
group.bench_function(
294+
BenchmarkId::new("string_view_single_char", "pos_first"),
295+
|b| {
296+
b.iter(|| {
297+
black_box(
298+
split_part_func
299+
.invoke_with_args(ScalarFunctionArgs {
300+
args: args.clone(),
301+
arg_fields: arg_fields.clone(),
302+
number_rows: N_ROWS,
303+
return_field: Arc::clone(&return_field),
304+
config_options: Arc::clone(&config_options),
305+
})
306+
.expect("split_part should work"),
307+
)
308+
})
309+
},
310+
);
311+
}
312+
313+
// Scenario 8: Many parts (20), position near end - shows benefit of early termination
314+
{
315+
let (strings, delimiters) = gen_split_part_data(N_ROWS, 20, 8, ".", false);
316+
let positions = gen_positions(N_ROWS, 2);
317+
let args = vec![strings, delimiters, positions];
318+
let arg_fields: Vec<_> = args
319+
.iter()
320+
.enumerate()
321+
.map(|(idx, arg)| {
322+
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
323+
})
324+
.collect();
325+
let return_field = Field::new("f", DataType::Utf8, true).into();
326+
327+
group.bench_function(BenchmarkId::new("many_parts_20", "pos_second"), |b| {
328+
b.iter(|| {
329+
black_box(
330+
split_part_func
331+
.invoke_with_args(ScalarFunctionArgs {
332+
args: args.clone(),
333+
arg_fields: arg_fields.clone(),
334+
number_rows: N_ROWS,
335+
return_field: Arc::clone(&return_field),
336+
config_options: Arc::clone(&config_options),
337+
})
338+
.expect("split_part should work"),
339+
)
340+
})
341+
});
342+
}
343+
344+
// Scenario 9: Long strings with many parts - worst case for old implementation
345+
{
346+
let (strings, delimiters) = gen_split_part_data(N_ROWS, 50, 16, "/", false);
347+
let positions = gen_positions(N_ROWS, 1);
348+
let args = vec![strings, delimiters, positions];
349+
let arg_fields: Vec<_> = args
350+
.iter()
351+
.enumerate()
352+
.map(|(idx, arg)| {
353+
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
354+
})
355+
.collect();
356+
let return_field = Field::new("f", DataType::Utf8, true).into();
357+
358+
group.bench_function(
359+
BenchmarkId::new("long_strings_50_parts", "pos_first"),
360+
|b| {
361+
b.iter(|| {
362+
black_box(
363+
split_part_func
364+
.invoke_with_args(ScalarFunctionArgs {
365+
args: args.clone(),
366+
arg_fields: arg_fields.clone(),
367+
number_rows: N_ROWS,
368+
return_field: Arc::clone(&return_field),
369+
config_options: Arc::clone(&config_options),
370+
})
371+
.expect("split_part should work"),
372+
)
373+
})
374+
},
375+
);
376+
}
377+
378+
group.finish();
379+
}
380+
381+
criterion_group!(benches, criterion_benchmark);
382+
criterion_main!(benches);

0 commit comments

Comments
 (0)