Skip to content

Commit ec7c9ab

Browse files
authored
chore: Add substr() benchmarks, refactor (#20803)
## Which issue does this PR close? N/A ## Rationale for this change I'd like to optimize `substr` for scalar `start`/`count` inputs, but the code would benefit from some refactoring and cleanup first. I also added benchmarks for `substr` with scalar args. ## What changes are included in this PR? - Refactor `string_view_substr` and `string_substr` to use a single loop - Change `get_true_start_end` to validate its own inputs, cleanup UTF8 path - Add benchmark cases for scalar `start` and `count` arguments - Improve docs ## Are these changes tested? Yes. ## Are there any user-facing changes? No, other than an error message wording change. ## AI usage Multiple AI tools were used to iterate on this PR. I have reviewed and understand the resulting code.
1 parent 2b7d4f9 commit ec7c9ab

5 files changed

Lines changed: 240 additions & 276 deletions

File tree

datafusion/functions/benches/substr.rs

Lines changed: 131 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -21,80 +21,62 @@ use arrow::util::bench_util::{
2121
create_string_array_with_len, create_string_view_array_with_len,
2222
};
2323
use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
24-
use datafusion_common::DataFusionError;
2524
use datafusion_common::config::ConfigOptions;
25+
use datafusion_common::{DataFusionError, ScalarValue};
2626
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
2727
use datafusion_functions::unicode;
2828
use std::hint::black_box;
2929
use std::sync::Arc;
3030

31+
fn make_i64_arg(value: i64, size: usize, as_scalar: bool) -> ColumnarValue {
32+
if as_scalar {
33+
ColumnarValue::Scalar(ScalarValue::from(value))
34+
} else {
35+
ColumnarValue::Array(Arc::new(Int64Array::from(vec![value; size])))
36+
}
37+
}
38+
3139
fn create_args_without_count<O: OffsetSizeTrait>(
3240
size: usize,
3341
str_len: usize,
3442
start_half_way: bool,
3543
force_view_types: bool,
44+
scalar_start: bool,
3645
) -> Vec<ColumnarValue> {
37-
let start_array = Arc::new(Int64Array::from(
38-
(0..size)
39-
.map(|_| {
40-
if start_half_way {
41-
(str_len / 2) as i64
42-
} else {
43-
1i64
44-
}
45-
})
46-
.collect::<Vec<_>>(),
47-
));
48-
49-
if force_view_types {
50-
let string_array =
51-
Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false));
52-
vec![
53-
ColumnarValue::Array(string_array),
54-
ColumnarValue::Array(start_array),
55-
]
46+
let start_val = if start_half_way {
47+
(str_len / 2) as i64
5648
} else {
57-
let string_array =
58-
Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
49+
1i64
50+
};
51+
let start = make_i64_arg(start_val, size, scalar_start);
5952

60-
vec![
61-
ColumnarValue::Array(string_array),
62-
ColumnarValue::Array(Arc::clone(&start_array) as ArrayRef),
63-
]
64-
}
53+
let string_array: ArrayRef = if force_view_types {
54+
Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false))
55+
} else {
56+
Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len))
57+
};
58+
59+
vec![ColumnarValue::Array(string_array), start]
6560
}
6661

6762
fn create_args_with_count<O: OffsetSizeTrait>(
6863
size: usize,
6964
str_len: usize,
7065
count_max: usize,
7166
force_view_types: bool,
67+
scalar_args: bool,
7268
) -> Vec<ColumnarValue> {
73-
let start_array =
74-
Arc::new(Int64Array::from((0..size).map(|_| 1).collect::<Vec<_>>()));
7569
let count = count_max.min(str_len) as i64;
76-
let count_array = Arc::new(Int64Array::from(
77-
(0..size).map(|_| count).collect::<Vec<_>>(),
78-
));
79-
80-
if force_view_types {
81-
let string_array =
82-
Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false));
83-
vec![
84-
ColumnarValue::Array(string_array),
85-
ColumnarValue::Array(start_array),
86-
ColumnarValue::Array(count_array),
87-
]
70+
let start = make_i64_arg(1i64, size, scalar_args);
71+
let count = make_i64_arg(count, size, scalar_args);
72+
73+
let string_array: ArrayRef = if force_view_types {
74+
Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false))
8875
} else {
89-
let string_array =
90-
Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
91-
92-
vec![
93-
ColumnarValue::Array(string_array),
94-
ColumnarValue::Array(Arc::clone(&start_array) as ArrayRef),
95-
ColumnarValue::Array(Arc::clone(&count_array) as ArrayRef),
96-
]
97-
}
76+
Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len))
77+
};
78+
79+
vec![ColumnarValue::Array(string_array), start, count]
9880
}
9981

10082
#[expect(clippy::needless_pass_by_value)]
@@ -122,22 +104,22 @@ fn criterion_benchmark(c: &mut Criterion) {
122104
for size in [1024, 4096] {
123105
// string_len = 12, substring_len=6 (see `create_args_without_count`)
124106
let len = 12;
125-
let mut group = c.benchmark_group("SHORTER THAN 12");
107+
let mut group = c.benchmark_group("substr, no count, short strings");
126108
group.sampling_mode(SamplingMode::Flat);
127109
group.sample_size(10);
128110

129-
let args = create_args_without_count::<i32>(size, len, true, true);
111+
let args = create_args_without_count::<i32>(size, len, true, true, false);
130112
group.bench_function(
131113
format!("substr_string_view [size={size}, strlen={len}]"),
132114
|b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
133115
);
134116

135-
let args = create_args_without_count::<i32>(size, len, false, false);
117+
let args = create_args_without_count::<i32>(size, len, false, false, false);
136118
group.bench_function(format!("substr_string [size={size}, strlen={len}]"), |b| {
137119
b.iter(|| black_box(invoke_substr_with_args(args.clone(), size)))
138120
});
139121

140-
let args = create_args_without_count::<i64>(size, len, true, false);
122+
let args = create_args_without_count::<i64>(size, len, true, false, false);
141123
group.bench_function(
142124
format!("substr_large_string [size={size}, strlen={len}]"),
143125
|b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
@@ -148,23 +130,23 @@ fn criterion_benchmark(c: &mut Criterion) {
148130
// string_len = 128, start=1, count=64, substring_len=64
149131
let len = 128;
150132
let count = 64;
151-
let mut group = c.benchmark_group("LONGER THAN 12");
133+
let mut group = c.benchmark_group("substr, with count, long strings");
152134
group.sampling_mode(SamplingMode::Flat);
153135
group.sample_size(10);
154136

155-
let args = create_args_with_count::<i32>(size, len, count, true);
137+
let args = create_args_with_count::<i32>(size, len, count, true, false);
156138
group.bench_function(
157139
format!("substr_string_view [size={size}, count={count}, strlen={len}]",),
158140
|b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
159141
);
160142

161-
let args = create_args_with_count::<i32>(size, len, count, false);
143+
let args = create_args_with_count::<i32>(size, len, count, false, false);
162144
group.bench_function(
163145
format!("substr_string [size={size}, count={count}, strlen={len}]",),
164146
|b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
165147
);
166148

167-
let args = create_args_with_count::<i64>(size, len, count, false);
149+
let args = create_args_with_count::<i64>(size, len, count, false, false);
168150
group.bench_function(
169151
format!("substr_large_string [size={size}, count={count}, strlen={len}]",),
170152
|b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
@@ -175,29 +157,116 @@ fn criterion_benchmark(c: &mut Criterion) {
175157
// string_len = 128, start=1, count=6, substring_len=6
176158
let len = 128;
177159
let count = 6;
178-
let mut group = c.benchmark_group("SRC_LEN > 12, SUB_LEN < 12");
160+
let mut group = c.benchmark_group("substr, short count, long strings");
179161
group.sampling_mode(SamplingMode::Flat);
180162
group.sample_size(10);
181163

182-
let args = create_args_with_count::<i32>(size, len, count, true);
164+
let args = create_args_with_count::<i32>(size, len, count, true, false);
183165
group.bench_function(
184166
format!("substr_string_view [size={size}, count={count}, strlen={len}]",),
185167
|b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
186168
);
187169

188-
let args = create_args_with_count::<i32>(size, len, count, false);
170+
let args = create_args_with_count::<i32>(size, len, count, false, false);
189171
group.bench_function(
190172
format!("substr_string [size={size}, count={count}, strlen={len}]",),
191173
|b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
192174
);
193175

194-
let args = create_args_with_count::<i64>(size, len, count, false);
176+
let args = create_args_with_count::<i64>(size, len, count, false, false);
195177
group.bench_function(
196178
format!("substr_large_string [size={size}, count={count}, strlen={len}]",),
197179
|b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
198180
);
199181

200182
group.finish();
183+
184+
// Scalar start, no count, short strings
185+
let len = 12;
186+
let mut group =
187+
c.benchmark_group("substr, scalar start, no count, short strings");
188+
group.sampling_mode(SamplingMode::Flat);
189+
group.sample_size(10);
190+
191+
let args = create_args_without_count::<i32>(size, len, true, true, true);
192+
group.bench_function(
193+
format!("substr_string_view [size={size}, strlen={len}]"),
194+
|b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
195+
);
196+
197+
let args = create_args_without_count::<i32>(size, len, false, false, true);
198+
group.bench_function(format!("substr_string [size={size}, strlen={len}]"), |b| {
199+
b.iter(|| black_box(invoke_substr_with_args(args.clone(), size)))
200+
});
201+
202+
group.finish();
203+
204+
// Scalar start, no count, long strings
205+
let len = 128;
206+
let mut group = c.benchmark_group("substr, scalar start, no count, long strings");
207+
group.sampling_mode(SamplingMode::Flat);
208+
group.sample_size(10);
209+
210+
let args = create_args_without_count::<i32>(size, len, true, true, true);
211+
group.bench_function(
212+
format!("substr_string_view [size={size}, strlen={len}]"),
213+
|b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
214+
);
215+
216+
let args = create_args_without_count::<i32>(size, len, false, false, true);
217+
group.bench_function(format!("substr_string [size={size}, strlen={len}]"), |b| {
218+
b.iter(|| black_box(invoke_substr_with_args(args.clone(), size)))
219+
});
220+
221+
group.finish();
222+
223+
// Scalar start and count, short strings
224+
let len = 12;
225+
let count = 6;
226+
let mut group = c.benchmark_group("substr, scalar args, short strings");
227+
group.sampling_mode(SamplingMode::Flat);
228+
group.sample_size(10);
229+
230+
let args = create_args_with_count::<i32>(size, len, count, true, true);
231+
group.bench_function(
232+
format!("substr_string_view [size={size}, count={count}, strlen={len}]"),
233+
|b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
234+
);
235+
236+
let args = create_args_with_count::<i32>(size, len, count, false, true);
237+
group.bench_function(
238+
format!("substr_string [size={size}, count={count}, strlen={len}]"),
239+
|b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
240+
);
241+
242+
group.finish();
243+
244+
// Scalar start and count, long strings
245+
let len = 128;
246+
let count = 64;
247+
let mut group = c.benchmark_group("substr, scalar args, long strings");
248+
group.sampling_mode(SamplingMode::Flat);
249+
group.sample_size(10);
250+
251+
let args = create_args_with_count::<i32>(size, len, count, true, true);
252+
group.bench_function(
253+
format!("substr_string_view [size={size}, count={count}, strlen={len}]"),
254+
|b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
255+
);
256+
257+
let args = create_args_with_count::<i32>(size, len, count, false, true);
258+
group.bench_function(
259+
format!("substr_string [size={size}, count={count}, strlen={len}]"),
260+
|b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
261+
);
262+
263+
let args = create_args_with_count::<i64>(size, len, count, false, true);
264+
group.bench_function(
265+
format!("substr_large_string [size={size}, count={count}, strlen={len}]"),
266+
|b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
267+
);
268+
269+
group.finish();
201270
}
202271
}
203272

0 commit comments

Comments
 (0)