Skip to content

Commit 5a427cb

Browse files
authored
perf: Optimize substr for Utf8, LargeUtf8 (#21366)
## Which issue does this PR close? - Closes #21364. ## Rationale for this change For `Utf8` and `LargeUtf8` inputs, we can optimize `substr` to avoid copying the output strings; instead, we can return a `StringViewArray` that points into the input value buffer. Benchmarks (M4 Max): no count, short strings (size=1024): - string_view: 5.97 µs -> 5.96 µs (-0.2%) - string: 7.80 µs -> 4.99 µs (-36.1%) - large_string: 8.47 µs -> 4.90 µs (-42.2%) no count, short strings (size=4096): - string_view: 23.10 µs -> 22.90 µs (-0.9%) - string: 31.24 µs -> 18.31 µs (-41.4%) - large_string: 34.10 µs -> 17.70 µs (-48.1%) with count, long strings (size=1024, count=64, strlen=128): - string_view: 10.16 µs -> 10.79 µs (+6.2%) - string: 11.90 µs -> 8.38 µs (-29.6%) - large_string: 11.93 µs -> 8.30 µs (-30.5%) with count, long strings (size=4096, count=64, strlen=128): - string_view: 39.37 µs -> 38.79 µs (-1.5%) - string: 46.22 µs -> 30.25 µs (-34.6%) - large_string: 46.57 µs -> 30.49 µs (-34.5%) short count, long strings (size=1024, count=6, strlen=128): - string_view: 11.65 µs -> 11.57 µs (-0.7%) - string: 14.97 µs -> 11.37 µs (-24.1%) - large_string: 14.92 µs -> 11.37 µs (-23.8%) short count, long strings (size=4096, count=6, strlen=128): - string_view: 45.88 µs -> 43.82 µs (-4.5%) - string: 58.38 µs -> 43.55 µs (-25.4%) - large_string: 58.59 µs -> 43.58 µs (-25.6%) scalar start, no count, short strings (size=1024, strlen=12): - string_view: 6.07 µs -> 6.10 µs (+0.5%) - string: 7.81 µs -> 5.06 µs (-35.2%) scalar start, no count, short strings (size=4096, strlen=12): - string_view: 23.08 µs -> 22.62 µs (-2.0%) - string: 31.07 µs -> 18.86 µs (-39.3%) scalar start, no count, long strings (size=1024, strlen=128): - string_view: 9.99 µs -> 10.65 µs (+6.6%) - string: 12.01 µs -> 8.17 µs (-32.0%) scalar start, no count, long strings (size=4096, strlen=128): - string_view: 38.57 µs -> 39.79 µs (+3.2%) - string: 46.83 µs -> 31.67 µs (-32.4%) scalar start=1, no count, long strings (size=1024, strlen=128): - string_view: 9.78 µs -> 10.48 µs (+7.2%) - string: 12.02 µs -> 8.16 µs (-32.1%) scalar start=1, no count, long strings (size=4096, strlen=128): - string_view: 38.54 µs -> 40.18 µs (+4.3%) - string: 46.36 µs -> 31.73 µs (-31.6%) scalar args, short strings (size=1024, count=6, strlen=12): - string_view: 11.30 µs -> 11.23 µs (-0.7%) - string: 15.04 µs -> 11.52 µs (-23.4%) scalar args, short strings (size=4096, count=6, strlen=12): - string_view: 44.34 µs -> 43.98 µs (-0.8%) - string: 59.63 µs -> 45.02 µs (-24.5%) scalar args, long strings (size=1024, count=64, strlen=128): - string_view: 10.51 µs -> 12.05 µs (+14.6%) - string: 12.21 µs -> 8.67 µs (-28.9%) - large_string: 12.20 µs -> 8.66 µs (-29.0%) scalar args, long strings (size=4096, count=64, strlen=128): - string_view: 40.13 µs -> 41.89 µs (+4.4%) - string: 46.96 µs -> 32.44 µs (-30.9%) - large_string: 47.24 µs -> 32.49 µs (-31.2%) This PR doesn't modify the `string_view` code path; I've included the benchmark results above for completeness, but any changes should just be benchmarking noise. ## What changes are included in this PR? * Implement optimization * Other minor code cleanup * Add a benchmark (only somewhat related to this optimization but related to future optimization work) ## Are these changes tested? Yes. ## Are there any user-facing changes? No.
1 parent fab3b71 commit 5a427cb

3 files changed

Lines changed: 153 additions & 21 deletions

File tree

datafusion/functions/benches/substr.rs

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ fn criterion_benchmark(c: &mut Criterion) {
201201

202202
group.finish();
203203

204-
// Scalar start, no count, long strings
204+
// Scalar start, no count, long strings, start near middle
205205
let len = 128;
206206
let mut group = c.benchmark_group("substr, scalar start, no count, long strings");
207207
group.sampling_mode(SamplingMode::Flat);
@@ -220,6 +220,26 @@ fn criterion_benchmark(c: &mut Criterion) {
220220

221221
group.finish();
222222

223+
// Scalar start=1, no count, long strings
224+
let len = 128;
225+
let mut group =
226+
c.benchmark_group("substr, scalar start=1, no count, long strings");
227+
group.sampling_mode(SamplingMode::Flat);
228+
group.sample_size(10);
229+
230+
let args = create_args_without_count::<i32>(size, len, false, true, true);
231+
group.bench_function(
232+
format!("substr_string_view [size={size}, strlen={len}]"),
233+
|b| b.iter(|| black_box(invoke_substr_with_args(args.clone(), size))),
234+
);
235+
236+
let args = create_args_without_count::<i32>(size, len, false, false, true);
237+
group.bench_function(format!("substr_string [size={size}, strlen={len}]"), |b| {
238+
b.iter(|| black_box(invoke_substr_with_args(args.clone(), size)))
239+
});
240+
241+
group.finish();
242+
223243
// Scalar start and count, short strings
224244
let len = 12;
225245
let count = 6;

datafusion/functions/src/strings.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -424,7 +424,7 @@ impl LargeStringArrayBuilder {
424424
/// - start_offset: The start offset of the substring in the view
425425
///
426426
/// LLVM is apparently overly eager to inline this function into some hot loops,
427-
/// which bloats them and regresses performance, so we disable inling for now.
427+
/// which bloats them and regresses performance, so we disable inlining for now.
428428
#[inline(never)]
429429
pub fn append_view(
430430
views_buffer: &mut Vec<u128>,

datafusion/functions/src/unicode/substr.rs

Lines changed: 131 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ use std::sync::Arc;
2020
use crate::strings::append_view;
2121
use crate::utils::make_scalar_function;
2222
use arrow::array::{
23-
Array, ArrayRef, AsArray, Int64Array, StringArrayType, StringViewArray,
24-
StringViewBuilder,
23+
Array, ArrayRef, AsArray, GenericStringArray, Int64Array, OffsetSizeTrait,
24+
StringArrayType, StringViewArray, StringViewBuilder, make_view,
2525
};
2626
use arrow::buffer::{NullBuffer, ScalarBuffer};
2727
use arrow::datatypes::DataType;
@@ -134,11 +134,11 @@ fn substr(args: &[ArrayRef]) -> Result<ArrayRef> {
134134
match args[0].data_type() {
135135
DataType::Utf8 => {
136136
let string_array = args[0].as_string::<i32>();
137-
string_substr::<_>(string_array, &args[1..])
137+
generic_string_substr(string_array, &args[1..])
138138
}
139139
DataType::LargeUtf8 => {
140140
let string_array = args[0].as_string::<i64>();
141-
string_substr::<_>(string_array, &args[1..])
141+
generic_string_substr(string_array, &args[1..])
142142
}
143143
DataType::Utf8View => {
144144
let string_array = args[0].as_string_view();
@@ -275,7 +275,7 @@ fn string_view_substr(
275275
let start_array = as_int64_array(&args[0])?;
276276
let count_array_opt = args.get(1).map(|a| as_int64_array(a)).transpose()?;
277277

278-
let enable_ascii_fast_path =
278+
let is_ascii =
279279
enable_ascii_fast_path(&string_view_array, start_array, count_array_opt);
280280

281281
// Combine null bitmaps from all inputs in bulk.
@@ -296,11 +296,10 @@ fn string_view_substr(
296296
let start = start_array.value(i);
297297
let count = count_array_opt.map(|a| a.value(i));
298298

299-
let (start, end) =
300-
get_true_start_end(string, start, count, enable_ascii_fast_path)?;
301-
let substr = &string[start..end];
299+
let (byte_start, byte_end) = get_true_start_end(string, start, count, is_ascii)?;
300+
let substr = &string[byte_start..byte_end];
302301

303-
append_view(&mut views_buf, raw_view, substr, start as u32);
302+
append_view(&mut views_buf, raw_view, substr, byte_start as u32);
304303
}
305304

306305
let views_buf = ScalarBuffer::from(views_buf);
@@ -319,15 +318,104 @@ fn string_view_substr(
319318
}
320319
}
321320

322-
fn string_substr<'a, V>(string_array: V, args: &[ArrayRef]) -> Result<ArrayRef>
323-
where
324-
V: StringArrayType<'a> + Copy,
325-
{
321+
fn values_fit_in_i32<T: OffsetSizeTrait>(string_array: &GenericStringArray<T>) -> bool {
322+
// The Arrow spec defines StringView offset fields as signed 32-bit
323+
// integers, so the maximum representable offset is i32::MAX.
324+
string_array
325+
.offsets()
326+
.last()
327+
.map(|offset| offset.as_usize() <= i32::MAX as usize)
328+
.unwrap_or(true)
329+
}
330+
331+
#[inline]
332+
fn append_view_from_buffer(
333+
views_buf: &mut Vec<u128>,
334+
substr: &str,
335+
byte_offset: usize,
336+
) -> bool {
337+
let byte_offset =
338+
u32::try_from(byte_offset).expect("validated string buffer offset fits in i32");
339+
let view = make_view(substr.as_bytes(), 0, byte_offset);
340+
views_buf.push(view);
341+
substr.len() > 12
342+
}
343+
344+
#[expect(clippy::needless_range_loop)]
345+
fn generic_string_substr<T: OffsetSizeTrait>(
346+
string_array: &GenericStringArray<T>,
347+
args: &[ArrayRef],
348+
) -> Result<ArrayRef> {
349+
// We'd like to return a StringViewArray that points into the input string
350+
// array's values buffer. Since the Arrow spec defines StringView offsets
351+
// as i32, we can't use this approach when the values buffer is >2GB, so
352+
// fallback to copying.
353+
if !values_fit_in_i32(string_array) {
354+
return generic_string_substr_copy(string_array, args);
355+
}
356+
326357
let start_array = as_int64_array(&args[0])?;
327358
let count_array_opt = args.get(1).map(|a| as_int64_array(a)).transpose()?;
328359

329-
let enable_ascii_fast_path =
330-
enable_ascii_fast_path(&string_array, start_array, count_array_opt);
360+
let is_ascii = enable_ascii_fast_path(&string_array, start_array, count_array_opt);
361+
let offsets = string_array.value_offsets();
362+
let mut views_buf = Vec::with_capacity(string_array.len());
363+
let mut has_out_of_line = false;
364+
365+
// Combine null bitmaps from all inputs in bulk.
366+
let nulls = NullBuffer::union(
367+
NullBuffer::union(string_array.nulls(), start_array.nulls()).as_ref(),
368+
count_array_opt.and_then(|a| a.nulls()),
369+
);
370+
371+
for i in 0..string_array.len() {
372+
if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
373+
views_buf.push(0);
374+
continue;
375+
}
376+
377+
let string = string_array.value(i);
378+
let source_offset = offsets[i].as_usize();
379+
let start = start_array.value(i);
380+
let count = count_array_opt.map(|a| a.value(i));
381+
382+
let (byte_start, byte_end) = get_true_start_end(string, start, count, is_ascii)?;
383+
has_out_of_line |= append_view_from_buffer(
384+
&mut views_buf,
385+
&string[byte_start..byte_end],
386+
source_offset + byte_start,
387+
);
388+
}
389+
390+
let views_buf = ScalarBuffer::from(views_buf);
391+
392+
// If all result strings are stored inline, we don't need to retain the
393+
// input string array.
394+
let data_buffers = if has_out_of_line {
395+
vec![string_array.values().clone()]
396+
} else {
397+
vec![]
398+
};
399+
400+
// Safety:
401+
// (1) The blocks of the given views are all provided
402+
// (2) Each referenced range in the source values buffer is within bounds
403+
unsafe {
404+
let array = StringViewArray::new_unchecked(views_buf, data_buffers, nulls);
405+
Ok(Arc::new(array) as ArrayRef)
406+
}
407+
}
408+
409+
// Fallback for `generic_string_substr` if we can't use zerocopy because the
410+
// input string array is too large.
411+
fn generic_string_substr_copy<T: OffsetSizeTrait>(
412+
string_array: &GenericStringArray<T>,
413+
args: &[ArrayRef],
414+
) -> Result<ArrayRef> {
415+
let start_array = as_int64_array(&args[0])?;
416+
let count_array_opt = args.get(1).map(|a| as_int64_array(a)).transpose()?;
417+
418+
let is_ascii = enable_ascii_fast_path(&string_array, start_array, count_array_opt);
331419

332420
// Combine null bitmaps from all inputs in bulk.
333421
let nulls = NullBuffer::union(
@@ -347,17 +435,20 @@ where
347435
let start = start_array.value(i);
348436
let count = count_array_opt.map(|a| a.value(i));
349437

350-
let (start, end) =
351-
get_true_start_end(string, start, count, enable_ascii_fast_path)?;
352-
result_builder.append_value(&string[start..end]);
438+
let (byte_start, byte_end) = get_true_start_end(string, start, count, is_ascii)?;
439+
result_builder.append_value(&string[byte_start..byte_end]);
353440
}
354441

355442
Ok(Arc::new(result_builder.finish()) as ArrayRef)
356443
}
357444

358445
#[cfg(test)]
359446
mod tests {
360-
use arrow::array::{Array, StringViewArray};
447+
use std::sync::Arc;
448+
449+
use arrow::array::{
450+
Array, ArrayRef, AsArray, Int64Array, StringArray, StringViewArray,
451+
};
361452
use arrow::datatypes::DataType::Utf8View;
362453

363454
use datafusion_common::{Result, ScalarValue, exec_err};
@@ -734,4 +825,25 @@ mod tests {
734825

735826
Ok(())
736827
}
828+
829+
#[test]
830+
fn test_sliced_string_array_array_args() -> Result<()> {
831+
// Use strings longer than 12 bytes so the result views are out-of-line.
832+
let string_array = Arc::new(StringArray::from(vec![
833+
"skipped_prefix_value",
834+
"alphabet_long_string",
835+
"joséésojanother_long",
836+
])) as ArrayRef;
837+
let string_array = string_array.slice(1, 2);
838+
let start_array = Arc::new(Int64Array::from(vec![3, 5])) as ArrayRef;
839+
let count_array = Arc::new(Int64Array::from(vec![15, 14])) as ArrayRef;
840+
841+
let result = super::substr(&[string_array, start_array, count_array])?;
842+
let result = result.as_string_view();
843+
844+
assert_eq!(result.value(0), "phabet_long_str");
845+
assert_eq!(result.value(1), "ésojanother_lo");
846+
847+
Ok(())
848+
}
737849
}

0 commit comments

Comments
 (0)