Skip to content

Commit 190b104

Browse files
authored
perf: Optimize split_part for Utf8View (#21420)
## Which issue does this PR close? - Closes #21410. ## Rationale for this change When `split_part` is invoked with a `StringViewArray`, we can avoid copying when constructing the return value by instead returning pointers into the view buffers of the input `StringViewArray`. This PR only applies this optimization to the code path for scalar `delimiter` and `position`, because that's the most common usage mode in practice. We could also optimize the array-args case but it didn't seem worth the extra code. Benchmarks (M4 Max): - scalar_utf8view_very_long_parts/pos_first: 102 µs → 68 µs (-33%) - scalar_utf8view_long_parts/pos_middle: 164 µs → 137 µs (-15%) - scalar_utf8_single_char/pos_first: 42.5 µs → 42.9 µs (no change) - scalar_utf8_single_char/pos_middle: 96.5 µs → 99.5 µs (noise) - scalar_utf8_single_char/pos_negative: 48.3 µs → 48.6 µs (no change) - scalar_utf8_multi_char/pos_middle: 132 µs → 132 µs (no change) - scalar_utf8_long_strings/pos_middle: 1.06 ms → 1.08 ms (noise) - array_utf8_single_char/pos_middle: 355 µs → 365 µs (noise) - array_utf8_multi_char/pos_middle: 357 µs → 360 µs (no change) ## What changes are included in this PR? * Implement optimization * Add benchmark that covers this case * Improve SLT test coverage for this code path ## Are these changes tested? Yes. ## Are there any user-facing changes? No.
1 parent cdddd76 commit 190b104

3 files changed

Lines changed: 253 additions & 9 deletions

File tree

datafusion/functions/benches/split_part.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,23 @@ fn criterion_benchmark(c: &mut Criterion) {
210210
);
211211
}
212212

213+
// Utf8View, very long parts (256 bytes), position 1
214+
{
215+
let strings = gen_string_array(N_ROWS, 5, 256, ".", true);
216+
let delimiter = ColumnarValue::Scalar(ScalarValue::Utf8View(Some(".".into())));
217+
let position = ColumnarValue::Scalar(ScalarValue::Int64(Some(1)));
218+
bench_split_part(
219+
&mut group,
220+
&split_part_func,
221+
&config_options,
222+
"scalar_utf8view_very_long_parts",
223+
"pos_first",
224+
strings,
225+
delimiter,
226+
position,
227+
);
228+
}
229+
213230
// ── Array delimiter and position ─────────────────
214231

215232
// Utf8, single-char delimiter, array args

datafusion/functions/src/string/split_part.rs

Lines changed: 145 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,11 @@
1717

1818
use crate::utils::utf8_to_str_type;
1919
use arrow::array::{
20-
Array, ArrayRef, AsArray, GenericStringBuilder, Int64Array, StringArrayType,
21-
StringLikeArrayBuilder, StringViewBuilder, new_null_array,
20+
Array, ArrayRef, AsArray, ByteView, GenericStringBuilder, Int64Array,
21+
StringArrayType, StringLikeArrayBuilder, StringViewArray, StringViewBuilder,
22+
make_view, new_null_array,
2223
};
24+
use arrow::buffer::ScalarBuffer;
2325
use arrow::datatypes::DataType;
2426
use datafusion_common::ScalarValue;
2527
use datafusion_common::cast::as_int64_array;
@@ -279,12 +281,9 @@ fn split_part_scalar(
279281
}
280282

281283
let result = match string_array.data_type() {
282-
DataType::Utf8View => split_part_scalar_impl(
283-
string_array.as_string_view(),
284-
delimiter,
285-
position,
286-
StringViewBuilder::with_capacity(string_array.len()),
287-
),
284+
DataType::Utf8View => {
285+
split_part_scalar_view(string_array.as_string_view(), delimiter, position)
286+
}
288287
DataType::Utf8 => {
289288
let arr = string_array.as_string::<i32>();
290289
// Conservative under-estimate for data capacity: split_part output
@@ -425,6 +424,116 @@ fn rsplit_nth_finder<'a>(
425424
}
426425
}
427426

427+
/// Zero-copy scalar fast path for `StringViewArray` inputs.
428+
///
429+
/// Instead of copying substring bytes into a new buffer, constructs
430+
/// `StringView` entries that point back into the original array's data
431+
/// buffers.
432+
fn split_part_scalar_view(
433+
string_view_array: &StringViewArray,
434+
delimiter: &str,
435+
position: i64,
436+
) -> Result<ArrayRef> {
437+
let len = string_view_array.len();
438+
let mut views_buf = Vec::with_capacity(len);
439+
let views = string_view_array.views();
440+
441+
if delimiter.is_empty() {
442+
// PostgreSQL: empty delimiter treats input as a single field.
443+
let empty_view = make_view(b"", 0, 0);
444+
let return_input = position == 1 || position == -1;
445+
for i in 0..len {
446+
if string_view_array.is_null(i) {
447+
views_buf.push(0);
448+
} else if return_input {
449+
views_buf.push(views[i]);
450+
} else {
451+
views_buf.push(empty_view);
452+
}
453+
}
454+
} else if position > 0 {
455+
let idx: usize = (position - 1).try_into().map_err(|_| {
456+
exec_datafusion_err!(
457+
"split_part index {position} exceeds maximum supported value"
458+
)
459+
})?;
460+
let finder = memmem::Finder::new(delimiter.as_bytes());
461+
split_view_loop(string_view_array, views, &mut views_buf, |s| {
462+
split_nth_finder(s, &finder, delimiter.len(), idx)
463+
});
464+
} else {
465+
let idx: usize = (position.unsigned_abs() - 1).try_into().map_err(|_| {
466+
exec_datafusion_err!(
467+
"split_part index {position} exceeds minimum supported value"
468+
)
469+
})?;
470+
let finder_rev = memmem::FinderRev::new(delimiter.as_bytes());
471+
split_view_loop(string_view_array, views, &mut views_buf, |s| {
472+
rsplit_nth_finder(s, &finder_rev, delimiter.len(), idx)
473+
});
474+
}
475+
476+
let views_buf = ScalarBuffer::from(views_buf);
477+
478+
// Nulls pass through unchanged, so we can use the input's null array.
479+
let nulls = string_view_array.nulls().cloned();
480+
481+
// Safety: each view is either copied unchanged from the input, or built
482+
// by `substr_view` from a substring that is a contiguous sub-range of the
483+
// original string value stored in the input's data buffers.
484+
unsafe {
485+
Ok(Arc::new(StringViewArray::new_unchecked(
486+
views_buf,
487+
string_view_array.data_buffers().to_vec(),
488+
nulls,
489+
)) as ArrayRef)
490+
}
491+
}
492+
493+
/// Creates a `StringView` referencing a substring of an existing view's buffer.
494+
/// For substrings ≤ 12 bytes, creates an inline view instead.
495+
#[inline]
496+
fn substr_view(original_view: &u128, substr: &str, start_offset: u32) -> u128 {
497+
if substr.len() > 12 {
498+
let view = ByteView::from(*original_view);
499+
make_view(
500+
substr.as_bytes(),
501+
view.buffer_index,
502+
view.offset + start_offset,
503+
)
504+
} else {
505+
make_view(substr.as_bytes(), 0, 0)
506+
}
507+
}
508+
509+
/// Applies `split_fn` to each non-null string and appends the resulting view to
510+
/// `views_buf`.
511+
#[inline(always)]
512+
fn split_view_loop<F>(
513+
string_view_array: &StringViewArray,
514+
views: &[u128],
515+
views_buf: &mut Vec<u128>,
516+
split_fn: F,
517+
) where
518+
F: Fn(&str) -> Option<&str>,
519+
{
520+
let empty_view = make_view(b"", 0, 0);
521+
for (i, raw_view) in views.iter().enumerate() {
522+
if string_view_array.is_null(i) {
523+
views_buf.push(0);
524+
continue;
525+
}
526+
let string = string_view_array.value(i);
527+
match split_fn(string) {
528+
Some(substr) => {
529+
let start_offset = substr.as_ptr() as usize - string.as_ptr() as usize;
530+
views_buf.push(substr_view(raw_view, substr, start_offset as u32));
531+
}
532+
None => views_buf.push(empty_view),
533+
}
534+
}
535+
}
536+
428537
fn split_part_impl<'a, StringArrType, DelimiterArrType, B>(
429538
string_array: &StringArrType,
430539
delimiter_array: &DelimiterArrType,
@@ -490,7 +599,7 @@ where
490599

491600
#[cfg(test)]
492601
mod tests {
493-
use arrow::array::{Array, StringArray};
602+
use arrow::array::{Array, AsArray, StringArray, StringViewArray};
494603
use arrow::datatypes::DataType::Utf8;
495604

496605
use datafusion_common::ScalarValue;
@@ -686,4 +795,31 @@ mod tests {
686795

687796
Ok(())
688797
}
798+
799+
#[test]
800+
fn test_split_part_stringview_sliced() -> Result<()> {
801+
use super::split_part_scalar_view;
802+
803+
let strings: StringViewArray = vec![
804+
Some("skip_this.value"),
805+
Some("this_is_a_long_prefix.suffix"),
806+
Some("short.val"),
807+
Some("another_long_result.rest"),
808+
None,
809+
]
810+
.into_iter()
811+
.collect();
812+
813+
// Slice off the first element to get a non-zero offset array.
814+
let sliced = strings.slice(1, 4);
815+
let result = split_part_scalar_view(&sliced, ".", 1)?;
816+
let result = result.as_string_view();
817+
assert_eq!(result.len(), 4);
818+
assert_eq!(result.value(0), "this_is_a_long_prefix");
819+
assert_eq!(result.value(1), "short");
820+
assert_eq!(result.value(2), "another_long_result");
821+
assert!(result.is_null(3));
822+
823+
Ok(())
824+
}
689825
}

datafusion/sqllogictest/test_files/string/string_view.slt

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -954,6 +954,97 @@ SELECT arrow_typeof(split_part(arrow_cast('a.b.c', 'Utf8View'), '.', 2));
954954
----
955955
Utf8View
956956

957+
# SPLIT_PART with Utf8View column (exercises the array fast path)
958+
query T
959+
SELECT split_part(column1_utf8view, 'ph', 1) FROM test;
960+
----
961+
Andrew
962+
Xiangpeng
963+
Ra
964+
(empty)
965+
NULL
966+
967+
query T
968+
SELECT split_part(column1_utf8view, 'ph', 2) FROM test;
969+
----
970+
(empty)
971+
(empty)
972+
ael
973+
(empty)
974+
NULL
975+
976+
# Negative position
977+
query T
978+
SELECT split_part(column1_utf8view, 'ph', -1) FROM test;
979+
----
980+
Andrew
981+
Xiangpeng
982+
ael
983+
(empty)
984+
NULL
985+
986+
# Delimiter not found returns full string
987+
query T
988+
SELECT split_part(column1_utf8view, 'ZZZ', 1) FROM test;
989+
----
990+
Andrew
991+
Xiangpeng
992+
Raphael
993+
(empty)
994+
NULL
995+
996+
# Empty delimiter with column
997+
query T
998+
SELECT split_part(column1_utf8view, '', 1) FROM test;
999+
----
1000+
Andrew
1001+
Xiangpeng
1002+
Raphael
1003+
(empty)
1004+
NULL
1005+
1006+
# Single-char delimiter with column
1007+
query T
1008+
SELECT split_part(column1_utf8view, 'a', 1) FROM test;
1009+
----
1010+
Andrew
1011+
Xi
1012+
R
1013+
(empty)
1014+
NULL
1015+
1016+
# Verify array path also returns Utf8View
1017+
query T
1018+
SELECT arrow_typeof(split_part(column1_utf8view, '.', 1)) FROM test LIMIT 1;
1019+
----
1020+
Utf8View
1021+
1022+
# Long strings (>12 bytes) exercise out-of-line StringView construction in split_part
1023+
query T
1024+
SELECT split_part(arrow_cast(column1, 'Utf8View'), '.', 1) FROM
1025+
(VALUES ('this_is_over_12.suffix'), ('short.val'), (NULL)) AS t(column1);
1026+
----
1027+
this_is_over_12
1028+
short
1029+
NULL
1030+
1031+
query T
1032+
SELECT split_part(arrow_cast(column1, 'Utf8View'), '.', -1) FROM
1033+
(VALUES ('prefix.this_is_over_12'), ('a.short'), (NULL)) AS t(column1);
1034+
----
1035+
this_is_over_12
1036+
short
1037+
NULL
1038+
1039+
# Results at the 12-byte inline/out-of-line boundary
1040+
query T
1041+
SELECT split_part(arrow_cast(column1, 'Utf8View'), '.', 1) FROM
1042+
(VALUES ('exactly12byt.rest'), ('thirteen_byte.rest'), ('twelve_bytes.rest')) AS t(column1);
1043+
----
1044+
exactly12byt
1045+
thirteen_byte
1046+
twelve_bytes
1047+
9571048
## Ensure no casts for STRPOS
9581049
query TT
9591050
EXPLAIN SELECT

0 commit comments

Comments
 (0)