Skip to content

Commit ec92925

Browse files
neilconwaymartin-g
andauthored
perf: Optimize substr_index to use bulk-NULL string builder (#21877)
## Which issue does this PR close? - Closes #21876. ## Rationale for this change As with other recent optimizations, we can optimize NULL handling in `substr_index` by using the new bulk-NULL string builders. Benchmarks: Utf8 - utf8_100_array_long_delimiter: 10.0 µs → 10.1 µs (+1.00%) - utf8_100_array_single_delimiter: 2.9 µs → 2.5 µs (−13.79%) - utf8_100_scalar_long_delimiter_neg: 4.1 µs → 3.5 µs (−14.63%) - utf8_100_scalar_long_delimiter_pos: 2.9 µs → 2.7 µs (−6.90%) - utf8_100_scalar_single_delimiter_neg: 2.2 µs → 1.993 µs (−9.41%) - utf8_100_scalar_single_delimiter_pos: 2.1 µs → 1.845 µs (−12.13%) - utf8_1000_array_long_delimiter: 101.0 µs → 101.1 µs (+0.10%) - utf8_1000_array_single_delimiter: 36.8 µs → 31.7 µs (−13.86%) - utf8_1000_scalar_long_delimiter_neg: 38.9 µs → 36.9 µs (−5.14%) - utf8_1000_scalar_long_delimiter_pos: 25.1 µs → 23.3 µs (−7.17%) - utf8_1000_scalar_single_delimiter_neg: 19.3 µs → 17.7 µs (−8.29%) - utf8_1000_scalar_single_delimiter_pos: 18.2 µs → 16.6 µs (−8.79%) - utf8_10000_array_long_delimiter: 1083.4 µs → 1038.2 µs (−4.17%) - utf8_10000_array_single_delimiter: 461.8 µs → 414.7 µs (−10.20%) - utf8_10000_scalar_long_delimiter_neg: 392.4 µs → 379.3 µs (−3.34%) - utf8_10000_scalar_long_delimiter_pos: 246.5 µs → 227.4 µs (−7.75%) - utf8_10000_scalar_single_delimiter_neg: 191.3 µs → 177.5 µs (−7.21%) - utf8_10000_scalar_single_delimiter_pos: 179.4 µs → 168.8 µs (−5.91%) Utf8View - utf8view_100_array_long_delimiter: 9.5 µs → 9.8 µs (+3.16%) - utf8view_100_array_single_delimiter: 2.6 µs → 2.6 µs (0.00%) - utf8view_100_scalar_long_delimiter_neg: 4.0 µs → 4.0 µs (0.00%) - utf8view_100_scalar_long_delimiter_pos: 2.8 µs → 2.8 µs (0.00%) - utf8view_100_scalar_single_delimiter_neg: 2.3 µs → 2.3 µs (0.00%) - utf8view_100_scalar_single_delimiter_pos: 2.2 µs → 2.1 µs (−4.55%) - utf8view_1000_array_long_delimiter: 94.8 µs → 99.2 µs (+4.64%) - utf8view_1000_array_single_delimiter: 31.5 µs → 32.0 µs (+1.59%) - utf8view_1000_scalar_long_delimiter_neg: 38.7 µs → 39.0 µs (+0.78%) - utf8view_1000_scalar_long_delimiter_pos: 25.4 µs → 25.4 µs (0.00%) - utf8view_1000_scalar_single_delimiter_neg: 21.4 µs → 21.8 µs (+1.87%) - utf8view_1000_scalar_single_delimiter_pos: 20.8 µs → 20.9 µs (+0.48%) - utf8view_10000_array_long_delimiter: 998.4 µs → 1025.4 µs (+2.70%) - utf8view_10000_array_single_delimiter: 414.9 µs → 415.7 µs (+0.19%) - utf8view_10000_scalar_long_delimiter_neg: 393.7 µs → 395.9 µs (+0.56%) - utf8view_10000_scalar_long_delimiter_pos: 253.4 µs → 252.7 µs (−0.28%) - utf8view_10000_scalar_single_delimiter_neg: 214.5 µs → 217.3 µs (+1.31%) - utf8view_10000_scalar_single_delimiter_pos: 207.9 µs → 208.7 µs (+0.38%) This PR doesn't touch the Utf8View code path, so the Utf8View regressions above are likely measurement noise. ## What changes are included in this PR? * Optimize `substr_index` by switching from Arrow string builders to bulk-NULL string builders ## Are these changes tested? Yes, covered by existing tests. ## Are there any user-facing changes? No. --------- Co-authored-by: Martin Grigorov <martin-g@users.noreply.github.com>
1 parent 22bb4e6 commit ec92925

1 file changed

Lines changed: 47 additions & 31 deletions

File tree

datafusion/functions/src/unicode/substrindex.rs

Lines changed: 47 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,14 @@
1818
use std::sync::Arc;
1919

2020
use arrow::array::{
21-
Array, ArrayRef, AsArray, ByteView, GenericStringArray, GenericStringBuilder,
22-
OffsetSizeTrait, PrimitiveArray, StringArrayType, StringLikeArrayBuilder,
23-
StringViewArray, make_view, new_null_array,
21+
Array, ArrayRef, AsArray, ByteView, GenericStringArray, OffsetSizeTrait,
22+
PrimitiveArray, StringArrayType, StringViewArray, make_view, new_null_array,
2423
};
2524
use arrow::buffer::ScalarBuffer;
2625
use arrow::datatypes::{DataType, Int64Type};
2726
use arrow_buffer::NullBuffer;
2827

28+
use crate::strings::GenericStringArrayBuilder;
2929
use crate::utils::make_scalar_function;
3030
use datafusion_common::{
3131
Result, ScalarValue, exec_datafusion_err, exec_err, utils::take_function_args,
@@ -151,7 +151,7 @@ fn substr_index(args: &[ArrayRef]) -> Result<ArrayRef> {
151151
string_array,
152152
delimiter_array,
153153
count_array,
154-
GenericStringBuilder::<i32>::with_capacity(
154+
GenericStringArrayBuilder::<i32>::with_capacity(
155155
string_array.len(),
156156
visible_string_bytes(string_array),
157157
),
@@ -165,7 +165,7 @@ fn substr_index(args: &[ArrayRef]) -> Result<ArrayRef> {
165165
string_array,
166166
delimiter_array,
167167
count_array,
168-
GenericStringBuilder::<i64>::with_capacity(
168+
GenericStringArrayBuilder::<i64>::with_capacity(
169169
string_array.len(),
170170
visible_string_bytes(string_array),
171171
),
@@ -229,7 +229,7 @@ fn substr_index_scalar(
229229
arr,
230230
delimiter,
231231
count,
232-
GenericStringBuilder::<i32>::with_capacity(
232+
GenericStringArrayBuilder::<i32>::with_capacity(
233233
arr.len(),
234234
visible_string_bytes(arr),
235235
),
@@ -241,7 +241,7 @@ fn substr_index_scalar(
241241
arr,
242242
delimiter,
243243
count,
244-
GenericStringBuilder::<i64>::with_capacity(
244+
GenericStringArrayBuilder::<i64>::with_capacity(
245245
arr.len(),
246246
visible_string_bytes(arr),
247247
),
@@ -261,30 +261,37 @@ fn visible_string_bytes<T: OffsetSizeTrait>(
261261
offsets[offsets.len() - 1].as_usize() - offsets[0].as_usize()
262262
}
263263

264-
fn substr_index_general<'a, S, B>(
264+
fn substr_index_general<'a, S, O>(
265265
string_array: S,
266266
delimiter_array: S,
267267
count_array: &PrimitiveArray<Int64Type>,
268-
mut builder: B,
268+
mut builder: GenericStringArrayBuilder<O>,
269269
) -> Result<ArrayRef>
270270
where
271271
S: StringArrayType<'a> + Copy,
272-
B: StringLikeArrayBuilder,
272+
O: OffsetSizeTrait,
273273
{
274-
for ((string, delimiter), n) in string_array
275-
.iter()
276-
.zip(delimiter_array.iter())
277-
.zip(count_array.iter())
278-
{
279-
match (string, delimiter, n) {
280-
(Some(string), Some(delimiter), Some(n)) => {
281-
builder.append_value(substr_index_slice(string, delimiter, n));
282-
}
283-
_ => builder.append_null(),
274+
let num_rows = string_array.len();
275+
// Output is null IFF any input is null.
276+
let nulls = NullBuffer::union(
277+
NullBuffer::union(string_array.nulls(), delimiter_array.nulls()).as_ref(),
278+
count_array.nulls(),
279+
);
280+
281+
for i in 0..num_rows {
282+
if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
283+
builder.append_placeholder();
284+
continue;
284285
}
286+
// SAFETY: `i < num_rows` and the union of input nulls is valid at i,
287+
// so each input is also valid at i.
288+
let string = unsafe { string_array.value_unchecked(i) };
289+
let delimiter = unsafe { delimiter_array.value_unchecked(i) };
290+
let n = unsafe { count_array.value_unchecked(i) };
291+
builder.append_value(substr_index_slice(string, delimiter, n));
285292
}
286293

287-
Ok(Arc::new(builder.finish()) as ArrayRef)
294+
Ok(Arc::new(builder.finish(nulls)?) as ArrayRef)
288295
}
289296

290297
fn substr_index_view(
@@ -332,15 +339,15 @@ fn substr_index_view(
332339
}
333340
}
334341

335-
fn substr_index_scalar_impl<'a, S, B>(
342+
fn substr_index_scalar_impl<'a, S, O>(
336343
string_array: S,
337344
delimiter: &str,
338345
count: i64,
339-
builder: B,
346+
builder: GenericStringArrayBuilder<O>,
340347
) -> Result<ArrayRef>
341348
where
342349
S: StringArrayType<'a> + Copy,
343-
B: StringLikeArrayBuilder,
350+
O: OffsetSizeTrait,
344351
{
345352
if count == 0 || delimiter.is_empty() {
346353
return map_strings(string_array, builder, |string| &string[..0]);
@@ -465,19 +472,28 @@ fn substr_index_scalar_view(
465472
}
466473
}
467474

468-
fn map_strings<'a, S, B, F>(string_array: S, mut builder: B, f: F) -> Result<ArrayRef>
475+
fn map_strings<'a, S, O, F>(
476+
string_array: S,
477+
mut builder: GenericStringArrayBuilder<O>,
478+
f: F,
479+
) -> Result<ArrayRef>
469480
where
470481
S: StringArrayType<'a> + Copy,
471-
B: StringLikeArrayBuilder,
482+
O: OffsetSizeTrait,
472483
F: Fn(&'a str) -> &'a str,
473484
{
474-
for string in string_array.iter() {
475-
match string {
476-
Some(s) => builder.append_value(f(s)),
477-
None => builder.append_null(),
485+
let nulls = string_array.nulls().cloned();
486+
for i in 0..string_array.len() {
487+
if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
488+
builder.append_placeholder();
489+
continue;
478490
}
491+
// SAFETY: `i < string_array.len()` and `nulls` is valid at i, so the
492+
// input is also valid at i.
493+
let s = unsafe { string_array.value_unchecked(i) };
494+
builder.append_value(f(s));
479495
}
480-
Ok(Arc::new(builder.finish()) as ArrayRef)
496+
Ok(Arc::new(builder.finish(nulls)?) as ArrayRef)
481497
}
482498

483499
#[inline]

0 commit comments

Comments
 (0)