Skip to content

Commit 794f30e

Browse files
authored
perf: Add bulk NULL-aware string builders, use in lower and upper (#21789)
## Which issue does this PR close? - Part of #21684 ## Rationale for this change Introduce three new string array builders with bulk null tracking: - `StringArrayBuilder` (Utf8) - `LargeStringArrayBuilder` (LargeUtf8) - `StringViewArrayBuilder` (Utf8View) Each builder has the following API: - append_value(&str) -- add a non-NULL value (row) - append_placeholder() -- add a NULL row placeholder - finish(Option<NullBuffer>) -- finish the build, specify NULLs These are the counterpart of Arrow's `GenericStringBuilder` / `StringViewBuilder` but it skips per-row NULL buffer maintenance, which lets callers compute the NULL buffer in bulk when possible. This PR also switches `case_conversion` to use the new APIs, which is used to implement `lower`, `upper`, and the Spark equivalents. This improves `lower` / `upper` performance by 3-15% on microbenchmarks. More UDFs (~10) will be converted to use this API in future PRs. ## What changes are included in this PR? * Add new builders * Add unit tests * Adopt builders in `case_conversion` ## Are these changes tested? Yes. ## Are there any user-facing changes? No.
1 parent 7d5ddca commit 794f30e

2 files changed

Lines changed: 491 additions & 28 deletions

File tree

datafusion/functions/src/string/common.rs

Lines changed: 43 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,10 @@
1919
2020
use std::sync::Arc;
2121

22-
use crate::strings::append_view;
22+
use crate::strings::{GenericStringArrayBuilder, StringViewArrayBuilder, append_view};
2323
use arrow::array::{
24-
Array, ArrayRef, GenericStringArray, GenericStringBuilder, NullBufferBuilder,
25-
OffsetSizeTrait, StringViewArray, StringViewBuilder, new_null_array,
24+
Array, ArrayRef, GenericStringArray, NullBufferBuilder, OffsetSizeTrait,
25+
StringViewArray, new_null_array,
2626
};
2727
use arrow::buffer::{Buffer, OffsetBuffer, ScalarBuffer};
2828
use arrow::datatypes::DataType;
@@ -349,18 +349,30 @@ where
349349
>(array, op)?)),
350350
DataType::Utf8View => {
351351
let string_array = as_string_view_array(array)?;
352-
let mut string_builder =
353-
StringViewBuilder::with_capacity(string_array.len());
354-
355-
for str in string_array.iter() {
356-
if let Some(str) = str {
357-
string_builder.append_value(op(str));
358-
} else {
359-
string_builder.append_null();
352+
let item_len = string_array.len();
353+
// Null-preserving: reuse the input null buffer as the output null buffer.
354+
let nulls = string_array.nulls().cloned();
355+
let mut builder = StringViewArrayBuilder::with_capacity(item_len);
356+
357+
if let Some(ref n) = nulls {
358+
for i in 0..item_len {
359+
if n.is_null(i) {
360+
builder.append_placeholder();
361+
} else {
362+
// SAFETY: `n.is_null(i)` was false in the branch above.
363+
let s = unsafe { string_array.value_unchecked(i) };
364+
builder.append_value(&op(s));
365+
}
366+
}
367+
} else {
368+
for i in 0..item_len {
369+
// SAFETY: no null buffer means every index is valid.
370+
let s = unsafe { string_array.value_unchecked(i) };
371+
builder.append_value(&op(s));
360372
}
361373
}
362374

363-
Ok(ColumnarValue::Array(Arc::new(string_builder.finish())))
375+
Ok(ColumnarValue::Array(Arc::new(builder.finish(nulls)?)))
364376
}
365377
other => exec_err!("Unsupported data type {other:?} for function {name}"),
366378
},
@@ -400,17 +412,28 @@ where
400412
let start = offsets.first().unwrap().as_usize();
401413
let end = offsets.last().unwrap().as_usize();
402414
let capacity = (end - start) + PRE_ALLOC_BYTES;
403-
let mut builder = GenericStringBuilder::<O>::with_capacity(item_len, capacity);
415+
// Null-preserving: reuse the input null buffer as the output null buffer.
416+
let nulls = string_array.nulls().cloned();
417+
let mut builder = GenericStringArrayBuilder::<O>::with_capacity(item_len, capacity);
404418

405-
if string_array.null_count() == 0 {
406-
let iter =
407-
(0..item_len).map(|i| Some(op(unsafe { string_array.value_unchecked(i) })));
408-
builder.extend(iter);
419+
if let Some(ref n) = nulls {
420+
for i in 0..item_len {
421+
if n.is_null(i) {
422+
builder.append_placeholder();
423+
} else {
424+
// SAFETY: `n.is_null(i)` was false in the branch above.
425+
let s = unsafe { string_array.value_unchecked(i) };
426+
builder.append_value(&op(s));
427+
}
428+
}
409429
} else {
410-
let iter = string_array.iter().map(|string| string.map(&op));
411-
builder.extend(iter);
430+
for i in 0..item_len {
431+
// SAFETY: no null buffer means every index is valid.
432+
let s = unsafe { string_array.value_unchecked(i) };
433+
builder.append_value(&op(s));
434+
}
412435
}
413-
Ok(Arc::new(builder.finish()))
436+
Ok(Arc::new(builder.finish(nulls)?))
414437
}
415438

416439
/// Fast path for case conversion on an all-ASCII string array. ASCII case

0 commit comments

Comments
 (0)