Skip to content

Commit 0144570

Browse files
neilconwaymartin-gblaginin
authored
perf: Add BulkNullStringArrayBuilder trait, use in repeat (#21854)
Add a DataFusion-side trait that abstracts over the bulk-NULL string array builders (GenericStringArrayBuilder<O> and StringViewArrayBuilder), so that functions which dispatch over Utf8/LargeUtf8/Utf8View can adopt the new builders without giving up their single-bodied generic implementation. Convert `repeat` as the first call site. The output is null iff either input is null, so the per-row null match becomes a single NullBuffer::union over the input null buffers, evaluated once before the loop. Also mark the inherent append_value/append_placeholder methods on the new builders as #[inline]; without this, calls through the trait wrapper end up going through a non-inlined inherent and slow down small-output paths. ## Which issue does this PR close? - Closes #21853. ## Rationale for this change Optimize NULL handling in `repeat` using the bulk-NULL string builders that have recently been added. This requires adding `BulkNullStringArrayBuilder`, a trait that is similar in spirit to Arrow's `StringLikeArrayBuilder`. Benchmarks: - repeat_string overflow [size=1024, repeat_times=1073741824]: 1022.5ns → 1054.5ns (+3.13%) - repeat_string overflow [size=4096, repeat_times=1073741824]: 1016.6ns → 1055.3ns (+3.81%) - repeat_large_string [size=1024, repeat_times=3]: 32.4µs → 26.6µs (−17.90%) - repeat_large_string [size=4096, repeat_times=3]: 127.4µs → 104.0µs (−18.37%) - repeat_string [size=1024, repeat_times=3]: 32.6µs → 26.8µs (−17.79%) - repeat_string [size=4096, repeat_times=3]: 127.4µs → 105.5µs (−17.19%) - repeat_string_view [size=1024, repeat_times=3]: 37.3µs → 31.7µs (−15.01%) - repeat_string_view [size=4096, repeat_times=3]: 146.5µs → 124.5µs (−15.02%) - repeat_large_string [size=1024, repeat_times=30]: 82.0µs → 80.4µs (−1.95%) - repeat_large_string [size=4096, repeat_times=30]: 344.2µs → 338.7µs (−1.60%) - repeat_string [size=1024, repeat_times=30]: 81.7µs → 79.7µs (−2.45%) - repeat_string [size=4096, repeat_times=30]: 352.2µs → 334.7µs (−4.97%) - repeat_string_view [size=1024, repeat_times=30]: 88.1µs → 83.1µs (−5.68%) - repeat_string_view [size=4096, repeat_times=30]: 368.8µs → 342.6µs (−7.10%) - repeat/scalar_utf8: 174.7ns → 179.2ns (+2.58%) - repeat/scalar_utf8view: 174.5ns → 180.5ns (+3.44%) ## What changes are included in this PR? * Add `BulkNullStringArrayBuilder` * Optimize `repeat` using `BulkNullStringArrayBuilder` * Inline some functions in GenericStringBuilder; benchmarking suggests this is a win ## Are these changes tested? Yes. ## Are there any user-facing changes? No. --------- Co-authored-by: Martin Grigorov <martin-g@users.noreply.github.com> Co-authored-by: Dmitrii Blaginin <dmitrii@blaginin.me>
1 parent 0bb17bc commit 0144570

3 files changed

Lines changed: 282 additions & 47 deletions

File tree

datafusion/functions/src/string/repeat.rs

Lines changed: 103 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,12 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use std::sync::Arc;
19-
20-
use crate::utils::utf8_to_str_type;
21-
use arrow::array::{
22-
Array, ArrayRef, AsArray, GenericStringArray, GenericStringBuilder, Int64Array,
23-
StringArrayType, StringLikeArrayBuilder, StringViewArray, StringViewBuilder,
18+
use crate::strings::{
19+
BulkNullStringArrayBuilder, GenericStringArrayBuilder, StringViewArrayBuilder,
2420
};
21+
use crate::utils::utf8_to_str_type;
22+
use arrow::array::{Array, ArrayRef, AsArray, Int64Array, StringArrayType};
23+
use arrow::buffer::NullBuffer;
2524
use arrow::datatypes::DataType;
2625
use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View};
2726
use datafusion_common::cast::as_int64_array;
@@ -190,43 +189,28 @@ fn repeat(string_array: &ArrayRef, count_array: &ArrayRef) -> Result<ArrayRef> {
190189
number_array,
191190
i32::MAX as usize,
192191
)?;
193-
let builder = StringViewBuilder::with_capacity(string_array.len());
194-
repeat_impl::<&StringViewArray, StringViewBuilder>(
195-
&string_view_array,
196-
number_array,
197-
max_item_capacity,
198-
builder,
199-
)
192+
let builder = StringViewArrayBuilder::with_capacity(string_array.len());
193+
repeat_impl(&string_view_array, number_array, max_item_capacity, builder)
200194
}
201195
Utf8 => {
202196
let string_arr = string_array.as_string::<i32>();
203197
let (total_capacity, max_item_capacity) =
204198
calculate_capacities(&string_arr, number_array, i32::MAX as usize)?;
205-
let builder = GenericStringBuilder::<i32>::with_capacity(
199+
let builder = GenericStringArrayBuilder::<i32>::with_capacity(
206200
string_array.len(),
207201
total_capacity,
208202
);
209-
repeat_impl::<&GenericStringArray<i32>, GenericStringBuilder<i32>>(
210-
&string_arr,
211-
number_array,
212-
max_item_capacity,
213-
builder,
214-
)
203+
repeat_impl(&string_arr, number_array, max_item_capacity, builder)
215204
}
216205
LargeUtf8 => {
217206
let string_arr = string_array.as_string::<i64>();
218207
let (total_capacity, max_item_capacity) =
219208
calculate_capacities(&string_arr, number_array, i64::MAX as usize)?;
220-
let builder = GenericStringBuilder::<i64>::with_capacity(
209+
let builder = GenericStringArrayBuilder::<i64>::with_capacity(
221210
string_array.len(),
222211
total_capacity,
223212
);
224-
repeat_impl::<&GenericStringArray<i64>, GenericStringBuilder<i64>>(
225-
&string_arr,
226-
number_array,
227-
max_item_capacity,
228-
builder,
229-
)
213+
repeat_impl(&string_arr, number_array, max_item_capacity, builder)
230214
}
231215
other => exec_err!(
232216
"Unsupported data type {other:?} for function repeat. \
@@ -278,7 +262,7 @@ fn repeat_impl<'a, S, B>(
278262
) -> Result<ArrayRef>
279263
where
280264
S: StringArrayType<'a> + 'a,
281-
B: StringLikeArrayBuilder,
265+
B: BulkNullStringArrayBuilder,
282266
{
283267
// Reusable buffer to avoid allocations in string.repeat()
284268
let mut buffer = Vec::<u8>::with_capacity(max_item_capacity);
@@ -301,12 +285,18 @@ where
301285
}
302286
}
303287

304-
// Fast path: no nulls in either array
305-
if string_array.null_count() == 0 && number_array.null_count() == 0 {
288+
// Output is null IFF either input is null
289+
let nulls = NullBuffer::union(string_array.nulls(), number_array.nulls());
290+
291+
if let Some(ref n) = nulls {
306292
for i in 0..string_array.len() {
307-
// SAFETY: i is within bounds (0..len) and null_count() == 0 guarantees valid value
293+
if n.is_null(i) {
294+
builder.append_placeholder();
295+
continue;
296+
}
297+
// SAFETY: index `i` in both arrays is valid
308298
let string = unsafe { string_array.value_unchecked(i) };
309-
let count = number_array.value(i);
299+
let count = unsafe { number_array.value_unchecked(i) };
310300
if count > 0 {
311301
repeat_to_buffer(&mut buffer, string, count as usize);
312302
// SAFETY: buffer contains valid UTF-8 since we only copy from a valid &str
@@ -316,27 +306,30 @@ where
316306
}
317307
}
318308
} else {
319-
// Slow path: handle nulls
320-
for (string, number) in string_array.iter().zip(number_array.iter()) {
321-
match (string, number) {
322-
(Some(string), Some(count)) if count > 0 => {
323-
repeat_to_buffer(&mut buffer, string, count as usize);
324-
// SAFETY: buffer contains valid UTF-8 since we only copy from a valid &str
325-
builder
326-
.append_value(unsafe { std::str::from_utf8_unchecked(&buffer) });
327-
}
328-
(Some(_), Some(_)) => builder.append_value(""),
329-
_ => builder.append_null(),
309+
for i in 0..string_array.len() {
310+
// SAFETY: no nulls, so every index in both arrays is valid
311+
let string = unsafe { string_array.value_unchecked(i) };
312+
let count = unsafe { number_array.value_unchecked(i) };
313+
if count > 0 {
314+
repeat_to_buffer(&mut buffer, string, count as usize);
315+
// SAFETY: buffer contains valid UTF-8 since we only copy from a valid &str
316+
builder.append_value(unsafe { std::str::from_utf8_unchecked(&buffer) });
317+
} else {
318+
builder.append_value("");
330319
}
331320
}
332321
}
333322

334-
Ok(Arc::new(builder.finish()) as ArrayRef)
323+
builder.finish(nulls)
335324
}
336325

337326
#[cfg(test)]
338327
mod tests {
339-
use arrow::array::{Array, LargeStringArray, StringArray, StringViewArray};
328+
use std::sync::Arc;
329+
330+
use arrow::array::{
331+
Array, ArrayRef, Int64Array, LargeStringArray, StringArray, StringViewArray,
332+
};
340333
use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View};
341334

342335
use datafusion_common::ScalarValue;
@@ -444,4 +437,69 @@ mod tests {
444437

445438
Ok(())
446439
}
440+
441+
// Slicing the input arrays produces a NullBuffer with a non-zero offset.
442+
// The tests below use 6-row inputs sliced to (1, 4) so that:
443+
// slot 0 (orig 1): "a" × 3 → "aaa"
444+
// slot 1 (orig 2): "bb" × 2 → "bbbb"
445+
// slot 2 (orig 3): "c" × NULL → NULL (count-side null)
446+
// slot 3 (orig 4): NULL × 1 → NULL (string-side null)
447+
fn sliced_offset_inputs<F>(make_strings: F) -> (ArrayRef, ArrayRef)
448+
where
449+
F: FnOnce(Vec<Option<&'static str>>) -> ArrayRef,
450+
{
451+
let strings = make_strings(vec![
452+
None,
453+
Some("a"),
454+
Some("bb"),
455+
Some("c"),
456+
None,
457+
Some("d"),
458+
]);
459+
let counts: ArrayRef = Arc::new(Int64Array::from(vec![
460+
Some(2),
461+
Some(3),
462+
Some(2),
463+
None,
464+
Some(1),
465+
Some(2),
466+
]));
467+
(strings.slice(1, 4), counts.slice(1, 4))
468+
}
469+
470+
fn assert_sliced_offset_output<A: Array + 'static>(result: ArrayRef)
471+
where
472+
for<'a> &'a A: arrow::array::ArrayAccessor<Item = &'a str>,
473+
{
474+
let result = result.as_any().downcast_ref::<A>().unwrap();
475+
assert_eq!(result.len(), 4);
476+
assert_eq!(arrow::array::ArrayAccessor::value(&result, 0), "aaa");
477+
assert_eq!(arrow::array::ArrayAccessor::value(&result, 1), "bbbb");
478+
assert!(result.is_null(2));
479+
assert!(result.is_null(3));
480+
assert_eq!(result.null_count(), 2);
481+
}
482+
483+
#[test]
484+
fn test_repeat_sliced_string_with_null_offset() {
485+
let (strings, counts) = sliced_offset_inputs(|v| Arc::new(StringArray::from(v)));
486+
let result = super::repeat(&strings, &counts).unwrap();
487+
assert_sliced_offset_output::<StringArray>(result);
488+
}
489+
490+
#[test]
491+
fn test_repeat_sliced_large_string_with_null_offset() {
492+
let (strings, counts) =
493+
sliced_offset_inputs(|v| Arc::new(LargeStringArray::from(v)));
494+
let result = super::repeat(&strings, &counts).unwrap();
495+
assert_sliced_offset_output::<LargeStringArray>(result);
496+
}
497+
498+
#[test]
499+
fn test_repeat_sliced_string_view_with_null_offset() {
500+
let (strings, counts) =
501+
sliced_offset_inputs(|v| Arc::new(StringViewArray::from(v)));
502+
let result = super::repeat(&strings, &counts).unwrap();
503+
assert_sliced_offset_output::<StringViewArray>(result);
504+
}
447505
}

datafusion/functions/src/strings.rs

Lines changed: 108 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,14 @@
1717

1818
use std::marker::PhantomData;
1919
use std::mem::size_of;
20+
use std::sync::Arc;
2021

2122
use datafusion_common::{Result, exec_datafusion_err, internal_err};
2223

2324
use arrow::array::{
24-
Array, ArrayAccessor, ArrayDataBuilder, BinaryArray, ByteView, GenericStringArray,
25-
LargeStringArray, OffsetSizeTrait, StringArray, StringViewArray, make_view,
25+
Array, ArrayAccessor, ArrayDataBuilder, ArrayRef, BinaryArray, ByteView,
26+
GenericStringArray, LargeStringArray, OffsetSizeTrait, StringArray, StringViewArray,
27+
make_view,
2628
};
2729
use arrow::buffer::{Buffer, MutableBuffer, NullBuffer, ScalarBuffer};
2830
use arrow::datatypes::DataType;
@@ -473,6 +475,7 @@ impl<O: OffsetSizeTrait> GenericStringArrayBuilder<O> {
473475
/// # Panics
474476
///
475477
/// Panics if the cumulative byte length exceeds `O::MAX`.
478+
#[inline]
476479
pub fn append_value(&mut self, value: &str) {
477480
self.value_buffer.extend_from_slice(value.as_bytes());
478481
let next_offset =
@@ -482,6 +485,7 @@ impl<O: OffsetSizeTrait> GenericStringArrayBuilder<O> {
482485

483486
/// Append an empty placeholder row. The corresponding slot must be masked
484487
/// as null by the null buffer passed to `finish`.
488+
#[inline]
485489
pub fn append_placeholder(&mut self) {
486490
let next_offset =
487491
O::from_usize(self.value_buffer.len()).expect("byte array offset overflow");
@@ -672,6 +676,47 @@ impl StringViewArrayBuilder {
672676
}
673677
}
674678

679+
/// Trait abstracting over the bulk-NULL string array builders.
680+
///
681+
/// Similar to Arrow's `StringLikeArrayBuilder`, this allows generic dispatch
682+
/// over the three string array types (Utf8, LargeUtf8, Utf8View) when the
683+
/// function body is uniform across them.
684+
pub(crate) trait BulkNullStringArrayBuilder {
685+
fn append_value(&mut self, value: &str);
686+
fn append_placeholder(&mut self);
687+
fn finish(self, nulls: Option<NullBuffer>) -> Result<ArrayRef>;
688+
}
689+
690+
impl<O: OffsetSizeTrait> BulkNullStringArrayBuilder for GenericStringArrayBuilder<O> {
691+
#[inline]
692+
fn append_value(&mut self, value: &str) {
693+
GenericStringArrayBuilder::<O>::append_value(self, value)
694+
}
695+
#[inline]
696+
fn append_placeholder(&mut self) {
697+
GenericStringArrayBuilder::<O>::append_placeholder(self)
698+
}
699+
fn finish(self, nulls: Option<NullBuffer>) -> Result<ArrayRef> {
700+
Ok(Arc::new(GenericStringArrayBuilder::<O>::finish(
701+
self, nulls,
702+
)?))
703+
}
704+
}
705+
706+
impl BulkNullStringArrayBuilder for StringViewArrayBuilder {
707+
#[inline]
708+
fn append_value(&mut self, value: &str) {
709+
StringViewArrayBuilder::append_value(self, value)
710+
}
711+
#[inline]
712+
fn append_placeholder(&mut self) {
713+
StringViewArrayBuilder::append_placeholder(self)
714+
}
715+
fn finish(self, nulls: Option<NullBuffer>) -> Result<ArrayRef> {
716+
Ok(Arc::new(StringViewArrayBuilder::finish(self, nulls)?))
717+
}
718+
}
719+
675720
/// Append a new view to the views buffer with the given substr.
676721
///
677722
/// Callers are responsible for their own null tracking.
@@ -962,4 +1007,65 @@ mod tests {
9621007
assert_eq!(array.value(i), value);
9631008
}
9641009
}
1010+
1011+
/// Build an array via `BulkNullStringArrayBuilder` to verify that the
1012+
/// trait methods produce the same result as the inherent methods.
1013+
fn build_via_trait<B: BulkNullStringArrayBuilder>(
1014+
mut builder: B,
1015+
nulls: Option<NullBuffer>,
1016+
) -> ArrayRef {
1017+
builder.append_value("a");
1018+
builder.append_placeholder();
1019+
builder.append_value("hello world!");
1020+
builder.finish(nulls).unwrap()
1021+
}
1022+
1023+
#[test]
1024+
fn bulk_null_trait_string_i32() {
1025+
let builder = GenericStringArrayBuilder::<i32>::with_capacity(3, 16);
1026+
let nulls = NullBuffer::from(vec![true, false, true]);
1027+
let array = build_via_trait(builder, Some(nulls));
1028+
let array = array.as_any().downcast_ref::<StringArray>().unwrap();
1029+
assert_eq!(array.len(), 3);
1030+
assert_eq!(array.value(0), "a");
1031+
assert!(array.is_null(1));
1032+
assert_eq!(array.value(2), "hello world!");
1033+
}
1034+
1035+
#[test]
1036+
fn bulk_null_trait_string_i64() {
1037+
let builder = GenericStringArrayBuilder::<i64>::with_capacity(3, 16);
1038+
let nulls = NullBuffer::from(vec![true, false, true]);
1039+
let array = build_via_trait(builder, Some(nulls));
1040+
let array = array.as_any().downcast_ref::<LargeStringArray>().unwrap();
1041+
assert_eq!(array.len(), 3);
1042+
assert_eq!(array.value(0), "a");
1043+
assert!(array.is_null(1));
1044+
assert_eq!(array.value(2), "hello world!");
1045+
}
1046+
1047+
#[test]
1048+
fn bulk_null_trait_string_view() {
1049+
let builder = StringViewArrayBuilder::with_capacity(3);
1050+
let nulls = NullBuffer::from(vec![true, false, true]);
1051+
let array = build_via_trait(builder, Some(nulls));
1052+
let array = array.as_any().downcast_ref::<StringViewArray>().unwrap();
1053+
assert_eq!(array.len(), 3);
1054+
assert_eq!(array.value(0), "a");
1055+
assert!(array.is_null(1));
1056+
assert_eq!(array.value(2), "hello world!");
1057+
}
1058+
1059+
#[test]
1060+
fn bulk_null_trait_no_nulls() {
1061+
let mut builder = GenericStringArrayBuilder::<i32>::with_capacity(2, 8);
1062+
BulkNullStringArrayBuilder::append_value(&mut builder, "x");
1063+
BulkNullStringArrayBuilder::append_value(&mut builder, "yy");
1064+
let array = BulkNullStringArrayBuilder::finish(builder, None).unwrap();
1065+
let array = array.as_any().downcast_ref::<StringArray>().unwrap();
1066+
assert_eq!(array.len(), 2);
1067+
assert_eq!(array.value(0), "x");
1068+
assert_eq!(array.value(1), "yy");
1069+
assert_eq!(array.null_count(), 0);
1070+
}
9651071
}

0 commit comments

Comments
 (0)