Skip to content

Commit 8939726

Browse files
authored
perf: Optimize NULL handling in substr (#21519)
## Which issue does this PR close? - Closes #21518. ## Rationale for this change Similar to other recent changes, `substr` currently checks for NULLs and builds the result NULL bitmap on a per-row basis. It is faster to instead compute the result NULL bitmap in bulk via bitwise AND. Benchmarks (ARM64): ``` - substr, no count, short strings/substr_large_string [size=1024]: 21.4µs → 20.9µs (-2.3%) - substr, no count, short strings/substr_large_string [size=4096]: 83.1µs → 83.0µs (-0.1%) - substr, no count, short strings/substr_string [size=1024]: 20.5µs → 19.8µs (-3.4%) - substr, no count, short strings/substr_string [size=4096]: 78.8µs → 77.0µs (-2.3%) - substr, no count, short strings/substr_string_view [size=1024]: 18.9µs → 16.1µs (-14.8%) - substr, no count, short strings/substr_string_view [size=4096]: 74.0µs → 61.6µs (-16.8%) - substr, scalar args, long strings/substr_large_string [size=1024]: 35.2µs → 34.0µs (-3.4%) - substr, scalar args, long strings/substr_large_string [size=4096]: 140.6µs → 134.5µs (-4.3%) - substr, scalar args, long strings/substr_string [size=1024]: 35.5µs → 33.8µs (-4.8%) - substr, scalar args, long strings/substr_string [size=4096]: 138.9µs → 134.2µs (-3.4%) - substr, scalar args, long strings/substr_string_view [size=1024]: 34.0µs → 31.0µs (-8.8%) - substr, scalar args, long strings/substr_string_view [size=4096]: 132.0µs → 121.8µs (-7.7%) - substr, scalar args, short strings/substr_string [size=1024]: 31.0µs → 29.2µs (-5.8%) - substr, scalar args, short strings/substr_string [size=4096]: 120.8µs → 111.5µs (-7.7%) - substr, scalar args, short strings/substr_string_view [size=1024]: 26.8µs → 23.1µs (-13.8%) - substr, scalar args, short strings/substr_string_view [size=4096]: 101.6µs → 86.4µs (-14.9%) - substr, scalar start, no count, long strings/substr_string [size=1024]: 34.5µs → 33.2µs (-3.8%) - substr, scalar start, no count, long strings/substr_string [size=4096]: 134.4µs → 133.6µs (-0.6%) - substr, scalar start, no count, long strings/substr_string_view [size=1024]: 32.9µs → 29.4µs (-10.6%) - substr, scalar start, no count, long strings/substr_string_view [size=4096]: 126.1µs → 115.2µs (-8.6%) - substr, scalar start, no count, short strings/substr_string [size=1024]: 20.9µs → 20.1µs (-3.8%) - substr, scalar start, no count, short strings/substr_string [size=4096]: 80.1µs → 77.5µs (-3.2%) - substr, scalar start, no count, short strings/substr_string_view [size=1024]: 19.9µs → 16.7µs (-16.1%) - substr, scalar start, no count, short strings/substr_string_view [size=4096]: 74.4µs → 62.4µs (-16.1%) - substr, short count, long strings/substr_large_string [size=1024]: 30.3µs → 28.4µs (-6.3%) - substr, short count, long strings/substr_large_string [size=4096]: 117.1µs → 112.0µs (-4.4%) - substr, short count, long strings/substr_string [size=1024]: 30.2µs → 28.3µs (-6.3%) - substr, short count, long strings/substr_string [size=4096]: 118.0µs → 111.0µs (-5.9%) - substr, short count, long strings/substr_string_view [size=1024]: 26.1µs → 22.8µs (-12.6%) - substr, short count, long strings/substr_string_view [size=4096]: 101.5µs → 87.7µs (-13.6%) - substr, with count, long strings/substr_large_string [size=1024]: 34.6µs → 32.8µs (-5.2%) - substr, with count, long strings/substr_large_string [size=4096]: 136.7µs → 133.0µs (-2.7%) - substr, with count, long strings/substr_string [size=1024]: 34.2µs → 32.7µs (-4.4%) - substr, with count, long strings/substr_string [size=4096]: 136.6µs → 132.3µs (-3.1%) - substr, with count, long strings/substr_string_view [size=1024]: 33.3µs → 30.3µs (-9.0%) - substr, with count, long strings/substr_string_view [size=4096]: 129.1µs → 119.6µs (-7.4%) ``` ## What changes are included in this PR? * Implement optimization * Rename `make_and_append_view` to `append_view`, and have callers deal with NULL handling; making it part of `append_view` encourages per-row NULL computations, which should be avoided when possible. * Mark `append_view` as never-inline; this avoids a performance regression on some of the `substr` microbenchmarks, where LLVM is a little eager to inline a large-ish function into a hot loop. ## Are these changes tested? Yes. ## Are there any user-facing changes? No.
1 parent e8d217a commit 8939726

3 files changed

Lines changed: 38 additions & 49 deletions

File tree

datafusion/functions/src/string/common.rs

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
2020
use std::sync::Arc;
2121

22-
use crate::strings::make_and_append_view;
22+
use crate::strings::append_view;
2323
use arrow::array::{
2424
Array, ArrayRef, GenericStringArray, GenericStringBuilder, NullBufferBuilder,
2525
OffsetSizeTrait, StringViewArray, StringViewBuilder, new_null_array,
@@ -152,13 +152,8 @@ fn string_view_trim<Tr: Trimmer>(args: &[ArrayRef]) -> Result<ArrayRef> {
152152
{
153153
if let Some(src_str) = src_str_opt {
154154
let (trimmed, offset) = Tr::trim_ascii_char(src_str, b' ');
155-
make_and_append_view(
156-
&mut views_buf,
157-
&mut null_builder,
158-
raw_view,
159-
trimmed,
160-
offset,
161-
);
155+
append_view(&mut views_buf, raw_view, trimmed, offset);
156+
null_builder.append_non_null();
162157
} else {
163158
null_builder.append_null();
164159
views_buf.push(0);
@@ -204,13 +199,8 @@ fn string_view_trim<Tr: Trimmer>(args: &[ArrayRef]) -> Result<ArrayRef> {
204199
pattern.clear();
205200
pattern.extend(characters.chars());
206201
let (trimmed, offset) = Tr::trim(src_str, &pattern);
207-
make_and_append_view(
208-
&mut views_buf,
209-
&mut null_builder,
210-
raw_view,
211-
trimmed,
212-
offset,
213-
);
202+
append_view(&mut views_buf, raw_view, trimmed, offset);
203+
null_builder.append_non_null();
214204
} else {
215205
null_builder.append_null();
216206
views_buf.push(0);
@@ -261,7 +251,8 @@ fn trim_and_append_view<Tr: Trimmer>(
261251
) {
262252
if let Some(src_str) = src_str_opt {
263253
let (trimmed, offset) = Tr::trim(src_str, pattern);
264-
make_and_append_view(views_buf, null_builder, original_view, trimmed, offset);
254+
append_view(views_buf, original_view, trimmed, offset);
255+
null_builder.append_non_null();
265256
} else {
266257
null_builder.append_null();
267258
views_buf.push(0);

datafusion/functions/src/strings.rs

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ use datafusion_common::{Result, exec_datafusion_err, internal_err};
2121

2222
use arrow::array::{
2323
Array, ArrayAccessor, ArrayDataBuilder, BinaryArray, ByteView, LargeStringArray,
24-
NullBufferBuilder, StringArray, StringViewArray, StringViewBuilder, make_view,
24+
StringArray, StringViewArray, StringViewBuilder, make_view,
2525
};
2626
use arrow::buffer::{MutableBuffer, NullBuffer};
2727
use arrow::datatypes::DataType;
@@ -372,7 +372,9 @@ impl LargeStringArrayBuilder {
372372
}
373373
}
374374

375-
/// Append a new view to the views buffer with the given substr
375+
/// Append a new view to the views buffer with the given substr.
376+
///
377+
/// Callers are responsible for their own null tracking.
376378
///
377379
/// # Safety
378380
///
@@ -381,13 +383,15 @@ impl LargeStringArrayBuilder {
381383
///
382384
/// # Arguments
383385
/// - views_buffer: The buffer to append the new view to
384-
/// - null_builder: The buffer to append the null value to
385386
/// - original_view: The original view value
386387
/// - substr: The substring to append. Must be a valid substring of the original view
387388
/// - start_offset: The start offset of the substring in the view
388-
pub fn make_and_append_view(
389+
///
390+
/// LLVM is apparently overly eager to inline this function into some hot loops,
391+
/// which bloats them and regresses performance, so we disable inling for now.
392+
#[inline(never)]
393+
pub fn append_view(
389394
views_buffer: &mut Vec<u128>,
390-
null_builder: &mut NullBufferBuilder,
391395
original_view: &u128,
392396
substr: &str,
393397
start_offset: u32,
@@ -401,11 +405,9 @@ pub fn make_and_append_view(
401405
view.offset + start_offset,
402406
)
403407
} else {
404-
// inline value does not need block id or offset
405408
make_view(substr.as_bytes(), 0, 0)
406409
};
407410
views_buffer.push(sub_view);
408-
null_builder.append_non_null();
409411
}
410412

411413
#[derive(Debug)]

datafusion/functions/src/unicode/substr.rs

Lines changed: 22 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,13 @@
1717

1818
use std::sync::Arc;
1919

20-
use crate::strings::make_and_append_view;
20+
use crate::strings::append_view;
2121
use crate::utils::make_scalar_function;
2222
use arrow::array::{
23-
Array, ArrayRef, AsArray, Int64Array, NullBufferBuilder, StringArrayType,
24-
StringViewArray, StringViewBuilder,
23+
Array, ArrayRef, AsArray, Int64Array, StringArrayType, StringViewArray,
24+
StringViewBuilder,
2525
};
26-
use arrow::buffer::ScalarBuffer;
26+
use arrow::buffer::{NullBuffer, ScalarBuffer};
2727
use arrow::datatypes::DataType;
2828
use datafusion_common::cast::as_int64_array;
2929
use datafusion_common::types::{
@@ -278,39 +278,32 @@ fn string_view_substr(
278278
let enable_ascii_fast_path =
279279
enable_ascii_fast_path(&string_view_array, start_array, count_array_opt);
280280

281+
// Combine null bitmaps from all inputs in bulk.
282+
let nulls = NullBuffer::union(
283+
NullBuffer::union(string_view_array.nulls(), start_array.nulls()).as_ref(),
284+
count_array_opt.and_then(|a| a.nulls()),
285+
);
286+
281287
let mut views_buf = Vec::with_capacity(string_view_array.len());
282-
let mut null_builder = NullBufferBuilder::new(string_view_array.len());
283-
284-
for i in 0..string_view_array.len() {
285-
if string_view_array.is_null(i)
286-
|| start_array.is_null(i)
287-
|| count_array_opt.map(|a| a.is_null(i)).unwrap_or(false)
288-
{
289-
null_builder.append_null();
288+
289+
for (i, raw_view) in string_view_array.views().iter().enumerate() {
290+
if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
290291
views_buf.push(0);
291292
continue;
292293
}
293294

294295
let string = string_view_array.value(i);
295296
let start = start_array.value(i);
296297
let count = count_array_opt.map(|a| a.value(i));
297-
let raw_view = string_view_array.views()[i];
298298

299299
let (start, end) =
300300
get_true_start_end(string, start, count, enable_ascii_fast_path)?;
301301
let substr = &string[start..end];
302302

303-
make_and_append_view(
304-
&mut views_buf,
305-
&mut null_builder,
306-
&raw_view,
307-
substr,
308-
start as u32,
309-
);
303+
append_view(&mut views_buf, raw_view, substr, start as u32);
310304
}
311305

312306
let views_buf = ScalarBuffer::from(views_buf);
313-
let nulls_buf = null_builder.finish();
314307

315308
// Safety:
316309
// (1) The blocks of the given views are all provided
@@ -320,7 +313,7 @@ fn string_view_substr(
320313
let array = StringViewArray::new_unchecked(
321314
views_buf,
322315
string_view_array.data_buffers().to_vec(),
323-
nulls_buf,
316+
nulls,
324317
);
325318
Ok(Arc::new(array) as ArrayRef)
326319
}
@@ -336,13 +329,16 @@ where
336329
let enable_ascii_fast_path =
337330
enable_ascii_fast_path(&string_array, start_array, count_array_opt);
338331

332+
// Combine null bitmaps from all inputs in bulk.
333+
let nulls = NullBuffer::union(
334+
NullBuffer::union(string_array.nulls(), start_array.nulls()).as_ref(),
335+
count_array_opt.and_then(|a| a.nulls()),
336+
);
337+
339338
let mut result_builder = StringViewBuilder::new();
340339

341340
for i in 0..string_array.len() {
342-
if string_array.is_null(i)
343-
|| start_array.is_null(i)
344-
|| count_array_opt.map(|a| a.is_null(i)).unwrap_or(false)
345-
{
341+
if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
346342
result_builder.append_null();
347343
continue;
348344
}

0 commit comments

Comments
 (0)