Skip to content

Commit 7d5ddca

Browse files
authored
perf: Optimize lower, upper for sliced arrays (#21814)
## Which issue does this PR close? - Closes #21804. ## Rationale for this change `case_conversion_ascii_array` operates directly on the underlying values buffer, but it neglects to ensure it only looks at bytes within the visible slice. For sliced arrays, this can lead to doing substantial unnecessary work. ## What changes are included in this PR? * Optimize `case_conversion_ascii_array` for sliced arrays * Add a unit test * Add a benchmark. We can make the "sliced array" case arbitrarily extreme, so the raw benchmark number here is less important; it is more important that this benchmark confirms that the work we do scales with the visible size of a sliced array, which it does. ## Are these changes tested? Yes. ## Are there any user-facing changes? No.
1 parent 85e75e2 commit 7d5ddca

3 files changed

Lines changed: 103 additions & 30 deletions

File tree

datafusion/functions/benches/lower.rs

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use arrow::array::{ArrayRef, StringArray, StringViewBuilder};
18+
use arrow::array::{Array, ArrayRef, StringArray, StringViewBuilder};
1919
use arrow::datatypes::{DataType, Field};
2020
use arrow::util::bench_util::{
2121
create_string_array_with_len, create_string_view_array_with_len,
@@ -195,6 +195,43 @@ fn criterion_benchmark(c: &mut Criterion) {
195195
);
196196
}
197197

198+
{
199+
let parent_size = 65536;
200+
let slice_len = 128;
201+
let str_len = 32;
202+
let parent = Arc::new(create_string_array_with_len::<i32>(
203+
parent_size,
204+
0.2,
205+
str_len,
206+
)) as ArrayRef;
207+
let offset = (parent_size - slice_len) / 2;
208+
let sliced = parent.slice(offset, slice_len);
209+
let args = vec![ColumnarValue::Array(sliced)];
210+
let arg_fields = args
211+
.iter()
212+
.enumerate()
213+
.map(|(idx, arg)| {
214+
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
215+
})
216+
.collect::<Vec<_>>();
217+
218+
c.bench_function(
219+
&format!("lower_sliced_ascii: parent={parent_size}, slice={slice_len}, str_len={str_len}"),
220+
|b| {
221+
b.iter(|| {
222+
let args_cloned = args.clone();
223+
black_box(lower.invoke_with_args(ScalarFunctionArgs {
224+
args: args_cloned,
225+
arg_fields: arg_fields.clone(),
226+
number_rows: slice_len,
227+
return_field: Field::new("f", DataType::Utf8, true).into(),
228+
config_options: Arc::clone(&config_options),
229+
}))
230+
})
231+
},
232+
);
233+
}
234+
198235
let sizes = [4096, 8192];
199236
let str_lens = [10, 64, 128];
200237
let mixes = [true, false];

datafusion/functions/src/string/common.rs

Lines changed: 33 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ use arrow::array::{
2424
Array, ArrayRef, GenericStringArray, GenericStringBuilder, NullBufferBuilder,
2525
OffsetSizeTrait, StringViewArray, StringViewBuilder, new_null_array,
2626
};
27-
use arrow::buffer::{Buffer, ScalarBuffer};
27+
use arrow::buffer::{Buffer, OffsetBuffer, ScalarBuffer};
2828
use arrow::datatypes::DataType;
2929
use datafusion_common::Result;
3030
use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
@@ -390,16 +390,16 @@ where
390390
const PRE_ALLOC_BYTES: usize = 8;
391391

392392
let string_array = as_generic_string_array::<O>(array)?;
393-
let value_data = string_array.value_data();
394-
395-
// All values are ASCII.
396-
if value_data.is_ascii() {
393+
if string_array.is_ascii() {
397394
return case_conversion_ascii_array::<O, _>(string_array, op);
398395
}
399396

400397
// Values contain non-ASCII.
401398
let item_len = string_array.len();
402-
let capacity = string_array.value_data().len() + PRE_ALLOC_BYTES;
399+
let offsets = string_array.value_offsets();
400+
let start = offsets.first().unwrap().as_usize();
401+
let end = offsets.last().unwrap().as_usize();
402+
let capacity = (end - start) + PRE_ALLOC_BYTES;
403403
let mut builder = GenericStringBuilder::<O>::with_capacity(item_len, capacity);
404404

405405
if string_array.null_count() == 0 {
@@ -413,9 +413,10 @@ where
413413
Ok(Arc::new(builder.finish()))
414414
}
415415

416-
/// All values of string_array are ASCII, and when converting case, there is no changes in the byte
417-
/// array length. Therefore, the StringArray can be treated as a complete ASCII string for
418-
/// case conversion, and we can reuse the offsets buffer and the nulls buffer.
416+
/// Fast path for case conversion on an all-ASCII string array. ASCII case
417+
/// conversion is byte-length-preserving, so we can convert the entire addressed
418+
/// range in one call and reuse the offsets and nulls buffers — rebasing the
419+
/// offsets when the input is a sliced array.
419420
fn case_conversion_ascii_array<'a, O, F>(
420421
string_array: &'a GenericStringArray<O>,
421422
op: F,
@@ -424,21 +425,33 @@ where
424425
O: OffsetSizeTrait,
425426
F: Fn(&'a str) -> String,
426427
{
427-
let value_data = string_array.value_data();
428-
// SAFETY: all items stored in value_data satisfy UTF8.
429-
// ref: impl ByteArrayNativeType for str {...}
430-
let str_values = unsafe { std::str::from_utf8_unchecked(value_data) };
428+
let value_offsets = string_array.value_offsets();
429+
let start = value_offsets.first().unwrap().as_usize();
430+
let end = value_offsets.last().unwrap().as_usize();
431+
let relevant = &string_array.value_data()[start..end];
432+
433+
// SAFETY: `relevant` is a subslice of the string array's value buffer,
434+
// which is valid UTF-8.
435+
let str_values = unsafe { std::str::from_utf8_unchecked(relevant) };
431436

432-
// conversion
433437
let converted_values = op(str_values);
434-
assert_eq!(converted_values.len(), str_values.len());
435-
let bytes = converted_values.into_bytes();
438+
debug_assert_eq!(converted_values.len(), str_values.len());
439+
let values = Buffer::from_vec(converted_values.into_bytes());
440+
441+
// Shift offsets from `start`-based to 0-based so they index into `values`.
442+
let offsets = if start == 0 {
443+
string_array.offsets().clone()
444+
} else {
445+
let s = O::usize_as(start);
446+
let rebased: Vec<O> = value_offsets.iter().map(|&o| o - s).collect();
447+
// SAFETY: subtracting a constant from monotonic offsets preserves
448+
// monotonicity, and `start` is the minimum offset so no underflow.
449+
unsafe { OffsetBuffer::new_unchecked(ScalarBuffer::from(rebased)) }
450+
};
436451

437-
// build result
438-
let values = Buffer::from_vec(bytes);
439-
let offsets = string_array.offsets().clone();
440452
let nulls = string_array.nulls().cloned();
441-
// SAFETY: offsets and nulls are consistent with the input array.
453+
// SAFETY: offsets are monotonic and in-bounds for `values`; nulls
454+
// (if any) match the slice length.
442455
Ok(Arc::new(unsafe {
443456
GenericStringArray::<O>::new_unchecked(offsets, values, nulls)
444457
}))

datafusion/functions/src/string/lower.rs

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -96,22 +96,24 @@ mod tests {
9696
use datafusion_common::config::ConfigOptions;
9797
use std::sync::Arc;
9898

99-
fn to_lower(input: ArrayRef, expected: ArrayRef) -> Result<()> {
99+
fn invoke_lower(input: ArrayRef) -> Result<ArrayRef> {
100100
let func = LowerFunc::new();
101-
let arg_fields = vec![Field::new("a", input.data_type().clone(), true).into()];
102-
101+
let data_type = input.data_type().clone();
103102
let args = ScalarFunctionArgs {
104103
number_rows: input.len(),
105104
args: vec![ColumnarValue::Array(input)],
106-
arg_fields,
107-
return_field: Field::new("f", expected.data_type().clone(), true).into(),
105+
arg_fields: vec![Field::new("a", data_type.clone(), true).into()],
106+
return_field: Field::new("f", data_type, true).into(),
108107
config_options: Arc::new(ConfigOptions::default()),
109108
};
110-
111-
let result = match func.invoke_with_args(args)? {
112-
ColumnarValue::Array(result) => result,
109+
match func.invoke_with_args(args)? {
110+
ColumnarValue::Array(r) => Ok(r),
113111
_ => unreachable!("lower"),
114-
};
112+
}
113+
}
114+
115+
fn to_lower(input: ArrayRef, expected: ArrayRef) -> Result<()> {
116+
let result = invoke_lower(input)?;
115117
assert_eq!(&expected, &result);
116118
Ok(())
117119
}
@@ -207,4 +209,25 @@ mod tests {
207209

208210
to_lower(input, expected)
209211
}
212+
213+
#[test]
214+
fn lower_sliced_utf8() -> Result<()> {
215+
let parent = Arc::new(StringArray::from(vec![
216+
Some("AAAAAAAA"),
217+
Some("HELLO"),
218+
Some("WORLD"),
219+
Some(""),
220+
Some("ZZZZZZZZ"),
221+
])) as ArrayRef;
222+
let sliced = parent.slice(1, 3);
223+
let result = invoke_lower(sliced)?;
224+
let result_sa = result.as_any().downcast_ref::<StringArray>().unwrap();
225+
226+
let expected = StringArray::from(vec![Some("hello"), Some("world"), Some("")]);
227+
assert_eq!(result_sa, &expected);
228+
// The slice's addressed bytes are "HELLO" + "WORLD" = 10; the ASCII
229+
// fast path must produce a tight output buffer (not the parent's).
230+
assert_eq!(result_sa.value_data().len(), 10);
231+
Ok(())
232+
}
210233
}

0 commit comments

Comments
 (0)