Skip to content

Commit 97172e2

Browse files
authored
perf: Optimize left, right to reduce copying (#21442)
## Which issue does this PR close? - Closes #21441. ## Rationale for this change This PR makes two distinct optimizations to the `left` and `right` builtin UDFs: 1. The `left` and `right` built-in UDFs have a zero-copy path for `Utf8View` input, but they always copy for `Utf8` and `LargeUtf8` inputs. If we make these functions always return `Utf8View`, we can add a zero-copy path for `Utf8` and `LargeUtf8` paths as well. We can't take this path in the case when the largest offset in the input string array is > 4GB, but that is rare. This follows the recent optimization for `substr` (#21366) 2. In the code path that handles `Utf8View` input, we were constructing the return value via `StringViewArray::try_new`, which does some fairly expensive validation. We know the return value is correct by construction, so we can use `StringViewArray::new_unchecked` instead. Benchmarks (ARM64): ``` - left/string short_result: 179.6µs → 127.1µs (-29.2%) - left/string long_result: 324.3µs → 262.2µs (-19.1%) - left/string_view short_result: 220.9µs → 122.5µs (-44.5%) - left/string_view long_result: 383.1µs → 212.0µs (-44.7%) - right/string short_result: 180.4µs → 126.0µs (-30.2%) - right/string long_result: 392.0µs → 343.9µs (-12.3%) - right/string_view short_result: 228.7µs → 125.3µs (-45.2%) - right/string_view long_result: 393.6µs → 238.0µs (-39.5%) ``` ## What changes are included in this PR? * Update benchmarks to measure both inline and out-of-line string results * Change `left` and `right` return types to be `Utf8View` * Optimize `left` and `right` string array path to do zero-copy when possible * Optimize `left` and `right` string view path, and refactor it to be more similar to the array path * Add more SLT tests to cover modified code paths * Update various test expectations to reflect the new return type ## Are these changes tested? Yes; benchmarked and new tests added. ## Are there any user-facing changes? The return value of these functions have changed. This shouldn't typically break any user logic, although it might result in the planner inserting or removing casts for downstream operators, and the performance of downstream operators might either be better or worse, depending on whether the downstream code is better suited for `Utf8` or `Utf8View` string representations.
1 parent 769e214 commit 97172e2

5 files changed

Lines changed: 344 additions & 192 deletions

File tree

datafusion/functions/benches/left_right.rs

Lines changed: 54 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -16,40 +16,40 @@
1616
// under the License.
1717

1818
use std::hint::black_box;
19+
use std::ops::Range;
1920
use std::sync::Arc;
2021

2122
use arrow::array::{ArrayRef, Int64Array};
2223
use arrow::datatypes::{DataType, Field};
2324
use arrow::util::bench_util::{
2425
create_string_array_with_len, create_string_view_array_with_len,
2526
};
26-
use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
27+
use criterion::{Criterion, criterion_group, criterion_main};
2728
use datafusion_common::config::ConfigOptions;
2829
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
2930
use datafusion_functions::unicode::{left, right};
3031

32+
const BATCH_SIZE: usize = 8192;
33+
3134
fn create_args(
32-
size: usize,
3335
str_len: usize,
34-
use_negative: bool,
36+
n_range: Range<i64>,
3537
is_string_view: bool,
3638
) -> Vec<ColumnarValue> {
3739
let string_arg = if is_string_view {
3840
ColumnarValue::Array(Arc::new(create_string_view_array_with_len(
39-
size, 0.1, str_len, true,
41+
BATCH_SIZE, 0.1, str_len, true,
4042
)))
4143
} else {
4244
ColumnarValue::Array(Arc::new(create_string_array_with_len::<i32>(
43-
size, 0.1, str_len,
45+
BATCH_SIZE, 0.1, str_len,
4446
)))
4547
};
4648

47-
// For negative n, we want to trigger the double-iteration code path
48-
let n_values: Vec<i64> = if use_negative {
49-
(0..size).map(|i| -((i % 10 + 1) as i64)).collect()
50-
} else {
51-
(0..size).map(|i| (i % 10 + 1) as i64).collect()
52-
};
49+
let n_span = (n_range.end - n_range.start) as usize;
50+
let n_values: Vec<i64> = (0..BATCH_SIZE)
51+
.map(|i| n_range.start + (i % n_span) as i64)
52+
.collect();
5353
let n_array = Arc::new(Int64Array::from(n_values));
5454

5555
vec![
@@ -59,68 +59,55 @@ fn create_args(
5959
}
6060

6161
fn criterion_benchmark(c: &mut Criterion) {
62-
let left_function = left();
63-
let right_function = right();
62+
// Short results (1-10 chars) produce inline StringView entries (≤12 bytes).
63+
// Long results (20-29 chars) produce out-of-line entries.
64+
let cases = [
65+
("short_result", 32, 1..11_i64),
66+
("long_result", 32, 20..30_i64),
67+
];
6468

65-
for function in [left_function, right_function] {
66-
for is_string_view in [false, true] {
67-
for is_negative in [false, true] {
68-
for size in [1024, 4096] {
69-
let function_name = function.name();
70-
let mut group =
71-
c.benchmark_group(format!("{function_name} size={size}"));
69+
for function in [left(), right()] {
70+
let mut group = c.benchmark_group(function.name().to_string());
7271

73-
let bench_name = format!(
74-
"{} {} n",
75-
if is_string_view {
76-
"string_view_array"
77-
} else {
78-
"string_array"
79-
},
80-
if is_negative { "negative" } else { "positive" },
81-
);
82-
let return_type = if is_string_view {
83-
DataType::Utf8View
84-
} else {
85-
DataType::Utf8
86-
};
87-
88-
let args = create_args(size, 32, is_negative, is_string_view);
89-
group.bench_function(BenchmarkId::new(bench_name, size), |b| {
90-
let arg_fields = args
91-
.iter()
92-
.enumerate()
93-
.map(|(idx, arg)| {
94-
Field::new(format!("arg_{idx}"), arg.data_type(), true)
95-
.into()
96-
})
97-
.collect::<Vec<_>>();
98-
let config_options = Arc::new(ConfigOptions::default());
72+
for is_string_view in [false, true] {
73+
let array_type = if is_string_view {
74+
"string_view"
75+
} else {
76+
"string"
77+
};
9978

100-
b.iter(|| {
101-
black_box(
102-
function
103-
.invoke_with_args(ScalarFunctionArgs {
104-
args: args.clone(),
105-
arg_fields: arg_fields.clone(),
106-
number_rows: size,
107-
return_field: Field::new(
108-
"f",
109-
return_type.clone(),
110-
true,
111-
)
112-
.into(),
113-
config_options: Arc::clone(&config_options),
114-
})
115-
.expect("should work"),
116-
)
117-
})
118-
});
79+
for (case_name, str_len, n_range) in &cases {
80+
let bench_name = format!("{array_type} {case_name}");
81+
let args = create_args(*str_len, n_range.clone(), is_string_view);
82+
let arg_fields: Vec<_> = args
83+
.iter()
84+
.enumerate()
85+
.map(|(idx, arg)| {
86+
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
87+
})
88+
.collect();
89+
let config_options = Arc::new(ConfigOptions::default());
90+
let return_field = Field::new("f", DataType::Utf8View, true).into();
11991

120-
group.finish();
121-
}
92+
group.bench_function(&bench_name, |b| {
93+
b.iter(|| {
94+
black_box(
95+
function
96+
.invoke_with_args(ScalarFunctionArgs {
97+
args: args.clone(),
98+
arg_fields: arg_fields.clone(),
99+
number_rows: BATCH_SIZE,
100+
return_field: Arc::clone(&return_field),
101+
config_options: Arc::clone(&config_options),
102+
})
103+
.expect("should work"),
104+
)
105+
})
106+
});
122107
}
123108
}
109+
110+
group.finish();
124111
}
125112
}
126113

datafusion/functions/src/unicode/common.rs

Lines changed: 112 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,12 @@
1818
//! Common utilities for implementing unicode functions
1919
2020
use arrow::array::{
21-
Array, ArrayAccessor, ArrayIter, ArrayRef, ByteView, GenericStringArray, Int64Array,
22-
OffsetSizeTrait, StringViewArray, make_view,
21+
Array, ArrayRef, ByteView, GenericStringArray, Int64Array, OffsetSizeTrait,
22+
StringViewArray, make_view,
2323
};
2424
use arrow::datatypes::DataType;
2525
use arrow_buffer::{NullBuffer, ScalarBuffer};
26+
use datafusion_common::Result;
2627
use datafusion_common::ScalarValue;
2728
use datafusion_common::cast::{
2829
as_generic_string_array, as_int64_array, as_string_view_array,
@@ -130,17 +131,17 @@ fn left_right_byte_length(string: &str, n: i64) -> usize {
130131
/// General implementation for `left` and `right` functions
131132
pub(crate) fn general_left_right<F: LeftRightSlicer>(
132133
args: &[ArrayRef],
133-
) -> datafusion_common::Result<ArrayRef> {
134+
) -> Result<ArrayRef> {
134135
let n_array = as_int64_array(&args[1])?;
135136

136137
match args[0].data_type() {
137138
DataType::Utf8 => {
138139
let string_array = as_generic_string_array::<i32>(&args[0])?;
139-
general_left_right_array::<i32, _, F>(string_array, n_array)
140+
general_left_right_array::<i32, F>(string_array, n_array)
140141
}
141142
DataType::LargeUtf8 => {
142143
let string_array = as_generic_string_array::<i64>(&args[0])?;
143-
general_left_right_array::<i64, _, F>(string_array, n_array)
144+
general_left_right_array::<i64, F>(string_array, n_array)
144145
}
145146
DataType::Utf8View => {
146147
let string_view_array = as_string_view_array(&args[0])?;
@@ -150,83 +151,125 @@ pub(crate) fn general_left_right<F: LeftRightSlicer>(
150151
}
151152
}
152153

153-
/// `general_left_right` implementation for strings
154-
fn general_left_right_array<
155-
'a,
156-
T: OffsetSizeTrait,
157-
V: ArrayAccessor<Item = &'a str>,
158-
F: LeftRightSlicer,
159-
>(
160-
string_array: V,
154+
/// Returns true if all offsets in the array fit in i32, meaning the values
155+
/// buffer can be referenced by StringView's offset field.
156+
fn values_fit_in_i32<T: OffsetSizeTrait>(string_array: &GenericStringArray<T>) -> bool {
157+
string_array
158+
.offsets()
159+
.last()
160+
.map(|offset| offset.as_usize() <= i32::MAX as usize)
161+
.unwrap_or(true)
162+
}
163+
164+
/// `left`/`right` for Utf8/LargeUtf8 input.
165+
///
166+
/// When offsets fit in i32, produces a zero-copy `StringViewArray` with views
167+
/// pointing into the input values buffer. Otherwise falls back to building a
168+
/// `StringViewArray` by copying.
169+
fn general_left_right_array<T: OffsetSizeTrait, F: LeftRightSlicer>(
170+
string_array: &GenericStringArray<T>,
161171
n_array: &Int64Array,
162-
) -> datafusion_common::Result<ArrayRef> {
163-
let iter = ArrayIter::new(string_array);
164-
let result = iter
165-
.zip(n_array.iter())
166-
.map(|(string, n)| match (string, n) {
167-
(Some(string), Some(n)) => {
168-
let range = F::slice(string, n);
169-
// Extract a given range from a byte-indexed slice
170-
Some(&string[range])
171-
}
172-
_ => None,
173-
})
174-
.collect::<GenericStringArray<T>>();
172+
) -> Result<ArrayRef> {
173+
if !values_fit_in_i32(string_array) {
174+
let result = string_array
175+
.iter()
176+
.zip(n_array.iter())
177+
.map(|(string, n)| match (string, n) {
178+
(Some(string), Some(n)) => Some(&string[F::slice(string, n)]),
179+
_ => None,
180+
})
181+
.collect::<StringViewArray>();
182+
return Ok(Arc::new(result) as ArrayRef);
183+
}
184+
185+
let len = string_array.len();
186+
let offsets = string_array.value_offsets();
187+
let nulls = NullBuffer::union(string_array.nulls(), n_array.nulls());
188+
189+
let mut views_buf = Vec::with_capacity(len);
190+
let mut has_out_of_line = false;
191+
192+
for (i, offset) in offsets.iter().enumerate().take(len) {
193+
if nulls.as_ref().is_some_and(|n| n.is_null(i)) {
194+
views_buf.push(0);
195+
continue;
196+
}
197+
198+
// SAFETY: we just checked validity above
199+
let string = unsafe { string_array.value_unchecked(i) };
200+
let n = n_array.value(i);
201+
let range = F::slice(string, n);
202+
let result_bytes = &string.as_bytes()[range.clone()];
203+
if result_bytes.len() > 12 {
204+
has_out_of_line = true;
205+
}
206+
207+
let buf_offset = offset.as_usize() as u32 + range.start as u32;
208+
views_buf.push(make_view(result_bytes, 0, buf_offset));
209+
}
175210

176-
Ok(Arc::new(result) as ArrayRef)
211+
let views = ScalarBuffer::from(views_buf);
212+
let data_buffers = if has_out_of_line {
213+
vec![string_array.values().clone()]
214+
} else {
215+
vec![]
216+
};
217+
218+
// SAFETY:
219+
// - Each view is produced by `make_view` with correct bytes and offset
220+
// - Out-of-line views reference buffer index 0, which is the original
221+
// values buffer included in data_buffers when has_out_of_line is true
222+
// - values_fit_in_i32 guarantees all offsets fit in i32
223+
unsafe {
224+
let array = StringViewArray::new_unchecked(views, data_buffers, nulls);
225+
Ok(Arc::new(array) as ArrayRef)
226+
}
177227
}
178228

179-
/// `general_left_right` implementation for StringViewArray
229+
/// `general_left_right` for StringViewArray input.
180230
fn general_left_right_view<F: LeftRightSlicer>(
181231
string_view_array: &StringViewArray,
182232
n_array: &Int64Array,
183-
) -> datafusion_common::Result<ArrayRef> {
184-
let len = n_array.len();
185-
233+
) -> Result<ArrayRef> {
186234
let views = string_view_array.views();
187-
// Every string in StringViewArray has one corresponding view in `views`
188-
debug_assert!(views.len() == string_view_array.len());
189-
190-
// Compose null buffer at once
191-
let string_nulls = string_view_array.nulls();
192-
let n_nulls = n_array.nulls();
193-
let new_nulls = NullBuffer::union(string_nulls, n_nulls);
235+
let new_nulls = NullBuffer::union(string_view_array.nulls(), n_array.nulls());
236+
let len = n_array.len();
237+
let mut has_out_of_line = false;
194238

195239
let new_views = (0..len)
196240
.map(|idx| {
197-
let view = views[idx];
198-
199-
let is_valid = match &new_nulls {
200-
Some(nulls_buf) => nulls_buf.is_valid(idx),
201-
None => true,
202-
};
203-
204-
if is_valid {
205-
let string: &str = string_view_array.value(idx);
206-
let n = n_array.value(idx);
207-
208-
// Input string comes from StringViewArray, so it should fit in 32-bit length
209-
let range = F::slice(string, n);
210-
let result_bytes = &string.as_bytes()[range.clone()];
211-
212-
let byte_view = ByteView::from(view);
213-
// New offset starts at 0 for left, and at `range.start` for right,
214-
// which is encoded in the given range
215-
let new_offset = byte_view.offset + (range.start as u32);
216-
// Reuse buffer
217-
make_view(result_bytes, byte_view.buffer_index, new_offset)
218-
} else {
219-
// For nulls, keep the original view
220-
view
241+
if new_nulls.as_ref().is_some_and(|n| n.is_null(idx)) {
242+
return 0;
221243
}
244+
245+
// SAFETY: we just checked validity above
246+
let string: &str = unsafe { string_view_array.value_unchecked(idx) };
247+
let n = n_array.value(idx);
248+
249+
let range = F::slice(string, n);
250+
let result_bytes = &string.as_bytes()[range.clone()];
251+
if result_bytes.len() > 12 {
252+
has_out_of_line = true;
253+
}
254+
255+
let byte_view = ByteView::from(views[idx]);
256+
let new_offset = byte_view.offset + (range.start as u32);
257+
make_view(result_bytes, byte_view.buffer_index, new_offset)
222258
})
223259
.collect::<Vec<u128>>();
224260

225-
// Buffers are unchanged
226-
let result = StringViewArray::try_new(
227-
ScalarBuffer::from(new_views),
228-
Vec::from(string_view_array.data_buffers()),
229-
new_nulls,
230-
)?;
231-
Ok(Arc::new(result) as ArrayRef)
261+
let views = ScalarBuffer::from(new_views);
262+
let data_buffers = if has_out_of_line {
263+
string_view_array.data_buffers().to_vec()
264+
} else {
265+
vec![]
266+
};
267+
268+
// SAFETY:
269+
// - Each view is produced by `make_view` with correct bytes and offset
270+
// - Out-of-line views reuse the original buffer index and adjusted offset
271+
unsafe {
272+
let array = StringViewArray::new_unchecked(views, data_buffers, new_nulls);
273+
Ok(Arc::new(array) as ArrayRef)
274+
}
232275
}

0 commit comments

Comments
 (0)