Skip to content

Commit 8152b44

Browse files
authored
feat: optimise copying in left for Utf8 and LargeUtf8 (#19980)
## Which issue does this PR close? - Closes #19749. ## Rationale for this change A follow-up to an optimisation of the `left` function in #19571 ## What changes are included in this PR? - Improve memory performance to O(1) by eliminating more string copies. Discover a byte offset for the last character for both positive and negative length arguments and slice bytes directly. - For `Utf8View` (`StringViewArray`), implement a zero-copy slice operation reusing the same Arrow buffers. It is possible for both views since the string only shrinks. We only need to tune a German prefix. - An Arrow view construction helper `shrink_string_view_array_view` is included in this PR. Unfortunately, string view builders cannot provide a way to reuse Arrow buffers. I believe it should better reside in the core Arrow crates instead - I can follow up on it. ## Are these changes tested? - Additional unit tests - SLTs ## Are there any user-facing changes? No
1 parent 92f60ad commit 8152b44

3 files changed

Lines changed: 294 additions & 100 deletions

File tree

datafusion/functions/benches/left.rs

Lines changed: 85 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,29 @@ use std::sync::Arc;
2222

2323
use arrow::array::{ArrayRef, Int64Array};
2424
use arrow::datatypes::{DataType, Field};
25-
use arrow::util::bench_util::create_string_array_with_len;
25+
use arrow::util::bench_util::{
26+
create_string_array_with_len, create_string_view_array_with_len,
27+
};
2628
use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
2729
use datafusion_common::config::ConfigOptions;
2830
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
2931
use datafusion_functions::unicode::left;
3032

31-
fn create_args(size: usize, str_len: usize, use_negative: bool) -> Vec<ColumnarValue> {
32-
let string_array = Arc::new(create_string_array_with_len::<i32>(size, 0.1, str_len));
33+
fn create_args(
34+
size: usize,
35+
str_len: usize,
36+
use_negative: bool,
37+
is_string_view: bool,
38+
) -> Vec<ColumnarValue> {
39+
let string_arg = if is_string_view {
40+
ColumnarValue::Array(Arc::new(create_string_view_array_with_len(
41+
size, 0.1, str_len, true,
42+
)))
43+
} else {
44+
ColumnarValue::Array(Arc::new(create_string_array_with_len::<i32>(
45+
size, 0.1, str_len,
46+
)))
47+
};
3348

3449
// For negative n, we want to trigger the double-iteration code path
3550
let n_values: Vec<i64> = if use_negative {
@@ -40,70 +55,84 @@ fn create_args(size: usize, str_len: usize, use_negative: bool) -> Vec<ColumnarV
4055
let n_array = Arc::new(Int64Array::from(n_values));
4156

4257
vec![
43-
ColumnarValue::Array(string_array),
58+
string_arg,
4459
ColumnarValue::Array(Arc::clone(&n_array) as ArrayRef),
4560
]
4661
}
4762

4863
fn criterion_benchmark(c: &mut Criterion) {
49-
for size in [1024, 4096] {
50-
let mut group = c.benchmark_group(format!("left size={size}"));
64+
for is_string_view in [false, true] {
65+
for size in [1024, 4096] {
66+
let mut group = c.benchmark_group(format!("left size={size}"));
5167

52-
// Benchmark with positive n (no optimization needed)
53-
let args = create_args(size, 32, false);
54-
group.bench_function(BenchmarkId::new("positive n", size), |b| {
55-
let arg_fields = args
56-
.iter()
57-
.enumerate()
58-
.map(|(idx, arg)| {
59-
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
68+
// Benchmark with positive n (no optimization needed)
69+
let mut function_name = if is_string_view {
70+
"string_view_array positive n"
71+
} else {
72+
"string_array positive n"
73+
};
74+
let args = create_args(size, 32, false, is_string_view);
75+
group.bench_function(BenchmarkId::new(function_name, size), |b| {
76+
let arg_fields = args
77+
.iter()
78+
.enumerate()
79+
.map(|(idx, arg)| {
80+
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
81+
})
82+
.collect::<Vec<_>>();
83+
let config_options = Arc::new(ConfigOptions::default());
84+
85+
b.iter(|| {
86+
black_box(
87+
left()
88+
.invoke_with_args(ScalarFunctionArgs {
89+
args: args.clone(),
90+
arg_fields: arg_fields.clone(),
91+
number_rows: size,
92+
return_field: Field::new("f", DataType::Utf8, true)
93+
.into(),
94+
config_options: Arc::clone(&config_options),
95+
})
96+
.expect("left should work"),
97+
)
6098
})
61-
.collect::<Vec<_>>();
62-
let config_options = Arc::new(ConfigOptions::default());
99+
});
63100

64-
b.iter(|| {
65-
black_box(
66-
left()
67-
.invoke_with_args(ScalarFunctionArgs {
68-
args: args.clone(),
69-
arg_fields: arg_fields.clone(),
70-
number_rows: size,
71-
return_field: Field::new("f", DataType::Utf8, true).into(),
72-
config_options: Arc::clone(&config_options),
73-
})
74-
.expect("left should work"),
75-
)
76-
})
77-
});
101+
// Benchmark with negative n (triggers optimization)
102+
function_name = if is_string_view {
103+
"string_view_array negative n"
104+
} else {
105+
"string_array negative n"
106+
};
107+
let args = create_args(size, 32, true, is_string_view);
108+
group.bench_function(BenchmarkId::new(function_name, size), |b| {
109+
let arg_fields = args
110+
.iter()
111+
.enumerate()
112+
.map(|(idx, arg)| {
113+
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
114+
})
115+
.collect::<Vec<_>>();
116+
let config_options = Arc::new(ConfigOptions::default());
78117

79-
// Benchmark with negative n (triggers optimization)
80-
let args = create_args(size, 32, true);
81-
group.bench_function(BenchmarkId::new("negative n", size), |b| {
82-
let arg_fields = args
83-
.iter()
84-
.enumerate()
85-
.map(|(idx, arg)| {
86-
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
118+
b.iter(|| {
119+
black_box(
120+
left()
121+
.invoke_with_args(ScalarFunctionArgs {
122+
args: args.clone(),
123+
arg_fields: arg_fields.clone(),
124+
number_rows: size,
125+
return_field: Field::new("f", DataType::Utf8, true)
126+
.into(),
127+
config_options: Arc::clone(&config_options),
128+
})
129+
.expect("left should work"),
130+
)
87131
})
88-
.collect::<Vec<_>>();
89-
let config_options = Arc::new(ConfigOptions::default());
90-
91-
b.iter(|| {
92-
black_box(
93-
left()
94-
.invoke_with_args(ScalarFunctionArgs {
95-
args: args.clone(),
96-
arg_fields: arg_fields.clone(),
97-
number_rows: size,
98-
return_field: Field::new("f", DataType::Utf8, true).into(),
99-
config_options: Arc::clone(&config_options),
100-
})
101-
.expect("left should work"),
102-
)
103-
})
104-
});
132+
});
105133

106-
group.finish();
134+
group.finish();
135+
}
107136
}
108137
}
109138

0 commit comments

Comments
 (0)