Skip to content

Commit b7f6090

Browse files
neilconwaymartin-g
andauthored
perf: Optimize lpad, rpad for ASCII strings (#20278)
The previous implementation incurred the overhead of Unicode machinery, even for the common case that both the input string and the fill string consistent only of ASCII characters. For the ASCII-only case, we can assume that the length in bytes equals the length in characters, and avoid expensive graphene-based segmentation. This follows similar optimizations applied elsewhere in the codebase. Benchmarks indicate this is a significant performance win for ASCII-only input (4x-10x faster) but only a mild regression for Unicode input (2-5% slower). Along the way: * Combine: a few instances of `write_str(str)? + append_value("")` with `append_value(str)`, which saves a few cycles * Add a missing test case for truncating the input string * Add benchmarks for Unicode input ## Which issue does this PR close? - Closes #20277. ## Are these changes tested? Covered by existing tests. Added new benchmarks for Unicode inputs. ## Are there any user-facing changes? No. --------- Co-authored-by: Martin Grigorov <martin-g@users.noreply.github.com>
1 parent 3157a2e commit b7f6090

4 files changed

Lines changed: 348 additions & 54 deletions

File tree

datafusion/functions/benches/pad.rs

Lines changed: 181 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,10 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use arrow::array::{ArrowPrimitiveType, OffsetSizeTrait, PrimitiveArray};
18+
use arrow::array::{
19+
ArrowPrimitiveType, GenericStringBuilder, OffsetSizeTrait, PrimitiveArray,
20+
StringViewBuilder,
21+
};
1922
use arrow::datatypes::{DataType, Field, Int64Type};
2023
use arrow::util::bench_util::{
2124
create_string_array_with_len, create_string_view_array_with_len,
@@ -30,6 +33,51 @@ use std::hint::black_box;
3033
use std::sync::Arc;
3134
use std::time::Duration;
3235

36+
const UNICODE_STRINGS: &[&str] = &[
37+
"Ñandú",
38+
"Íslensku",
39+
"Þjóðarinnar",
40+
"Ελληνική",
41+
"Иванович",
42+
"データフュージョン",
43+
"José García",
44+
"Ölçü bïrïmï",
45+
"Ÿéšṱëṟḏàÿ",
46+
"Ährenstraße",
47+
];
48+
49+
fn create_unicode_string_array<O: OffsetSizeTrait>(
50+
size: usize,
51+
null_density: f32,
52+
) -> arrow::array::GenericStringArray<O> {
53+
let mut rng = rand::rng();
54+
let mut builder = GenericStringBuilder::<O>::new();
55+
for i in 0..size {
56+
if rng.random::<f32>() < null_density {
57+
builder.append_null();
58+
} else {
59+
builder.append_value(UNICODE_STRINGS[i % UNICODE_STRINGS.len()]);
60+
}
61+
}
62+
builder.finish()
63+
}
64+
65+
fn create_unicode_string_view_array(
66+
size: usize,
67+
null_density: f32,
68+
) -> arrow::array::StringViewArray {
69+
let mut rng = rand::rng();
70+
let mut builder = StringViewBuilder::with_capacity(size);
71+
for i in 0..size {
72+
if rng.random::<f32>() < null_density {
73+
builder.append_null();
74+
} else {
75+
builder.append_value(UNICODE_STRINGS[i % UNICODE_STRINGS.len()]);
76+
}
77+
}
78+
builder.finish()
79+
}
80+
3381
struct Filter<Dist> {
3482
dist: Dist,
3583
}
@@ -67,6 +115,34 @@ where
67115
.collect()
68116
}
69117

118+
/// Create args for pad benchmark with Unicode strings
119+
fn create_unicode_pad_args(
120+
size: usize,
121+
target_len: usize,
122+
use_string_view: bool,
123+
) -> Vec<ColumnarValue> {
124+
let length_array =
125+
Arc::new(create_primitive_array::<Int64Type>(size, 0.0, target_len));
126+
127+
if use_string_view {
128+
let string_array = create_unicode_string_view_array(size, 0.1);
129+
let fill_array = create_unicode_string_view_array(size, 0.1);
130+
vec![
131+
ColumnarValue::Array(Arc::new(string_array)),
132+
ColumnarValue::Array(length_array),
133+
ColumnarValue::Array(Arc::new(fill_array)),
134+
]
135+
} else {
136+
let string_array = create_unicode_string_array::<i32>(size, 0.1);
137+
let fill_array = create_unicode_string_array::<i32>(size, 0.1);
138+
vec![
139+
ColumnarValue::Array(Arc::new(string_array)),
140+
ColumnarValue::Array(length_array),
141+
ColumnarValue::Array(Arc::new(fill_array)),
142+
]
143+
}
144+
}
145+
70146
/// Create args for pad benchmark
71147
fn create_pad_args<O: OffsetSizeTrait>(
72148
size: usize,
@@ -208,6 +284,58 @@ fn criterion_benchmark(c: &mut Criterion) {
208284
},
209285
);
210286

287+
// Utf8 type with Unicode strings
288+
let args = create_unicode_pad_args(size, 20, false);
289+
let arg_fields = args
290+
.iter()
291+
.enumerate()
292+
.map(|(idx, arg)| {
293+
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
294+
})
295+
.collect::<Vec<_>>();
296+
297+
group.bench_function(
298+
format!("lpad utf8 unicode [size={size}, target=20]"),
299+
|b| {
300+
b.iter(|| {
301+
let args_cloned = args.clone();
302+
black_box(unicode::lpad().invoke_with_args(ScalarFunctionArgs {
303+
args: args_cloned,
304+
arg_fields: arg_fields.clone(),
305+
number_rows: size,
306+
return_field: Field::new("f", DataType::Utf8, true).into(),
307+
config_options: Arc::clone(&config_options),
308+
}))
309+
})
310+
},
311+
);
312+
313+
// StringView type with Unicode strings
314+
let args = create_unicode_pad_args(size, 20, true);
315+
let arg_fields = args
316+
.iter()
317+
.enumerate()
318+
.map(|(idx, arg)| {
319+
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
320+
})
321+
.collect::<Vec<_>>();
322+
323+
group.bench_function(
324+
format!("lpad stringview unicode [size={size}, target=20]"),
325+
|b| {
326+
b.iter(|| {
327+
let args_cloned = args.clone();
328+
black_box(unicode::lpad().invoke_with_args(ScalarFunctionArgs {
329+
args: args_cloned,
330+
arg_fields: arg_fields.clone(),
331+
number_rows: size,
332+
return_field: Field::new("f", DataType::Utf8View, true).into(),
333+
config_options: Arc::clone(&config_options),
334+
}))
335+
})
336+
},
337+
);
338+
211339
group.finish();
212340
}
213341

@@ -322,6 +450,58 @@ fn criterion_benchmark(c: &mut Criterion) {
322450
},
323451
);
324452

453+
// Utf8 type with Unicode strings
454+
let args = create_unicode_pad_args(size, 20, false);
455+
let arg_fields = args
456+
.iter()
457+
.enumerate()
458+
.map(|(idx, arg)| {
459+
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
460+
})
461+
.collect::<Vec<_>>();
462+
463+
group.bench_function(
464+
format!("rpad utf8 unicode [size={size}, target=20]"),
465+
|b| {
466+
b.iter(|| {
467+
let args_cloned = args.clone();
468+
black_box(unicode::rpad().invoke_with_args(ScalarFunctionArgs {
469+
args: args_cloned,
470+
arg_fields: arg_fields.clone(),
471+
number_rows: size,
472+
return_field: Field::new("f", DataType::Utf8, true).into(),
473+
config_options: Arc::clone(&config_options),
474+
}))
475+
})
476+
},
477+
);
478+
479+
// StringView type with Unicode strings
480+
let args = create_unicode_pad_args(size, 20, true);
481+
let arg_fields = args
482+
.iter()
483+
.enumerate()
484+
.map(|(idx, arg)| {
485+
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
486+
})
487+
.collect::<Vec<_>>();
488+
489+
group.bench_function(
490+
format!("rpad stringview unicode [size={size}, target=20]"),
491+
|b| {
492+
b.iter(|| {
493+
let args_cloned = args.clone();
494+
black_box(unicode::rpad().invoke_with_args(ScalarFunctionArgs {
495+
args: args_cloned,
496+
arg_fields: arg_fields.clone(),
497+
number_rows: size,
498+
return_field: Field::new("f", DataType::Utf8View, true).into(),
499+
config_options: Arc::clone(&config_options),
500+
}))
501+
})
502+
},
503+
);
504+
325505
group.finish();
326506
}
327507
}

datafusion/functions/src/unicode/lpad.rs

Lines changed: 77 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,10 @@ use datafusion_macros::user_doc;
4949
+---------------------------------------------+
5050
```"#,
5151
standard_argument(name = "str", prefix = "String"),
52-
argument(name = "n", description = "String length to pad to."),
52+
argument(
53+
name = "n",
54+
description = "String length to pad to. If the input string is longer than this length, it is truncated (on the right)."
55+
),
5356
argument(
5457
name = "padding_str",
5558
description = "Optional string expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._"
@@ -225,24 +228,47 @@ where
225228
continue;
226229
}
227230

228-
// Reuse buffers by clearing and refilling
229-
graphemes_buf.clear();
230-
graphemes_buf.extend(string.graphemes(true));
231-
232-
fill_chars_buf.clear();
233-
fill_chars_buf.extend(fill.chars());
234-
235-
if length < graphemes_buf.len() {
236-
builder.append_value(graphemes_buf[..length].concat());
237-
} else if fill_chars_buf.is_empty() {
238-
builder.append_value(string);
231+
if string.is_ascii() && fill.is_ascii() {
232+
// ASCII fast path: byte length == character length,
233+
// so we skip expensive grapheme segmentation.
234+
let str_len = string.len();
235+
if length < str_len {
236+
builder.append_value(&string[..length]);
237+
} else if fill.is_empty() {
238+
builder.append_value(string);
239+
} else {
240+
let pad_len = length - str_len;
241+
let fill_len = fill.len();
242+
let full_reps = pad_len / fill_len;
243+
let remainder = pad_len % fill_len;
244+
for _ in 0..full_reps {
245+
builder.write_str(fill)?;
246+
}
247+
if remainder > 0 {
248+
builder.write_str(&fill[..remainder])?;
249+
}
250+
builder.append_value(string);
251+
}
239252
} else {
240-
for l in 0..length - graphemes_buf.len() {
241-
let c = *fill_chars_buf.get(l % fill_chars_buf.len()).unwrap();
242-
builder.write_char(c)?;
253+
// Reuse buffers by clearing and refilling
254+
graphemes_buf.clear();
255+
graphemes_buf.extend(string.graphemes(true));
256+
257+
fill_chars_buf.clear();
258+
fill_chars_buf.extend(fill.chars());
259+
260+
if length < graphemes_buf.len() {
261+
builder.append_value(graphemes_buf[..length].concat());
262+
} else if fill_chars_buf.is_empty() {
263+
builder.append_value(string);
264+
} else {
265+
for l in 0..length - graphemes_buf.len() {
266+
let c =
267+
*fill_chars_buf.get(l % fill_chars_buf.len()).unwrap();
268+
builder.write_char(c)?;
269+
}
270+
builder.append_value(string);
243271
}
244-
builder.write_str(string)?;
245-
builder.append_value("");
246272
}
247273
} else {
248274
builder.append_null();
@@ -266,17 +292,30 @@ where
266292
continue;
267293
}
268294

269-
// Reuse buffer by clearing and refilling
270-
graphemes_buf.clear();
271-
graphemes_buf.extend(string.graphemes(true));
272-
273-
if length < graphemes_buf.len() {
274-
builder.append_value(graphemes_buf[..length].concat());
295+
if string.is_ascii() {
296+
// ASCII fast path: byte length == character length
297+
let str_len = string.len();
298+
if length < str_len {
299+
builder.append_value(&string[..length]);
300+
} else {
301+
for _ in 0..(length - str_len) {
302+
builder.write_str(" ")?;
303+
}
304+
builder.append_value(string);
305+
}
275306
} else {
276-
builder
277-
.write_str(" ".repeat(length - graphemes_buf.len()).as_str())?;
278-
builder.write_str(string)?;
279-
builder.append_value("");
307+
// Reuse buffer by clearing and refilling
308+
graphemes_buf.clear();
309+
graphemes_buf.extend(string.graphemes(true));
310+
311+
if length < graphemes_buf.len() {
312+
builder.append_value(graphemes_buf[..length].concat());
313+
} else {
314+
for _ in 0..(length - graphemes_buf.len()) {
315+
builder.write_str(" ")?;
316+
}
317+
builder.append_value(string);
318+
}
280319
}
281320
} else {
282321
builder.append_null();
@@ -523,6 +562,17 @@ mod tests {
523562
None,
524563
Ok(None)
525564
);
565+
test_lpad!(
566+
Some("hello".into()),
567+
ScalarValue::Int64(Some(2i64)),
568+
Ok(Some("he"))
569+
);
570+
test_lpad!(
571+
Some("hi".into()),
572+
ScalarValue::Int64(Some(6i64)),
573+
Some("xy".into()),
574+
Ok(Some("xyxyhi"))
575+
);
526576
test_lpad!(
527577
Some("josé".into()),
528578
ScalarValue::Int64(Some(10i64)),

0 commit comments

Comments
 (0)