Skip to content

Commit ace9cd4

Browse files
neilconwaymartin-g
andauthored
perf: Optimize trim UDFs for single-character trims (#20328)
## Which issue does this PR close? - Closes #20327 ## Rationale for this change By default, btrim(), ltrim(), and rtrim() trim space characters; it is also reasonably common for queries to specify a non-default trim pattern that is still a single ASCII character. We can optimize for this case by doing a byte-level scan, rather than invoking the more heavyweight std::string machinery used for more complex trim scenarios. ## What changes are included in this PR? Add a benchmark for trimming spaces, and implement the optimization described above. Also fixed an error in the documentation. ## Are these changes tested? Yes, and benchmarked. ## Are there any user-facing changes? No. --------- Co-authored-by: Martin Grigorov <martin-g@users.noreply.github.com>
1 parent ba267ac commit ace9cd4

7 files changed

Lines changed: 224 additions & 29 deletions

File tree

datafusion/functions/benches/trim.rs

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,45 @@ fn create_args(
141141
]
142142
}
143143

144+
/// Create args for trim benchmark where space characters are being trimmed
145+
fn create_space_trim_args(
146+
size: usize,
147+
pad_len: usize,
148+
remaining_len: usize,
149+
string_array_type: StringArrayType,
150+
trim_type: TrimType,
151+
) -> Vec<ColumnarValue> {
152+
let rng = &mut StdRng::seed_from_u64(42);
153+
let spaces = " ".repeat(pad_len);
154+
155+
let string_iter = (0..size).map(|_| {
156+
if rng.random::<f32>() < 0.1 {
157+
None
158+
} else {
159+
let content: String = rng
160+
.sample_iter(&Alphanumeric)
161+
.take(remaining_len)
162+
.map(char::from)
163+
.collect();
164+
165+
let value = match trim_type {
166+
TrimType::Ltrim => format!("{spaces}{content}"),
167+
TrimType::Rtrim => format!("{content}{spaces}"),
168+
TrimType::Btrim => format!("{spaces}{content}{spaces}"),
169+
};
170+
Some(value)
171+
}
172+
});
173+
174+
let string_array: ArrayRef = match string_array_type {
175+
StringArrayType::Utf8View => Arc::new(string_iter.collect::<StringViewArray>()),
176+
StringArrayType::Utf8 => Arc::new(string_iter.collect::<StringArray>()),
177+
StringArrayType::LargeUtf8 => Arc::new(string_iter.collect::<LargeStringArray>()),
178+
};
179+
180+
vec![ColumnarValue::Array(string_array)]
181+
}
182+
144183
#[expect(clippy::too_many_arguments)]
145184
fn run_with_string_type<M: Measurement>(
146185
group: &mut BenchmarkGroup<'_, M>,
@@ -221,6 +260,60 @@ fn run_trim_benchmark(
221260
group.finish();
222261
}
223262

263+
#[expect(clippy::too_many_arguments)]
264+
fn run_space_trim_benchmark(
265+
c: &mut Criterion,
266+
group_name: &str,
267+
trim_func: &ScalarUDF,
268+
trim_type: TrimType,
269+
string_types: &[StringArrayType],
270+
size: usize,
271+
pad_len: usize,
272+
remaining_len: usize,
273+
) {
274+
let mut group = c.benchmark_group(group_name);
275+
group.sampling_mode(SamplingMode::Flat);
276+
group.sample_size(10);
277+
278+
let total_len = match trim_type {
279+
TrimType::Btrim => 2 * pad_len + remaining_len,
280+
_ => pad_len + remaining_len,
281+
};
282+
283+
for string_type in string_types {
284+
let args =
285+
create_space_trim_args(size, pad_len, remaining_len, *string_type, trim_type);
286+
let arg_fields = args
287+
.iter()
288+
.enumerate()
289+
.map(|(idx, arg)| {
290+
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
291+
})
292+
.collect::<Vec<_>>();
293+
let config_options = Arc::new(ConfigOptions::default());
294+
295+
group.bench_function(
296+
format!(
297+
"{trim_type} {string_type} [size={size}, len={total_len}, pad={pad_len}]",
298+
),
299+
|b| {
300+
b.iter(|| {
301+
let args_cloned = args.clone();
302+
black_box(trim_func.invoke_with_args(ScalarFunctionArgs {
303+
args: args_cloned,
304+
arg_fields: arg_fields.clone(),
305+
number_rows: size,
306+
return_field: Field::new("f", DataType::Utf8, true).into(),
307+
config_options: Arc::clone(&config_options),
308+
}))
309+
})
310+
},
311+
);
312+
}
313+
314+
group.finish();
315+
}
316+
224317
fn criterion_benchmark(c: &mut Criterion) {
225318
let ltrim = string::ltrim();
226319
let rtrim = string::rtrim();
@@ -295,6 +388,45 @@ fn criterion_benchmark(c: &mut Criterion) {
295388
&trimmed,
296389
remaining_len,
297390
);
391+
392+
// Scenario 4: Trim spaces, short strings (len <= 12)
393+
// pad_len=4, remaining_len=8
394+
run_space_trim_benchmark(
395+
c,
396+
"trim spaces, short strings (len <= 12)",
397+
trim_func,
398+
*trim_type,
399+
&string_types,
400+
size,
401+
4,
402+
8,
403+
);
404+
405+
// Scenario 5: Trim spaces, long strings (len > 12)
406+
// pad_len=4, remaining_len=60
407+
run_space_trim_benchmark(
408+
c,
409+
"trim spaces, long strings",
410+
trim_func,
411+
*trim_type,
412+
&string_types,
413+
size,
414+
4,
415+
60,
416+
);
417+
418+
// Scenario 6: Trim spaces, long strings, heavy padding
419+
// pad_len=56, remaining_len=8
420+
run_space_trim_benchmark(
421+
c,
422+
"trim spaces, heavy padding",
423+
trim_func,
424+
*trim_type,
425+
&string_types,
426+
size,
427+
56,
428+
8,
429+
);
298430
}
299431
}
300432
}

datafusion/functions/src/string/btrim.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ use datafusion_macros::user_doc;
3030
use std::any::Any;
3131
use std::sync::Arc;
3232

33-
/// Returns the longest string with leading and trailing characters removed. If the characters are not specified, whitespace is removed.
33+
/// Returns the longest string with leading and trailing characters removed. If the characters are not specified, spaces are removed.
3434
/// btrim('xyxtrimyyx', 'xyz') = 'trim'
3535
fn btrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
3636
let use_string_view = args[0].data_type() == &DataType::Utf8View;
@@ -45,7 +45,7 @@ fn btrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
4545

4646
#[user_doc(
4747
doc_section(label = "String Functions"),
48-
description = "Trims the specified trim string from the start and end of a string. If no trim string is provided, all whitespace is removed from the start and end of the input string.",
48+
description = "Trims the specified trim string from the start and end of a string. If no trim string is provided, all spaces are removed from the start and end of the input string.",
4949
syntax_example = "btrim(str[, trim_str])",
5050
sql_example = r#"```sql
5151
> select btrim('__datafusion____', '_');
@@ -58,7 +58,7 @@ fn btrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
5858
standard_argument(name = "str", prefix = "String"),
5959
argument(
6060
name = "trim_str",
61-
description = r"String expression to operate on. Can be a constant, column, or function, and any combination of operators. _Default is whitespace characters._"
61+
description = r"String expression to operate on. Can be a constant, column, or function, and any combination of operators. _Default is a space._"
6262
),
6363
alternative_syntax = "trim(BOTH trim_str FROM str)",
6464
alternative_syntax = "trim(trim_str FROM str)",

datafusion/functions/src/string/common.rs

Lines changed: 68 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,22 @@ use datafusion_expr::ColumnarValue;
3838
/// from the beginning of the input string where the trimmed result starts.
3939
pub(crate) trait Trimmer {
4040
fn trim<'a>(input: &'a str, pattern: &[char]) -> (&'a str, u32);
41+
42+
/// Optimized trim for a single ASCII byte.
43+
/// Uses byte-level scanning instead of char-level iteration.
44+
fn trim_ascii_char(input: &str, byte: u8) -> (&str, u32);
45+
}
46+
47+
/// Returns the number of leading bytes matching `byte`
48+
#[inline]
49+
fn leading_bytes(bytes: &[u8], byte: u8) -> usize {
50+
bytes.iter().take_while(|&&b| b == byte).count()
51+
}
52+
53+
/// Returns the number of trailing bytes matching `byte`
54+
#[inline]
55+
fn trailing_bytes(bytes: &[u8], byte: u8) -> usize {
56+
bytes.iter().rev().take_while(|&&b| b == byte).count()
4157
}
4258

4359
/// Left trim - removes leading characters
@@ -46,10 +62,19 @@ pub(crate) struct TrimLeft;
4662
impl Trimmer for TrimLeft {
4763
#[inline]
4864
fn trim<'a>(input: &'a str, pattern: &[char]) -> (&'a str, u32) {
65+
if pattern.len() == 1 && pattern[0].is_ascii() {
66+
return Self::trim_ascii_char(input, pattern[0] as u8);
67+
}
4968
let trimmed = input.trim_start_matches(pattern);
5069
let offset = (input.len() - trimmed.len()) as u32;
5170
(trimmed, offset)
5271
}
72+
73+
#[inline]
74+
fn trim_ascii_char(input: &str, byte: u8) -> (&str, u32) {
75+
let start = leading_bytes(input.as_bytes(), byte);
76+
(&input[start..], start as u32)
77+
}
5378
}
5479

5580
/// Right trim - removes trailing characters
@@ -58,9 +83,19 @@ pub(crate) struct TrimRight;
5883
impl Trimmer for TrimRight {
5984
#[inline]
6085
fn trim<'a>(input: &'a str, pattern: &[char]) -> (&'a str, u32) {
86+
if pattern.len() == 1 && pattern[0].is_ascii() {
87+
return Self::trim_ascii_char(input, pattern[0] as u8);
88+
}
6189
let trimmed = input.trim_end_matches(pattern);
6290
(trimmed, 0)
6391
}
92+
93+
#[inline]
94+
fn trim_ascii_char(input: &str, byte: u8) -> (&str, u32) {
95+
let bytes = input.as_bytes();
96+
let end = bytes.len() - trailing_bytes(bytes, byte);
97+
(&input[..end], 0)
98+
}
6499
}
65100

66101
/// Both trim - removes both leading and trailing characters
@@ -69,11 +104,22 @@ pub(crate) struct TrimBoth;
69104
impl Trimmer for TrimBoth {
70105
#[inline]
71106
fn trim<'a>(input: &'a str, pattern: &[char]) -> (&'a str, u32) {
107+
if pattern.len() == 1 && pattern[0].is_ascii() {
108+
return Self::trim_ascii_char(input, pattern[0] as u8);
109+
}
72110
let left_trimmed = input.trim_start_matches(pattern);
73111
let offset = (input.len() - left_trimmed.len()) as u32;
74112
let trimmed = left_trimmed.trim_end_matches(pattern);
75113
(trimmed, offset)
76114
}
115+
116+
#[inline]
117+
fn trim_ascii_char(input: &str, byte: u8) -> (&str, u32) {
118+
let bytes = input.as_bytes();
119+
let start = leading_bytes(bytes, byte);
120+
let end = bytes.len() - trailing_bytes(&bytes[start..], byte);
121+
(&input[start..end], start as u32)
122+
}
77123
}
78124

79125
pub(crate) fn general_trim<T: OffsetSizeTrait, Tr: Trimmer>(
@@ -99,19 +145,24 @@ fn string_view_trim<Tr: Trimmer>(args: &[ArrayRef]) -> Result<ArrayRef> {
99145

100146
match args.len() {
101147
1 => {
102-
// Default whitespace trim - pattern is just space
103-
let pattern = [' '];
148+
// Trim spaces by default
104149
for (src_str_opt, raw_view) in string_view_array
105150
.iter()
106151
.zip(string_view_array.views().iter())
107152
{
108-
trim_and_append_view::<Tr>(
109-
src_str_opt,
110-
&pattern,
111-
&mut views_buf,
112-
&mut null_builder,
113-
raw_view,
114-
);
153+
if let Some(src_str) = src_str_opt {
154+
let (trimmed, offset) = Tr::trim_ascii_char(src_str, b' ');
155+
make_and_append_view(
156+
&mut views_buf,
157+
&mut null_builder,
158+
raw_view,
159+
trimmed,
160+
offset,
161+
);
162+
} else {
163+
null_builder.append_null();
164+
views_buf.push(0);
165+
}
115166
}
116167
}
117168
2 => {
@@ -141,6 +192,7 @@ fn string_view_trim<Tr: Trimmer>(args: &[ArrayRef]) -> Result<ArrayRef> {
141192
}
142193
} else {
143194
// Per-row pattern - must compute pattern chars for each row
195+
let mut pattern: Vec<char> = Vec::new();
144196
for ((src_str_opt, raw_view), characters_opt) in string_view_array
145197
.iter()
146198
.zip(string_view_array.views().iter())
@@ -149,7 +201,8 @@ fn string_view_trim<Tr: Trimmer>(args: &[ArrayRef]) -> Result<ArrayRef> {
149201
if let (Some(src_str), Some(characters)) =
150202
(src_str_opt, characters_opt)
151203
{
152-
let pattern: Vec<char> = characters.chars().collect();
204+
pattern.clear();
205+
pattern.extend(characters.chars());
153206
let (trimmed, offset) = Tr::trim(src_str, &pattern);
154207
make_and_append_view(
155208
&mut views_buf,
@@ -225,11 +278,10 @@ fn string_trim<T: OffsetSizeTrait, Tr: Trimmer>(args: &[ArrayRef]) -> Result<Arr
225278

226279
match args.len() {
227280
1 => {
228-
// Default whitespace trim - pattern is just space
229-
let pattern = [' '];
281+
// Trim spaces by default
230282
let result = string_array
231283
.iter()
232-
.map(|string| string.map(|s| Tr::trim(s, &pattern).0))
284+
.map(|string| string.map(|s| Tr::trim_ascii_char(s, b' ').0))
233285
.collect::<GenericStringArray<T>>();
234286

235287
Ok(Arc::new(result) as ArrayRef)
@@ -255,12 +307,14 @@ fn string_trim<T: OffsetSizeTrait, Tr: Trimmer>(args: &[ArrayRef]) -> Result<Arr
255307
}
256308

257309
// Per-row pattern - must compute pattern chars for each row
310+
let mut pattern: Vec<char> = Vec::new();
258311
let result = string_array
259312
.iter()
260313
.zip(characters_array.iter())
261314
.map(|(string, characters)| match (string, characters) {
262315
(Some(s), Some(c)) => {
263-
let pattern: Vec<char> = c.chars().collect();
316+
pattern.clear();
317+
pattern.extend(c.chars());
264318
Some(Tr::trim(s, &pattern).0)
265319
}
266320
_ => None,

datafusion/functions/src/string/ltrim.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ use datafusion_expr::{
3131
};
3232
use datafusion_macros::user_doc;
3333

34-
/// Returns the longest string with leading characters removed. If the characters are not specified, whitespace is removed.
34+
/// Returns the longest string with leading characters removed. If the characters are not specified, spaces are removed.
3535
/// ltrim('zzzytest', 'xyz') = 'test'
3636
fn ltrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
3737
let use_string_view = args[0].data_type() == &DataType::Utf8View;
@@ -46,7 +46,7 @@ fn ltrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
4646

4747
#[user_doc(
4848
doc_section(label = "String Functions"),
49-
description = "Trims the specified trim string from the beginning of a string. If no trim string is provided, all whitespace is removed from the start of the input string.",
49+
description = "Trims the specified trim string from the beginning of a string. If no trim string is provided, spaces are removed from the start of the input string.",
5050
syntax_example = "ltrim(str[, trim_str])",
5151
sql_example = r#"```sql
5252
> select ltrim(' datafusion ');
@@ -65,7 +65,7 @@ fn ltrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
6565
standard_argument(name = "str", prefix = "String"),
6666
argument(
6767
name = "trim_str",
68-
description = r"String expression to trim from the beginning of the input string. Can be a constant, column, or function, and any combination of arithmetic operators. _Default is whitespace characters._"
68+
description = r"String expression to trim from the beginning of the input string. Can be a constant, column, or function, and any combination of arithmetic operators. _Default is a space._"
6969
),
7070
alternative_syntax = "trim(LEADING trim_str FROM str)",
7171
related_udf(name = "btrim"),

0 commit comments

Comments
 (0)