Skip to content

Commit 62ad66b

Browse files
neilconwayalamb
andauthored
perf: Use bulk-NULL builder in chr (#21847)
## Which issue does this PR close? - Closes #21846. ## Rationale for this change Optimize `chr` by avoiding per-row NULL bitmap maintenance, and also split the hot loop to avoid taking a branch when no NULL bitmap is given. Benchmarks: ``` chr/array: 3.8768 µs → 3.1548 µs, −18.57% (`p < 0.05`) ``` ## What changes are included in this PR? * Optimize `chr` to reduce NULL-handling overhead ## Are these changes tested? Yes. ## Are there any user-facing changes? No. --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent 2676297 commit 62ad66b

1 file changed

Lines changed: 40 additions & 18 deletions

File tree

  • datafusion/functions/src/string

datafusion/functions/src/string/chr.rs

Lines changed: 40 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,12 @@
1717

1818
use std::sync::Arc;
1919

20-
use arrow::array::{ArrayRef, GenericStringBuilder, Int64Array};
20+
use arrow::array::{Array, ArrayRef, Int64Array};
2121
use arrow::datatypes::DataType;
2222
use arrow::datatypes::DataType::Int64;
2323
use arrow::datatypes::DataType::Utf8;
2424

25+
use crate::strings::GenericStringArrayBuilder;
2526
use datafusion_common::cast::as_int64_array;
2627
use datafusion_common::utils::take_function_args;
2728
use datafusion_common::{Result, ScalarValue, exec_err, internal_err};
@@ -32,31 +33,46 @@ use datafusion_macros::user_doc;
3233
/// Returns the character with the given code.
3334
/// chr(65) = 'A'
3435
fn chr_array(integer_array: &Int64Array) -> Result<ArrayRef> {
35-
let mut builder = GenericStringBuilder::<i32>::with_capacity(
36-
integer_array.len(),
37-
// 1 byte per character, assuming that is the common case
38-
integer_array.len(),
36+
let len = integer_array.len();
37+
let mut builder = GenericStringArrayBuilder::<i32>::with_capacity(
38+
len, // 1 byte per character, assuming that is the common case
39+
len,
3940
);
4041

4142
let mut buf = [0u8; 4];
43+
let nulls = integer_array.nulls();
4244

43-
for integer in integer_array {
44-
match integer {
45-
Some(integer) => {
46-
if let Ok(u) = u32::try_from(integer)
47-
&& let Some(c) = core::char::from_u32(u)
48-
{
49-
builder.append_value(c.encode_utf8(&mut buf));
50-
continue;
51-
}
52-
53-
return exec_err!("invalid Unicode scalar value: {integer}");
45+
if let Some(n) = nulls {
46+
for i in 0..len {
47+
if n.is_null(i) {
48+
builder.append_placeholder();
49+
continue;
5450
}
55-
None => builder.append_null(),
51+
// SAFETY: bounds + null check above.
52+
let integer = unsafe { integer_array.value_unchecked(i) };
53+
if let Ok(u) = u32::try_from(integer)
54+
&& let Some(c) = core::char::from_u32(u)
55+
{
56+
builder.append_value(c.encode_utf8(&mut buf));
57+
continue;
58+
}
59+
return exec_err!("invalid Unicode scalar value: {integer}");
60+
}
61+
} else {
62+
for i in 0..len {
63+
// SAFETY: no null buffer means every index is valid.
64+
let integer = unsafe { integer_array.value_unchecked(i) };
65+
if let Ok(u) = u32::try_from(integer)
66+
&& let Some(c) = core::char::from_u32(u)
67+
{
68+
builder.append_value(c.encode_utf8(&mut buf));
69+
continue;
70+
}
71+
return exec_err!("invalid Unicode scalar value: {integer}");
5672
}
5773
}
5874

59-
Ok(Arc::new(builder.finish()) as ArrayRef)
75+
Ok(Arc::new(builder.finish(nulls.cloned())?) as ArrayRef)
6076
}
6177

6278
#[user_doc(
@@ -198,7 +214,13 @@ mod tests {
198214
];
199215

200216
assert_eq!(string_array.len(), expected.len());
217+
assert_eq!(string_array.null_count(), 1);
218+
assert!(string_array.is_null(7));
201219
for (i, e) in expected.iter().enumerate() {
220+
if i == 7 {
221+
continue;
222+
}
223+
assert!(!string_array.is_null(i));
202224
assert_eq!(string_array.value(i), *e);
203225
}
204226
}

0 commit comments

Comments
 (0)