Skip to content

Commit 766dff1

Browse files
authored
chore: Rename concat-specific string builders, make pub(crate) (#21695)
## Which issue does this PR close? - Part of #21684 but does not close it. ## Rationale for this change Rename the three builders in `datafusion/functions/src/strings.rs` to make their special-purpose nature explicit: - `StringArrayBuilder` -> `ConcatStringBuilder` - `LargeStringArrayBuilder` -> `ConcatLargeStringBuilder` - `StringViewArrayBuilder` -> `ConcatStringViewBuilder` These builders are used only by `concat` and `concat_ws`, and their APIs are specific enough to the needs of `concat`-like callers that it seems unlikely that other call-sites will emerge in the future. This also frees the previous names of these builders to be used by a more broadly useful string builder API. Also make these types `pub(crate)`, instead of `pub`; there is no good reason to make them part of the public API. This is a breaking API change. ## What changes are included in this PR? * Rename types * Make types `pub(crate)`, not `pub` * Add section to v54 migration guide ## Are these changes tested? Yes. ## Are there any user-facing changes? Yes, breaking API change -- although these types should probably not have been part of the public API to begin with, and we already made a breaking API change to them as part of other work in v54 (#21538)
1 parent 83c2c01 commit 766dff1

4 files changed

Lines changed: 44 additions & 28 deletions

File tree

datafusion/functions/src/string/concat.rs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ use std::sync::Arc;
2222

2323
use crate::string::concat;
2424
use crate::strings::{
25-
ColumnarValueRef, LargeStringArrayBuilder, StringArrayBuilder, StringViewArrayBuilder,
25+
ColumnarValueRef, ConcatLargeStringBuilder, ConcatStringBuilder,
26+
ConcatStringViewBuilder,
2627
};
2728
use datafusion_common::cast::{as_binary_array, as_string_array, as_string_view_array};
2829
use datafusion_common::{
@@ -242,7 +243,7 @@ impl ScalarUDFImpl for ConcatFunc {
242243

243244
match return_datatype {
244245
DataType::Utf8 => {
245-
let mut builder = StringArrayBuilder::with_capacity(len, data_size);
246+
let mut builder = ConcatStringBuilder::with_capacity(len, data_size);
246247
for i in 0..len {
247248
columns
248249
.iter()
@@ -254,7 +255,7 @@ impl ScalarUDFImpl for ConcatFunc {
254255
Ok(ColumnarValue::Array(Arc::new(string_array)))
255256
}
256257
DataType::Utf8View => {
257-
let mut builder = StringViewArrayBuilder::with_capacity(len, data_size);
258+
let mut builder = ConcatStringViewBuilder::with_capacity(len, data_size);
258259
for i in 0..len {
259260
columns
260261
.iter()
@@ -266,7 +267,7 @@ impl ScalarUDFImpl for ConcatFunc {
266267
Ok(ColumnarValue::Array(Arc::new(string_array)))
267268
}
268269
DataType::LargeUtf8 => {
269-
let mut builder = LargeStringArrayBuilder::with_capacity(len, data_size);
270+
let mut builder = ConcatLargeStringBuilder::with_capacity(len, data_size);
270271
for i in 0..len {
271272
columns
272273
.iter()

datafusion/functions/src/string/concat_ws.rs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@ use crate::string::concat;
2424
use crate::string::concat::simplify_concat;
2525
use crate::string::concat_ws;
2626
use crate::strings::{
27-
ColumnarValueRef, LargeStringArrayBuilder, StringArrayBuilder, StringViewArrayBuilder,
27+
ColumnarValueRef, ConcatLargeStringBuilder, ConcatStringBuilder,
28+
ConcatStringViewBuilder,
2829
};
2930
use datafusion_common::cast::{
3031
as_large_string_array, as_string_array, as_string_view_array,
@@ -311,7 +312,7 @@ impl ScalarUDFImpl for ConcatWsFunc {
311312

312313
match return_datatype {
313314
DataType::Utf8View => {
314-
let mut builder = StringViewArrayBuilder::with_capacity(len, data_size);
315+
let mut builder = ConcatStringViewBuilder::with_capacity(len, data_size);
315316
for i in 0..len {
316317
if !sep.is_valid(i) {
317318
builder.append_offset()?;
@@ -332,7 +333,7 @@ impl ScalarUDFImpl for ConcatWsFunc {
332333
Ok(ColumnarValue::Array(Arc::new(builder.finish(sep.nulls())?)))
333334
}
334335
DataType::LargeUtf8 => {
335-
let mut builder = LargeStringArrayBuilder::with_capacity(len, data_size);
336+
let mut builder = ConcatLargeStringBuilder::with_capacity(len, data_size);
336337
for i in 0..len {
337338
if !sep.is_valid(i) {
338339
builder.append_offset()?;
@@ -353,7 +354,7 @@ impl ScalarUDFImpl for ConcatWsFunc {
353354
Ok(ColumnarValue::Array(Arc::new(builder.finish(sep.nulls())?)))
354355
}
355356
_ => {
356-
let mut builder = StringArrayBuilder::with_capacity(len, data_size);
357+
let mut builder = ConcatStringBuilder::with_capacity(len, data_size);
357358
for i in 0..len {
358359
if !sep.is_valid(i) {
359360
builder.append_offset()?;

datafusion/functions/src/strings.rs

Lines changed: 26 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -26,17 +26,20 @@ use arrow::array::{
2626
use arrow::buffer::{Buffer, MutableBuffer, NullBuffer, ScalarBuffer};
2727
use arrow::datatypes::DataType;
2828

29-
/// Optimized version of the StringBuilder in Arrow that:
30-
/// 1. Precalculating the expected length of the result, avoiding reallocations.
31-
/// 2. Avoids creating / incrementally creating a `NullBufferBuilder`
32-
pub struct StringArrayBuilder {
29+
/// Builder used by `concat`/`concat_ws` to assemble a [`StringArray`] one row
30+
/// at a time from multiple input columns.
31+
///
32+
/// Each row is written via repeated `write` calls, followed by a single
33+
/// `append_offset` call to commit the row. The output null buffer is supplied
34+
/// by the caller at `finish` time.
35+
pub(crate) struct ConcatStringBuilder {
3336
offsets_buffer: MutableBuffer,
3437
value_buffer: MutableBuffer,
3538
/// If true, a safety check is required during the `finish` call
3639
tainted: bool,
3740
}
3841

39-
impl StringArrayBuilder {
42+
impl ConcatStringBuilder {
4043
pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
4144
let capacity = item_capacity
4245
.checked_add(1)
@@ -151,21 +154,21 @@ impl StringArrayBuilder {
151154
}
152155
}
153156

154-
/// Optimized version of Arrow's [`StringViewBuilder`]. Rather than adding NULLs
155-
/// on a row-by-row basis, the caller should provide nulls when calling
156-
/// [`finish`](Self::finish). This allows callers to compute nulls more
157-
/// efficiently (e.g., via bulk bitmap operations).
157+
/// Builder used by `concat`/`concat_ws` to assemble a [`StringViewArray`] one
158+
/// row at a time from multiple input columns.
158159
///
159-
/// [`StringViewBuilder`]: arrow::array::StringViewBuilder
160-
pub struct StringViewArrayBuilder {
160+
/// Each row is written via repeated `write` calls, followed by a single
161+
/// `append_offset` call to commit the row as a single string view. The output
162+
/// null buffer is supplied by the caller at `finish` time.
163+
pub(crate) struct ConcatStringViewBuilder {
161164
views: Vec<u128>,
162165
data: Vec<u8>,
163166
block: Vec<u8>,
164167
/// If true, a safety check is required during the `append_offset` call
165168
tainted: bool,
166169
}
167170

168-
impl StringViewArrayBuilder {
171+
impl ConcatStringViewBuilder {
169172
pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
170173
Self {
171174
views: Vec::with_capacity(item_capacity),
@@ -286,14 +289,17 @@ impl StringViewArrayBuilder {
286289
}
287290
}
288291

289-
pub struct LargeStringArrayBuilder {
292+
/// Builder used by `concat`/`concat_ws` to assemble a [`LargeStringArray`] one
293+
/// row at a time from multiple input columns. See [`ConcatStringBuilder`] for
294+
/// details on the row-composition contract.
295+
pub(crate) struct ConcatLargeStringBuilder {
290296
offsets_buffer: MutableBuffer,
291297
value_buffer: MutableBuffer,
292298
/// If true, a safety check is required during the `finish` call
293299
tainted: bool,
294300
}
295301

296-
impl LargeStringArrayBuilder {
302+
impl ConcatLargeStringBuilder {
297303
pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
298304
let capacity = item_capacity
299305
.checked_add(1)
@@ -426,7 +432,7 @@ impl LargeStringArrayBuilder {
426432
/// LLVM is apparently overly eager to inline this function into some hot loops,
427433
/// which bloats them and regresses performance, so we disable inlining for now.
428434
#[inline(never)]
429-
pub fn append_view(
435+
pub(crate) fn append_view(
430436
views_buffer: &mut Vec<u128>,
431437
original_view: &u128,
432438
substr: &str,
@@ -447,7 +453,7 @@ pub fn append_view(
447453
}
448454

449455
#[derive(Debug)]
450-
pub enum ColumnarValueRef<'a> {
456+
pub(crate) enum ColumnarValueRef<'a> {
451457
Scalar(&'a [u8]),
452458
NullableArray(&'a StringArray),
453459
NonNullableArray(&'a StringArray),
@@ -497,13 +503,13 @@ mod tests {
497503

498504
#[test]
499505
#[should_panic(expected = "capacity integer overflow")]
500-
fn test_overflow_string_array_builder() {
501-
let _builder = StringArrayBuilder::with_capacity(usize::MAX, usize::MAX);
506+
fn test_overflow_concat_string_builder() {
507+
let _builder = ConcatStringBuilder::with_capacity(usize::MAX, usize::MAX);
502508
}
503509

504510
#[test]
505511
#[should_panic(expected = "capacity integer overflow")]
506-
fn test_overflow_large_string_array_builder() {
507-
let _builder = LargeStringArrayBuilder::with_capacity(usize::MAX, usize::MAX);
512+
fn test_overflow_concat_large_string_builder() {
513+
let _builder = ConcatLargeStringBuilder::with_capacity(usize::MAX, usize::MAX);
508514
}
509515
}

docs/source/library-user-guide/upgrading/54.0.0.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,14 @@ The difference is only observable for strings containing combining characters
320320
clusters (e.g., ZWJ emoji sequences). For ASCII and most common Unicode text,
321321
behavior is unchanged.
322322

323+
### Items in `datafusion_functions::strings` are no longer public
324+
325+
`StringArrayBuilder`, `LargeStringArrayBuilder`, `StringViewArrayBuilder`,
326+
`ColumnarValueRef`, and `append_view` have been reduced to `pub(crate)`. They
327+
were only ever used to implement `concat` and `concat_ws` inside the crate. If
328+
you were importing them externally, use Arrow's corresponding builders with a
329+
caller-computed `NullBuffer`.
330+
323331
[#17861]: https://github.com/apache/datafusion/pull/17861
324332

325333
### `approx_percentile_cont`, `approx_percentile_cont_with_weight`, `approx_median` now coerce to floats

0 commit comments

Comments
 (0)