Skip to content

Commit f0de02f

Browse files
etk18Eesh Sagar SinghDandandan
authored
Use BooleanBufferBuilder rather than Vec<bool> in ArrowBytesViewMap (#20064)
## Which issue does this PR close? Closes #20053 ## Rationale for this change `ArrowBytesViewMap` previously used `Vec<bool>` to track null values. This PR replaces it with `BooleanBufferBuilder`, which is significantly more memory-efficient and faster, and aligns with Apache Arrow best practices for building validity bitmaps. This change improves performance and memory usage without changing behavior. ## What changed - Replaced `Vec<bool>` with `BooleanBufferBuilder` for null tracking - Updated null buffer construction to use Arrow-native buffers - Kept ordering and semantics unchanged ## Tests - `cargo test -p datafusion-physical-expr-common` > Note: Full workspace tests require `protoc`; this PR was validated with the affected crate tests. https://github.com/user-attachments/assets/88b1af34-a905-43bd-b9e9-065858d0781d --------- Co-authored-by: Eesh Sagar Singh <etk18@Eeshs-MacBook-Air.local> Co-authored-by: Daniël Heres <danielheres@gmail.com>
1 parent 7c3ea05 commit f0de02f

1 file changed

Lines changed: 8 additions & 16 deletions

File tree

datafusion/physical-expr-common/src/binary_view_map.rs

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,10 @@
1919
//! `StringViewArray`/`BinaryViewArray`.
2020
use crate::binary_map::OutputType;
2121
use ahash::RandomState;
22+
use arrow::array::NullBufferBuilder;
2223
use arrow::array::cast::AsArray;
2324
use arrow::array::{Array, ArrayRef, BinaryViewArray, ByteView, make_view};
24-
use arrow::buffer::{Buffer, NullBuffer, ScalarBuffer};
25+
use arrow::buffer::{Buffer, ScalarBuffer};
2526
use arrow::datatypes::{BinaryViewType, ByteViewType, DataType, StringViewType};
2627
use datafusion_common::hash_utils::create_hashes;
2728
use datafusion_common::utils::proxy::{HashTableAllocExt, VecAllocExt};
@@ -134,7 +135,7 @@ where
134135
/// Completed buffers containing string data
135136
completed: Vec<Buffer>,
136137
/// Tracks null values (true = null)
137-
nulls: Vec<bool>,
138+
nulls: NullBufferBuilder,
138139

139140
/// random state used to generate hashes
140141
random_state: RandomState,
@@ -161,7 +162,7 @@ where
161162
views: Vec::new(),
162163
in_progress: Vec::new(),
163164
completed: Vec::new(),
164-
nulls: Vec::new(),
165+
nulls: NullBufferBuilder::new(0),
165166
random_state: RandomState::new(),
166167
hashes_buffer: vec![],
167168
null: None,
@@ -281,7 +282,7 @@ where
281282
let payload = make_payload_fn(None);
282283
let null_index = self.views.len();
283284
self.views.push(0);
284-
self.nulls.push(true);
285+
self.nulls.append_null();
285286
self.null = Some((payload, null_index));
286287
payload
287288
};
@@ -371,16 +372,7 @@ where
371372
}
372373

373374
// Build null buffer if we have any nulls
374-
let null_buffer = if self.nulls.iter().any(|&is_null| is_null) {
375-
Some(NullBuffer::from(
376-
self.nulls
377-
.iter()
378-
.map(|&is_null| !is_null)
379-
.collect::<Vec<_>>(),
380-
))
381-
} else {
382-
None
383-
};
375+
let null_buffer = self.nulls.finish();
384376

385377
let views = ScalarBuffer::from(self.views);
386378
let array =
@@ -420,7 +412,7 @@ where
420412
};
421413

422414
self.views.push(view);
423-
self.nulls.push(false);
415+
self.nulls.append_non_null();
424416
view
425417
}
426418

@@ -445,7 +437,7 @@ where
445437
let views_size = self.views.len() * size_of::<u128>();
446438
let in_progress_size = self.in_progress.capacity();
447439
let completed_size: usize = self.completed.iter().map(|b| b.len()).sum();
448-
let nulls_size = self.nulls.len();
440+
let nulls_size = self.nulls.len() / 8;
449441

450442
self.map_size
451443
+ views_size

0 commit comments

Comments
 (0)