Skip to content

Commit 22b7e1a

Browse files
committed
fix: account for initial hash table allocation in ArrowBytesMap/ArrowBytesViewMap
ArrowBytesMap and ArrowBytesViewMap allocate their hash tables with HashTable::with_capacity(INITIAL_MAP_CAPACITY) but initialize map_size to 0. insert_accounted only tracks incremental growth beyond the current capacity, so the initial allocation is never counted and size() understates memory usage until the first resize. Initialize map_size with map.allocation_size() in both constructors, and add regression tests covering both map types. Closes #21248.
1 parent 067ba4b commit 22b7e1a

2 files changed

Lines changed: 86 additions & 4 deletions

File tree

datafusion/physical-expr-common/src/binary_map.rs

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -244,10 +244,12 @@ where
244244
V: Debug + PartialEq + Eq + Clone + Copy + Default,
245245
{
246246
pub fn new(output_type: OutputType) -> Self {
247+
let map = hashbrown::hash_table::HashTable::with_capacity(INITIAL_MAP_CAPACITY);
248+
let map_size = map.allocation_size();
247249
Self {
248250
output_type,
249-
map: hashbrown::hash_table::HashTable::with_capacity(INITIAL_MAP_CAPACITY),
250-
map_size: 0,
251+
map,
252+
map_size,
251253
buffer: BufferBuilder::new(INITIAL_BUFFER_CAPACITY),
252254
offsets: vec![O::default()], // first offset is always 0
253255
random_state: RandomState::default(),
@@ -874,6 +876,47 @@ mod tests {
874876
assert!(size_after_values2 > total_strings1_len + total_strings2_len);
875877
}
876878

879+
/// Verify that `size()` accounts for the initial hash table allocation
880+
/// from `HashTable::with_capacity(INITIAL_MAP_CAPACITY)`.
881+
///
882+
/// Regression test: `map_size` was previously initialized to 0 despite
883+
/// the hash table pre-allocating memory, causing `size()` to undercount
884+
/// until the first hash table resize.
885+
#[test]
886+
fn test_size_accounts_for_initial_hash_table_allocation() {
887+
let set = ArrowBytesSet::<i32>::new(OutputType::Utf8);
888+
let initial_size = set.size();
889+
890+
// Compute the exact allocation for a HashTable with the same
891+
// capacity and entry type used by ArrowBytesMap
892+
let expected_ht_alloc =
893+
hashbrown::hash_table::HashTable::<Entry<i32, ()>>::with_capacity(
894+
INITIAL_MAP_CAPACITY,
895+
)
896+
.allocation_size();
897+
898+
// The hash table must allocate non-trivial memory for 128 entries
899+
assert!(
900+
expected_ht_alloc > 0,
901+
"hash table should allocate memory for {INITIAL_MAP_CAPACITY} entries"
902+
);
903+
904+
// size() = map_size + buffer_capacity + offsets + hashes_buffer
905+
// At creation the non-map components are:
906+
// buffer: INITIAL_BUFFER_CAPACITY bytes
907+
// offsets: vec with 1 element
908+
// hashes: empty vec (0 bytes)
909+
//
910+
// Before the fix (map_size=0), size() ≈ INITIAL_BUFFER_CAPACITY + sizeof(i32),
911+
// missing the hash table allocation entirely.
912+
// After the fix, size() includes expected_ht_alloc as well.
913+
assert!(
914+
initial_size >= expected_ht_alloc + INITIAL_BUFFER_CAPACITY,
915+
"size() ({initial_size}) should include both hash table allocation \
916+
({expected_ht_alloc}) and buffer capacity ({INITIAL_BUFFER_CAPACITY})"
917+
);
918+
}
919+
877920
#[test]
878921
fn test_map() {
879922
let input = vec![

datafusion/physical-expr-common/src/binary_view_map.rs

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -155,10 +155,12 @@ where
155155
V: Debug + PartialEq + Eq + Clone + Copy + Default,
156156
{
157157
pub fn new(output_type: OutputType) -> Self {
158+
let map = hashbrown::hash_table::HashTable::with_capacity(INITIAL_MAP_CAPACITY);
159+
let map_size = map.allocation_size();
158160
Self {
159161
output_type,
160-
map: hashbrown::hash_table::HashTable::with_capacity(INITIAL_MAP_CAPACITY),
161-
map_size: 0,
162+
map,
163+
map_size,
162164
views: Vec::new(),
163165
in_progress: Vec::new(),
164166
completed: Vec::new(),
@@ -678,6 +680,43 @@ mod tests {
678680
assert_eq!(set.len(), 10);
679681
}
680682

683+
/// Verify that `size()` accounts for the initial hash table allocation
684+
/// from `HashTable::with_capacity(INITIAL_MAP_CAPACITY)`.
685+
///
686+
/// Regression test: `map_size` was previously initialized to 0 despite
687+
/// the hash table pre-allocating memory, causing `size()` to undercount
688+
/// until the first hash table resize.
689+
#[test]
690+
fn test_size_accounts_for_initial_hash_table_allocation() {
691+
let set = ArrowBytesViewSet::new(OutputType::Utf8View);
692+
let initial_size = set.size();
693+
694+
// Compute the exact allocation for a HashTable with the same
695+
// capacity and entry type used by ArrowBytesViewMap
696+
let expected_ht_alloc =
697+
hashbrown::hash_table::HashTable::<Entry<()>>::with_capacity(
698+
INITIAL_MAP_CAPACITY,
699+
)
700+
.allocation_size();
701+
702+
assert!(
703+
expected_ht_alloc > 0,
704+
"hash table should allocate memory for {INITIAL_MAP_CAPACITY} entries"
705+
);
706+
707+
// For ArrowBytesViewMap, all non-map components (views, in_progress,
708+
// completed, nulls, hashes_buffer) start empty, so:
709+
// size() = map_size + 0
710+
//
711+
// Before the fix (map_size=0), size() was 0 at creation,
712+
// missing the hash table allocation entirely.
713+
// After the fix, size() == expected_ht_alloc.
714+
assert!(
715+
initial_size >= expected_ht_alloc,
716+
"size() ({initial_size}) should include hash table allocation ({expected_ht_alloc})"
717+
);
718+
}
719+
681720
#[derive(Debug, PartialEq, Eq, Default, Clone, Copy)]
682721
struct TestPayload {
683722
// store the string value to check against input

0 commit comments

Comments
 (0)