Skip to content

Commit b23dc9b

Browse files
committed
test: add regression tests for initial hash table allocation tracking
Verify that ArrowBytesMap::size() and ArrowBytesViewMap::size() account for the hash table memory pre-allocated by HashTable::with_capacity(). Both tests fail without the fix (map_size: 0) and pass with it.
1 parent 0673088 commit b23dc9b

2 files changed

Lines changed: 78 additions & 0 deletions

File tree

datafusion/physical-expr-common/src/binary_map.rs

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -876,6 +876,47 @@ mod tests {
876876
assert!(size_after_values2 > total_strings1_len + total_strings2_len);
877877
}
878878

879+
/// Verify that `size()` accounts for the initial hash table allocation
880+
/// from `HashTable::with_capacity(INITIAL_MAP_CAPACITY)`.
881+
///
882+
/// Regression test: `map_size` was previously initialized to 0 despite
883+
/// the hash table pre-allocating memory, causing `size()` to undercount
884+
/// until the first hash table resize.
885+
#[test]
886+
fn test_size_accounts_for_initial_hash_table_allocation() {
887+
let set = ArrowBytesSet::<i32>::new(OutputType::Utf8);
888+
let initial_size = set.size();
889+
890+
// Compute the exact allocation for a HashTable with the same
891+
// capacity and entry type used by ArrowBytesMap
892+
let expected_ht_alloc =
893+
hashbrown::hash_table::HashTable::<Entry<i32, ()>>::with_capacity(
894+
INITIAL_MAP_CAPACITY,
895+
)
896+
.allocation_size();
897+
898+
// The hash table must allocate non-trivial memory for 128 entries
899+
assert!(
900+
expected_ht_alloc > 0,
901+
"hash table should allocate memory for {INITIAL_MAP_CAPACITY} entries"
902+
);
903+
904+
// size() = map_size + buffer_capacity + offsets + hashes_buffer
905+
// At creation the non-map components are:
906+
// buffer: INITIAL_BUFFER_CAPACITY bytes
907+
// offsets: vec with 1 element
908+
// hashes: empty vec (0 bytes)
909+
//
910+
// Before the fix (map_size=0), size() ≈ INITIAL_BUFFER_CAPACITY + sizeof(i32),
911+
// missing the hash table allocation entirely.
912+
// After the fix, size() includes expected_ht_alloc as well.
913+
assert!(
914+
initial_size >= expected_ht_alloc + INITIAL_BUFFER_CAPACITY,
915+
"size() ({initial_size}) should include both hash table allocation \
916+
({expected_ht_alloc}) and buffer capacity ({INITIAL_BUFFER_CAPACITY})"
917+
);
918+
}
919+
879920
#[test]
880921
fn test_map() {
881922
let input = vec![

datafusion/physical-expr-common/src/binary_view_map.rs

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -680,6 +680,43 @@ mod tests {
680680
assert_eq!(set.len(), 10);
681681
}
682682

683+
/// Verify that `size()` accounts for the initial hash table allocation
684+
/// from `HashTable::with_capacity(INITIAL_MAP_CAPACITY)`.
685+
///
686+
/// Regression test: `map_size` was previously initialized to 0 despite
687+
/// the hash table pre-allocating memory, causing `size()` to undercount
688+
/// until the first hash table resize.
689+
#[test]
690+
fn test_size_accounts_for_initial_hash_table_allocation() {
691+
let set = ArrowBytesViewSet::new(OutputType::Utf8View);
692+
let initial_size = set.size();
693+
694+
// Compute the exact allocation for a HashTable with the same
695+
// capacity and entry type used by ArrowBytesViewMap
696+
let expected_ht_alloc =
697+
hashbrown::hash_table::HashTable::<Entry<()>>::with_capacity(
698+
INITIAL_MAP_CAPACITY,
699+
)
700+
.allocation_size();
701+
702+
assert!(
703+
expected_ht_alloc > 0,
704+
"hash table should allocate memory for {INITIAL_MAP_CAPACITY} entries"
705+
);
706+
707+
// For ArrowBytesViewMap, all non-map components (views, in_progress,
708+
// completed, nulls, hashes_buffer) start empty, so:
709+
// size() = map_size + 0
710+
//
711+
// Before the fix (map_size=0), size() was 0 at creation,
712+
// missing the hash table allocation entirely.
713+
// After the fix, size() == expected_ht_alloc.
714+
assert!(
715+
initial_size >= expected_ht_alloc,
716+
"size() ({initial_size}) should include hash table allocation ({expected_ht_alloc})"
717+
);
718+
}
719+
683720
#[derive(Debug, PartialEq, Eq, Default, Clone, Copy)]
684721
struct TestPayload {
685722
// store the string value to check against input

0 commit comments

Comments
 (0)