Skip to content

Commit 7cf63f1

Browse files
authored
Refactor ListArray hashing to consider only sliced values (#19500)
## Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes #123` indicates that this PR will close issue #123. --> N/A ## Rationale for this change <!-- Why are you proposing this change? If this is already explained clearly in the issue then this section is not needed. Explaining clearly why changes are proposed helps reviewers understand your changes and offer better suggestions for fixes. --> When hashing list arrays we hash all bytes of the child array, even if we slice to a certain range of values. Refactor to slice only the needed bytes; also do some general refactors. ## What changes are included in this PR? <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> Refactor list array hashing. ## Are these changes tested? <!-- We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 2. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? --> Added test. ## Are there any user-facing changes? <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. --> No. <!-- If there are any breaking changes to public APIs, please add the `api change` label. -->
1 parent cf76352 commit 7cf63f1

3 files changed

Lines changed: 56 additions & 12 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion/common/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ half = { workspace = true }
7272
hashbrown = { workspace = true }
7373
hex = { workspace = true, optional = true }
7474
indexmap = { workspace = true }
75+
itertools = { workspace = true }
7576
libc = "0.2.180"
7677
log = { workspace = true }
7778
object_store = { workspace = true, optional = true }

datafusion/common/src/hash_utils.rs

Lines changed: 54 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ use arrow::array::*;
2323
use arrow::datatypes::*;
2424
#[cfg(not(feature = "force_hash_collisions"))]
2525
use arrow::{downcast_dictionary_array, downcast_primitive_array};
26+
use itertools::Itertools;
2627

2728
#[cfg(not(feature = "force_hash_collisions"))]
2829
use crate::cast::{
@@ -514,24 +515,41 @@ fn hash_list_array<OffsetSize>(
514515
where
515516
OffsetSize: OffsetSizeTrait,
516517
{
517-
let values = array.values();
518-
let offsets = array.value_offsets();
519-
let nulls = array.nulls();
520-
let mut values_hashes = vec![0u64; values.len()];
521-
create_hashes([values], random_state, &mut values_hashes)?;
522-
if let Some(nulls) = nulls {
523-
for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
524-
if nulls.is_valid(i) {
518+
// In case values is sliced, hash only the bytes used by the offsets of this ListArray
519+
let first_offset = array.value_offsets().first().cloned().unwrap_or_default();
520+
let last_offset = array.value_offsets().last().cloned().unwrap_or_default();
521+
let value_bytes_len = (last_offset - first_offset).as_usize();
522+
let mut values_hashes = vec![0u64; value_bytes_len];
523+
create_hashes(
524+
[array
525+
.values()
526+
.slice(first_offset.as_usize(), value_bytes_len)],
527+
random_state,
528+
&mut values_hashes,
529+
)?;
530+
531+
if array.null_count() > 0 {
532+
for (i, (start, stop)) in array.value_offsets().iter().tuple_windows().enumerate()
533+
{
534+
if array.is_valid(i) {
525535
let hash = &mut hashes_buffer[i];
526-
for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
536+
for values_hash in &values_hashes[(*start - first_offset).as_usize()
537+
..(*stop - first_offset).as_usize()]
538+
{
527539
*hash = combine_hashes(*hash, *values_hash);
528540
}
529541
}
530542
}
531543
} else {
532-
for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
533-
let hash = &mut hashes_buffer[i];
534-
for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
544+
for ((start, stop), hash) in array
545+
.value_offsets()
546+
.iter()
547+
.tuple_windows()
548+
.zip(hashes_buffer.iter_mut())
549+
{
550+
for values_hash in &values_hashes
551+
[(*start - first_offset).as_usize()..(*stop - first_offset).as_usize()]
552+
{
535553
*hash = combine_hashes(*hash, *values_hash);
536554
}
537555
}
@@ -1176,6 +1194,30 @@ mod tests {
11761194
assert_eq!(hashes[1], hashes[6]); // null vs empty list
11771195
}
11781196

1197+
#[test]
1198+
#[cfg(not(feature = "force_hash_collisions"))]
1199+
fn create_hashes_for_sliced_list_arrays() {
1200+
let data = vec![
1201+
Some(vec![Some(0), Some(1), Some(2)]),
1202+
None,
1203+
// Slice from here
1204+
Some(vec![Some(3), None, Some(5)]),
1205+
Some(vec![Some(3), None, Some(5)]),
1206+
None,
1207+
// To here
1208+
Some(vec![Some(0), Some(1), Some(2)]),
1209+
Some(vec![]),
1210+
];
1211+
let list_array =
1212+
Arc::new(ListArray::from_iter_primitive::<Int32Type, _, _>(data)) as ArrayRef;
1213+
let list_array = list_array.slice(2, 3);
1214+
let random_state = RandomState::with_seeds(0, 0, 0, 0);
1215+
let mut hashes = vec![0; list_array.len()];
1216+
create_hashes(&[list_array], &random_state, &mut hashes).unwrap();
1217+
assert_eq!(hashes[0], hashes[1]);
1218+
assert_ne!(hashes[1], hashes[2]);
1219+
}
1220+
11791221
#[test]
11801222
// Tests actual values of hashes, which are different if forcing collisions
11811223
#[cfg(not(feature = "force_hash_collisions"))]

0 commit comments

Comments
 (0)