Skip to content

Commit b83c0e0

Browse files
[branch-53] fix: move overflow guard before dense ratio in hash join to prevent overflows (apache#20998) (apache#21008)
(cherry picked from commit e74e58f) I confirmed that this fixed CometFuzzTestSuite "join" which found the original issue while testing `branch-53`. ## Which issue does this PR close? - Closes apache#20995. ## Rationale for this change apache#20995 has details but it is very straightforward. `dense_ratio` calculation overflows since overflow guard is after not before ## What changes are included in this PR? Prevent hash join overflow and unit test for it - backports apache#20998 from @buraksenn ## Are these changes tested? Added a test case for both min and max scenario Co-authored-by: Burak Şen <buraksenb@gmail.com>
1 parent 28bb951 commit b83c0e0

1 file changed

Lines changed: 43 additions & 9 deletions

File tree

  • datafusion/physical-plan/src/joins/hash_join

datafusion/physical-plan/src/joins/hash_join/exec.rs

Lines changed: 43 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -154,23 +154,23 @@ fn try_create_array_map(
154154

155155
let range = ArrayMap::calculate_range(min_val, max_val);
156156
let num_row: usize = batches.iter().map(|x| x.num_rows()).sum();
157-
let dense_ratio = (num_row as f64) / ((range + 1) as f64);
158157

159158
// TODO: support create ArrayMap<u64>
160159
if num_row >= u32::MAX as usize {
161160
return Ok(None);
162161
}
163162

164-
if range >= perfect_hash_join_small_build_threshold as u64
165-
&& dense_ratio <= perfect_hash_join_min_key_density
166-
{
163+
// When the key range spans the full integer domain (e.g. i64::MIN to i64::MAX),
164+
// range is u64::MAX and `range + 1` below would overflow.
165+
if range == usize::MAX as u64 {
167166
return Ok(None);
168167
}
169168

170-
// If range equals usize::MAX, then range + 1 would overflow to 0, which would cause
171-
// ArrayMap to allocate an invalid zero-sized array or cause indexing issues.
172-
// This check prevents such overflow and ensures valid array allocation.
173-
if range == usize::MAX as u64 {
169+
let dense_ratio = (num_row as f64) / ((range + 1) as f64);
170+
171+
if range >= perfect_hash_join_small_build_threshold as u64
172+
&& dense_ratio <= perfect_hash_join_min_key_density
173+
{
174174
return Ok(None);
175175
}
176176

@@ -2082,7 +2082,9 @@ mod tests {
20822082
test::exec::MockExec,
20832083
};
20842084

2085-
use arrow::array::{Date32Array, Int32Array, StructArray, UInt32Array, UInt64Array};
2085+
use arrow::array::{
2086+
Date32Array, Int32Array, Int64Array, StructArray, UInt32Array, UInt64Array,
2087+
};
20862088
use arrow::buffer::NullBuffer;
20872089
use arrow::datatypes::{DataType, Field};
20882090
use arrow_schema::Schema;
@@ -5499,6 +5501,38 @@ mod tests {
54995501
Ok(())
55005502
}
55015503

5504+
#[tokio::test]
5505+
async fn test_perfect_hash_join_overflow_full_int64_range() -> Result<()> {
5506+
let task_ctx = prepare_task_ctx(8192, true);
5507+
let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, true)]));
5508+
let batch = RecordBatch::try_new(
5509+
Arc::clone(&schema),
5510+
vec![Arc::new(Int64Array::from(vec![i64::MIN, i64::MAX]))],
5511+
)?;
5512+
let left = TestMemoryExec::try_new_exec(
5513+
&[vec![batch.clone()]],
5514+
Arc::clone(&schema),
5515+
None,
5516+
)?;
5517+
let right = TestMemoryExec::try_new_exec(&[vec![batch]], schema, None)?;
5518+
let on: JoinOn = vec![(
5519+
Arc::new(Column::new_with_schema("a", &left.schema())?) as _,
5520+
Arc::new(Column::new_with_schema("a", &right.schema())?) as _,
5521+
)];
5522+
let (_columns, batches, _metrics) = join_collect(
5523+
left,
5524+
right,
5525+
on,
5526+
&JoinType::Inner,
5527+
NullEquality::NullEqualsNothing,
5528+
task_ctx,
5529+
)
5530+
.await?;
5531+
let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
5532+
assert_eq!(total_rows, 2);
5533+
Ok(())
5534+
}
5535+
55025536
#[apply(hash_join_exec_configs)]
55035537
#[tokio::test]
55045538
async fn test_phj_null_equals_null_build_no_nulls_probe_has_nulls(

0 commit comments

Comments
 (0)