Skip to content

Commit e74e58f

Browse files
fix: move overflow guard before dense ratio in hash join to prevent overflows (#20998)
## Which issue does this PR close? - Closes #20995. ## Rationale for this change #20995 has details but it is very straightforward. `dense_ratio` calculation overflows since overflow guard is after not before ## What changes are included in this PR? Prevent hash join overflow and unit test for it ## Are these changes tested? Added a test case for both min and max scenario Co-authored-by: Matt Butrovich <mbutrovich@users.noreply.github.com>
1 parent 8142308 commit e74e58f

1 file changed

Lines changed: 43 additions & 9 deletions

File tree

  • datafusion/physical-plan/src/joins/hash_join

datafusion/physical-plan/src/joins/hash_join/exec.rs

Lines changed: 43 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -155,23 +155,23 @@ fn try_create_array_map(
155155

156156
let range = ArrayMap::calculate_range(min_val, max_val);
157157
let num_row: usize = batches.iter().map(|x| x.num_rows()).sum();
158-
let dense_ratio = (num_row as f64) / ((range + 1) as f64);
159158

160159
// TODO: support create ArrayMap<u64>
161160
if num_row >= u32::MAX as usize {
162161
return Ok(None);
163162
}
164163

165-
if range >= perfect_hash_join_small_build_threshold as u64
166-
&& dense_ratio <= perfect_hash_join_min_key_density
167-
{
164+
// When the key range spans the full integer domain (e.g. i64::MIN to i64::MAX),
165+
// range is u64::MAX and `range + 1` below would overflow.
166+
if range == usize::MAX as u64 {
168167
return Ok(None);
169168
}
170169

171-
// If range equals usize::MAX, then range + 1 would overflow to 0, which would cause
172-
// ArrayMap to allocate an invalid zero-sized array or cause indexing issues.
173-
// This check prevents such overflow and ensures valid array allocation.
174-
if range == usize::MAX as u64 {
170+
let dense_ratio = (num_row as f64) / ((range + 1) as f64);
171+
172+
if range >= perfect_hash_join_small_build_threshold as u64
173+
&& dense_ratio <= perfect_hash_join_min_key_density
174+
{
175175
return Ok(None);
176176
}
177177

@@ -2142,7 +2142,9 @@ mod tests {
21422142
test::exec::MockExec,
21432143
};
21442144

2145-
use arrow::array::{Date32Array, Int32Array, StructArray, UInt32Array, UInt64Array};
2145+
use arrow::array::{
2146+
Date32Array, Int32Array, Int64Array, StructArray, UInt32Array, UInt64Array,
2147+
};
21462148
use arrow::buffer::NullBuffer;
21472149
use arrow::datatypes::{DataType, Field};
21482150
use arrow_schema::Schema;
@@ -5559,6 +5561,38 @@ mod tests {
55595561
Ok(())
55605562
}
55615563

5564+
#[tokio::test]
5565+
async fn test_perfect_hash_join_overflow_full_int64_range() -> Result<()> {
5566+
let task_ctx = prepare_task_ctx(8192, true);
5567+
let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, true)]));
5568+
let batch = RecordBatch::try_new(
5569+
Arc::clone(&schema),
5570+
vec![Arc::new(Int64Array::from(vec![i64::MIN, i64::MAX]))],
5571+
)?;
5572+
let left = TestMemoryExec::try_new_exec(
5573+
&[vec![batch.clone()]],
5574+
Arc::clone(&schema),
5575+
None,
5576+
)?;
5577+
let right = TestMemoryExec::try_new_exec(&[vec![batch]], schema, None)?;
5578+
let on: JoinOn = vec![(
5579+
Arc::new(Column::new_with_schema("a", &left.schema())?) as _,
5580+
Arc::new(Column::new_with_schema("a", &right.schema())?) as _,
5581+
)];
5582+
let (_columns, batches, _metrics) = join_collect(
5583+
left,
5584+
right,
5585+
on,
5586+
&JoinType::Inner,
5587+
NullEquality::NullEqualsNothing,
5588+
task_ctx,
5589+
)
5590+
.await?;
5591+
let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
5592+
assert_eq!(total_rows, 2);
5593+
Ok(())
5594+
}
5595+
55625596
#[apply(hash_join_exec_configs)]
55635597
#[tokio::test]
55645598
async fn test_phj_null_equals_null_build_no_nulls_probe_has_nulls(

0 commit comments

Comments
 (0)