-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Skip probe-side consumption when hash join build side is empty #21068
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
39c22f5
8e4c6eb
dcb4cd9
072cbea
30f2a3e
626f771
45ee6d1
12a52ef
48cdea9
4d6367b
c48f212
f0681be
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -855,6 +855,22 @@ pub(crate) fn need_produce_result_in_final(join_type: JoinType) -> bool { | |
| ) | ||
| } | ||
|
|
||
| /// Returns true when an empty build side necessarily produces an empty result. | ||
| /// | ||
| /// This is the shared source of truth for both state-machine short-circuiting | ||
| /// and `build_batch_empty_build_side`. | ||
| pub(crate) fn empty_build_side_produces_empty_result(join_type: JoinType) -> bool { | ||
| matches!( | ||
| join_type, | ||
| JoinType::Inner | ||
| | JoinType::Left | ||
| | JoinType::LeftSemi | ||
| | JoinType::LeftAnti | ||
| | JoinType::LeftMark | ||
| | JoinType::RightSemi | ||
| ) | ||
| } | ||
|
|
||
| pub(crate) fn get_final_indices_from_shared_bitmap( | ||
| shared_bitmap: &SharedBitmapBuilder, | ||
| join_type: JoinType, | ||
|
|
@@ -1060,47 +1076,35 @@ pub(crate) fn build_batch_empty_build_side( | |
| column_indices: &[ColumnIndex], | ||
| join_type: JoinType, | ||
| ) -> Result<RecordBatch> { | ||
| match join_type { | ||
| // these join types only return data if the left side is not empty, so we return an | ||
| // empty RecordBatch | ||
| JoinType::Inner | ||
| | JoinType::Left | ||
| | JoinType::LeftSemi | ||
| | JoinType::RightSemi | ||
| | JoinType::LeftAnti | ||
| | JoinType::LeftMark => Ok(RecordBatch::new_empty(Arc::new(schema.clone()))), | ||
| if empty_build_side_produces_empty_result(join_type) { | ||
| // These join types only return data if the left side is not empty. | ||
| return Ok(RecordBatch::new_empty(Arc::new(schema.clone()))); | ||
| } | ||
|
Comment on lines
+1063
to
+1066
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: should we keep this? this is technically unreachable no? 🤔
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @LiaCastaneda But
|
||
|
|
||
| // the remaining joins will return data for the right columns and null for the left ones | ||
| JoinType::Right | JoinType::Full | JoinType::RightAnti | JoinType::RightMark => { | ||
| let num_rows = probe_batch.num_rows(); | ||
| if schema.fields().is_empty() { | ||
| return new_empty_schema_batch(schema, num_rows); | ||
| } | ||
| let mut columns: Vec<Arc<dyn Array>> = | ||
| Vec::with_capacity(schema.fields().len()); | ||
|
|
||
| for column_index in column_indices { | ||
| let array = match column_index.side { | ||
| // left -> null array | ||
| JoinSide::Left => new_null_array( | ||
| build_batch.column(column_index.index).data_type(), | ||
| num_rows, | ||
| ), | ||
| // right -> respective right array | ||
| JoinSide::Right => Arc::clone(probe_batch.column(column_index.index)), | ||
| // right mark -> unset boolean array as there are no matches on the left side | ||
| JoinSide::None => Arc::new(BooleanArray::new( | ||
| BooleanBuffer::new_unset(num_rows), | ||
| None, | ||
| )), | ||
| }; | ||
| // The remaining joins return right-side rows and nulls for the left side. | ||
| let num_rows = probe_batch.num_rows(); | ||
| if schema.fields().is_empty() { | ||
| return new_empty_schema_batch(schema, num_rows); | ||
| } | ||
|
|
||
| columns.push(array); | ||
| let columns = column_indices | ||
| .iter() | ||
| .map(|column_index| match column_index.side { | ||
| // left -> null array | ||
| JoinSide::Left => new_null_array( | ||
| build_batch.column(column_index.index).data_type(), | ||
| num_rows, | ||
| ), | ||
| // right -> respective right array | ||
| JoinSide::Right => Arc::clone(probe_batch.column(column_index.index)), | ||
| // right mark -> unset boolean array as there are no matches on the left side | ||
| JoinSide::None => { | ||
| Arc::new(BooleanArray::new(BooleanBuffer::new_unset(num_rows), None)) | ||
| } | ||
| }) | ||
| .collect(); | ||
|
|
||
| Ok(RecordBatch::try_new(Arc::new(schema.clone()), columns)?) | ||
| } | ||
| } | ||
| Ok(RecordBatch::try_new(Arc::new(schema.clone()), columns)?) | ||
| } | ||
|
|
||
| /// The input is the matched indices for left and right and | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe a method on
JoinType?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree this reads more like join semantics than a hash-join utility. Moving it onto
JoinTypeas something likeJoinType::empty_build_side_produces_empty_result()would make the call sites a bit clearer and keep the rule next to the other join-type predicates.