Skip to content

Commit 39cbd35

Browse files
committed
more tests and less things that go wrong
1 parent 232a7bf commit 39cbd35

3 files changed

Lines changed: 93 additions & 15 deletions

File tree

datafusion/core/src/physical_planner.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1650,6 +1650,7 @@ impl DefaultPhysicalPlanner {
16501650
} else if session_state.config().target_partitions() > 1
16511651
&& session_state.config().repartition_joins()
16521652
&& !prefer_hash_join
1653+
&& !*null_aware
16531654
{
16541655
// Use SortMergeJoin if hash join is not preferred
16551656
let join_on_len = join_on.len();

datafusion/physical-plan/src/joins/sort_merge_join/bitwise_stream.rs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -126,9 +126,7 @@ use std::sync::Arc;
126126
use std::task::{Context, Poll};
127127

128128
use crate::RecordBatchStream;
129-
use crate::joins::utils::{
130-
JoinFilter, build_null_aware_left_mark_column, compare_join_arrays,
131-
};
129+
use crate::joins::utils::{JoinFilter, compare_join_arrays};
132130
use crate::metrics::{
133131
BaselineMetrics, Count, ExecutionPlanMetricsSet, Gauge, MetricBuilder,
134132
};

datafusion/sqllogictest/test_files/null_aware_mark_join.slt

Lines changed: 91 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,37 @@ physical_plan
7272
04)------DataSourceExec: partitions=1, partition_sizes=[1]
7373
05)------DataSourceExec: partitions=1, partition_sizes=[1]
7474

75+
query IT rowsort
76+
SELECT id, value
77+
FROM outer_table
78+
WHERE (id NOT IN (SELECT id FROM inner_table_with_null)) IS NULL;
79+
----
80+
1 a
81+
3 c
82+
4 d
83+
NULL e
84+
85+
query TT
86+
EXPLAIN
87+
SELECT value
88+
FROM outer_table
89+
WHERE (id NOT IN (SELECT id FROM inner_table_with_null)) IS NULL;
90+
----
91+
logical_plan
92+
01)Projection: outer_table.value
93+
02)--Filter: NOT __correlated_sq_1.mark IS NULL
94+
03)----Projection: outer_table.value, __correlated_sq_1.mark
95+
04)------LeftMark Join: outer_table.id = __correlated_sq_1.id
96+
05)--------TableScan: outer_table projection=[id, value]
97+
06)--------SubqueryAlias: __correlated_sq_1
98+
07)----------TableScan: inner_table_with_null projection=[id]
99+
physical_plan
100+
01)FilterExec: NOT mark@1 IS NULL, projection=[value@0]
101+
02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
102+
03)----HashJoinExec: mode=CollectLeft, join_type=LeftMark, on=[(id@0, id@0)], projection=[value@1, mark@2]
103+
04)------DataSourceExec: partitions=1, partition_sizes=[1]
104+
05)------DataSourceExec: partitions=1, partition_sizes=[1]
105+
75106
query T rowsort
76107
SELECT value
77108
FROM outer_table
@@ -121,8 +152,30 @@ c
121152
d
122153
e
123154

155+
query TT
156+
EXPLAIN
157+
SELECT value
158+
FROM outer_table
159+
WHERE (id NOT IN (SELECT id FROM empty_table)) IS TRUE;
160+
----
161+
logical_plan
162+
01)Projection: outer_table.value
163+
02)--Filter: NOT __correlated_sq_1.mark IS TRUE
164+
03)----Projection: outer_table.value, __correlated_sq_1.mark
165+
04)------LeftMark Join: outer_table.id = __correlated_sq_1.id
166+
05)--------TableScan: outer_table projection=[id, value]
167+
06)--------SubqueryAlias: __correlated_sq_1
168+
07)----------TableScan: empty_table projection=[id]
169+
physical_plan
170+
01)FilterExec: NOT mark@1 IS NOT DISTINCT FROM true, projection=[value@0]
171+
02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
172+
03)----HashJoinExec: mode=CollectLeft, join_type=LeftMark, on=[(id@0, id@0)], projection=[value@1, mark@2]
173+
04)------DataSourceExec: partitions=1, partition_sizes=[1]
174+
05)------DataSourceExec: partitions=1, partition_sizes=[0]
175+
124176
###################################
125177
## Sort-merge join null-aware mark
178+
As of this work, sort-merge join actually don't support null-aware semantics, so they still end up using a hash-join.
126179
###################################
127180

128181
statement ok
@@ -149,12 +202,20 @@ logical_plan
149202
06)--------TableScan: inner_table_with_null projection=[id]
150203
physical_plan
151204
01)FilterExec: NOT mark@2 IS NULL, projection=[id@0, value@1]
152-
02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true
153-
03)----SortMergeJoinExec: join_type=LeftMark, on=[(id@0, id@0)]
154-
04)------SortExec: expr=[id@0 ASC], preserve_partitioning=[false]
155-
05)--------DataSourceExec: partitions=1, partition_sizes=[1]
156-
06)------SortExec: expr=[id@0 ASC], preserve_partitioning=[false]
157-
07)--------DataSourceExec: partitions=1, partition_sizes=[1]
205+
02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
206+
03)----HashJoinExec: mode=CollectLeft, join_type=LeftMark, on=[(id@0, id@0)]
207+
04)------DataSourceExec: partitions=1, partition_sizes=[1]
208+
05)------DataSourceExec: partitions=1, partition_sizes=[1]
209+
210+
query IT rowsort
211+
SELECT id, value
212+
FROM outer_table
213+
WHERE (id NOT IN (SELECT id FROM inner_table_with_null)) IS NULL;
214+
----
215+
1 a
216+
3 c
217+
4 d
218+
NULL e
158219

159220
query TT
160221
EXPLAIN
@@ -173,12 +234,9 @@ logical_plan
173234
physical_plan
174235
01)FilterExec: NOT mark@1 IS NULL, projection=[value@0]
175236
02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
176-
03)----ProjectionExec: expr=[value@1 as value, mark@2 as mark]
177-
04)------SortMergeJoinExec: join_type=LeftMark, on=[(id@0, id@0)]
178-
05)--------SortExec: expr=[id@0 ASC], preserve_partitioning=[false]
179-
06)----------DataSourceExec: partitions=1, partition_sizes=[1]
180-
07)--------SortExec: expr=[id@0 ASC], preserve_partitioning=[false]
181-
08)----------DataSourceExec: partitions=1, partition_sizes=[1]
237+
03)----HashJoinExec: mode=CollectLeft, join_type=LeftMark, on=[(id@0, id@0)], projection=[value@1, mark@2]
238+
04)------DataSourceExec: partitions=1, partition_sizes=[1]
239+
05)------DataSourceExec: partitions=1, partition_sizes=[1]
182240

183241
query T rowsort
184242
SELECT value
@@ -197,6 +255,27 @@ WHERE (id NOT IN (SELECT id FROM inner_table_no_null)) IS NULL;
197255
----
198256
e
199257

258+
query TT
259+
EXPLAIN
260+
SELECT value
261+
FROM outer_table
262+
WHERE (id NOT IN (SELECT id FROM inner_table_no_null)) IS NULL;
263+
----
264+
logical_plan
265+
01)Projection: outer_table.value
266+
02)--Filter: NOT __correlated_sq_1.mark IS NULL
267+
03)----Projection: outer_table.value, __correlated_sq_1.mark
268+
04)------LeftMark Join: outer_table.id = __correlated_sq_1.id
269+
05)--------TableScan: outer_table projection=[id, value]
270+
06)--------SubqueryAlias: __correlated_sq_1
271+
07)----------TableScan: inner_table_no_null projection=[id]
272+
physical_plan
273+
01)FilterExec: NOT mark@1 IS NULL, projection=[value@0]
274+
02)--RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
275+
03)----HashJoinExec: mode=CollectLeft, join_type=LeftMark, on=[(id@0, id@0)], projection=[value@1, mark@2]
276+
04)------DataSourceExec: partitions=1, partition_sizes=[1]
277+
05)------DataSourceExec: partitions=1, partition_sizes=[1]
278+
200279
####################################
201280
## Nested loop mark join with NULLs
202281
####################################

0 commit comments

Comments
 (0)