Skip to content

Commit 40b209e

Browse files
akoshchiyalamb
andauthored
feat: remove __unnest_placeholder from struct unnest projection (#21725)
## Which issue does this PR close? <!-- We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. For example `Closes #123` indicates that this PR will close issue #123. --> - Closes #16894 ## What changes are included in this PR? - User-visible struct-UNNEST columns no longer expose `__unnest_placeholder`. - Internal Unnest planning still uses placeholders. - This adds a final Projection layer in explain plans wherever struct UNNEST output is published. <!-- There is no need to duplicate the description in the issue here but it is sometimes worth providing a summary of the individual changes in this PR. --> ## Are these changes tested? Yes. <!-- We typically require tests for all PRs in order to: 1. Prevent the code from being accidentally broken by subsequent changes 2. Serve as another way to document the expected behavior of the code If tests are not included in your PR, please explain why (for example, are they covered by existing tests)? --> ## Are there any user-facing changes? Yes. The aliases of unnested columns will be changed. <!-- If there are user-facing changes then we may require documentation to be updated before approving the PR. --> <!-- If there are any breaking changes to public APIs, please add the `api change` label. --> --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent 62ad66b commit 40b209e

5 files changed

Lines changed: 106 additions & 51 deletions

File tree

datafusion/sql/src/select.rs

Lines changed: 58 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,17 @@ use crate::query::to_order_by_exprs_with_select;
2424
use crate::utils::{
2525
CheckColumnsMustReferenceAggregatePurpose, CheckColumnsSatisfyExprsPurpose,
2626
check_columns_satisfy_exprs, extract_aliases, rebase_expr, resolve_aliases_to_exprs,
27-
resolve_columns, resolve_positions_to_exprs, rewrite_recursive_unnests_bottom_up,
27+
resolve_columns, resolve_positions_to_exprs, rewrite_recursive_unnest_bottom_up,
28+
rewrite_recursive_unnests_bottom_up,
2829
};
2930

31+
use arrow::datatypes::DataType;
3032
use datafusion_common::error::DataFusionErrorBuilder;
3133
use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion};
3234
use datafusion_common::{Column, DFSchema, DFSchemaRef, Result, not_impl_err, plan_err};
3335
use datafusion_common::{RecursionUnnestOption, UnnestOptions};
36+
use datafusion_expr::ExprSchemable;
37+
use datafusion_expr::builder::get_struct_unnested_columns;
3438
use datafusion_expr::expr::{PlannedReplaceSelectItem, WildcardOptions};
3539
use datafusion_expr::expr_rewriter::{
3640
normalize_col, normalize_col_with_schemas_and_ambiguity_check, normalize_sorts,
@@ -463,15 +467,30 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
463467

464468
// expr returned here maybe different from the originals in inner_projection_exprs
465469
// for example:
466-
// - unnest(struct_col) will be transformed into unnest(struct_col).field1, unnest(struct_col).field2
467-
// - unnest(array_col) will be transformed into unnest(array_col).element
468-
// - unnest(array_col) + 1 will be transformed into unnest(array_col).element +1
469-
let outer_projection_exprs = rewrite_recursive_unnests_bottom_up(
470-
&intermediate_plan,
471-
&mut unnest_columns,
472-
&mut inner_projection_exprs,
473-
&intermediate_select_exprs,
474-
)?;
470+
// - unnest(struct_col) will be transformed into struct_col.field1, struct_col.field2
471+
// - unnest(array_col) will be transformed into array_col.element
472+
// - unnest(array_col) + 1 will be transformed into array_col.element +1
473+
let mut outer_projection_exprs = vec![];
474+
for expr in &intermediate_select_exprs {
475+
let mut rewritten_exprs = rewrite_recursive_unnest_bottom_up(
476+
&intermediate_plan,
477+
&mut unnest_columns,
478+
&mut inner_projection_exprs,
479+
expr,
480+
)?;
481+
482+
if let Some(columns) =
483+
self.get_struct_unnest_columns(&intermediate_plan, expr)?
484+
{
485+
rewritten_exprs = rewritten_exprs
486+
.into_iter()
487+
.zip(columns)
488+
.map(|(expr, column)| expr.alias(column.flat_name()))
489+
.collect();
490+
}
491+
492+
outer_projection_exprs.extend(rewritten_exprs);
493+
}
475494

476495
// No more unnest is possible
477496
if unnest_columns.is_empty() {
@@ -516,6 +535,35 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
516535
.build()
517536
}
518537

538+
fn get_struct_unnest_columns(
539+
&self,
540+
input: &LogicalPlan,
541+
expr: &Expr,
542+
) -> Result<Option<Vec<Column>>> {
543+
let unnest_expr = match expr {
544+
Expr::Unnest(unnest_expr) => Some(unnest_expr),
545+
Expr::Alias(alias) => match alias.expr.as_ref() {
546+
Expr::Unnest(unnest_expr) => Some(unnest_expr),
547+
_ => None,
548+
},
549+
_ => None,
550+
};
551+
552+
let Some(unnest_expr) = unnest_expr else {
553+
return Ok(None);
554+
};
555+
556+
let field = unnest_expr.expr.to_field(input.schema())?.1;
557+
let DataType::Struct(inner_fields) = field.data_type() else {
558+
return Ok(None);
559+
};
560+
561+
Ok(Some(get_struct_unnested_columns(
562+
&unnest_expr.expr.schema_name().to_string(),
563+
inner_fields,
564+
)))
565+
}
566+
519567
fn try_process_aggregate_unnest(&self, input: LogicalPlan) -> Result<LogicalPlan> {
520568
match input {
521569
// Fast path if there are no unnest in group by

datafusion/sql/tests/cases/plan_to_sql.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -988,7 +988,7 @@ fn test_unnest_logical_plan() -> Result<()> {
988988
assert_snapshot!(
989989
plan,
990990
@r"
991-
Projection: __unnest_placeholder(unnest_table.struct_col).field1, __unnest_placeholder(unnest_table.struct_col).field2, __unnest_placeholder(unnest_table.array_col,depth=1) AS UNNEST(unnest_table.array_col), unnest_table.struct_col, unnest_table.array_col
991+
Projection: __unnest_placeholder(unnest_table.struct_col).field1 AS unnest_table.struct_col.field1, __unnest_placeholder(unnest_table.struct_col).field2 AS unnest_table.struct_col.field2, __unnest_placeholder(unnest_table.array_col,depth=1) AS UNNEST(unnest_table.array_col), unnest_table.struct_col, unnest_table.array_col
992992
Unnest: lists[__unnest_placeholder(unnest_table.array_col)|depth=1] structs[__unnest_placeholder(unnest_table.struct_col)]
993993
Projection: unnest_table.struct_col AS __unnest_placeholder(unnest_table.struct_col), unnest_table.array_col AS __unnest_placeholder(unnest_table.array_col), unnest_table.struct_col, unnest_table.array_col
994994
TableScan: unnest_table

datafusion/sqllogictest/test_files/push_down_filter_unnest.slt

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -130,19 +130,20 @@ statement ok
130130
CREATE TABLE d AS VALUES (named_struct('a', 1, 'b', 2)), (named_struct('a', 3, 'b', 4)), (named_struct('a', 5, 'b', 6));
131131

132132
query II
133-
select * from (select unnest(column1) from d) where "__unnest_placeholder(d.column1).b" > 5;
133+
select * from (select unnest(column1) from d) where "d.column1.b" > 5;
134134
----
135135
5 6
136136

137137
query TT
138-
explain select * from (select unnest(column1) from d) where "__unnest_placeholder(d.column1).b" > 5;
138+
explain select * from (select unnest(column1) from d) where "d.column1.b" > 5;
139139
----
140140
physical_plan
141-
01)FilterExec: __unnest_placeholder(d.column1).b@1 > 5
142-
02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
143-
03)----UnnestExec
144-
04)------ProjectionExec: expr=[column1@0 as __unnest_placeholder(d.column1)]
145-
05)--------DataSourceExec: partitions=1, partition_sizes=[1]
141+
01)ProjectionExec: expr=[__unnest_placeholder(d.column1).a@0 as d.column1.a, __unnest_placeholder(d.column1).b@1 as d.column1.b]
142+
02)--FilterExec: __unnest_placeholder(d.column1).b@1 > 5
143+
03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
144+
04)------UnnestExec
145+
05)--------ProjectionExec: expr=[column1@0 as __unnest_placeholder(d.column1)]
146+
06)----------DataSourceExec: partitions=1, partition_sizes=[1]
146147

147148
statement ok
148149
drop table d;

datafusion/sqllogictest/test_files/unnest.slt

Lines changed: 37 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,9 @@ select unnest(struct(1,2,3)) as ignored_alias;
6868
query TTT
6969
describe select unnest(struct(1,2,3)) as ignored_alias;
7070
----
71-
__unnest_placeholder(struct(Int64(1),Int64(2),Int64(3))).c0 Int64 YES
72-
__unnest_placeholder(struct(Int64(1),Int64(2),Int64(3))).c1 Int64 YES
73-
__unnest_placeholder(struct(Int64(1),Int64(2),Int64(3))).c2 Int64 YES
71+
struct(Int64(1),Int64(2),Int64(3)).c0 Int64 YES
72+
struct(Int64(1),Int64(2),Int64(3)).c1 Int64 YES
73+
struct(Int64(1),Int64(2),Int64(3)).c2 Int64 YES
7474

7575
## Basic unnest list expression in from clause
7676
query I
@@ -608,18 +608,20 @@ query TT
608608
explain select unnest(unnest(column3)), column3 from recursive_unnest_table;
609609
----
610610
logical_plan
611-
01)Unnest: lists[] structs[__unnest_placeholder(UNNEST(recursive_unnest_table.column3))]
612-
02)--Projection: __unnest_placeholder(recursive_unnest_table.column3,depth=1) AS UNNEST(recursive_unnest_table.column3) AS __unnest_placeholder(UNNEST(recursive_unnest_table.column3)), recursive_unnest_table.column3
613-
03)----Unnest: lists[__unnest_placeholder(recursive_unnest_table.column3)|depth=1] structs[]
614-
04)------Projection: recursive_unnest_table.column3 AS __unnest_placeholder(recursive_unnest_table.column3), recursive_unnest_table.column3
615-
05)--------TableScan: recursive_unnest_table projection=[column3]
611+
01)Projection: __unnest_placeholder(UNNEST(recursive_unnest_table.column3)).c0 AS UNNEST(recursive_unnest_table.column3).c0, __unnest_placeholder(UNNEST(recursive_unnest_table.column3)).c1 AS UNNEST(recursive_unnest_table.column3).c1, recursive_unnest_table.column3
612+
02)--Unnest: lists[] structs[__unnest_placeholder(UNNEST(recursive_unnest_table.column3))]
613+
03)----Projection: __unnest_placeholder(recursive_unnest_table.column3,depth=1) AS UNNEST(recursive_unnest_table.column3) AS __unnest_placeholder(UNNEST(recursive_unnest_table.column3)), recursive_unnest_table.column3
614+
04)------Unnest: lists[__unnest_placeholder(recursive_unnest_table.column3)|depth=1] structs[]
615+
05)--------Projection: recursive_unnest_table.column3 AS __unnest_placeholder(recursive_unnest_table.column3), recursive_unnest_table.column3
616+
06)----------TableScan: recursive_unnest_table projection=[column3]
616617
physical_plan
617-
01)UnnestExec
618-
02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
619-
03)----ProjectionExec: expr=[__unnest_placeholder(recursive_unnest_table.column3,depth=1)@0 as __unnest_placeholder(UNNEST(recursive_unnest_table.column3)), column3@1 as column3]
620-
04)------UnnestExec
621-
05)--------ProjectionExec: expr=[column3@0 as __unnest_placeholder(recursive_unnest_table.column3), column3@0 as column3]
622-
06)----------DataSourceExec: partitions=1, partition_sizes=[1]
618+
01)ProjectionExec: expr=[__unnest_placeholder(UNNEST(recursive_unnest_table.column3)).c0@0 as UNNEST(recursive_unnest_table.column3).c0, __unnest_placeholder(UNNEST(recursive_unnest_table.column3)).c1@1 as UNNEST(recursive_unnest_table.column3).c1, column3@2 as column3]
619+
02)--UnnestExec
620+
03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
621+
04)------ProjectionExec: expr=[__unnest_placeholder(recursive_unnest_table.column3,depth=1)@0 as __unnest_placeholder(UNNEST(recursive_unnest_table.column3)), column3@1 as column3]
622+
05)--------UnnestExec
623+
06)----------ProjectionExec: expr=[column3@0 as __unnest_placeholder(recursive_unnest_table.column3), column3@0 as column3]
624+
07)------------DataSourceExec: partitions=1, partition_sizes=[1]
623625

624626
## unnest->field_access->unnest->unnest
625627
query I?
@@ -824,9 +826,9 @@ d e {c0: f}
824826
query TTT
825827
describe select unnest(column1) c1 from nested_unnest_table;
826828
----
827-
__unnest_placeholder(nested_unnest_table.column1).c0 Utf8 YES
828-
__unnest_placeholder(nested_unnest_table.column1).c1 Utf8 YES
829-
__unnest_placeholder(nested_unnest_table.column1).c2 Struct("c0": Utf8) YES
829+
nested_unnest_table.column1.c0 Utf8 YES
830+
nested_unnest_table.column1.c1 Utf8 YES
831+
nested_unnest_table.column1.c2 Struct("c0": Utf8) YES
830832

831833
query II??I??
832834
select unnest(column5), * from unnest_table;
@@ -1098,15 +1100,17 @@ EXPLAIN WITH unnested AS (
10981100
) SELECT * FROM unnested order by 1;
10991101
----
11001102
logical_plan
1101-
01)Sort: unnested.__unnest_placeholder(struct(t.column1,t.column2,t.column3)).c0 ASC NULLS LAST
1103+
01)Sort: unnested.struct(t.column1,t.column2,t.column3).c0 ASC NULLS LAST
11021104
02)--SubqueryAlias: unnested
1103-
03)----Unnest: lists[] structs[__unnest_placeholder(struct(t.column1,t.column2,t.column3))]
1104-
04)------Projection: struct(t.column1, t.column2, t.column3) AS __unnest_placeholder(struct(t.column1,t.column2,t.column3))
1105-
05)--------TableScan: t projection=[column1, column2, column3]
1105+
03)----Projection: __unnest_placeholder(struct(t.column1,t.column2,t.column3)).c0 AS struct(t.column1,t.column2,t.column3).c0, __unnest_placeholder(struct(t.column1,t.column2,t.column3)).c1 AS struct(t.column1,t.column2,t.column3).c1, __unnest_placeholder(struct(t.column1,t.column2,t.column3)).c2 AS struct(t.column1,t.column2,t.column3).c2
1106+
04)------Unnest: lists[] structs[__unnest_placeholder(struct(t.column1,t.column2,t.column3))]
1107+
05)--------Projection: struct(t.column1, t.column2, t.column3) AS __unnest_placeholder(struct(t.column1,t.column2,t.column3))
1108+
06)----------TableScan: t projection=[column1, column2, column3]
11061109
physical_plan
1107-
01)SortExec: expr=[__unnest_placeholder(struct(t.column1,t.column2,t.column3)).c0@0 ASC NULLS LAST], preserve_partitioning=[false]
1108-
02)--UnnestExec
1109-
03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_tuples.parquet]]}, projection=[struct(column1@0, column2@1, column3@2) as __unnest_placeholder(struct(t.column1,t.column2,t.column3))], file_type=parquet
1110+
01)SortExec: expr=[struct(t.column1,t.column2,t.column3).c0@0 ASC NULLS LAST], preserve_partitioning=[false]
1111+
02)--ProjectionExec: expr=[__unnest_placeholder(struct(t.column1,t.column2,t.column3)).c0@0 as struct(t.column1,t.column2,t.column3).c0, __unnest_placeholder(struct(t.column1,t.column2,t.column3)).c1@1 as struct(t.column1,t.column2,t.column3).c1, __unnest_placeholder(struct(t.column1,t.column2,t.column3)).c2@2 as struct(t.column1,t.column2,t.column3).c2]
1112+
03)----UnnestExec
1113+
04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_tuples.parquet]]}, projection=[struct(column1@0, column2@1, column3@2) as __unnest_placeholder(struct(t.column1,t.column2,t.column3))], file_type=parquet
11101114

11111115
# cleanup
11121116
statement ok
@@ -1152,12 +1156,14 @@ EXPLAIN SELECT UNNEST(column1), column2 FROM t ORDER BY column2;
11521156
----
11531157
logical_plan
11541158
01)Sort: t.column2 ASC NULLS LAST
1155-
02)--Unnest: lists[] structs[__unnest_placeholder(t.column1)]
1156-
03)----Projection: t.column1 AS __unnest_placeholder(t.column1), t.column2
1157-
04)------TableScan: t projection=[column1, column2]
1159+
02)--Projection: __unnest_placeholder(t.column1).s1 AS t.column1.s1, __unnest_placeholder(t.column1).s2 AS t.column1.s2, __unnest_placeholder(t.column1).s3 AS t.column1.s3, t.column2
1160+
03)----Unnest: lists[] structs[__unnest_placeholder(t.column1)]
1161+
04)------Projection: t.column1 AS __unnest_placeholder(t.column1), t.column2
1162+
05)--------TableScan: t projection=[column1, column2]
11581163
physical_plan
1159-
01)UnnestExec
1160-
02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_struct.parquet]]}, projection=[column1@0 as __unnest_placeholder(t.column1), column2], output_ordering=[column2@1 ASC NULLS LAST], file_type=parquet
1164+
01)ProjectionExec: expr=[__unnest_placeholder(t.column1).s1@0 as t.column1.s1, __unnest_placeholder(t.column1).s2@1 as t.column1.s2, __unnest_placeholder(t.column1).s3@2 as t.column1.s3, column2@3 as column2]
1165+
02)--UnnestExec
1166+
03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_struct.parquet]]}, projection=[column1@0 as __unnest_placeholder(t.column1), column2], output_ordering=[column2@1 ASC NULLS LAST], file_type=parquet
11611167

11621168
# cleanup
11631169
statement ok
@@ -1220,12 +1226,12 @@ EXPLAIN SELECT UNNEST(UNNEST(column1)), UNNEST(column2), UNNEST(column3), column
12201226
----
12211227
logical_plan
12221228
01)Sort: t.column4 ASC NULLS LAST
1223-
02)--Projection: __unnest_placeholder(t.column1,depth=2) AS UNNEST(UNNEST(t.column1)), __unnest_placeholder(t.column2,depth=1) AS UNNEST(t.column2), __unnest_placeholder(t.column3).s1, __unnest_placeholder(t.column3).s2, __unnest_placeholder(t.column3).s3, t.column4
1229+
02)--Projection: __unnest_placeholder(t.column1,depth=2) AS UNNEST(UNNEST(t.column1)), __unnest_placeholder(t.column2,depth=1) AS UNNEST(t.column2), __unnest_placeholder(t.column3).s1 AS t.column3.s1, __unnest_placeholder(t.column3).s2 AS t.column3.s2, __unnest_placeholder(t.column3).s3 AS t.column3.s3, t.column4
12241230
03)----Unnest: lists[__unnest_placeholder(t.column1)|depth=2, __unnest_placeholder(t.column2)|depth=1] structs[__unnest_placeholder(t.column3)]
12251231
04)------Projection: t.column1 AS __unnest_placeholder(t.column1), t.column2 AS __unnest_placeholder(t.column2), t.column3 AS __unnest_placeholder(t.column3), t.column4
12261232
05)--------TableScan: t projection=[column1, column2, column3, column4]
12271233
physical_plan
1228-
01)ProjectionExec: expr=[__unnest_placeholder(t.column1,depth=2)@0 as UNNEST(UNNEST(t.column1)), __unnest_placeholder(t.column2,depth=1)@1 as UNNEST(t.column2), __unnest_placeholder(t.column3).s1@2 as __unnest_placeholder(t.column3).s1, __unnest_placeholder(t.column3).s2@3 as __unnest_placeholder(t.column3).s2, __unnest_placeholder(t.column3).s3@4 as __unnest_placeholder(t.column3).s3, column4@5 as column4]
1234+
01)ProjectionExec: expr=[__unnest_placeholder(t.column1,depth=2)@0 as UNNEST(UNNEST(t.column1)), __unnest_placeholder(t.column2,depth=1)@1 as UNNEST(t.column2), __unnest_placeholder(t.column3).s1@2 as t.column3.s1, __unnest_placeholder(t.column3).s2@3 as t.column3.s2, __unnest_placeholder(t.column3).s3@4 as t.column3.s3, column4@5 as column4]
12291235
02)--UnnestExec
12301236
03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_struct_arrays.parquet]]}, projection=[column1@0 as __unnest_placeholder(t.column1), column2@1 as __unnest_placeholder(t.column2), column3@2 as __unnest_placeholder(t.column3), column4], output_ordering=[column4@3 ASC NULLS LAST], file_type=parquet
12311237

docs/source/user-guide/sql/special_functions.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ Expands an array or map into rows.
6969
### `unnest (struct)`
7070

7171
Expand a struct fields into individual columns.
72-
Each field of the struct will be prefixed with `__unnest_placeholder` and could be accessed via `"__unnest_placeholder(<struct>).<field>"`.
72+
Each field of the struct can be accessed via `"<table>.<struct>.<field>"`.
7373

7474
#### Arguments
7575

@@ -93,7 +93,7 @@ Each field of the struct will be prefixed with `__unnest_placeholder` and could
9393

9494
> select unnest(struct_column) from foov;
9595
+--------------------------------------------+--------------------------------------------+
96-
| __unnest_placeholder(foov.struct_column).a | __unnest_placeholder(foov.struct_column).b |
96+
| foov.struct_column.a | foov.struct_column.b |
9797
+--------------------------------------------+--------------------------------------------+
9898
| 5 | a string |
9999
| 6 | another string |

0 commit comments

Comments
 (0)