Skip to content

Commit db4927c

Browse files
committed
Support duplicate column names in SQL projections (fix #6543)
Add a deduplication pass in the SQL planner that auto-suffixes duplicate expression names with :{count} before projection, so queries like SELECT x, x or TPC-DS Q39 no longer error. The fix is scoped to the SQL path only. The Rust API (LogicalPlanBuilder::project) still rejects duplicates via validate_unique_names, keeping optimizer invariants intact.
1 parent 646d183 commit db4927c

6 files changed

Lines changed: 269 additions & 35 deletions

File tree

datafusion/sql/src/select.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,9 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
2323
use crate::query::to_order_by_exprs_with_select;
2424
use crate::utils::{
2525
CheckColumnsMustReferenceAggregatePurpose, CheckColumnsSatisfyExprsPurpose,
26-
check_columns_satisfy_exprs, extract_aliases, rebase_expr, resolve_aliases_to_exprs,
27-
resolve_columns, resolve_positions_to_exprs, rewrite_recursive_unnests_bottom_up,
26+
check_columns_satisfy_exprs, deduplicate_select_expr_names, extract_aliases,
27+
rebase_expr, resolve_aliases_to_exprs, resolve_columns, resolve_positions_to_exprs,
28+
rewrite_recursive_unnests_bottom_up,
2829
};
2930

3031
use datafusion_common::error::DataFusionErrorBuilder;
@@ -109,6 +110,10 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
109110
planner_context,
110111
)?;
111112

113+
// Auto-suffix duplicate expression names (e.g. cov, cov → cov, cov:1)
114+
// before projection so that the unique-name constraint is satisfied.
115+
let select_exprs = deduplicate_select_expr_names(select_exprs);
116+
112117
// Having and group by clause may reference aliases defined in select projection
113118
let projected_plan = self.project(base_plan.clone(), select_exprs)?;
114119
let select_exprs = projected_plan.expressions();

datafusion/sql/src/utils.rs

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ use datafusion_expr::expr::{
3434
Alias, GroupingSet, Unnest, WindowFunction, WindowFunctionParams,
3535
};
3636
use datafusion_expr::utils::{expr_as_column_expr, find_column_exprs};
37+
use datafusion_expr::select_expr::SelectExpr;
3738
use datafusion_expr::{
3839
ColumnUnnestList, Expr, ExprSchemable, LogicalPlan, col, expr_vec_fmt,
3940
};
@@ -633,6 +634,42 @@ fn push_projection_dedupl(projection: &mut Vec<Expr>, expr: Expr) {
633634
projection.push(expr);
634635
}
635636
}
637+
638+
/// Auto-suffix duplicate SELECT expression names with `:{count}`.
639+
///
640+
/// The first occurrence keeps its original name so that ORDER BY / HAVING
641+
/// references resolve correctly. Wildcards are left untouched because they
642+
/// are expanded later in `project_with_validation`.
643+
///
644+
/// Duplicates are detected by the schema name of each expression, which
645+
/// identifies logically identical expressions before column normalization.
646+
pub(crate) fn deduplicate_select_expr_names(
647+
exprs: Vec<SelectExpr>,
648+
) -> Vec<SelectExpr> {
649+
let mut seen: HashMap<String, usize> = HashMap::new();
650+
exprs
651+
.into_iter()
652+
.map(|select_expr| match select_expr {
653+
SelectExpr::Expression(expr) => {
654+
let name = expr.schema_name().to_string();
655+
let count = seen.entry(name.clone()).or_insert(0);
656+
let result = if *count > 0 {
657+
let (_qualifier, field_name) = expr.qualified_name();
658+
SelectExpr::Expression(
659+
expr.alias(format!("{field_name}:{count}")),
660+
)
661+
} else {
662+
SelectExpr::Expression(expr)
663+
};
664+
*count += 1;
665+
result
666+
}
667+
// Leave wildcards alone — they are expanded later
668+
other => other,
669+
})
670+
.collect()
671+
}
672+
636673
/// The context is we want to rewrite unnest() into InnerProjection->Unnest->OuterProjection
637674
/// Given an expression which contains unnest expr as one of its children,
638675
/// Try transform depends on unnest type

datafusion/sql/tests/sql_integration.rs

Lines changed: 34 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -790,11 +790,13 @@ fn select_column_does_not_exist() {
790790
#[test]
791791
fn select_repeated_column() {
792792
let sql = "SELECT age, age FROM person";
793-
let err = logical_plan(sql).expect_err("query should have failed");
794-
793+
let plan = logical_plan(sql).unwrap();
795794
assert_snapshot!(
796-
err.strip_backtrace(),
797-
@r#"Error during planning: Projections require unique expression names but the expression "person.age" at position 0 and "person.age" at position 1 have the same name. Consider aliasing ("AS") one of them."#
795+
plan,
796+
@r"
797+
Projection: person.age, person.age AS age:1
798+
TableScan: person
799+
"
798800
);
799801
}
800802

@@ -1532,11 +1534,14 @@ fn select_simple_aggregate_column_does_not_exist() {
15321534
#[test]
15331535
fn select_simple_aggregate_repeated_aggregate() {
15341536
let sql = "SELECT MIN(age), MIN(age) FROM person";
1535-
let err = logical_plan(sql).expect_err("query should have failed");
1536-
1537+
let plan = logical_plan(sql).unwrap();
15371538
assert_snapshot!(
1538-
err.strip_backtrace(),
1539-
@r#"Error during planning: Projections require unique expression names but the expression "min(person.age)" at position 0 and "min(person.age)" at position 1 have the same name. Consider aliasing ("AS") one of them."#
1539+
plan,
1540+
@r"
1541+
Projection: min(person.age), min(person.age) AS min(person.age):1
1542+
Aggregate: groupBy=[[]], aggr=[[min(person.age)]]
1543+
TableScan: person
1544+
"
15401545
);
15411546
}
15421547

@@ -1585,11 +1590,14 @@ fn select_from_typed_string_values() {
15851590
#[test]
15861591
fn select_simple_aggregate_repeated_aggregate_with_repeated_aliases() {
15871592
let sql = "SELECT MIN(age) AS a, MIN(age) AS a FROM person";
1588-
let err = logical_plan(sql).expect_err("query should have failed");
1589-
1593+
let plan = logical_plan(sql).unwrap();
15901594
assert_snapshot!(
1591-
err.strip_backtrace(),
1592-
@r#"Error during planning: Projections require unique expression names but the expression "min(person.age) AS a" at position 0 and "min(person.age) AS a" at position 1 have the same name. Consider aliasing ("AS") one of them."#
1595+
plan,
1596+
@r"
1597+
Projection: min(person.age) AS a, min(person.age) AS a AS a:1
1598+
Aggregate: groupBy=[[]], aggr=[[min(person.age)]]
1599+
TableScan: person
1600+
"
15931601
);
15941602
}
15951603

@@ -1626,11 +1634,14 @@ fn select_simple_aggregate_with_groupby_with_aliases() {
16261634
#[test]
16271635
fn select_simple_aggregate_with_groupby_with_aliases_repeated() {
16281636
let sql = "SELECT state AS a, MIN(age) AS a FROM person GROUP BY state";
1629-
let err = logical_plan(sql).expect_err("query should have failed");
1630-
1637+
let plan = logical_plan(sql).unwrap();
16311638
assert_snapshot!(
1632-
err.strip_backtrace(),
1633-
@r#"Error during planning: Projections require unique expression names but the expression "person.state AS a" at position 0 and "min(person.age) AS a" at position 1 have the same name. Consider aliasing ("AS") one of them."#
1639+
plan,
1640+
@r"
1641+
Projection: person.state AS a, min(person.age) AS a AS a:1
1642+
Aggregate: groupBy=[[person.state]], aggr=[[min(person.age)]]
1643+
TableScan: person
1644+
"
16341645
);
16351646
}
16361647

@@ -1751,11 +1762,14 @@ fn select_simple_aggregate_with_groupby_can_use_alias() {
17511762
#[test]
17521763
fn select_simple_aggregate_with_groupby_aggregate_repeated() {
17531764
let sql = "SELECT state, MIN(age), MIN(age) FROM person GROUP BY state";
1754-
let err = logical_plan(sql).expect_err("query should have failed");
1755-
1765+
let plan = logical_plan(sql).unwrap();
17561766
assert_snapshot!(
1757-
err.strip_backtrace(),
1758-
@r#"Error during planning: Projections require unique expression names but the expression "min(person.age)" at position 1 and "min(person.age)" at position 2 have the same name. Consider aliasing ("AS") one of them."#
1767+
plan,
1768+
@r"
1769+
Projection: person.state, min(person.age), min(person.age) AS min(person.age):1
1770+
Aggregate: groupBy=[[person.state]], aggr=[[min(person.age)]]
1771+
TableScan: person
1772+
"
17591773
);
17601774
}
17611775

datafusion/sqllogictest/test_files/aggregate.slt

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7941,21 +7941,16 @@ select count(), count() * count() from t;
79417941
----
79427942
2 4
79437943

7944-
# DataFusion error: Error during planning: Projections require unique expression names but the expression "count\(Int64\(1\)\)" at position 0 and "count\(Int64\(1\)\)" at position 1 have the same name\. Consider aliasing \("AS"\) one of them\.
7945-
query error
7944+
# Duplicate aggregate expressions are now auto-suffixed
7945+
query II
79467946
select count(1), count(1) from t;
7947+
----
7948+
2 2
79477949

7948-
# DataFusion error: Error during planning: Projections require unique expression names but the expression "count\(Int64\(1\)\)" at position 0 and "count\(Int64\(1\)\)" at position 1 have the same name\. Consider aliasing \("AS"\) one of them\.
7949-
query error
7950-
explain select count(1), count(1) from t;
7951-
7952-
# DataFusion error: Error during planning: Projections require unique expression names but the expression "count\(Int64\(1\) AS \)" at position 0 and "count\(Int64\(1\) AS \)" at position 1 have the same name\. Consider aliasing \("AS"\) one of them\.
7953-
query error
7950+
query II
79547951
select count(), count() from t;
7955-
7956-
# DataFusion error: Error during planning: Projections require unique expression names but the expression "count\(Int64\(1\) AS \)" at position 0 and "count\(Int64\(1\) AS \)" at position 1 have the same name\. Consider aliasing \("AS"\) one of them\.
7957-
query error
7958-
explain select count(), count() from t;
7952+
----
7953+
2 2
79597954

79607955
query II
79617956
select count(1), count(2) from t;
Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
# Tests for duplicate column names/aliases in projections.
19+
# DataFusion auto-suffixes duplicates with :{count} (e.g. cov, cov:1).
20+
21+
# Setup
22+
statement ok
23+
CREATE TABLE t(x INT, y INT) AS VALUES (1, 2), (3, 4);
24+
25+
#
26+
# Basic duplicate alias
27+
#
28+
query II
29+
SELECT x AS c1, y AS c1 FROM t;
30+
----
31+
1 2
32+
3 4
33+
34+
#
35+
# Duplicate literal expressions
36+
#
37+
query II
38+
SELECT 1, 1;
39+
----
40+
1 1
41+
42+
#
43+
# Same column selected twice
44+
#
45+
query II
46+
SELECT x, x FROM t;
47+
----
48+
1 1
49+
3 3
50+
51+
#
52+
# Subquery with duplicate column names
53+
#
54+
query II
55+
SELECT * FROM (SELECT x AS c1, y AS c1 FROM t);
56+
----
57+
1 2
58+
3 4
59+
60+
#
61+
# ORDER BY referencing a duplicated alias resolves to first occurrence
62+
#
63+
query II
64+
SELECT x AS c1, y AS c1 FROM t ORDER BY c1;
65+
----
66+
1 2
67+
3 4
68+
69+
#
70+
# CTE join producing duplicate column names (TPC-DS Q39 pattern)
71+
#
72+
statement ok
73+
CREATE TABLE inv(warehouse_sk INT, item_sk INT, moy INT, cov DOUBLE) AS VALUES
74+
(1, 10, 1, 1.5),
75+
(1, 10, 2, 2.0),
76+
(2, 20, 1, 0.8),
77+
(2, 20, 2, 1.2);
78+
79+
query IIIRIIIR
80+
WITH inv1 AS (
81+
SELECT warehouse_sk, item_sk, moy, cov FROM inv WHERE moy = 1
82+
),
83+
inv2 AS (
84+
SELECT warehouse_sk, item_sk, moy, cov FROM inv WHERE moy = 2
85+
)
86+
SELECT inv1.warehouse_sk, inv1.item_sk, inv1.moy, inv1.cov,
87+
inv2.warehouse_sk, inv2.item_sk, inv2.moy, inv2.cov
88+
FROM inv1 JOIN inv2
89+
ON inv1.item_sk = inv2.item_sk AND inv1.warehouse_sk = inv2.warehouse_sk
90+
ORDER BY inv1.warehouse_sk, inv1.item_sk;
91+
----
92+
1 10 1 1.5 1 10 2 2
93+
2 20 1 0.8 2 20 2 1.2
94+
95+
#
96+
# Three-way duplicate
97+
#
98+
query III
99+
SELECT x AS a, y AS a, x + y AS a FROM t;
100+
----
101+
1 2 3
102+
3 4 7
103+
104+
#
105+
# CAST produces same schema_name as original column (TPC-DS Q39 pattern).
106+
# CAST is transparent to schema_name, so CAST(x AS DOUBLE) and x
107+
# both have schema_name "x" — this must be deduped.
108+
#
109+
query RI
110+
SELECT CAST(x AS DOUBLE), x FROM t;
111+
----
112+
1 1
113+
3 3
114+
115+
#
116+
# GROUP BY with duplicate expressions in SELECT
117+
#
118+
query II
119+
SELECT x, x FROM t GROUP BY x;
120+
----
121+
1 1
122+
3 3
123+
124+
#
125+
# Aggregate with GROUP BY producing duplicate column names
126+
#
127+
query III
128+
SELECT x, SUM(y) AS total, SUM(y) AS total FROM t GROUP BY x ORDER BY x;
129+
----
130+
1 2 2
131+
3 4 4
132+
133+
#
134+
# ORDER BY referencing the second (renamed) column by position
135+
#
136+
query II
137+
SELECT y AS c1, x AS c1 FROM t ORDER BY 2;
138+
----
139+
2 1
140+
4 3
141+
142+
#
143+
# Function calls that produce the same schema_name after argument
144+
# normalization (reported in issue #6543 for iszero).
145+
#
146+
query BB
147+
SELECT iszero(0.0), iszero(-0.0);
148+
----
149+
true true
150+
151+
#
152+
# Duplicate expressions inside a UNION subquery
153+
#
154+
query II
155+
SELECT * FROM (SELECT x AS a, y AS a FROM t UNION ALL SELECT y AS a, x AS a FROM t) ORDER BY 1, 2;
156+
----
157+
1 2
158+
2 1
159+
3 4
160+
4 3
161+
162+
#
163+
# Known limitation: wildcard expansion happens after dedup, so
164+
# SELECT *, col FROM t still errors when col overlaps with *.
165+
# This will be addressed in a follow-up PR.
166+
#
167+
query error DataFusion error: Error during planning: Projections require unique expression names but the expression "t\.x" at position 0 and "t\.x" at position 2 have the same name\. Consider aliasing \("AS"\) one of them\.
168+
SELECT *, x FROM t;
169+
170+
# Cleanup
171+
statement ok
172+
DROP TABLE t;
173+
174+
statement ok
175+
DROP TABLE inv;

datafusion/sqllogictest/test_files/unnest.slt

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -547,8 +547,16 @@ select unnest(column1) from (select * from (values([1,2,3]), ([4,5,6])) limit 1
547547
5
548548
6
549549

550-
query error DataFusion error: Error during planning: Projections require unique expression names but the expression "UNNEST\(unnest_table.column1\)" at position 0 and "UNNEST\(unnest_table.column1\)" at position 1 have the same name. Consider aliasing \("AS"\) one of them.
550+
query II
551551
select unnest(column1), unnest(column1) from unnest_table;
552+
----
553+
1 1
554+
2 2
555+
3 3
556+
4 4
557+
5 5
558+
6 6
559+
12 12
552560

553561
query II
554562
select unnest(column1), unnest(column1) u1 from unnest_table;

0 commit comments

Comments
 (0)