Skip to content

Commit d334608

Browse files
committed
fix: propagate ambiguous column names through projections and joins
1 parent c46092a commit d334608

3 files changed

Lines changed: 116 additions & 10 deletions

File tree

datafusion/expr/src/logical_plan/builder.rs

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1745,7 +1745,30 @@ pub fn build_join_schema(
17451745
.collect();
17461746

17471747
let dfschema = DFSchema::new_with_metadata(qualified_fields, metadata)?;
1748-
dfschema.with_functional_dependencies(func_dependencies)
1748+
let dfschema = dfschema.with_functional_dependencies(func_dependencies)?;
1749+
1750+
// Propagate ambiguous names from both input schemas. A name that was
1751+
// already ambiguous on either side of the join (e.g. because the left
1752+
// input is itself a subquery that wrapped a JOIN) remains ambiguous in
1753+
// the output. We only propagate names that actually appear as field
1754+
// names in the output schema so we don't accumulate stale entries.
1755+
let output_field_names: HashSet<&str> = dfschema
1756+
.fields()
1757+
.iter()
1758+
.map(|f| f.name().as_str())
1759+
.collect();
1760+
let inherited_ambiguous: HashSet<String> = left
1761+
.ambiguous_names()
1762+
.iter()
1763+
.chain(right.ambiguous_names())
1764+
.filter(|n| output_field_names.contains(n.as_str()))
1765+
.cloned()
1766+
.collect();
1767+
if inherited_ambiguous.is_empty() {
1768+
Ok(dfschema)
1769+
} else {
1770+
Ok(dfschema.with_ambiguous_names(inherited_ambiguous))
1771+
}
17491772
}
17501773

17511774
/// (Re)qualify the sides of a join if needed, i.e. if the columns from one side would otherwise

datafusion/expr/src/logical_plan/plan.rs

Lines changed: 46 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2375,6 +2375,29 @@ pub fn projection_schema(input: &LogicalPlan, exprs: &[Expr]) -> Result<Arc<DFSc
23752375
exprs, input,
23762376
)?)?;
23772377

2378+
// Propagate ambiguous names from the input for any column passed through
2379+
// unchanged. This prevents a `SELECT * FROM (...) AS alias` wrapper from
2380+
// silently dropping the ambiguity marker set by an inner JOIN or alias.
2381+
let input_ambiguous = input.schema().ambiguous_names();
2382+
if !input_ambiguous.is_empty() {
2383+
// A column is a pass-through when it is `Expr::Column(c)` and `c.name`
2384+
// appears in the input's ambiguous set.
2385+
let inherited: HashSet<String> = exprs
2386+
.iter()
2387+
.filter_map(|e| {
2388+
if let Expr::Column(col) = e {
2389+
if input_ambiguous.contains(&col.name) {
2390+
return Some(col.name.clone());
2391+
}
2392+
}
2393+
None
2394+
})
2395+
.collect();
2396+
if !inherited.is_empty() {
2397+
return Ok(Arc::new(schema.with_ambiguous_names(inherited)));
2398+
}
2399+
}
2400+
23782401
Ok(Arc::new(schema))
23792402
}
23802403

@@ -2406,23 +2429,37 @@ impl SubqueryAlias {
24062429
let aliases = unique_field_aliases(plan.schema().fields());
24072430
let is_projection_needed = aliases.iter().any(Option::is_some);
24082431

2409-
// Collect the set of unqualified field names that are ambiguous in this
2410-
// subquery alias's output schema. A name is ambiguous when two or more
2411-
// input columns share the same unqualified name (they come, say, from
2412-
// different sides of a JOIN). `unique_field_aliases` renames the
2413-
// duplicates to keep the Arrow schema free of duplicates, but we still
2414-
// need to reject unqualified references to those names from outer
2415-
// queries.
2432+
// Collect unqualified field names that are ambiguous in this alias's
2433+
// output schema. `unique_field_aliases` renames duplicates (e.g. to
2434+
// "id:1") to keep Arrow happy, but outer queries must still be
2435+
// prevented from referencing those names without qualification.
2436+
// We also inherit names already marked ambiguous by the input schema
2437+
// so nested `SELECT * FROM (...) AS sN` wrappers don't lose the marker.
24162438
let ambiguous_names: HashSet<String> = {
24172439
let mut name_counts: HashMap<&str, usize> = HashMap::new();
24182440
for field in plan.schema().fields() {
24192441
*name_counts.entry(field.name().as_str()).or_insert(0) += 1;
24202442
}
2421-
name_counts
2443+
let mut names: HashSet<String> = name_counts
24222444
.into_iter()
24232445
.filter(|&(_, count)| count >= 2)
24242446
.map(|(name, _)| name.to_string())
2425-
.collect()
2447+
.collect();
2448+
2449+
// Inherit names still visible in the output (the first occurrence
2450+
// of a renamed duplicate like "id:1" still keeps the name "id").
2451+
let output_field_names: HashSet<&str> = plan
2452+
.schema()
2453+
.fields()
2454+
.iter()
2455+
.map(|f| f.name().as_str())
2456+
.collect();
2457+
for inherited in plan.schema().ambiguous_names() {
2458+
if output_field_names.contains(inherited.as_str()) {
2459+
names.insert(inherited.clone());
2460+
}
2461+
}
2462+
names
24262463
};
24272464

24282465
// Insert a projection node, if needed, to make sure aliases are applied.

datafusion/sqllogictest/test_files/joins.slt

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5594,6 +5594,52 @@ FROM (SELECT t_left.id, t_left.age, t_right.score, t_extra.dept
55945594
100 eng
55955595
200 sales
55965596

5597+
# Nested derived table: double SELECT * wrapper must preserve ambiguity.
5598+
# SELECT age FROM ( SELECT * FROM ( SELECT * FROM t_left JOIN t_right ON t_left.id = t_right.id ) AS s1 ) AS s2
5599+
# "age" is ambiguous in s1 (from both t_left and t_right) and must stay
5600+
# ambiguous when s1 is wrapped in another SELECT * … AS s2.
5601+
query error DataFusion error: Schema error: Ambiguous reference to unqualified field age
5602+
SELECT age FROM (
5603+
SELECT * FROM (
5604+
SELECT * FROM t_left JOIN t_right ON t_left.id = t_right.id
5605+
) AS s1
5606+
) AS s2;
5607+
5608+
# Nested derived table: same for "id"
5609+
query error DataFusion error: Schema error: Ambiguous reference to unqualified field id
5610+
SELECT id FROM (
5611+
SELECT * FROM (
5612+
SELECT * FROM t_left JOIN t_right ON t_left.id = t_right.id
5613+
) AS s1
5614+
) AS s2;
5615+
5616+
# Join over subquery + table: ambiguous names from the subquery side must
5617+
# propagate into the outer join schema so that bare "age" is still rejected.
5618+
# Set up a seed table with a single column so only the subquery side has "age".
5619+
statement ok
5620+
CREATE TABLE seed(val INT) AS VALUES (1), (2);
5621+
5622+
query error DataFusion error: Schema error: Ambiguous reference to unqualified field age
5623+
SELECT age FROM (SELECT * FROM t_left JOIN t_right ON t_left.id = t_right.id) sub
5624+
JOIN seed ON true;
5625+
5626+
# Qualified access through the subquery alias is still fine even after joining
5627+
# with another table.
5628+
query II rowsort
5629+
SELECT sub.id, sub.score FROM (
5630+
SELECT t_left.id, t_right.score
5631+
FROM t_left JOIN t_right ON t_left.id = t_right.id
5632+
) sub
5633+
JOIN seed ON true;
5634+
----
5635+
1 100
5636+
1 100
5637+
2 200
5638+
2 200
5639+
5640+
statement ok
5641+
DROP TABLE seed;
5642+
55975643
statement ok
55985644
DROP TABLE t_left;
55995645

0 commit comments

Comments
 (0)