Skip to content

Commit 670dbf4

Browse files
adriangbclaude
andauthored
fix: prevent duplicate alias collision with user-provided __datafusion_extracted names (#20432)
## Summary - Fixes a bug where the optimizer's `AliasGenerator` could produce alias names that collide with`__datafusion_extracted_N` aliases, causing a "Schema contains duplicate unqualified field name" error - I don't expect users themselves to create these aliases, but if you run the optimizers twice (with different `AliasGenerator` instances) you'll hit this. - Adds `AliasGenerator::update_min_id()` to advance the counter past existing aliases - Scans each plan node's expressions during `ExtractLeafExpressions` traversal to seed the generator before any extraction occurs - Switches to controlling the traversal which also means the config-based short circuit more clearly skips the entire rule. Closes #20430 ## Test plan - [x] Unit test: `test_user_provided_extracted_alias_no_collision` in `extract_leaf_expressions` - [x] SLT regression test in `projection_pushdown.slt` with explicit `__datafusion_extracted_2` alias 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 17d770d commit 670dbf4

3 files changed

Lines changed: 87 additions & 5 deletions

File tree

datafusion/common/src/alias.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,16 @@ impl AliasGenerator {
3737
Self::default()
3838
}
3939

40+
/// Advance the counter to at least `min_id`, ensuring future aliases
41+
/// won't collide with already-existing ones.
42+
///
43+
/// For example, if the query already contains an alias `alias_42`, then calling
44+
/// `update_min_id(42)` will ensure that future aliases generated by this
45+
/// [`AliasGenerator`] will start from `alias_43`.
46+
pub fn update_min_id(&self, min_id: usize) {
47+
self.next_id.fetch_max(min_id + 1, Ordering::Relaxed);
48+
}
49+
4050
/// Return a unique alias with the provided prefix
4151
pub fn next(&self, prefix: &str) -> String {
4252
let id = self.next_id.fetch_add(1, Ordering::Relaxed);

datafusion/optimizer/src/extract_leaf_expressions.rs

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -114,10 +114,6 @@ impl OptimizerRule for ExtractLeafExpressions {
114114
"extract_leaf_expressions"
115115
}
116116

117-
fn apply_order(&self) -> Option<ApplyOrder> {
118-
Some(ApplyOrder::TopDown)
119-
}
120-
121117
fn rewrite(
122118
&self,
123119
plan: LogicalPlan,
@@ -127,10 +123,45 @@ impl OptimizerRule for ExtractLeafExpressions {
127123
return Ok(Transformed::no(plan));
128124
}
129125
let alias_generator = config.alias_generator();
130-
extract_from_plan(plan, alias_generator)
126+
127+
// Advance the alias generator past any user-provided __datafusion_extracted_N
128+
// aliases to prevent collisions when generating new extraction aliases.
129+
advance_generator_past_existing(&plan, alias_generator)?;
130+
131+
plan.transform_down_with_subqueries(|plan| {
132+
extract_from_plan(plan, alias_generator)
133+
})
131134
}
132135
}
133136

137+
/// Scans the current plan node's expressions for pre-existing
138+
/// `__datafusion_extracted_N` aliases and advances the generator
139+
/// counter past them to avoid collisions with user-provided aliases.
140+
fn advance_generator_past_existing(
141+
plan: &LogicalPlan,
142+
alias_generator: &AliasGenerator,
143+
) -> Result<()> {
144+
plan.apply(|plan| {
145+
plan.expressions().iter().try_for_each(|expr| {
146+
expr.apply(|e| {
147+
if let Expr::Alias(alias) = e
148+
&& let Some(id) = alias
149+
.name
150+
.strip_prefix(EXTRACTED_EXPR_PREFIX)
151+
.and_then(|s| s.strip_prefix('_'))
152+
.and_then(|s| s.parse().ok())
153+
{
154+
alias_generator.update_min_id(id);
155+
}
156+
Ok(TreeNodeRecursion::Continue)
157+
})?;
158+
Ok::<(), datafusion_common::error::DataFusionError>(())
159+
})?;
160+
Ok(TreeNodeRecursion::Continue)
161+
})
162+
.map(|_| ())
163+
}
164+
134165
/// Extracts `MoveTowardsLeafNodes` sub-expressions from a plan node.
135166
///
136167
/// Works for any number of inputs (0, 1, 2, …N). For multi-input nodes

datafusion/sqllogictest/test_files/projection_pushdown.slt

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1949,3 +1949,44 @@ ORDER BY simple_struct.id;
19491949
3 3
19501950
4 4
19511951
5 5
1952+
1953+
# =========================================================================
1954+
# Regression: user-provided __datafusion_extracted aliases must not
1955+
# collide with optimizer-generated ones
1956+
# (https://github.com/apache/datafusion/issues/20430)
1957+
# =========================================================================
1958+
1959+
statement ok
1960+
COPY ( select {f1: 1, f2: 2} as s
1961+
) TO 'test_files/scratch/projection_pushdown/test.parquet'
1962+
STORED AS PARQUET;
1963+
1964+
statement ok
1965+
CREATE EXTERNAL TABLE t
1966+
STORED AS PARQUET
1967+
LOCATION 'test_files/scratch/projection_pushdown/test.parquet';
1968+
1969+
# Verify that the user-provided __datafusion_extracted_2 alias is preserved
1970+
# and the optimizer skips to _3 and _4 for its generated aliases.
1971+
query TT
1972+
EXPLAIN SELECT
1973+
get_field(s, 'f1') AS __datafusion_extracted_2
1974+
FROM t
1975+
WHERE COALESCE(get_field(s, 'f1'), get_field(s, 'f2')) = 1;
1976+
----
1977+
logical_plan
1978+
01)Projection: __datafusion_extracted_2
1979+
02)--Filter: CASE WHEN __datafusion_extracted_3 IS NOT NULL THEN __datafusion_extracted_3 ELSE __datafusion_extracted_4 END = Int64(1)
1980+
03)----Projection: get_field(t.s, Utf8("f1")) AS __datafusion_extracted_3, get_field(t.s, Utf8("f2")) AS __datafusion_extracted_4, get_field(t.s, Utf8("f1")) AS __datafusion_extracted_2
1981+
04)------TableScan: t projection=[s], partial_filters=[CASE WHEN get_field(t.s, Utf8("f1")) IS NOT NULL THEN get_field(t.s, Utf8("f1")) ELSE get_field(t.s, Utf8("f2")) END = Int64(1)]
1982+
physical_plan
1983+
01)FilterExec: CASE WHEN __datafusion_extracted_3@0 IS NOT NULL THEN __datafusion_extracted_3@0 ELSE __datafusion_extracted_4@1 END = 1, projection=[__datafusion_extracted_2@2]
1984+
02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/test.parquet]]}, projection=[get_field(s@0, f1) as __datafusion_extracted_3, get_field(s@0, f2) as __datafusion_extracted_4, get_field(s@0, f1) as __datafusion_extracted_2], file_type=parquet
1985+
1986+
query I
1987+
SELECT
1988+
get_field(s, 'f1') AS __datafusion_extracted_2
1989+
FROM t
1990+
WHERE COALESCE(get_field(s, 'f1'), get_field(s, 'f2')) = 1;
1991+
----
1992+
1

0 commit comments

Comments
 (0)