apache
diff --git a/‎datafusion/core/tests/dataframe/mod.rs‎
Lines changed: 2 additions & 3 deletions b/‎datafusion/core/tests/dataframe/mod.rs‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎datafusion/core/tests/physical_optimizer/partition_statistics.rs‎
Lines changed: 4 additions & 1 deletion b/‎datafusion/core/tests/physical_optimizer/partition_statistics.rs‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎datafusion/core/tests/physical_optimizer/projection_pushdown.rs‎
Lines changed: 119 additions & 1 deletion b/‎datafusion/core/tests/physical_optimizer/projection_pushdown.rs‎
Lines changed: 119 additions & 1 deletion
diff --git a/‎datafusion/expr/src/lib.rs‎
Lines changed: 3 additions & 1 deletion b/‎datafusion/expr/src/lib.rs‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎datafusion/expr/src/logical_plan/plan.rs‎
Lines changed: 69 additions & 18 deletions b/‎datafusion/expr/src/logical_plan/plan.rs‎
Lines changed: 69 additions & 18 deletions
diff --git a/‎datafusion/expr/src/udf.rs‎
Lines changed: 53 additions & 0 deletions b/‎datafusion/expr/src/udf.rs‎
Lines changed: 53 additions & 0 deletions
@@ -2458,9 +2458,8 @@ async fn cache_producer_test() -> Result<()> {
         @r"
     CacheNode
       Projection: aggregate_test_100.c2, aggregate_test_100.c3, CAST(CAST(aggregate_test_100.c2 AS Int64) + CAST(aggregate_test_100.c3 AS Int64) AS Int64) AS sum
-        Projection: aggregate_test_100.c2, aggregate_test_100.c3
-          Limit: skip=0, fetch=1
-            TableScan: aggregate_test_100, fetch=1
+        Limit: skip=0, fetch=1
+          TableScan: aggregate_test_100 projection=[c2, c3], fetch=1
     "
     );
     Ok(())
 
@@ -935,7 +935,10 @@ mod test {
             num_rows: Precision::Exact(0),
             total_byte_size: Precision::Absent,
             column_statistics: vec![
-                ColumnStatistics::new_unknown(),
+                ColumnStatistics {
+                    distinct_count: Precision::Exact(0),
+                    ..ColumnStatistics::new_unknown()
+                },
                 ColumnStatistics::new_unknown(),
                 ColumnStatistics::new_unknown(),
             ],
 
@@ -46,7 +46,7 @@ use datafusion_physical_optimizer::output_requirements::OutputRequirementExec;
 use datafusion_physical_optimizer::projection_pushdown::ProjectionPushdown;
 use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec;
 use datafusion_physical_plan::coop::CooperativeExec;
-use datafusion_physical_plan::filter::FilterExec;
+use datafusion_physical_plan::filter::{FilterExec, FilterExecBuilder};
 use datafusion_physical_plan::joins::utils::{ColumnIndex, JoinFilter};
 use datafusion_physical_plan::joins::{
     HashJoinExec, NestedLoopJoinExec, PartitionMode, StreamJoinPartitionMode,
@@ -1754,3 +1754,121 @@ fn test_hash_join_empty_projection_embeds() -> Result<()> {
 
     Ok(())
 }
+
+/// Regression test for <https://github.com/apache/datafusion/issues/21459>
+///
+/// When a `ProjectionExec` sits on top of a `FilterExec` that already carries
+/// an embedded projection, the `ProjectionPushdown` optimizer must not panic.
+///
+/// Before the fix, `FilterExecBuilder::from(self)` copied stale projection
+/// indices (e.g. `[0, 1, 2]`). After swapping, the new input was narrower
+/// (2 columns), so `.build()` panicked with "project index out of bounds".
+#[test]
+fn test_filter_with_embedded_projection_after_projection() -> Result<()> {
+    // DataSourceExec: [a, b, c, d, e]
+    let csv = create_simple_csv_exec();
+
+    // FilterExec: a > 0, projection=[0, 1, 2] → output: [a, b, c]
+    let predicate = Arc::new(BinaryExpr::new(
+        Arc::new(Column::new("a", 0)),
+        Operator::Gt,
+        Arc::new(Literal::new(ScalarValue::Int32(Some(0)))),
+    ));
+    let filter: Arc<dyn ExecutionPlan> = Arc::new(
+        FilterExecBuilder::new(predicate, csv)
+            .apply_projection(Some(vec![0, 1, 2]))?
+            .build()?,
+    );
+
+    // ProjectionExec: narrows [a, b, c] → [a, b]
+    let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
+        vec![
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "a"),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "b"),
+        ],
+        filter,
+    )?);
+
+    let initial = displayable(projection.as_ref()).indent(true).to_string();
+    let actual = initial.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    ProjectionExec: expr=[a@0 as a, b@1 as b]
+      FilterExec: a@0 > 0, projection=[a@0, b@1, c@2]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    "
+    );
+
+    // This must not panic
+    let after_optimize =
+        ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?;
+    let after_optimize_string = displayable(after_optimize.as_ref())
+        .indent(true)
+        .to_string();
+    let actual = after_optimize_string.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    FilterExec: a@0 > 0
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b], file_type=csv, has_header=false
+    "
+    );
+
+    Ok(())
+}
+
+/// Same as above, but the outer ProjectionExec also renames columns.
+/// Ensures the rename is preserved after the projection pushdown swap.
+#[test]
+fn test_filter_with_embedded_projection_after_renaming_projection() -> Result<()> {
+    let csv = create_simple_csv_exec();
+
+    // FilterExec: b > 10, projection=[0, 1, 2, 3] → output: [a, b, c, d]
+    let predicate = Arc::new(BinaryExpr::new(
+        Arc::new(Column::new("b", 1)),
+        Operator::Gt,
+        Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
+    ));
+    let filter: Arc<dyn ExecutionPlan> = Arc::new(
+        FilterExecBuilder::new(predicate, csv)
+            .apply_projection(Some(vec![0, 1, 2, 3]))?
+            .build()?,
+    );
+
+    // ProjectionExec: [a as x, b as y] — narrows and renames
+    let projection: Arc<dyn ExecutionPlan> = Arc::new(ProjectionExec::try_new(
+        vec![
+            ProjectionExpr::new(Arc::new(Column::new("a", 0)), "x"),
+            ProjectionExpr::new(Arc::new(Column::new("b", 1)), "y"),
+        ],
+        filter,
+    )?);
+
+    let initial = displayable(projection.as_ref()).indent(true).to_string();
+    let actual = initial.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    ProjectionExec: expr=[a@0 as x, b@1 as y]
+      FilterExec: b@1 > 10, projection=[a@0, b@1, c@2, d@3]
+        DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false
+    "
+    );
+
+    let after_optimize =
+        ProjectionPushdown::new().optimize(projection, &ConfigOptions::new())?;
+    let after_optimize_string = displayable(after_optimize.as_ref())
+        .indent(true)
+        .to_string();
+    let actual = after_optimize_string.trim();
+    assert_snapshot!(
+        actual,
+        @r"
+    FilterExec: y@1 > 10
+      DataSourceExec: file_groups={1 group: [[x]]}, projection=[a@0 as x, b@1 as y], file_type=csv, has_header=false
+    "
+    );
+
+    Ok(())
+}
@@ -126,7 +126,9 @@ pub use udaf::{
     udaf_default_schema_name, udaf_default_window_function_display_name,
     udaf_default_window_function_schema_name,
 };
-pub use udf::{ReturnFieldArgs, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl};
+pub use udf::{
+    ReturnFieldArgs, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, StructFieldMapping,
+};
 pub use udwf::{LimitEffect, ReversedUDWF, WindowUDF, WindowUDFImpl};
 pub use window_frame::{WindowFrame, WindowFrameBound, WindowFrameUnits};
 
 
@@ -45,7 +45,7 @@ use crate::utils::{
     grouping_set_expr_count, grouping_set_to_exprlist, split_conjunction,
 };
 use crate::{
-    BinaryExpr, CreateMemoryTable, CreateView, Execute, Expr, ExprSchemable,
+    BinaryExpr, CreateMemoryTable, CreateView, Execute, Expr, ExprSchemable, GroupingSet,
     LogicalPlanBuilder, Operator, Prepare, TableProviderFilterPushDown, TableSource,
     WindowFunctionDefinition, build_join_schema, expr_vec_fmt, requalify_sides_if_needed,
 };
@@ -3595,11 +3595,12 @@ impl Aggregate {
                 .into_iter()
                 .map(|(q, f)| (q, f.as_ref().clone().with_nullable(true).into()))
                 .collect::<Vec<_>>();
+            let max_ordinal = max_grouping_set_duplicate_ordinal(&group_expr);
             qualified_fields.push((
                 None,
                 Field::new(
                     Self::INTERNAL_GROUPING_ID,
-                    Self::grouping_id_type(qualified_fields.len()),
+                    Self::grouping_id_type(qualified_fields.len(), max_ordinal),
                     false,
                 )
                 .into(),
@@ -3685,15 +3686,24 @@ impl Aggregate {
     }
 
     /// Returns the data type of the grouping id.
-    /// The grouping ID value is a bitmask where each set bit
-    /// indicates that the corresponding grouping expression is
-    /// null
-    pub fn grouping_id_type(group_exprs: usize) -> DataType {
-        if group_exprs <= 8 {
+    ///
+    /// The grouping ID packs two pieces of information into a single integer:
+    /// - The low `group_exprs` bits are the semantic bitmask (a set bit means the
+    ///   corresponding grouping expression is NULL for this grouping set).
+    /// - The bits above position `group_exprs` encode a duplicate ordinal that
+    ///   distinguishes multiple occurrences of the same grouping set pattern.
+    ///
+    /// `max_ordinal` is the highest ordinal value that will appear (0 when there
+    /// are no duplicate grouping sets).  The type is chosen to be the smallest
+    /// unsigned integer that can represent both parts.
+    pub fn grouping_id_type(group_exprs: usize, max_ordinal: usize) -> DataType {
+        let ordinal_bits = usize::BITS as usize - max_ordinal.leading_zeros() as usize;
+        let total_bits = group_exprs + ordinal_bits;
+        if total_bits <= 8 {
             DataType::UInt8
-        } else if group_exprs <= 16 {
+        } else if total_bits <= 16 {
             DataType::UInt16
-        } else if group_exprs <= 32 {
+        } else if total_bits <= 32 {
             DataType::UInt32
         } else {
             DataType::UInt64
@@ -3702,21 +3712,36 @@ impl Aggregate {
 
     /// Internal column used when the aggregation is a grouping set.
     ///
-    /// This column contains a bitmask where each bit represents a grouping
-    /// expression. The least significant bit corresponds to the rightmost
-    /// grouping expression. A bit value of 0 indicates that the corresponding
-    /// column is included in the grouping set, while a value of 1 means it is excluded.
+    /// This column packs two values into a single unsigned integer:
+    ///
+    /// - **Low bits (positions 0 .. n-1)**: a semantic bitmask where each bit
+    ///   represents one of the `n` grouping expressions.  The least significant
+    ///   bit corresponds to the rightmost grouping expression.  A `1` bit means
+    ///   the corresponding column is replaced with `NULL` for this grouping set;
+    ///   a `0` bit means it is included.
+    /// - **High bits (positions n and above)**: a *duplicate ordinal* that
+    ///   distinguishes multiple occurrences of the same semantic grouping set
+    ///   pattern within a single query.  The ordinal is `0` for the first
+    ///   occurrence, `1` for the second, and so on.
+    ///
+    /// The integer type is chosen by [`Self::grouping_id_type`] to be the
+    /// smallest `UInt8 / UInt16 / UInt32 / UInt64` that can represent both
+    /// parts.
     ///
-    /// For example, for the grouping expressions CUBE(a, b), the grouping ID
-    /// column will have the following values:
+    /// For example, for the grouping expressions CUBE(a, b) (no duplicates),
+    /// the grouping ID column will have the following values:
     ///     0b00: Both `a` and `b` are included
     ///     0b01: `b` is excluded
     ///     0b10: `a` is excluded
     ///     0b11: Both `a` and `b` are excluded
     ///
-    /// This internal column is necessary because excluded columns are replaced
-    /// with `NULL` values. To handle these cases correctly, we must distinguish
-    /// between an actual `NULL` value in a column and a column being excluded from the set.
+    /// When the same set appears twice and `n = 2`, the duplicate ordinal is
+    /// packed into bit 2:
+    ///     first occurrence:  `0b0_01` (ordinal = 0, mask = 0b01)
+    ///     second occurrence: `0b1_01` (ordinal = 1, mask = 0b01)
+    ///
+    /// The GROUPING function always masks the value with `(1 << n) - 1` before
+    /// interpreting it so the ordinal bits are invisible to user-facing SQL.
     pub const INTERNAL_GROUPING_ID: &'static str = "__grouping_id";
 }
 
@@ -3737,6 +3762,24 @@ impl PartialOrd for Aggregate {
     }
 }
 
+/// Returns the highest duplicate ordinal across all grouping sets in `group_expr`.
+///
+/// The ordinal for each occurrence of a grouping set pattern is its 0-based
+/// index among identical entries. For example, if the same set appears three
+/// times, the ordinals are 0, 1, 2 and this function returns 2.
+/// Returns 0 when no grouping set is duplicated.
+fn max_grouping_set_duplicate_ordinal(group_expr: &[Expr]) -> usize {
+    if let Some(Expr::GroupingSet(GroupingSet::GroupingSets(sets))) = group_expr.first() {
+        let mut counts: HashMap<&[Expr], usize> = HashMap::new();
+        for set in sets {
+            *counts.entry(set).or_insert(0) += 1;
+        }
+        counts.into_values().max().unwrap_or(0).saturating_sub(1)
+    } else {
+        0
+    }
+}
+
 /// Checks whether any expression in `group_expr` contains `Expr::GroupingSet`.
 fn contains_grouping_set(group_expr: &[Expr]) -> bool {
     group_expr
@@ -5053,6 +5096,14 @@ mod tests {
         );
     }
 
+    #[test]
+    fn grouping_id_type_accounts_for_duplicate_ordinal_bits() {
+        // 8 grouping columns fit in UInt8 when there are no duplicate ordinals,
+        // but adding one duplicate ordinal bit widens the type to UInt16.
+        assert_eq!(Aggregate::grouping_id_type(8, 0), DataType::UInt8);
+        assert_eq!(Aggregate::grouping_id_type(8, 1), DataType::UInt16);
+    }
+
     #[test]
     fn test_filter_is_scalar() {
         // test empty placeholder
 
@@ -38,6 +38,25 @@ use std::fmt::Debug;
 use std::hash::{Hash, Hasher};
 use std::sync::Arc;
 
+/// Describes how a struct-producing UDF's output fields correspond to its
+/// input arguments. This enables the optimizer to propagate orderings
+/// through struct projections (e.g., so that sorting by a struct field
+/// can be recognized as equivalent to sorting by the source column).
+///
+/// See [`ScalarUDFImpl::struct_field_mapping`] for details.
+pub struct StructFieldMapping {
+    /// The UDF used to construct field access expressions on the output.
+    /// For example, the `get_field` UDF for accessing struct fields.
+    pub field_accessor: Arc<ScalarUDF>,
+    /// For each output field: the literal arguments to pass to the
+    /// `field_accessor` UDF (after the base expression), and the index
+    /// of the corresponding input argument that produces the field's value.
+    ///
+    /// For `named_struct('a', col1, 'b', col2)`, this would be:
+    /// `[(["a"], 1), (["b"], 3)]` — field `"a"` comes from arg index 1.
+    pub fields: Vec<(Vec<ScalarValue>, usize)>,
+}
+
 /// Logical representation of a Scalar User Defined Function.
 ///
 /// A scalar function produces a single row output for each row of input. This
@@ -305,6 +324,14 @@ impl ScalarUDF {
         self.inner.evaluate_bounds(inputs)
     }
 
+    /// See [`ScalarUDFImpl::struct_field_mapping`] for more details.
+    pub fn struct_field_mapping(
+        &self,
+        literal_args: &[Option<ScalarValue>],
+    ) -> Option<StructFieldMapping> {
+        self.inner.struct_field_mapping(literal_args)
+    }
+
     /// Updates bounds for child expressions, given a known interval for this
     /// function. This is used to propagate constraints down through an expression
     /// tree.
@@ -961,6 +988,25 @@ pub trait ScalarUDFImpl: Debug + DynEq + DynHash + Send + Sync + Any {
         not_impl_err!("Function {} does not implement coerce_types", self.name())
     }
 
+    /// For struct-producing functions, return how output fields map to input
+    /// arguments. This enables the optimizer to propagate orderings through
+    /// struct projections.
+    ///
+    /// `literal_args[i]` is `Some(value)` if argument `i` is a known literal,
+    /// allowing extraction of field names from arguments like
+    /// `named_struct('field_name', value, ...)`.
+    ///
+    /// For example, `named_struct('a', col1, 'b', col2)` would return a
+    /// mapping indicating that output field `'a'` (accessed via
+    /// `get_field(output, 'a')`) corresponds to input argument `col1` at
+    /// index 1, and field `'b'` corresponds to `col2` at index 3.
+    fn struct_field_mapping(
+        &self,
+        _literal_args: &[Option<ScalarValue>],
+    ) -> Option<StructFieldMapping> {
+        None
+    }
+
     /// Returns the documentation for this Scalar UDF.
     ///
     /// Documentation can be accessed programmatically as well as generating
@@ -1109,6 +1155,13 @@ impl ScalarUDFImpl for AliasedScalarUDFImpl {
         self.inner.propagate_constraints(interval, inputs)
     }
 
+    fn struct_field_mapping(
+        &self,
+        literal_args: &[Option<ScalarValue>],
+    ) -> Option<StructFieldMapping> {
+        self.inner.struct_field_mapping(literal_args)
+    }
+
     fn output_ordering(&self, inputs: &[ExprProperties]) -> Result<SortProperties> {
         self.inner.output_ordering(inputs)
     }