support simplify order by deptno, total_sal, abs(deptno) to order by deptno, abs(deptno)

xiedeyantu · xiedeyantu · commit 77308cc138ae · 2026-04-06T10:43:38.000+08:00
diff --git a/datafusion/common/src/functional_dependencies.rs b/datafusion/common/src/functional_dependencies.rs
@@ -598,19 +598,19 @@ pub fn get_required_sort_exprs_indices(
 ) -> Option<Vec<usize>> {
     let dependencies = schema.functional_dependencies();
     let field_names = schema.field_names();
-    let sort_expr_indices = sort_expr_names
-        .iter()
-        .map(|sort_expr_name| {
-            field_names
-                .iter()
-                .position(|field_name| field_name == sort_expr_name)
-        })
-        .collect::<Option<Vec<_>>>()?;
 
     let mut known_field_indices = HashSet::new();
     let mut required_sort_expr_indices = Vec::new();
 
-    for (sort_expr_idx, field_idx) in sort_expr_indices.into_iter().enumerate() {
+    for (sort_expr_idx, sort_expr_name) in sort_expr_names.iter().enumerate() {
+        let Some(field_idx) = field_names
+            .iter()
+            .position(|field_name| field_name == sort_expr_name)
+        else {
+            required_sort_expr_indices.push(sort_expr_idx);
+            continue;
+        };
+
         let removable = dependencies.deps.iter().any(|dependency| {
             dependency.target_indices.contains(&field_idx)
                 && dependency
diff --git a/datafusion/optimizer/src/eliminate_duplicated_expr.rs b/datafusion/optimizer/src/eliminate_duplicated_expr.rs
@@ -160,7 +160,6 @@ mod tests {
     use crate::assert_optimized_plan_eq_snapshot;
     use crate::test::*;
     use datafusion_expr::{col, logical_plan::builder::LogicalPlanBuilder};
-    use datafusion_functions_aggregate::sum::sum;
     use std::sync::Arc;
 
     macro_rules! assert_optimized_plan_equal {
@@ -169,7 +168,8 @@ mod tests {
             @ $expected:literal $(,)?
         ) => {{
             let optimizer_ctx = OptimizerContext::new().with_max_passes(1);
-            let rules: Vec<Arc<dyn crate::OptimizerRule + Send + Sync>> = vec![Arc::new(EliminateDuplicatedExpr::new())];
+            let rules: Vec<Arc<dyn crate::OptimizerRule + Send + Sync>> =
+                vec![Arc::new(EliminateDuplicatedExpr::new())];
             assert_optimized_plan_eq_snapshot!(
                 optimizer_ctx,
                 rules,
@@ -214,40 +214,4 @@ mod tests {
             TableScan: test
         ")
     }
-
-    #[test]
-    fn eliminate_fd_redundant_sort_expr() -> Result<()> {
-        let table_scan = test_table_scan().unwrap();
-        let plan = LogicalPlanBuilder::from(table_scan)
-            .aggregate(vec![col("a")], vec![sum(col("b")).alias("total_sal")])?
-            .sort(vec![
-                col("a").sort(true, true),
-                col("total_sal").sort(true, true),
-            ])?
-            .build()?;
-
-        assert_optimized_plan_equal!(plan, @r"
-        Sort: test.a ASC NULLS FIRST
-          Aggregate: groupBy=[[test.a]], aggr=[[sum(test.b) AS total_sal]]
-            TableScan: test
-        ")
-    }
-
-    #[test]
-    fn keep_order_by_when_dependency_comes_later() -> Result<()> {
-        let table_scan = test_table_scan().unwrap();
-        let plan = LogicalPlanBuilder::from(table_scan)
-            .aggregate(vec![col("a")], vec![sum(col("b")).alias("total_sal")])?
-            .sort(vec![
-                col("total_sal").sort(true, true),
-                col("a").sort(true, true),
-            ])?
-            .build()?;
-
-        assert_optimized_plan_equal!(plan, @r"
-        Sort: total_sal ASC NULLS FIRST, test.a ASC NULLS FIRST
-          Aggregate: groupBy=[[test.a]], aggr=[[sum(test.b) AS total_sal]]
-            TableScan: test
-        ")
-    }
 }
diff --git a/datafusion/sqllogictest/test_files/order.slt b/datafusion/sqllogictest/test_files/order.slt
@@ -260,6 +260,66 @@ physical_plan
 02)--SortExec: expr=[c2@1 ASC NULLS LAST, c3@2 ASC NULLS LAST], preserve_partitioning=[false]
 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3], file_type=csv, has_header=true
 
+
+# eliminate redundant fd sort expr
+query TT
+explain SELECT c2, SUM(c3) AS total_sal FROM aggregate_test_100 GROUP BY c2 ORDER BY c2, total_sal
+----
+logical_plan
+01)Sort: aggregate_test_100.c2 ASC NULLS LAST
+02)--Projection: aggregate_test_100.c2, sum(aggregate_test_100.c3) AS total_sal
+03)----Aggregate: groupBy=[[aggregate_test_100.c2]], aggr=[[sum(CAST(aggregate_test_100.c3 AS Int64))]]
+04)------TableScan: aggregate_test_100 projection=[c2, c3]
+physical_plan
+01)SortPreservingMergeExec: [c2@0 ASC NULLS LAST]
+02)--SortExec: expr=[c2@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[c2@0 as c2, sum(aggregate_test_100.c3)@1 as total_sal]
+04)------AggregateExec: mode=FinalPartitioned, gby=[c2@0 as c2], aggr=[sum(aggregate_test_100.c3)]
+05)--------RepartitionExec: partitioning=Hash([c2@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[c2@0 as c2], aggr=[sum(aggregate_test_100.c3)]
+07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3], file_type=csv, has_header=true
+
+# keep order by when dependency comes later
+query TT
+explain SELECT c2, SUM(c3) AS total_sal FROM aggregate_test_100 GROUP BY c2 ORDER BY total_sal, c2
+----
+logical_plan
+01)Sort: total_sal ASC NULLS LAST, aggregate_test_100.c2 ASC NULLS LAST
+02)--Projection: aggregate_test_100.c2, sum(aggregate_test_100.c3) AS total_sal
+03)----Aggregate: groupBy=[[aggregate_test_100.c2]], aggr=[[sum(CAST(aggregate_test_100.c3 AS Int64))]]
+04)------TableScan: aggregate_test_100 projection=[c2, c3]
+physical_plan
+01)SortPreservingMergeExec: [total_sal@1 ASC NULLS LAST, c2@0 ASC NULLS LAST]
+02)--SortExec: expr=[total_sal@1 ASC NULLS LAST, c2@0 ASC NULLS LAST], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[c2@0 as c2, sum(aggregate_test_100.c3)@1 as total_sal]
+04)------AggregateExec: mode=FinalPartitioned, gby=[c2@0 as c2], aggr=[sum(aggregate_test_100.c3)]
+05)--------RepartitionExec: partitioning=Hash([c2@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[c2@0 as c2], aggr=[sum(aggregate_test_100.c3)]
+07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3], file_type=csv, has_header=true
+
+# eliminate redundant sort expr even with non schema expr
+query TT
+explain SELECT c2, SUM(c3) AS total_sal FROM aggregate_test_100 GROUP BY c2 ORDER BY c2, total_sal, abs(c2)
+----
+logical_plan
+01)Sort: aggregate_test_100.c2 ASC NULLS LAST, abs(aggregate_test_100.c2) ASC NULLS LAST
+02)--Projection: aggregate_test_100.c2, sum(aggregate_test_100.c3) AS total_sal
+03)----Aggregate: groupBy=[[aggregate_test_100.c2]], aggr=[[sum(CAST(aggregate_test_100.c3 AS Int64))]]
+04)------TableScan: aggregate_test_100 projection=[c2, c3]
+physical_plan
+01)SortPreservingMergeExec: [c2@0 ASC NULLS LAST, abs(c2@0) ASC NULLS LAST]
+02)--SortExec: expr=[c2@0 ASC NULLS LAST, abs(c2@0) ASC NULLS LAST], preserve_partitioning=[true]
+03)----ProjectionExec: expr=[c2@0 as c2, sum(aggregate_test_100.c3)@1 as total_sal]
+04)------AggregateExec: mode=FinalPartitioned, gby=[c2@0 as c2], aggr=[sum(aggregate_test_100.c3)]
+05)--------RepartitionExec: partitioning=Hash([c2@0], 4), input_partitions=4
+06)----------AggregateExec: mode=Partial, gby=[c2@0 as c2], aggr=[sum(aggregate_test_100.c3)]
+07)------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
+08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3], file_type=csv, has_header=true
+
+statement ok
+
 query II
 SELECT c2, c3 FROM aggregate_test_100 ORDER BY c2, c3, c2
 ----
@@ -1637,7 +1697,7 @@ physical_plan
 statement ok
 reset datafusion.catalog.information_schema;
 
-# The SLT runner sets `target_partitions` to 4 instead of using the default, so 
+# The SLT runner sets `target_partitions` to 4 instead of using the default, so
 # reset it explicitly.
 statement ok
 set datafusion.execution.target_partitions = 4;