address pr comments

jayshrivastava · jayshrivastava · commit 19009682c39f · 2026-03-26T15:33:27.000-04:00
- removes test_pushdown_through_aggregate_with_reordered_input_no_pushdown_on_agg_result as it is basically a duplicate of another test
- adds slt tests for filter pushdown through aggregations (asserting correctness + plans)
diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown.rs
@@ -3224,85 +3224,6 @@ fn test_pushdown_through_aggregate_with_reordered_input_columns() {
     );
 }
 
-#[test]
-fn test_pushdown_through_aggregate_with_reordered_input_no_pushdown_on_agg_result() {
-    let scan = TestScanBuilder::new(schema()).with_support(true).build();
-
-    let reordered_schema = Arc::new(Schema::new(vec![
-        Field::new("c", DataType::Float64, false),
-        Field::new("a", DataType::Utf8, false),
-        Field::new("b", DataType::Utf8, false),
-    ]));
-    let projection = Arc::new(
-        ProjectionExec::try_new(
-            vec![
-                (col("c", &schema()).unwrap(), "c".to_string()),
-                (col("a", &schema()).unwrap(), "a".to_string()),
-                (col("b", &schema()).unwrap(), "b".to_string()),
-            ],
-            scan,
-        )
-        .unwrap(),
-    );
-
-    let aggregate_expr = vec![
-        AggregateExprBuilder::new(
-            count_udaf(),
-            vec![col("c", &reordered_schema).unwrap()],
-        )
-        .schema(reordered_schema.clone())
-        .alias("cnt")
-        .build()
-        .map(Arc::new)
-        .unwrap(),
-    ];
-
-    let group_by = PhysicalGroupBy::new_single(vec![
-        (col("a", &reordered_schema).unwrap(), "a".to_string()),
-        (col("b", &reordered_schema).unwrap(), "b".to_string()),
-    ]);
-
-    let aggregate = Arc::new(
-        AggregateExec::try_new(
-            AggregateMode::Final,
-            group_by,
-            aggregate_expr,
-            vec![None],
-            projection,
-            reordered_schema,
-        )
-        .unwrap(),
-    );
-
-    // Filter on cnt@2 (aggregate result, not a grouping column)
-    let agg_output_schema = aggregate.schema();
-    let predicate = Arc::new(BinaryExpr::new(
-        Arc::new(Column::new_with_schema("cnt", &agg_output_schema).unwrap()),
-        Operator::Gt,
-        Arc::new(Literal::new(ScalarValue::Int64(Some(5)))),
-    )) as Arc<dyn PhysicalExpr>;
-    let plan = Arc::new(FilterExec::try_new(predicate, aggregate).unwrap());
-
-    // The filter is not pushed down.
-    insta::assert_snapshot!(
-        OptimizationTest::new(plan, FilterPushdown::new(), true),
-        @r"
-    OptimizationTest:
-      input:
-        - FilterExec: cnt@2 > 5
-        -   AggregateExec: mode=Final, gby=[a@1 as a, b@2 as b], aggr=[cnt]
-        -     ProjectionExec: expr=[c@2 as c, a@0 as a, b@1 as b]
-        -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-      output:
-        Ok:
-          - FilterExec: cnt@2 > 5
-          -   AggregateExec: mode=Final, gby=[a@1 as a, b@2 as b], aggr=[cnt]
-          -     ProjectionExec: expr=[c@2 as c, a@0 as a, b@1 as b]
-          -       DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-    "
-    );
-}
-
 #[test]
 fn test_pushdown_through_aggregate_grouping_sets_with_reordered_input() {
     let scan = TestScanBuilder::new(schema()).with_support(true).build();
diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs
@@ -640,7 +640,8 @@ pub struct AggregateExec {
     limit_options: Option<LimitOptions>,
     /// Input plan, could be a partial aggregate or the input to the aggregate
     pub input: Arc<dyn ExecutionPlan>,
-    /// Schema after the aggregate is applied
+    /// Schema after the aggregate is applied. Contains the group by columns followed by the
+    /// aggregate outputs.
     schema: SchemaRef,
     /// Input schema before any aggregation is applied. For partial aggregate this will be the
     /// same as input.schema() but for the final aggregate it will be the same as the input
@@ -1473,9 +1474,12 @@ impl ExecutionPlan for AggregateExec {
         // This optimization is NOT safe for filters on aggregated columns (like filtering on
         // the result of SUM or COUNT), as those require computing all groups first.
 
-        // Build grouping columns using output indices because parent filters reference the AggregateExec's output schema where grouping
-        // columns in the output schema. The grouping expressions reference
-        // input columns which may not match the output schema.
+        // Build grouping columns using output indices because parent filters reference the
+        // AggregateExec's output schema where grouping columns in the output schema. The
+        // grouping expressions reference input columns which may not match the output schema.
+        //
+        // It is safe to assume that the output_schema contains group by columns in the same order
+        // as the group by expression. See [`create_schema`] and [`AggregateExec`].
         let output_schema = self.schema();
         let grouping_columns: HashSet<_> = (0..self.group_by.expr().len())
             .map(|i| Column::new(output_schema.field(i).name(), i))
@@ -1599,6 +1603,8 @@ impl ExecutionPlan for AggregateExec {
     }
 }
 
+/// Creates the output schema for an [`AggregateExec`] containing the group by columns followed
+/// by the aggregate columns.
 fn create_schema(
     input_schema: &Schema,
     group_by: &PhysicalGroupBy,
diff --git a/datafusion/sqllogictest/test_files/push_down_filter_regression.slt b/datafusion/sqllogictest/test_files/push_down_filter_regression.slt
@@ -185,6 +185,87 @@ physical_plan
 04)------AggregateExec: mode=Partial, gby=[(NULL as id), (id@0 as id)], aggr=[max(agg_dyn_test.id)]
 05)--------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-01/j5fUeSDQo22oPyPU.parquet, WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-02/j5fUeSDQo22oPyPU.parquet], [WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-03/j5fUeSDQo22oPyPU.parquet, WORKSPACE_ROOT/datafusion/core/tests/data/test_statistics_per_partition/date=2025-03-04/j5fUeSDQo22oPyPU.parquet]]}, projection=[id], file_type=parquet, predicate=id@0 < 10, pruning_predicate=id_null_count@1 != row_count@2 AND id_min@0 < 10, required_guarantees=[]
 
+statement ok
+set datafusion.execution.target_partitions = 1;
+
+# Regression test for https://github.com/apache/datafusion/issues/21065
+# Ensure filter pushdown through AggregateExec still works when a ProjectionExec
+# reorders aggregate input columns.
+statement ok
+create external table agg_reordered_pushdown stored as parquet location '../../parquet-testing/data/alltypes_plain.parquet';
+
+# Filter on grouping column b should push below the aggregate even though the
+# aggregate input is reordered to (c, a, b).
+query TT
+explain
+select a, b, cnt
+from (
+    select a, b, count(c) as cnt
+    from (
+        select id as c, cast(string_col as varchar) as a, bool_col as b
+        from agg_reordered_pushdown
+    ) t
+    group by a, b
+) q
+where b = true;
+----
+physical_plan
+01)ProjectionExec: expr=[a@0 as a, b@1 as b, count(t.c)@2 as cnt]
+02)--AggregateExec: mode=Single, gby=[a@1 as a, b@2 as b], aggr=[count(t.c)]
+03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id@0 as c, CAST(string_col@9 AS Utf8View) as a, bool_col@1 as b], file_type=parquet, predicate=bool_col@1, pruning_predicate=bool_col_min@0 OR bool_col_max@1, required_guarantees=[]
+
+query TBI rowsort
+select a, b, cnt
+from (
+    select a, b, count(c) as cnt
+    from (
+        select id as c, cast(string_col as varchar) as a, bool_col as b
+        from agg_reordered_pushdown
+    ) t
+    group by a, b
+) q
+where b = true;
+----
+0 true 4
+
+# Filter on aggregate output must remain above the aggregate.
+query TT
+explain
+select a, b, cnt
+from (
+    select a, b, count(c) as cnt
+    from (
+        select id as c, cast(string_col as varchar) as a, bool_col as b
+        from agg_reordered_pushdown
+    ) t
+    group by a, b
+) q
+where cnt > 1;
+----
+physical_plan
+01)ProjectionExec: expr=[a@0 as a, b@1 as b, count(t.c)@2 as cnt]
+02)--FilterExec: count(t.c)@2 > 1
+03)----AggregateExec: mode=Single, gby=[a@1 as a, b@2 as b], aggr=[count(t.c)]
+04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id@0 as c, CAST(string_col@9 AS Utf8View) as a, bool_col@1 as b], file_type=parquet
+
+query TBI rowsort
+select a, b, cnt
+from (
+    select a, b, count(c) as cnt
+    from (
+        select id as c, cast(string_col as varchar) as a, bool_col as b
+        from agg_reordered_pushdown
+    ) t
+    group by a, b
+) q
+where cnt > 1;
+----
+0 true 4
+1 false 4
+
+statement ok
+drop table agg_reordered_pushdown;
+
 statement ok
 drop table agg_dyn_test;