Skip to content

Commit ae9ebd7

Browse files
committed
perf: skip RG reorder when sort column not in file schema
For GROUP BY + ORDER BY queries, the TopK sort column is an aggregate output (e.g. COUNT(*)) that doesn't exist in the parquet file schema. Previously we still created ReorderByStatistics which tried to look up the column in statistics — wasted work. Now check column existence in file schema before creating the optimizer. This eliminates overhead for non-scan-level TopK queries (ClickBench Q40-Q42 regression fix).
1 parent a269ffd commit ae9ebd7

1 file changed

Lines changed: 27 additions & 15 deletions

File tree

datafusion/datasource-parquet/src/opener.rs

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1197,23 +1197,35 @@ impl RowGroupsPrunedParquetOpen {
11971197
&& sort_options.len() == 1
11981198
{
11991199
// Build a sort order from DynamicFilter for non-sort-pushdown TopK.
1200-
// Always ASC — reverse handles DESC separately.
1200+
// Quick bail: check if the sort column exists in file schema.
1201+
// For GROUP BY + ORDER BY, the sort column is an aggregate output
1202+
// (not in parquet) — skip to avoid wasted StatisticsConverter work.
12011203
let children = df.children();
12021204
if !children.is_empty() {
1203-
let sort_expr =
1204-
datafusion_physical_expr_common::sort_expr::PhysicalSortExpr {
1205-
expr: Arc::clone(children[0]),
1206-
options: arrow::compute::SortOptions {
1207-
descending: false,
1208-
nulls_first: sort_options[0].nulls_first,
1209-
},
1210-
};
1211-
LexOrdering::new(vec![sort_expr]).map(|order| {
1212-
Box::new(crate::access_plan_optimizer::ReorderByStatistics::new(
1213-
order,
1214-
))
1215-
as Box<dyn crate::access_plan_optimizer::AccessPlanOptimizer>
1216-
})
1205+
let col = find_column_in_expr(children[0]);
1206+
if let Some(ref c) = col
1207+
&& prepared
1208+
.physical_file_schema
1209+
.field_with_name(c.name())
1210+
.is_ok()
1211+
{
1212+
let sort_expr =
1213+
datafusion_physical_expr_common::sort_expr::PhysicalSortExpr {
1214+
expr: Arc::clone(children[0]),
1215+
options: arrow::compute::SortOptions {
1216+
descending: false,
1217+
nulls_first: sort_options[0].nulls_first,
1218+
},
1219+
};
1220+
LexOrdering::new(vec![sort_expr]).map(|order| {
1221+
Box::new(crate::access_plan_optimizer::ReorderByStatistics::new(
1222+
order,
1223+
))
1224+
as Box<dyn crate::access_plan_optimizer::AccessPlanOptimizer>
1225+
})
1226+
} else {
1227+
None
1228+
}
12171229
} else {
12181230
None
12191231
}

0 commit comments

Comments
 (0)