Skip to content

Commit 7731130

Browse files
authored
feat: add a config to disable subquery_sort_elimination (#21614)
## Which issue does this PR close? - Closes #15886 ## Rationale for this change - see #15886 (comment) ## What changes are included in this PR? - Added a new optimizer config option: `datafusion.sql_parser.enable_subquery_sort_elimination` - Added a `SessionConfig` builder method: `with_enable_subquery_sort_elimination(...)` - Updated SQL relation planning so subquery/CTE `ORDER BY` elimination only happens when this config is enabled - Kept the current behavior as the default to avoid changing existing behavior unexpectedly ## Are these changes tested? Added SQL integration tests that verify: - subquery `ORDER BY` is removed by default - subquery `ORDER BY` is preserved when `enable_subquery_sort_elimination` is set to `false` - add slt test case ## Are there any user-facing changes? no
1 parent 44625fb commit 7731130

7 files changed

Lines changed: 123 additions & 4 deletions

File tree

datafusion/common/src/config.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,15 @@ config_namespace! {
311311
/// By default, `nulls_max` is used to follow Postgres's behavior.
312312
/// postgres rule: <https://www.postgresql.org/docs/current/queries-order.html>
313313
pub default_null_ordering: String, default = "nulls_max".to_string()
314+
315+
/// When set to true, DataFusion may remove `ORDER BY` clauses from
316+
/// subqueries or CTEs during SQL planning when their ordering cannot
317+
/// affect the result, such as when no `LIMIT` or other
318+
/// order-sensitive operator depends on them.
319+
///
320+
/// Disable this option to preserve explicit subquery ordering in the
321+
/// planned query.
322+
pub enable_subquery_sort_elimination: bool, default = true
314323
}
315324
}
316325

datafusion/execution/src/config.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -431,6 +431,15 @@ impl SessionConfig {
431431
self
432432
}
433433

434+
/// Enables or disables elimination of `ORDER BY` clauses in subqueries
435+
/// when they are not required by order-sensitive operators.
436+
pub fn with_enable_subquery_sort_elimination(mut self, enabled: bool) -> Self {
437+
self.options_mut()
438+
.sql_parser
439+
.enable_subquery_sort_elimination = enabled;
440+
self
441+
}
442+
434443
/// Set the size of [`sort_spill_reservation_bytes`] to control
435444
/// memory pre-reservation
436445
///

datafusion/sql/src/relation/mod.rs

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,14 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
9797
}
9898
};
9999

100-
let optimized_plan = optimize_subquery_sort(planned_relation.plan)?.data;
100+
let optimized_plan = optimize_subquery_sort(
101+
planned_relation.plan,
102+
self.context_provider
103+
.options()
104+
.sql_parser
105+
.enable_subquery_sort_elimination,
106+
)?
107+
.data;
101108
if let Some(alias) = planned_relation.alias {
102109
self.apply_table_alias(optimized_plan, alias)
103110
} else {
@@ -357,7 +364,14 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
357364
}
358365
}
359366

360-
fn optimize_subquery_sort(plan: LogicalPlan) -> Result<Transformed<LogicalPlan>> {
367+
fn optimize_subquery_sort(
368+
plan: LogicalPlan,
369+
enable_subquery_sort_elimination: bool,
370+
) -> Result<Transformed<LogicalPlan>> {
371+
if !enable_subquery_sort_elimination {
372+
return Ok(Transformed::no(plan));
373+
}
374+
361375
// When initializing subqueries, we examine sort options since they might be unnecessary.
362376
// They are only important if the subquery result is affected by the ORDER BY statement,
363377
// which can happen when we have:

datafusion/sql/tests/sql_integration.rs

Lines changed: 67 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3478,6 +3478,13 @@ fn logical_plan(sql: &str) -> Result<LogicalPlan> {
34783478
logical_plan_with_options(sql, ParserOptions::default())
34793479
}
34803480

3481+
fn logical_plan_with_config(
3482+
sql: &str,
3483+
config_options: datafusion_common::config::ConfigOptions,
3484+
) -> Result<LogicalPlan> {
3485+
logical_plan_with_config_and_options(sql, config_options, ParserOptions::default())
3486+
}
3487+
34813488
fn logical_plan_with_options(sql: &str, options: ParserOptions) -> Result<LogicalPlan> {
34823489
let dialect = &GenericDialect {};
34833490
logical_plan_with_dialect_and_options(sql, dialect, options)
@@ -3497,7 +3504,25 @@ fn logical_plan_with_dialect_and_options(
34973504
dialect: &dyn Dialect,
34983505
options: ParserOptions,
34993506
) -> Result<LogicalPlan> {
3500-
let state = MockSessionState::default()
3507+
let state = mock_session_state();
3508+
3509+
logical_plan_from_state(sql, dialect, options, state)
3510+
}
3511+
3512+
fn logical_plan_with_config_and_options(
3513+
sql: &str,
3514+
config_options: datafusion_common::config::ConfigOptions,
3515+
options: ParserOptions,
3516+
) -> Result<LogicalPlan> {
3517+
let dialect = &GenericDialect {};
3518+
let mut state = mock_session_state();
3519+
state.config_options = config_options;
3520+
3521+
logical_plan_from_state(sql, dialect, options, state)
3522+
}
3523+
3524+
fn mock_session_state() -> MockSessionState {
3525+
MockSessionState::default()
35013526
.with_scalar_function(Arc::new(unicode::character_length().as_ref().clone()))
35023527
.with_scalar_function(Arc::new(string::concat().as_ref().clone()))
35033528
.with_scalar_function(Arc::new(make_udf(
@@ -3534,8 +3559,15 @@ fn logical_plan_with_dialect_and_options(
35343559
.with_aggregate_function(grouping_udaf())
35353560
.with_window_function(rank_udwf())
35363561
.with_window_function(row_number_udwf())
3537-
.with_expr_planner(Arc::new(CoreFunctionPlanner::default()));
3562+
.with_expr_planner(Arc::new(CoreFunctionPlanner::default()))
3563+
}
35383564

3565+
fn logical_plan_from_state(
3566+
sql: &str,
3567+
dialect: &dyn Dialect,
3568+
options: ParserOptions,
3569+
state: MockSessionState,
3570+
) -> Result<LogicalPlan> {
35393571
let context = MockContextProvider { state };
35403572
let planner = SqlToRel::new_with_options(&context, options);
35413573
let result = DFParser::parse_sql_with_dialect(sql, dialect);
@@ -3808,6 +3840,39 @@ fn in_subquery_uncorrelated() {
38083840
);
38093841
}
38103842

3843+
#[test]
3844+
fn subquery_order_by_is_eliminated_by_default() {
3845+
let sql = "SELECT x.* FROM (SELECT id FROM person ORDER BY id) x";
3846+
let plan = logical_plan(sql).unwrap();
3847+
assert_snapshot!(
3848+
plan.display_indent_schema().to_string(),
3849+
@r"
3850+
Projection: x.id [id:UInt32]
3851+
SubqueryAlias: x [id:UInt32]
3852+
Projection: person.id [id:UInt32]
3853+
TableScan: person [id:UInt32, first_name:Utf8, last_name:Utf8, age:Int32, state:Utf8, salary:Float64, birth_date:Timestamp(ns), 😀:Int32]
3854+
"
3855+
);
3856+
}
3857+
3858+
#[test]
3859+
fn subquery_order_by_can_be_preserved() {
3860+
let sql = "SELECT x.* FROM (SELECT id FROM person ORDER BY id) x";
3861+
let mut config_options = datafusion_common::config::ConfigOptions::new();
3862+
config_options.sql_parser.enable_subquery_sort_elimination = false;
3863+
let plan = logical_plan_with_config(sql, config_options).unwrap();
3864+
assert_snapshot!(
3865+
plan.display_indent_schema().to_string(),
3866+
@r"
3867+
Projection: x.id [id:UInt32]
3868+
SubqueryAlias: x [id:UInt32]
3869+
Sort: person.id ASC NULLS LAST [id:UInt32]
3870+
Projection: person.id [id:UInt32]
3871+
TableScan: person [id:UInt32, first_name:Utf8, last_name:Utf8, age:Int32, state:Utf8, salary:Float64, birth_date:Timestamp(ns), 😀:Int32]
3872+
"
3873+
);
3874+
}
3875+
38113876
#[test]
38123877
fn not_in_subquery_correlated() {
38133878
let sql = "SELECT id FROM person p WHERE id NOT IN \

datafusion/sqllogictest/test_files/information_schema.slt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,7 @@ datafusion.sql_parser.default_null_ordering nulls_max
343343
datafusion.sql_parser.dialect generic
344344
datafusion.sql_parser.enable_ident_normalization true
345345
datafusion.sql_parser.enable_options_value_normalization false
346+
datafusion.sql_parser.enable_subquery_sort_elimination true
346347
datafusion.sql_parser.map_string_types_to_utf8view true
347348
datafusion.sql_parser.parse_float_as_decimal false
348349
datafusion.sql_parser.recursion_limit 50
@@ -489,6 +490,7 @@ datafusion.sql_parser.default_null_ordering nulls_max Specifies the default null
489490
datafusion.sql_parser.dialect generic Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks.
490491
datafusion.sql_parser.enable_ident_normalization true When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted)
491492
datafusion.sql_parser.enable_options_value_normalization false When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically.
493+
datafusion.sql_parser.enable_subquery_sort_elimination true When set to true, DataFusion may remove `ORDER BY` clauses from subqueries or CTEs during SQL planning when their ordering cannot affect the result, such as when no `LIMIT` or other order-sensitive operator depends on them. Disable this option to preserve explicit subquery ordering in the planned query.
492494
datafusion.sql_parser.map_string_types_to_utf8view true If true, string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning. If false, they are mapped to `Utf8`. Default is true.
493495
datafusion.sql_parser.parse_float_as_decimal false When set to true, SQL parser will parse float as decimal type
494496
datafusion.sql_parser.recursion_limit 50 Specifies the recursion depth limit when parsing complex SQL Queries

datafusion/sqllogictest/test_files/subquery_sort.slt

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,25 @@ EXPLAIN SELECT c1 FROM (SELECT c1 FROM sink_table ORDER BY c2)
5151
logical_plan TableScan: sink_table projection=[c1]
5252
physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1], file_type=csv, has_header=true
5353

54+
statement ok
55+
SET datafusion.sql_parser.enable_subquery_sort_elimination = false;
56+
57+
query TT
58+
EXPLAIN SELECT c1 FROM (SELECT c1 FROM sink_table ORDER BY c2) AS ttt
59+
----
60+
logical_plan
61+
01)SubqueryAlias: ttt
62+
02)--Projection: sink_table.c1
63+
03)----Sort: sink_table.c2 ASC NULLS LAST
64+
04)------TableScan: sink_table projection=[c1, c2]
65+
physical_plan
66+
01)ProjectionExec: expr=[c1@0 as c1]
67+
02)--SortExec: expr=[c2@1 ASC NULLS LAST], preserve_partitioning=[false]
68+
03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2], file_type=csv, has_header=true
69+
70+
statement ok
71+
RESET datafusion.sql_parser.enable_subquery_sort_elimination;
72+
5473

5574
# Do not remove ordering when it's with limit
5675

docs/source/user-guide/configs.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@ The following configuration settings are available:
191191
| datafusion.sql_parser.collect_spans | false | When set to true, the source locations relative to the original SQL query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected and recorded in the logical plan nodes. |
192192
| datafusion.sql_parser.recursion_limit | 50 | Specifies the recursion depth limit when parsing complex SQL Queries |
193193
| datafusion.sql_parser.default_null_ordering | nulls_max | Specifies the default null ordering for query results. There are 4 options: - `nulls_max`: Nulls appear last in ascending order. - `nulls_min`: Nulls appear first in ascending order. - `nulls_first`: Nulls always be first in any order. - `nulls_last`: Nulls always be last in any order. By default, `nulls_max` is used to follow Postgres's behavior. postgres rule: <https://www.postgresql.org/docs/current/queries-order.html> |
194+
| datafusion.sql_parser.enable_subquery_sort_elimination | true | When set to true, DataFusion may remove `ORDER BY` clauses from subqueries or CTEs during SQL planning when their ordering cannot affect the result, such as when no `LIMIT` or other order-sensitive operator depends on them. Disable this option to preserve explicit subquery ordering in the planned query. |
194195
| datafusion.format.safe | true | If set to `true` any formatting errors will be written to the output instead of being converted into a [`std::fmt::Error`] |
195196
| datafusion.format.null | | Format string for nulls |
196197
| datafusion.format.date_format | %Y-%m-%d | Date format for date arrays |

0 commit comments

Comments
 (0)