Skip to content

Commit 839fd80

Browse files
committed
Databricks: Add support for OPTIMIZE statement
Add support for Databricks Delta Lake OPTIMIZE statement syntax: - OPTIMIZE table_name [WHERE predicate] [ZORDER BY (col1, ...)] This extends the existing OptimizeTable AST to support both ClickHouse and Databricks syntax by adding: - has_table_keyword: distinguishes OPTIMIZE TABLE (ClickHouse) from OPTIMIZE (Databricks) - predicate: optional WHERE clause for partition filtering - zorder: optional ZORDER BY clause for data colocation
1 parent 6060a11 commit 839fd80

3 files changed

Lines changed: 134 additions & 9 deletions

File tree

src/ast/mod.rs

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4654,22 +4654,34 @@ pub enum Statement {
46544654
/// Legacy copy-style options.
46554655
options: Vec<CopyLegacyOption>,
46564656
},
4657+
/// ClickHouse:
46574658
/// ```sql
46584659
/// OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE [BY expression]]
46594660
/// ```
4660-
///
46614661
/// See ClickHouse <https://clickhouse.com/docs/en/sql-reference/statements/optimize>
4662+
///
4663+
/// Databricks:
4664+
/// ```sql
4665+
/// OPTIMIZE table_name [WHERE predicate] [ZORDER BY (col_name1 [, ...])]
4666+
/// ```
4667+
/// See Databricks <https://docs.databricks.com/en/sql/language-manual/delta-optimize.html>
46624668
OptimizeTable {
46634669
/// Table name to optimize.
46644670
name: ObjectName,
4665-
/// Optional cluster identifier.
4671+
/// Whether the `TABLE` keyword was present (ClickHouse uses `OPTIMIZE TABLE`, Databricks uses `OPTIMIZE`).
4672+
has_table_keyword: bool,
4673+
/// Optional cluster identifier (ClickHouse).
46664674
on_cluster: Option<Ident>,
4667-
/// Optional partition spec.
4675+
/// Optional partition spec (ClickHouse).
46684676
partition: Option<Partition>,
4669-
/// Whether `FINAL` was specified.
4677+
/// Whether `FINAL` was specified (ClickHouse).
46704678
include_final: bool,
4671-
/// Optional deduplication settings.
4679+
/// Optional deduplication settings (ClickHouse).
46724680
deduplicate: Option<Deduplicate>,
4681+
/// Optional WHERE predicate (Databricks).
4682+
predicate: Option<Expr>,
4683+
/// Optional ZORDER BY columns (Databricks).
4684+
zorder: Option<Vec<Expr>>,
46734685
},
46744686
/// ```sql
46754687
/// LISTEN
@@ -6243,12 +6255,19 @@ impl fmt::Display for Statement {
62436255
}
62446256
Statement::OptimizeTable {
62456257
name,
6258+
has_table_keyword,
62466259
on_cluster,
62476260
partition,
62486261
include_final,
62496262
deduplicate,
6263+
predicate,
6264+
zorder,
62506265
} => {
6251-
write!(f, "OPTIMIZE TABLE {name}")?;
6266+
write!(f, "OPTIMIZE")?;
6267+
if *has_table_keyword {
6268+
write!(f, " TABLE")?;
6269+
}
6270+
write!(f, " {name}")?;
62526271
if let Some(on_cluster) = on_cluster {
62536272
write!(f, " ON CLUSTER {on_cluster}")?;
62546273
}
@@ -6261,6 +6280,12 @@ impl fmt::Display for Statement {
62616280
if let Some(deduplicate) = deduplicate {
62626281
write!(f, " {deduplicate}")?;
62636282
}
6283+
if let Some(predicate) = predicate {
6284+
write!(f, " WHERE {predicate}")?;
6285+
}
6286+
if let Some(zorder) = zorder {
6287+
write!(f, " ZORDER BY ({})", display_comma_separated(zorder))?;
6288+
}
62646289
Ok(())
62656290
}
62666291
Statement::LISTEN { channel } => {

src/parser/mod.rs

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -697,8 +697,10 @@ impl<'a> Parser<'a> {
697697
self.parse_install()
698698
}
699699
Keyword::LOAD => self.parse_load(),
700-
// `OPTIMIZE` is clickhouse specific https://clickhouse.tech/docs/en/sql-reference/statements/optimize/
701-
Keyword::OPTIMIZE if dialect_of!(self is ClickHouseDialect | GenericDialect) => {
700+
// `OPTIMIZE` is clickhouse/databricks specific
701+
// ClickHouse: https://clickhouse.tech/docs/en/sql-reference/statements/optimize/
702+
// Databricks: https://docs.databricks.com/en/sql/language-manual/delta-optimize.html
703+
Keyword::OPTIMIZE if dialect_of!(self is ClickHouseDialect | DatabricksDialect | GenericDialect) => {
702704
self.parse_optimize_table()
703705
}
704706
// `COMMENT` is snowflake specific https://docs.snowflake.com/en/sql-reference/sql/comment
@@ -18204,13 +18206,24 @@ impl<'a> Parser<'a> {
1820418206
}
1820518207
}
1820618208

18209+
/// ClickHouse:
1820718210
/// ```sql
1820818211
/// OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE [BY expression]]
1820918212
/// ```
1821018213
/// [ClickHouse](https://clickhouse.com/docs/en/sql-reference/statements/optimize)
18214+
///
18215+
/// Databricks:
18216+
/// ```sql
18217+
/// OPTIMIZE table_name [WHERE predicate] [ZORDER BY (col_name1 [, ...])]
18218+
/// ```
18219+
/// [Databricks](https://docs.databricks.com/en/sql/language-manual/delta-optimize.html)
1821118220
pub fn parse_optimize_table(&mut self) -> Result<Statement, ParserError> {
18212-
self.expect_keyword_is(Keyword::TABLE)?;
18221+
// Check for TABLE keyword (ClickHouse uses it, Databricks does not)
18222+
let has_table_keyword = self.parse_keyword(Keyword::TABLE);
18223+
1821318224
let name = self.parse_object_name(false)?;
18225+
18226+
// ClickHouse-specific options
1821418227
let on_cluster = self.parse_optional_on_cluster()?;
1821518228

1821618229
let partition = if self.parse_keyword(Keyword::PARTITION) {
@@ -18224,6 +18237,7 @@ impl<'a> Parser<'a> {
1822418237
};
1822518238

1822618239
let include_final = self.parse_keyword(Keyword::FINAL);
18240+
1822718241
let deduplicate = if self.parse_keyword(Keyword::DEDUPLICATE) {
1822818242
if self.parse_keyword(Keyword::BY) {
1822918243
Some(Deduplicate::ByExpression(self.parse_expr()?))
@@ -18234,12 +18248,31 @@ impl<'a> Parser<'a> {
1823418248
None
1823518249
};
1823618250

18251+
// Databricks-specific options
18252+
let predicate = if self.parse_keyword(Keyword::WHERE) {
18253+
Some(self.parse_expr()?)
18254+
} else {
18255+
None
18256+
};
18257+
18258+
let zorder = if self.parse_keywords(&[Keyword::ZORDER, Keyword::BY]) {
18259+
self.expect_token(&Token::LParen)?;
18260+
let columns = self.parse_comma_separated(|p| p.parse_expr())?;
18261+
self.expect_token(&Token::RParen)?;
18262+
Some(columns)
18263+
} else {
18264+
None
18265+
};
18266+
1823718267
Ok(Statement::OptimizeTable {
1823818268
name,
18269+
has_table_keyword,
1823918270
on_cluster,
1824018271
partition,
1824118272
include_final,
1824218273
deduplicate,
18274+
predicate,
18275+
zorder,
1824318276
})
1824418277
}
1824518278

tests/sqlparser_databricks.rs

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,3 +389,70 @@ fn parse_table_time_travel() {
389389
.parse_sql_statements("SELECT 1 FROM t1 VERSION AS OF 1 - 2",)
390390
.is_err())
391391
}
392+
393+
#[test]
394+
fn parse_optimize_table() {
395+
// Basic OPTIMIZE (Databricks style - no TABLE keyword)
396+
databricks().verified_stmt("OPTIMIZE my_table");
397+
databricks().verified_stmt("OPTIMIZE db.my_table");
398+
databricks().verified_stmt("OPTIMIZE catalog.db.my_table");
399+
400+
// With WHERE clause
401+
databricks().verified_stmt("OPTIMIZE my_table WHERE date = '2023-01-01'");
402+
databricks().verified_stmt("OPTIMIZE my_table WHERE date >= '2023-01-01' AND date < '2023-02-01'");
403+
404+
// With ZORDER BY clause
405+
databricks().verified_stmt("OPTIMIZE my_table ZORDER BY (col1)");
406+
databricks().verified_stmt("OPTIMIZE my_table ZORDER BY (col1, col2)");
407+
databricks().verified_stmt("OPTIMIZE my_table ZORDER BY (col1, col2, col3)");
408+
409+
// Combined WHERE and ZORDER BY
410+
databricks().verified_stmt("OPTIMIZE my_table WHERE date = '2023-01-01' ZORDER BY (col1)");
411+
databricks().verified_stmt("OPTIMIZE my_table WHERE date >= '2023-01-01' ZORDER BY (col1, col2)");
412+
413+
// Verify AST structure
414+
match databricks()
415+
.verified_stmt("OPTIMIZE my_table WHERE date = '2023-01-01' ZORDER BY (col1, col2)")
416+
{
417+
Statement::OptimizeTable {
418+
name,
419+
has_table_keyword,
420+
on_cluster,
421+
partition,
422+
include_final,
423+
deduplicate,
424+
predicate,
425+
zorder,
426+
} => {
427+
assert_eq!(name.to_string(), "my_table");
428+
assert!(!has_table_keyword);
429+
assert!(on_cluster.is_none());
430+
assert!(partition.is_none());
431+
assert!(!include_final);
432+
assert!(deduplicate.is_none());
433+
assert!(predicate.is_some());
434+
assert_eq!(
435+
zorder,
436+
Some(vec![
437+
Expr::Identifier(Ident::new("col1")),
438+
Expr::Identifier(Ident::new("col2")),
439+
])
440+
);
441+
}
442+
_ => unreachable!(),
443+
}
444+
445+
// Negative cases
446+
assert_eq!(
447+
databricks()
448+
.parse_sql_statements("OPTIMIZE my_table ZORDER BY")
449+
.unwrap_err(),
450+
ParserError::ParserError("Expected: (, found: EOF".to_string())
451+
);
452+
assert_eq!(
453+
databricks()
454+
.parse_sql_statements("OPTIMIZE my_table ZORDER BY ()")
455+
.unwrap_err(),
456+
ParserError::ParserError("Expected: an expression, found: )".to_string())
457+
);
458+
}

0 commit comments

Comments
 (0)