-
Notifications
You must be signed in to change notification settings - Fork 710
Databricks: Add support for OPTIMIZE, PARTITIONED BY, and STRUCT
#2170
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
839fd80
cb1cb49
eea0d0b
aa54943
b00781c
414fc39
0c63e73
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
|
|
@@ -4654,22 +4654,34 @@ pub enum Statement { | |||||||
| /// Legacy copy-style options. | ||||||||
| options: Vec<CopyLegacyOption>, | ||||||||
| }, | ||||||||
| /// ClickHouse: | ||||||||
| /// ```sql | ||||||||
| /// OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE [BY expression]] | ||||||||
| /// ``` | ||||||||
| /// | ||||||||
| /// See ClickHouse <https://clickhouse.com/docs/en/sql-reference/statements/optimize> | ||||||||
| /// | ||||||||
| /// Databricks: | ||||||||
| /// ```sql | ||||||||
| /// OPTIMIZE table_name [WHERE predicate] [ZORDER BY (col_name1 [, ...])] | ||||||||
| /// ``` | ||||||||
| /// See Databricks <https://docs.databricks.com/en/sql/language-manual/delta-optimize.html> | ||||||||
| OptimizeTable { | ||||||||
| /// Table name to optimize. | ||||||||
| name: ObjectName, | ||||||||
| /// Optional cluster identifier. | ||||||||
| /// Whether the `TABLE` keyword was present (ClickHouse uses `OPTIMIZE TABLE`, Databricks uses `OPTIMIZE`). | ||||||||
| has_table_keyword: bool, | ||||||||
| /// Optional cluster identifier (ClickHouse). | ||||||||
| on_cluster: Option<Ident>, | ||||||||
| /// Optional partition spec. | ||||||||
| /// Optional partition spec (ClickHouse). | ||||||||
| partition: Option<Partition>, | ||||||||
| /// Whether `FINAL` was specified. | ||||||||
| /// Whether `FINAL` was specified (ClickHouse). | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
| include_final: bool, | ||||||||
| /// Optional deduplication settings. | ||||||||
| /// Optional deduplication settings (ClickHouse). | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
| deduplicate: Option<Deduplicate>, | ||||||||
| /// Optional WHERE predicate (Databricks). | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
| predicate: Option<Expr>, | ||||||||
| /// Optional ZORDER BY columns (Databricks). | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
| zorder: Option<Vec<Expr>>, | ||||||||
| }, | ||||||||
| /// ```sql | ||||||||
| /// LISTEN | ||||||||
|
|
@@ -6243,12 +6255,19 @@ impl fmt::Display for Statement { | |||||||
| } | ||||||||
| Statement::OptimizeTable { | ||||||||
| name, | ||||||||
| has_table_keyword, | ||||||||
| on_cluster, | ||||||||
| partition, | ||||||||
| include_final, | ||||||||
| deduplicate, | ||||||||
| predicate, | ||||||||
| zorder, | ||||||||
| } => { | ||||||||
| write!(f, "OPTIMIZE TABLE {name}")?; | ||||||||
| write!(f, "OPTIMIZE")?; | ||||||||
| if *has_table_keyword { | ||||||||
| write!(f, " TABLE")?; | ||||||||
| } | ||||||||
| write!(f, " {name}")?; | ||||||||
| if let Some(on_cluster) = on_cluster { | ||||||||
| write!(f, " ON CLUSTER {on_cluster}")?; | ||||||||
| } | ||||||||
|
|
@@ -6261,6 +6280,12 @@ impl fmt::Display for Statement { | |||||||
| if let Some(deduplicate) = deduplicate { | ||||||||
| write!(f, " {deduplicate}")?; | ||||||||
| } | ||||||||
| if let Some(predicate) = predicate { | ||||||||
| write!(f, " WHERE {predicate}")?; | ||||||||
| } | ||||||||
| if let Some(zorder) = zorder { | ||||||||
| write!(f, " ZORDER BY ({})", display_comma_separated(zorder))?; | ||||||||
| } | ||||||||
| Ok(()) | ||||||||
| } | ||||||||
| Statement::LISTEN { channel } => { | ||||||||
|
|
||||||||
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
|
|
@@ -697,8 +697,10 @@ impl<'a> Parser<'a> { | |||||||
| self.parse_install() | ||||||||
| } | ||||||||
| Keyword::LOAD => self.parse_load(), | ||||||||
| // `OPTIMIZE` is clickhouse specific https://clickhouse.tech/docs/en/sql-reference/statements/optimize/ | ||||||||
| Keyword::OPTIMIZE if dialect_of!(self is ClickHouseDialect | GenericDialect) => { | ||||||||
| // `OPTIMIZE` is clickhouse/databricks specific | ||||||||
| // ClickHouse: https://clickhouse.tech/docs/en/sql-reference/statements/optimize/ | ||||||||
| // Databricks: https://docs.databricks.com/en/sql/language-manual/delta-optimize.html | ||||||||
| Keyword::OPTIMIZE if dialect_of!(self is ClickHouseDialect | DatabricksDialect | GenericDialect) => { | ||||||||
| self.parse_optimize_table() | ||||||||
| } | ||||||||
| // `COMMENT` is snowflake specific https://docs.snowflake.com/en/sql-reference/sql/comment | ||||||||
|
|
@@ -3332,25 +3334,35 @@ impl<'a> Parser<'a> { | |||||||
| /// Syntax: | ||||||||
| /// | ||||||||
| /// ```sql | ||||||||
| /// -- BigQuery style | ||||||||
| /// [field_name] field_type | ||||||||
| /// -- Databricks/Hive style (colon separator) | ||||||||
| /// field_name: field_type | ||||||||
| /// ``` | ||||||||
| /// | ||||||||
| /// [struct]: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#declaring_a_struct_type | ||||||||
| /// [tuple]: https://clickhouse.com/docs/en/sql-reference/data-types/tuple | ||||||||
| /// [databricks]: https://docs.databricks.com/en/sql/language-manual/data-types/struct-type.html | ||||||||
| fn parse_struct_field_def( | ||||||||
| &mut self, | ||||||||
| ) -> Result<(StructField, MatchedTrailingBracket), ParserError> { | ||||||||
| // Look beyond the next item to infer whether both field name | ||||||||
| // and type are specified. | ||||||||
| let is_anonymous_field = !matches!( | ||||||||
| // Supports both: | ||||||||
| // - `field_name field_type` (BigQuery style) | ||||||||
| // - `field_name: field_type` (Databricks/Hive style) | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
thinking we can skip the comment since they're other dialects that are affected, and the code is explanatory in this case |
||||||||
| let is_named_field = matches!( | ||||||||
| (self.peek_nth_token(0).token, self.peek_nth_token(1).token), | ||||||||
| (Token::Word(_), Token::Word(_)) | ||||||||
| (Token::Word(_), Token::Word(_)) | (Token::Word(_), Token::Colon) | ||||||||
| ); | ||||||||
|
|
||||||||
| let field_name = if is_anonymous_field { | ||||||||
| None | ||||||||
| let field_name = if is_named_field { | ||||||||
| let name = self.parse_identifier()?; | ||||||||
| // Consume optional colon separator (Databricks/Hive style) | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
| let _ = self.consume_token(&Token::Colon); | ||||||||
| Some(name) | ||||||||
| } else { | ||||||||
| Some(self.parse_identifier()?) | ||||||||
| None | ||||||||
| }; | ||||||||
|
|
||||||||
| let (field_type, trailing_bracket) = self.parse_data_type_helper()?; | ||||||||
|
|
@@ -7878,14 +7890,41 @@ impl<'a> Parser<'a> { | |||||||
| pub fn parse_hive_distribution(&mut self) -> Result<HiveDistributionStyle, ParserError> { | ||||||||
| if self.parse_keywords(&[Keyword::PARTITIONED, Keyword::BY]) { | ||||||||
| self.expect_token(&Token::LParen)?; | ||||||||
| let columns = self.parse_comma_separated(Parser::parse_column_def)?; | ||||||||
| let columns = self.parse_comma_separated(Parser::parse_column_def_for_partition)?; | ||||||||
| self.expect_token(&Token::RParen)?; | ||||||||
| Ok(HiveDistributionStyle::PARTITIONED { columns }) | ||||||||
| } else { | ||||||||
| Ok(HiveDistributionStyle::NONE) | ||||||||
| } | ||||||||
| } | ||||||||
|
|
||||||||
| /// Parse column definition for PARTITIONED BY clause. | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
| /// | ||||||||
| /// Databricks allows partition columns without types when referencing | ||||||||
| /// columns already defined in the table specification: | ||||||||
| /// ```sql | ||||||||
| /// CREATE TABLE t (col1 STRING, col2 INT) PARTITIONED BY (col1) | ||||||||
| /// CREATE TABLE t (col1 STRING) PARTITIONED BY (col2 INT) | ||||||||
| /// ``` | ||||||||
| /// | ||||||||
| /// See [Databricks](https://docs.databricks.com/en/sql/language-manual/sql-ref-partition.html) | ||||||||
| fn parse_column_def_for_partition(&mut self) -> Result<ColumnDef, ParserError> { | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the reason for this rewrite to support optional datatype? If so I think it would make sense that we introduce a new type that reflects this instead of reusing the struct PartitionedByColumnDef { name, data_type: Option<DataType>}`wdyt?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh I see they're indeed the same cases. It would seem that For this PR though this new function seems to duplicate pub fn parse_column_def(&mut self) -> Result<ColumnDef, ParserError> {
parse_column_def_inner(true)
}
fn parse_column_def_inner(&mut self) -> Result<ColumnDef, ParserError> {
// ...
let data_type = if self.is_column_type_sqlite_unspecified() {
DataType::Unspecified
} else {
self
.maybe_parse(|parser| parser.parse_data_type())?
.unwrap_or(DataType::Unspecified)
};
// ...
}
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Makes sense! I've extracted the shared logic into
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @iffyio could you please review? 👀 |
||||||||
| let name = self.parse_identifier()?; | ||||||||
|
|
||||||||
| // Check if the next token indicates there's no type specified | ||||||||
| // (comma or closing paren means end of this column definition) | ||||||||
| let data_type = match self.peek_token().token { | ||||||||
| Token::Comma | Token::RParen => DataType::Unspecified, | ||||||||
| _ => self.parse_data_type()?, | ||||||||
| }; | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we can instead do something like let data_type = self.maybe_parse(|parser| parser.parse_data_type())?; |
||||||||
|
|
||||||||
| Ok(ColumnDef { | ||||||||
| name, | ||||||||
| data_type, | ||||||||
| options: vec![], | ||||||||
| }) | ||||||||
| } | ||||||||
|
|
||||||||
| /// Parse Hive formats. | ||||||||
| pub fn parse_hive_formats(&mut self) -> Result<Option<HiveFormat>, ParserError> { | ||||||||
| let mut hive_format: Option<HiveFormat> = None; | ||||||||
|
|
@@ -11781,7 +11820,8 @@ impl<'a> Parser<'a> { | |||||||
| let field_defs = self.parse_duckdb_struct_type_def()?; | ||||||||
| Ok(DataType::Struct(field_defs, StructBracketKind::Parentheses)) | ||||||||
| } | ||||||||
| Keyword::STRUCT if dialect_is!(dialect is BigQueryDialect | GenericDialect) => { | ||||||||
| Keyword::STRUCT if dialect_is!(dialect is BigQueryDialect | DatabricksDialect | GenericDialect) => | ||||||||
| { | ||||||||
| self.prev_token(); | ||||||||
| let (field_defs, _trailing_bracket) = | ||||||||
| self.parse_struct_type_def(Self::parse_struct_field_def)?; | ||||||||
|
|
@@ -18204,13 +18244,24 @@ impl<'a> Parser<'a> { | |||||||
| } | ||||||||
| } | ||||||||
|
|
||||||||
| /// ClickHouse: | ||||||||
| /// ```sql | ||||||||
| /// OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE [BY expression]] | ||||||||
| /// ``` | ||||||||
| /// [ClickHouse](https://clickhouse.com/docs/en/sql-reference/statements/optimize) | ||||||||
| /// | ||||||||
| /// Databricks: | ||||||||
| /// ```sql | ||||||||
| /// OPTIMIZE table_name [WHERE predicate] [ZORDER BY (col_name1 [, ...])] | ||||||||
| /// ``` | ||||||||
| /// [Databricks](https://docs.databricks.com/en/sql/language-manual/delta-optimize.html) | ||||||||
| pub fn parse_optimize_table(&mut self) -> Result<Statement, ParserError> { | ||||||||
| self.expect_keyword_is(Keyword::TABLE)?; | ||||||||
| // Check for TABLE keyword (ClickHouse uses it, Databricks does not) | ||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
thinking we can skip the comment so that it doesnt become stale/incomplete if other dialects support the feature |
||||||||
| let has_table_keyword = self.parse_keyword(Keyword::TABLE); | ||||||||
|
|
||||||||
| let name = self.parse_object_name(false)?; | ||||||||
|
|
||||||||
| // ClickHouse-specific options | ||||||||
| let on_cluster = self.parse_optional_on_cluster()?; | ||||||||
|
|
||||||||
| let partition = if self.parse_keyword(Keyword::PARTITION) { | ||||||||
|
|
@@ -18224,6 +18275,7 @@ impl<'a> Parser<'a> { | |||||||
| }; | ||||||||
|
|
||||||||
| let include_final = self.parse_keyword(Keyword::FINAL); | ||||||||
|
|
||||||||
| let deduplicate = if self.parse_keyword(Keyword::DEDUPLICATE) { | ||||||||
| if self.parse_keyword(Keyword::BY) { | ||||||||
| Some(Deduplicate::ByExpression(self.parse_expr()?)) | ||||||||
|
|
@@ -18234,12 +18286,31 @@ impl<'a> Parser<'a> { | |||||||
| None | ||||||||
| }; | ||||||||
|
|
||||||||
| // Databricks-specific options | ||||||||
| let predicate = if self.parse_keyword(Keyword::WHERE) { | ||||||||
| Some(self.parse_expr()?) | ||||||||
| } else { | ||||||||
| None | ||||||||
| }; | ||||||||
|
|
||||||||
| let zorder = if self.parse_keywords(&[Keyword::ZORDER, Keyword::BY]) { | ||||||||
| self.expect_token(&Token::LParen)?; | ||||||||
| let columns = self.parse_comma_separated(|p| p.parse_expr())?; | ||||||||
| self.expect_token(&Token::RParen)?; | ||||||||
| Some(columns) | ||||||||
| } else { | ||||||||
| None | ||||||||
| }; | ||||||||
|
|
||||||||
| Ok(Statement::OptimizeTable { | ||||||||
| name, | ||||||||
| has_table_keyword, | ||||||||
| on_cluster, | ||||||||
| partition, | ||||||||
| include_final, | ||||||||
| deduplicate, | ||||||||
| predicate, | ||||||||
| zorder, | ||||||||
| }) | ||||||||
| } | ||||||||
|
|
||||||||
|
|
||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.