Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 31 additions & 6 deletions src/ast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4654,22 +4654,34 @@ pub enum Statement {
/// Legacy copy-style options.
options: Vec<CopyLegacyOption>,
},
/// ClickHouse:
/// ```sql
/// OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE [BY expression]]
/// ```
///
/// See ClickHouse <https://clickhouse.com/docs/en/sql-reference/statements/optimize>
///
/// Databricks:
/// ```sql
/// OPTIMIZE table_name [WHERE predicate] [ZORDER BY (col_name1 [, ...])]
/// ```
/// See Databricks <https://docs.databricks.com/en/sql/language-manual/delta-optimize.html>
OptimizeTable {
/// Table name to optimize.
name: ObjectName,
/// Optional cluster identifier.
/// Whether the `TABLE` keyword was present (ClickHouse uses `OPTIMIZE TABLE`, Databricks uses `OPTIMIZE`).
has_table_keyword: bool,
/// Optional cluster identifier (ClickHouse).
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
/// Optional cluster identifier (ClickHouse).
/// Optional cluster identifier.
/// [Clickhouse](https://clickhouse.com/docs/en/sql-reference/statements/optimize)

on_cluster: Option<Ident>,
/// Optional partition spec.
/// Optional partition spec (ClickHouse).
partition: Option<Partition>,
/// Whether `FINAL` was specified.
/// Whether `FINAL` was specified (ClickHouse).
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
/// Whether `FINAL` was specified (ClickHouse).
/// Whether `FINAL` was specified.
/// [Clickhouse](https://clickhouse.com/docs/en/sql-reference/statements/optimize)

include_final: bool,
/// Optional deduplication settings.
/// Optional deduplication settings (ClickHouse).
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
/// Optional deduplication settings (ClickHouse).
/// Optional deduplication settings.
/// [Clickhouse](https://clickhouse.com/docs/en/sql-reference/statements/optimize)

deduplicate: Option<Deduplicate>,
/// Optional WHERE predicate (Databricks).
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
/// Optional WHERE predicate (Databricks).
/// Optional WHERE predicate.
/// [Databricks](https://docs.databricks.com/en/sql/language-manual/delta-optimize.html)

predicate: Option<Expr>,
/// Optional ZORDER BY columns (Databricks).
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
/// Optional ZORDER BY columns (Databricks).
/// Optional ZORDER BY columns.
/// [Databricks](https://docs.databricks.com/en/sql/language-manual/delta-optimize.html)

zorder: Option<Vec<Expr>>,
},
/// ```sql
/// LISTEN
Expand Down Expand Up @@ -6243,12 +6255,19 @@ impl fmt::Display for Statement {
}
Statement::OptimizeTable {
name,
has_table_keyword,
on_cluster,
partition,
include_final,
deduplicate,
predicate,
zorder,
} => {
write!(f, "OPTIMIZE TABLE {name}")?;
write!(f, "OPTIMIZE")?;
if *has_table_keyword {
write!(f, " TABLE")?;
}
write!(f, " {name}")?;
if let Some(on_cluster) = on_cluster {
write!(f, " ON CLUSTER {on_cluster}")?;
}
Expand All @@ -6261,6 +6280,12 @@ impl fmt::Display for Statement {
if let Some(deduplicate) = deduplicate {
write!(f, " {deduplicate}")?;
}
if let Some(predicate) = predicate {
write!(f, " WHERE {predicate}")?;
}
if let Some(zorder) = zorder {
write!(f, " ZORDER BY ({})", display_comma_separated(zorder))?;
}
Ok(())
}
Statement::LISTEN { channel } => {
Expand Down
91 changes: 81 additions & 10 deletions src/parser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -697,8 +697,10 @@ impl<'a> Parser<'a> {
self.parse_install()
}
Keyword::LOAD => self.parse_load(),
// `OPTIMIZE` is clickhouse specific https://clickhouse.tech/docs/en/sql-reference/statements/optimize/
Keyword::OPTIMIZE if dialect_of!(self is ClickHouseDialect | GenericDialect) => {
// `OPTIMIZE` is clickhouse/databricks specific
// ClickHouse: https://clickhouse.tech/docs/en/sql-reference/statements/optimize/
// Databricks: https://docs.databricks.com/en/sql/language-manual/delta-optimize.html
Keyword::OPTIMIZE if dialect_of!(self is ClickHouseDialect | DatabricksDialect | GenericDialect) => {
self.parse_optimize_table()
}
// `COMMENT` is snowflake specific https://docs.snowflake.com/en/sql-reference/sql/comment
Expand Down Expand Up @@ -3332,25 +3334,35 @@ impl<'a> Parser<'a> {
/// Syntax:
///
/// ```sql
/// -- BigQuery style
/// [field_name] field_type
/// -- Databricks/Hive style (colon separator)
/// field_name: field_type
/// ```
///
/// [struct]: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#declaring_a_struct_type
/// [tuple]: https://clickhouse.com/docs/en/sql-reference/data-types/tuple
/// [databricks]: https://docs.databricks.com/en/sql/language-manual/data-types/struct-type.html
fn parse_struct_field_def(
&mut self,
) -> Result<(StructField, MatchedTrailingBracket), ParserError> {
// Look beyond the next item to infer whether both field name
// and type are specified.
let is_anonymous_field = !matches!(
// Supports both:
// - `field_name field_type` (BigQuery style)
// - `field_name: field_type` (Databricks/Hive style)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Supports both:
// - `field_name field_type` (BigQuery style)
// - `field_name: field_type` (Databricks/Hive style)

thinking we can skip the comment since they're other dialects that are affected, and the code is explanatory in this case

let is_named_field = matches!(
(self.peek_nth_token(0).token, self.peek_nth_token(1).token),
(Token::Word(_), Token::Word(_))
(Token::Word(_), Token::Word(_)) | (Token::Word(_), Token::Colon)
);

let field_name = if is_anonymous_field {
None
let field_name = if is_named_field {
let name = self.parse_identifier()?;
// Consume optional colon separator (Databricks/Hive style)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Consume optional colon separator (Databricks/Hive style)

let _ = self.consume_token(&Token::Colon);
Some(name)
} else {
Some(self.parse_identifier()?)
None
};

let (field_type, trailing_bracket) = self.parse_data_type_helper()?;
Expand Down Expand Up @@ -7878,14 +7890,41 @@ impl<'a> Parser<'a> {
pub fn parse_hive_distribution(&mut self) -> Result<HiveDistributionStyle, ParserError> {
if self.parse_keywords(&[Keyword::PARTITIONED, Keyword::BY]) {
self.expect_token(&Token::LParen)?;
let columns = self.parse_comma_separated(Parser::parse_column_def)?;
let columns = self.parse_comma_separated(Parser::parse_column_def_for_partition)?;
self.expect_token(&Token::RParen)?;
Ok(HiveDistributionStyle::PARTITIONED { columns })
} else {
Ok(HiveDistributionStyle::NONE)
}
}

/// Parse column definition for PARTITIONED BY clause.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
/// Parse column definition for PARTITIONED BY clause.
/// Parse column definition for `PARTITIONED BY` clause.

///
/// Databricks allows partition columns without types when referencing
/// columns already defined in the table specification:
/// ```sql
/// CREATE TABLE t (col1 STRING, col2 INT) PARTITIONED BY (col1)
/// CREATE TABLE t (col1 STRING) PARTITIONED BY (col2 INT)
/// ```
///
/// See [Databricks](https://docs.databricks.com/en/sql/language-manual/sql-ref-partition.html)
fn parse_column_def_for_partition(&mut self) -> Result<ColumnDef, ParserError> {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
fn parse_column_def_for_partition(&mut self) -> Result<ColumnDef, ParserError> {
fn parse_partitioned_by_column_def(&mut self) -> Result<ColumnDef, ParserError> {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the reason for this rewrite to support optional datatype? If so I think it would make sense that we introduce a new type that reflects this instead of reusing the Unspecified datatype which has a different meaning? e.g. that this function returns

struct PartitionedByColumnDef { name, data_type: Option<DataType>}`

wdyt?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since DataType::Unspecified already exists and is used by SQLite for similar "type-less" scenarios, I thought reusing it would keep things simpler without adding another AST type. Do two cases have different meaning?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh I see they're indeed the same cases. It would seem that ColumnDef should have an Optional datatype instead of us introducing a special datatype but that issue is unrelated to this PR.

For this PR though this new function seems to duplicate parse_column_def (minus the aforementioned optional datatype case) so that it would be good to reuse it instead. I can imagine something like?

pub fn parse_column_def(&mut self) -> Result<ColumnDef, ParserError> {
	parse_column_def_inner(true)
}

fn parse_column_def_inner(&mut self) -> Result<ColumnDef, ParserError> {
	// ...
	let data_type = if self.is_column_type_sqlite_unspecified() {
    	DataType::Unspecified
    } else {
		self
        	.maybe_parse(|parser| parser.parse_data_type())?
            .unwrap_or(DataType::Unspecified)
    };
    // ...
}

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense! I've extracted the shared logic into parse_column_def_inner(optional_data_type: bool) so that parse_column_def delegates to it with false (type required) and the PARTITIONED BY path calls it with true (type optional via maybe_parse). This removes the duplicate function while reusing the full column def parsing logic including options.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@iffyio could you please review? 👀

let name = self.parse_identifier()?;

// Check if the next token indicates there's no type specified
// (comma or closing paren means end of this column definition)
let data_type = match self.peek_token().token {
Token::Comma | Token::RParen => DataType::Unspecified,
_ => self.parse_data_type()?,
};
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can instead do something like

let data_type = self.maybe_parse(|parser| parser.parse_data_type())?;


Ok(ColumnDef {
name,
data_type,
options: vec![],
})
}

/// Parse Hive formats.
pub fn parse_hive_formats(&mut self) -> Result<Option<HiveFormat>, ParserError> {
let mut hive_format: Option<HiveFormat> = None;
Expand Down Expand Up @@ -11781,7 +11820,8 @@ impl<'a> Parser<'a> {
let field_defs = self.parse_duckdb_struct_type_def()?;
Ok(DataType::Struct(field_defs, StructBracketKind::Parentheses))
}
Keyword::STRUCT if dialect_is!(dialect is BigQueryDialect | GenericDialect) => {
Keyword::STRUCT if dialect_is!(dialect is BigQueryDialect | DatabricksDialect | GenericDialect) =>
{
self.prev_token();
let (field_defs, _trailing_bracket) =
self.parse_struct_type_def(Self::parse_struct_field_def)?;
Expand Down Expand Up @@ -18204,13 +18244,24 @@ impl<'a> Parser<'a> {
}
}

/// ClickHouse:
/// ```sql
/// OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE [BY expression]]
/// ```
/// [ClickHouse](https://clickhouse.com/docs/en/sql-reference/statements/optimize)
///
/// Databricks:
/// ```sql
/// OPTIMIZE table_name [WHERE predicate] [ZORDER BY (col_name1 [, ...])]
/// ```
/// [Databricks](https://docs.databricks.com/en/sql/language-manual/delta-optimize.html)
pub fn parse_optimize_table(&mut self) -> Result<Statement, ParserError> {
self.expect_keyword_is(Keyword::TABLE)?;
// Check for TABLE keyword (ClickHouse uses it, Databricks does not)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Check for TABLE keyword (ClickHouse uses it, Databricks does not)

thinking we can skip the comment so that it doesnt become stale/incomplete if other dialects support the feature

let has_table_keyword = self.parse_keyword(Keyword::TABLE);

let name = self.parse_object_name(false)?;

// ClickHouse-specific options
let on_cluster = self.parse_optional_on_cluster()?;

let partition = if self.parse_keyword(Keyword::PARTITION) {
Expand All @@ -18224,6 +18275,7 @@ impl<'a> Parser<'a> {
};

let include_final = self.parse_keyword(Keyword::FINAL);

let deduplicate = if self.parse_keyword(Keyword::DEDUPLICATE) {
if self.parse_keyword(Keyword::BY) {
Some(Deduplicate::ByExpression(self.parse_expr()?))
Expand All @@ -18234,12 +18286,31 @@ impl<'a> Parser<'a> {
None
};

// Databricks-specific options
let predicate = if self.parse_keyword(Keyword::WHERE) {
Some(self.parse_expr()?)
} else {
None
};

let zorder = if self.parse_keywords(&[Keyword::ZORDER, Keyword::BY]) {
self.expect_token(&Token::LParen)?;
let columns = self.parse_comma_separated(|p| p.parse_expr())?;
self.expect_token(&Token::RParen)?;
Some(columns)
} else {
None
};

Ok(Statement::OptimizeTable {
name,
has_table_keyword,
on_cluster,
partition,
include_final,
deduplicate,
predicate,
zorder,
})
}

Expand Down
Loading