Skip to content

Commit eea0d0b

Browse files
committed
Databricks: Add support for STRUCT type with colon syntax
Add support for Databricks/Hive-style STRUCT field syntax using colons: STRUCT<field_name: field_type, ...> Changes: - Add DatabricksDialect to STRUCT type parsing (alongside BigQuery/Generic) - Modify parse_struct_field_def to handle optional colon separator between field name and type, supporting both: - BigQuery style: STRUCT<field_name field_type> - Databricks/Hive style: STRUCT<field_name: field_type> This enables parsing complex nested types like: ARRAY<STRUCT<finish_flag: STRING, survive_flag: STRING, score: INT>>
1 parent cb1cb49 commit eea0d0b

2 files changed

Lines changed: 83 additions & 6 deletions

File tree

src/parser/mod.rs

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3334,25 +3334,35 @@ impl<'a> Parser<'a> {
33343334
/// Syntax:
33353335
///
33363336
/// ```sql
3337+
/// -- BigQuery style
33373338
/// [field_name] field_type
3339+
/// -- Databricks/Hive style (colon separator)
3340+
/// field_name: field_type
33383341
/// ```
33393342
///
33403343
/// [struct]: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#declaring_a_struct_type
33413344
/// [tuple]: https://clickhouse.com/docs/en/sql-reference/data-types/tuple
3345+
/// [databricks]: https://docs.databricks.com/en/sql/language-manual/data-types/struct-type.html
33423346
fn parse_struct_field_def(
33433347
&mut self,
33443348
) -> Result<(StructField, MatchedTrailingBracket), ParserError> {
33453349
// Look beyond the next item to infer whether both field name
33463350
// and type are specified.
3347-
let is_anonymous_field = !matches!(
3351+
// Supports both:
3352+
// - `field_name field_type` (BigQuery style)
3353+
// - `field_name: field_type` (Databricks/Hive style)
3354+
let is_named_field = matches!(
33483355
(self.peek_nth_token(0).token, self.peek_nth_token(1).token),
3349-
(Token::Word(_), Token::Word(_))
3356+
(Token::Word(_), Token::Word(_)) | (Token::Word(_), Token::Colon)
33503357
);
33513358

3352-
let field_name = if is_anonymous_field {
3353-
None
3359+
let field_name = if is_named_field {
3360+
let name = self.parse_identifier()?;
3361+
// Consume optional colon separator (Databricks/Hive style)
3362+
let _ = self.consume_token(&Token::Colon);
3363+
Some(name)
33543364
} else {
3355-
Some(self.parse_identifier()?)
3365+
None
33563366
};
33573367

33583368
let (field_type, trailing_bracket) = self.parse_data_type_helper()?;
@@ -11810,7 +11820,7 @@ impl<'a> Parser<'a> {
1181011820
let field_defs = self.parse_duckdb_struct_type_def()?;
1181111821
Ok(DataType::Struct(field_defs, StructBracketKind::Parentheses))
1181211822
}
11813-
Keyword::STRUCT if dialect_is!(dialect is BigQueryDialect | GenericDialect) => {
11823+
Keyword::STRUCT if dialect_is!(dialect is BigQueryDialect | DatabricksDialect | GenericDialect) => {
1181411824
self.prev_token();
1181511825
let (field_defs, _trailing_bracket) =
1181611826
self.parse_struct_type_def(Self::parse_struct_field_def)?;

tests/sqlparser_databricks.rs

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -522,3 +522,70 @@ fn parse_create_table_partitioned_by() {
522522
_ => unreachable!(),
523523
}
524524
}
525+
526+
#[test]
527+
fn parse_databricks_struct_type() {
528+
// Databricks uses colon-separated struct field syntax (colon is optional)
529+
// https://docs.databricks.com/en/sql/language-manual/data-types/struct-type.html
530+
531+
// Basic struct with colon syntax - parses to canonical form without colons
532+
databricks().one_statement_parses_to(
533+
"CREATE TABLE t (col1 STRUCT<field1: STRING, field2: INT>)",
534+
"CREATE TABLE t (col1 STRUCT<field1 STRING, field2 INT>)",
535+
);
536+
537+
// Nested array of struct (the original issue case)
538+
databricks().one_statement_parses_to(
539+
"CREATE TABLE t (col1 ARRAY<STRUCT<finish_flag: STRING, survive_flag: STRING, score: INT>>)",
540+
"CREATE TABLE t (col1 ARRAY<STRUCT<finish_flag STRING, survive_flag STRING, score INT>>)",
541+
);
542+
543+
// Multiple struct columns
544+
databricks().one_statement_parses_to(
545+
"CREATE TABLE t (col1 STRUCT<a: INT, b: STRING>, col2 STRUCT<x: DOUBLE>)",
546+
"CREATE TABLE t (col1 STRUCT<a INT, b STRING>, col2 STRUCT<x DOUBLE>)",
547+
);
548+
549+
// Deeply nested structs
550+
databricks().one_statement_parses_to(
551+
"CREATE TABLE t (col1 STRUCT<outer: STRUCT<inner: STRING>>)",
552+
"CREATE TABLE t (col1 STRUCT<outer STRUCT<inner STRING>>)",
553+
);
554+
555+
// Struct with array field
556+
databricks().one_statement_parses_to(
557+
"CREATE TABLE t (col1 STRUCT<items: ARRAY<INT>, name: STRING>)",
558+
"CREATE TABLE t (col1 STRUCT<items ARRAY<INT>, name STRING>)",
559+
);
560+
561+
// Syntax without colons should also work (BigQuery compatible)
562+
databricks().verified_stmt("CREATE TABLE t (col1 STRUCT<field1 STRING, field2 INT>)");
563+
564+
// Verify AST structure
565+
match databricks().one_statement_parses_to(
566+
"CREATE TABLE t (col1 STRUCT<field1: STRING, field2: INT>)",
567+
"CREATE TABLE t (col1 STRUCT<field1 STRING, field2 INT>)",
568+
) {
569+
Statement::CreateTable(CreateTable { columns, .. }) => {
570+
assert_eq!(columns.len(), 1);
571+
assert_eq!(columns[0].name.to_string(), "col1");
572+
match &columns[0].data_type {
573+
DataType::Struct(fields, StructBracketKind::AngleBrackets) => {
574+
assert_eq!(fields.len(), 2);
575+
assert_eq!(
576+
fields[0].field_name.as_ref().map(|i| i.to_string()),
577+
Some("field1".to_string())
578+
);
579+
assert_eq!(fields[0].field_type, DataType::String(None));
580+
assert_eq!(
581+
fields[1].field_name.as_ref().map(|i| i.to_string()),
582+
Some("field2".to_string())
583+
);
584+
assert_eq!(fields[1].field_type, DataType::Int(None));
585+
}
586+
_ => unreachable!(),
587+
}
588+
}
589+
_ => unreachable!(),
590+
}
591+
}

0 commit comments

Comments
 (0)