Spark SQL: Add SparkSqlDialect with support for USING, lambdas, MAP/STRUCT types

andygrove · claude · andygrove · commit d23053c2829a · 2026-04-15T11:58:59.000-06:00
Adds a new `SparkSqlDialect` with the following features:
- `CREATE TABLE ... USING &lt;format&gt;` via new `HiveIOFormat::Using` AST variant
- `MAP&lt;K, V&gt;` angle-bracket type syntax (`supports_map_literal_with_angle_brackets`)
- `STRUCT&lt;field: type&gt;` type parsing now driven by `supports_struct_literal()` trait method
- `LONG` as an alias for `BIGINT` (`supports_long_type_as_bigint`)
- Lambda functions, `DIV` integer division, aggregate `FILTER`, `SELECT * EXCEPT`,
  struct literals, nested comments, `!` as NOT, CTE without AS, multi-column aliases

Also adds `tests/sqlparser_spark.rs` with 16 tests including integration with
the Apache DataFusion Comet SQL test files (1,152 statements, all passing).

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/src/ast/ddl.rs b/src/ast/ddl.rs
@@ -3203,6 +3203,7 @@ impl fmt::Display for CreateTable {
                 Some(HiveIOFormat::FileFormat { format }) if !self.external => {
                     write!(f, " STORED AS {format}")?
                 }
+                Some(HiveIOFormat::Using { format }) => write!(f, " USING {format}")?,
                 _ => (),
             }
             if let Some(serde_properties) = serde_properties.as_ref() {
diff --git a/src/ast/mod.rs b/src/ast/mod.rs
@@ -8658,6 +8658,15 @@ pub enum HiveIOFormat {
         /// The file format used for storage.
         format: FileFormat,
     },
+    /// `USING <format>` syntax used by Spark SQL.
+    ///
+    /// Example: `CREATE TABLE t (i INT) USING PARQUET`
+    ///
+    /// See <https://spark.apache.org/docs/latest/sql-ref-syntax-ddl-create-table-datasource.html>
+    Using {
+        /// The data source or format name, e.g. `parquet`, `delta`, `csv`.
+        format: Ident,
+    },
 }
 
 #[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash, Default)]
diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs
@@ -28,6 +28,7 @@ mod oracle;
 mod postgresql;
 mod redshift;
 mod snowflake;
+mod spark;
 mod sqlite;
 
 use core::any::{Any, TypeId};
@@ -51,6 +52,7 @@ pub use self::postgresql::PostgreSqlDialect;
 pub use self::redshift::RedshiftSqlDialect;
 pub use self::snowflake::parse_snowflake_stage_name;
 pub use self::snowflake::SnowflakeDialect;
+pub use self::spark::SparkSqlDialect;
 pub use self::sqlite::SQLiteDialect;
 
 /// Macro for streamlining the creation of derived `Dialect` objects.
@@ -1727,6 +1729,42 @@ pub trait Dialect: Debug + Any {
     fn supports_xml_expressions(&self) -> bool {
         false
     }
+
+    /// Returns true if the dialect supports `USING <format>` in `CREATE TABLE`.
+    ///
+    /// Example:
+    /// ```sql
+    /// CREATE TABLE t (i INT) USING PARQUET
+    /// ```
+    ///
+    /// [Spark SQL](https://spark.apache.org/docs/latest/sql-ref-syntax-ddl-create-table-datasource.html)
+    fn supports_create_table_using(&self) -> bool {
+        false
+    }
+
+    /// Returns true if the dialect treats `LONG` as an alias for `BIGINT`.
+    ///
+    /// Example:
+    /// ```sql
+    /// CREATE TABLE t (id LONG)
+    /// ```
+    ///
+    /// [Spark SQL](https://spark.apache.org/docs/latest/sql-ref-datatypes.html)
+    fn supports_long_type_as_bigint(&self) -> bool {
+        false
+    }
+
+    /// Returns true if the dialect supports `MAP<K, V>` angle-bracket syntax for the MAP data type.
+    ///
+    /// Example:
+    /// ```sql
+    /// CREATE TABLE t (m MAP<STRING, INT>)
+    /// ```
+    ///
+    /// [Spark SQL](https://spark.apache.org/docs/latest/sql-ref-datatypes.html)
+    fn supports_map_literal_with_angle_brackets(&self) -> bool {
+        false
+    }
 }
 
 /// Operators for which precedence must be defined.
@@ -1801,6 +1839,7 @@ pub fn dialect_from_str(dialect_name: impl AsRef<str>) -> Option<Box<dyn Dialect
         "ansi" => Some(Box::new(AnsiDialect {})),
         "duckdb" => Some(Box::new(DuckDbDialect {})),
         "databricks" => Some(Box::new(DatabricksDialect {})),
+        "spark" | "sparksql" => Some(Box::new(SparkSqlDialect {})),
         "oracle" => Some(Box::new(OracleDialect {})),
         _ => None,
     }
diff --git a/src/dialect/spark.rs b/src/dialect/spark.rs
@@ -0,0 +1,142 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::ast::{BinaryOperator, Expr};
+use crate::dialect::Dialect;
+use crate::keywords::Keyword;
+use crate::parser::{Parser, ParserError};
+
+/// A [`Dialect`] for [Apache Spark SQL](https://spark.apache.org/docs/latest/sql-ref.html).
+///
+/// See <https://spark.apache.org/docs/latest/sql-ref-syntax.html>.
+#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
+#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
+pub struct SparkSqlDialect;
+
+impl Dialect for SparkSqlDialect {
+    // See https://spark.apache.org/docs/latest/sql-ref-identifier.html
+    fn is_delimited_identifier_start(&self, ch: char) -> bool {
+        matches!(ch, '`')
+    }
+
+    fn is_identifier_start(&self, ch: char) -> bool {
+        matches!(ch, 'a'..='z' | 'A'..='Z' | '_')
+    }
+
+    fn is_identifier_part(&self, ch: char) -> bool {
+        matches!(ch, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_')
+    }
+
+    /// See <https://spark.apache.org/docs/latest/sql-ref-functions-builtin-agg.html>
+    fn supports_filter_during_aggregation(&self) -> bool {
+        true
+    }
+
+    /// See <https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select-groupby.html>
+    fn supports_group_by_expr(&self) -> bool {
+        true
+    }
+
+    /// See <https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select-groupby.html>
+    fn supports_group_by_with_modifier(&self) -> bool {
+        true
+    }
+
+    /// See <https://spark.apache.org/docs/latest/sql-ref-functions-builtin-higher-order-func.html>
+    fn supports_lambda_functions(&self) -> bool {
+        true
+    }
+
+    /// See <https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select.html>
+    fn supports_select_wildcard_except(&self) -> bool {
+        true
+    }
+
+    /// See <https://spark.apache.org/docs/latest/sql-ref-datatypes.html>
+    fn supports_struct_literal(&self) -> bool {
+        true
+    }
+
+    fn supports_nested_comments(&self) -> bool {
+        true
+    }
+
+    /// See <https://spark.apache.org/docs/latest/sql-ref-syntax-ddl-create-table-datasource.html>
+    fn supports_create_table_using(&self) -> bool {
+        true
+    }
+
+    /// `LONG` is an alias for `BIGINT` in Spark SQL.
+    ///
+    /// See <https://spark.apache.org/docs/latest/sql-ref-datatypes.html>
+    fn supports_long_type_as_bigint(&self) -> bool {
+        true
+    }
+
+    /// See <https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select.html>
+    fn supports_values_as_table_factor(&self) -> bool {
+        true
+    }
+
+    fn require_interval_qualifier(&self) -> bool {
+        true
+    }
+
+    fn supports_bang_not_operator(&self) -> bool {
+        true
+    }
+
+    fn supports_select_item_multi_column_alias(&self) -> bool {
+        true
+    }
+
+    fn supports_cte_without_as(&self) -> bool {
+        true
+    }
+
+    /// See <https://spark.apache.org/docs/latest/sql-ref-datatypes.html>
+    fn supports_map_literal_with_angle_brackets(&self) -> bool {
+        true
+    }
+
+    /// Parse the `DIV` keyword as integer division.
+    ///
+    /// Example: `SELECT 10 DIV 3` returns `3`.
+    ///
+    /// See <https://spark.apache.org/docs/latest/sql-ref-functions-builtin-math.html>
+    fn parse_infix(
+        &self,
+        parser: &mut Parser,
+        expr: &Expr,
+        _precedence: u8,
+    ) -> Option<Result<Expr, ParserError>> {
+        if parser.parse_keyword(Keyword::DIV) {
+            let left = Box::new(expr.clone());
+            let right = Box::new(match parser.parse_expr() {
+                Ok(expr) => expr,
+                Err(e) => return Some(Err(e)),
+            });
+            Some(Ok(Expr::BinaryOp {
+                left,
+                op: BinaryOperator::MyIntegerDivide,
+                right,
+            }))
+        } else {
+            None
+        }
+    }
+}
diff --git a/src/parser/mod.rs b/src/parser/mod.rs
@@ -8311,6 +8311,7 @@ impl<'a> Parser<'a> {
                 Keyword::STORED,
                 Keyword::LOCATION,
                 Keyword::WITH,
+                Keyword::USING,
             ]) {
                 Some(Keyword::ROW) => {
                     hive_format
@@ -8350,6 +8351,16 @@ impl<'a> Parser<'a> {
                         break;
                     }
                 }
+                Some(Keyword::USING) if self.dialect.supports_create_table_using() => {
+                    let format = self.parse_identifier()?;
+                    hive_format.get_or_insert_with(HiveFormat::default).storage =
+                        Some(HiveIOFormat::Using { format });
+                }
+                Some(Keyword::USING) => {
+                    // USING is not a table format keyword in this dialect; put it back
+                    self.prev_token();
+                    break;
+                }
                 None => break,
                 _ => break,
             }
@@ -12475,6 +12486,9 @@ impl<'a> Parser<'a> {
                 Keyword::TINYBLOB => Ok(DataType::TinyBlob),
                 Keyword::MEDIUMBLOB => Ok(DataType::MediumBlob),
                 Keyword::LONGBLOB => Ok(DataType::LongBlob),
+                Keyword::LONG if self.dialect.supports_long_type_as_bigint() => {
+                    Ok(DataType::BigInt(None))
+                }
                 Keyword::BYTES => Ok(DataType::Bytes(self.parse_optional_precision()?)),
                 Keyword::BIT => {
                     if self.parse_keyword(Keyword::VARYING) {
@@ -12609,8 +12623,7 @@ impl<'a> Parser<'a> {
                     let field_defs = self.parse_duckdb_struct_type_def()?;
                     Ok(DataType::Struct(field_defs, StructBracketKind::Parentheses))
                 }
-                Keyword::STRUCT if dialect_is!(dialect is BigQueryDialect | DatabricksDialect | GenericDialect) =>
-                {
+                Keyword::STRUCT if self.dialect.supports_struct_literal() => {
                     self.prev_token();
                     let (field_defs, _trailing_bracket) =
                         self.parse_struct_type_def(Self::parse_struct_field_def)?;
@@ -12631,6 +12644,17 @@ impl<'a> Parser<'a> {
                 Keyword::LOWCARDINALITY if dialect_is!(dialect is ClickHouseDialect | GenericDialect) => {
                     Ok(self.parse_sub_type(DataType::LowCardinality)?)
                 }
+                Keyword::MAP if self.dialect.supports_map_literal_with_angle_brackets() => {
+                    self.expect_token(&Token::Lt)?;
+                    let key_data_type = self.parse_data_type()?;
+                    self.expect_token(&Token::Comma)?;
+                    let (value_data_type, _trailing_bracket) = self.parse_data_type_helper()?;
+                    trailing_bracket = self.expect_closing_angle_bracket(_trailing_bracket)?;
+                    Ok(DataType::Map(
+                        Box::new(key_data_type),
+                        Box::new(value_data_type),
+                    ))
+                }
                 Keyword::MAP if dialect_is!(dialect is ClickHouseDialect | GenericDialect) => {
                     self.prev_token();
                     let (key_data_type, value_data_type) = self.parse_click_house_map_def()?;
diff --git a/tests/sqlparser_spark.rs b/tests/sqlparser_spark.rs

Original file line number	Diff line number	Diff line change
`@@ -3203,6 +3203,7 @@ impl fmt::Display for CreateTable {`
`3203`	`3203`	`Some(HiveIOFormat::FileFormat { format }) if !self.external => {`
`3204`	`3204`	`write!(f, " STORED AS {format}")?`
`3205`	`3205`	`}`
	`3206`	`+ Some(HiveIOFormat::Using { format }) => write!(f, " USING {format}")?,`
`3206`	`3207`	`_ => (),`
`3207`	`3208`	`}`
`3208`	`3209`	`if let Some(serde_properties) = serde_properties.as_ref() {`