Skip to content

Commit d23053c

Browse files
andygroveclaude
andcommitted
Spark SQL: Add SparkSqlDialect with support for USING, lambdas, MAP/STRUCT types
Adds a new `SparkSqlDialect` with the following features: - `CREATE TABLE ... USING <format>` via new `HiveIOFormat::Using` AST variant - `MAP<K, V>` angle-bracket type syntax (`supports_map_literal_with_angle_brackets`) - `STRUCT<field: type>` type parsing now driven by `supports_struct_literal()` trait method - `LONG` as an alias for `BIGINT` (`supports_long_type_as_bigint`) - Lambda functions, `DIV` integer division, aggregate `FILTER`, `SELECT * EXCEPT`, struct literals, nested comments, `!` as NOT, CTE without AS, multi-column aliases Also adds `tests/sqlparser_spark.rs` with 16 tests including integration with the Apache DataFusion Comet SQL test files (1,152 statements, all passing). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 9f04ebe commit d23053c

File tree

6 files changed

+546
-2
lines changed

6 files changed

+546
-2
lines changed

src/ast/ddl.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3203,6 +3203,7 @@ impl fmt::Display for CreateTable {
32033203
Some(HiveIOFormat::FileFormat { format }) if !self.external => {
32043204
write!(f, " STORED AS {format}")?
32053205
}
3206+
Some(HiveIOFormat::Using { format }) => write!(f, " USING {format}")?,
32063207
_ => (),
32073208
}
32083209
if let Some(serde_properties) = serde_properties.as_ref() {

src/ast/mod.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8658,6 +8658,15 @@ pub enum HiveIOFormat {
86588658
/// The file format used for storage.
86598659
format: FileFormat,
86608660
},
8661+
/// `USING <format>` syntax used by Spark SQL.
8662+
///
8663+
/// Example: `CREATE TABLE t (i INT) USING PARQUET`
8664+
///
8665+
/// See <https://spark.apache.org/docs/latest/sql-ref-syntax-ddl-create-table-datasource.html>
8666+
Using {
8667+
/// The data source or format name, e.g. `parquet`, `delta`, `csv`.
8668+
format: Ident,
8669+
},
86618670
}
86628671

86638672
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash, Default)]

src/dialect/mod.rs

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ mod oracle;
2828
mod postgresql;
2929
mod redshift;
3030
mod snowflake;
31+
mod spark;
3132
mod sqlite;
3233

3334
use core::any::{Any, TypeId};
@@ -51,6 +52,7 @@ pub use self::postgresql::PostgreSqlDialect;
5152
pub use self::redshift::RedshiftSqlDialect;
5253
pub use self::snowflake::parse_snowflake_stage_name;
5354
pub use self::snowflake::SnowflakeDialect;
55+
pub use self::spark::SparkSqlDialect;
5456
pub use self::sqlite::SQLiteDialect;
5557

5658
/// Macro for streamlining the creation of derived `Dialect` objects.
@@ -1727,6 +1729,42 @@ pub trait Dialect: Debug + Any {
17271729
fn supports_xml_expressions(&self) -> bool {
17281730
false
17291731
}
1732+
1733+
/// Returns true if the dialect supports `USING <format>` in `CREATE TABLE`.
1734+
///
1735+
/// Example:
1736+
/// ```sql
1737+
/// CREATE TABLE t (i INT) USING PARQUET
1738+
/// ```
1739+
///
1740+
/// [Spark SQL](https://spark.apache.org/docs/latest/sql-ref-syntax-ddl-create-table-datasource.html)
1741+
fn supports_create_table_using(&self) -> bool {
1742+
false
1743+
}
1744+
1745+
/// Returns true if the dialect treats `LONG` as an alias for `BIGINT`.
1746+
///
1747+
/// Example:
1748+
/// ```sql
1749+
/// CREATE TABLE t (id LONG)
1750+
/// ```
1751+
///
1752+
/// [Spark SQL](https://spark.apache.org/docs/latest/sql-ref-datatypes.html)
1753+
fn supports_long_type_as_bigint(&self) -> bool {
1754+
false
1755+
}
1756+
1757+
/// Returns true if the dialect supports `MAP<K, V>` angle-bracket syntax for the MAP data type.
1758+
///
1759+
/// Example:
1760+
/// ```sql
1761+
/// CREATE TABLE t (m MAP<STRING, INT>)
1762+
/// ```
1763+
///
1764+
/// [Spark SQL](https://spark.apache.org/docs/latest/sql-ref-datatypes.html)
1765+
fn supports_map_literal_with_angle_brackets(&self) -> bool {
1766+
false
1767+
}
17301768
}
17311769

17321770
/// Operators for which precedence must be defined.
@@ -1801,6 +1839,7 @@ pub fn dialect_from_str(dialect_name: impl AsRef<str>) -> Option<Box<dyn Dialect
18011839
"ansi" => Some(Box::new(AnsiDialect {})),
18021840
"duckdb" => Some(Box::new(DuckDbDialect {})),
18031841
"databricks" => Some(Box::new(DatabricksDialect {})),
1842+
"spark" | "sparksql" => Some(Box::new(SparkSqlDialect {})),
18041843
"oracle" => Some(Box::new(OracleDialect {})),
18051844
_ => None,
18061845
}

src/dialect/spark.rs

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use crate::ast::{BinaryOperator, Expr};
19+
use crate::dialect::Dialect;
20+
use crate::keywords::Keyword;
21+
use crate::parser::{Parser, ParserError};
22+
23+
/// A [`Dialect`] for [Apache Spark SQL](https://spark.apache.org/docs/latest/sql-ref.html).
24+
///
25+
/// See <https://spark.apache.org/docs/latest/sql-ref-syntax.html>.
26+
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
27+
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
28+
pub struct SparkSqlDialect;
29+
30+
impl Dialect for SparkSqlDialect {
31+
// See https://spark.apache.org/docs/latest/sql-ref-identifier.html
32+
fn is_delimited_identifier_start(&self, ch: char) -> bool {
33+
matches!(ch, '`')
34+
}
35+
36+
fn is_identifier_start(&self, ch: char) -> bool {
37+
matches!(ch, 'a'..='z' | 'A'..='Z' | '_')
38+
}
39+
40+
fn is_identifier_part(&self, ch: char) -> bool {
41+
matches!(ch, 'a'..='z' | 'A'..='Z' | '0'..='9' | '_')
42+
}
43+
44+
/// See <https://spark.apache.org/docs/latest/sql-ref-functions-builtin-agg.html>
45+
fn supports_filter_during_aggregation(&self) -> bool {
46+
true
47+
}
48+
49+
/// See <https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select-groupby.html>
50+
fn supports_group_by_expr(&self) -> bool {
51+
true
52+
}
53+
54+
/// See <https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select-groupby.html>
55+
fn supports_group_by_with_modifier(&self) -> bool {
56+
true
57+
}
58+
59+
/// See <https://spark.apache.org/docs/latest/sql-ref-functions-builtin-higher-order-func.html>
60+
fn supports_lambda_functions(&self) -> bool {
61+
true
62+
}
63+
64+
/// See <https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select.html>
65+
fn supports_select_wildcard_except(&self) -> bool {
66+
true
67+
}
68+
69+
/// See <https://spark.apache.org/docs/latest/sql-ref-datatypes.html>
70+
fn supports_struct_literal(&self) -> bool {
71+
true
72+
}
73+
74+
fn supports_nested_comments(&self) -> bool {
75+
true
76+
}
77+
78+
/// See <https://spark.apache.org/docs/latest/sql-ref-syntax-ddl-create-table-datasource.html>
79+
fn supports_create_table_using(&self) -> bool {
80+
true
81+
}
82+
83+
/// `LONG` is an alias for `BIGINT` in Spark SQL.
84+
///
85+
/// See <https://spark.apache.org/docs/latest/sql-ref-datatypes.html>
86+
fn supports_long_type_as_bigint(&self) -> bool {
87+
true
88+
}
89+
90+
/// See <https://spark.apache.org/docs/latest/sql-ref-syntax-qry-select.html>
91+
fn supports_values_as_table_factor(&self) -> bool {
92+
true
93+
}
94+
95+
fn require_interval_qualifier(&self) -> bool {
96+
true
97+
}
98+
99+
fn supports_bang_not_operator(&self) -> bool {
100+
true
101+
}
102+
103+
fn supports_select_item_multi_column_alias(&self) -> bool {
104+
true
105+
}
106+
107+
fn supports_cte_without_as(&self) -> bool {
108+
true
109+
}
110+
111+
/// See <https://spark.apache.org/docs/latest/sql-ref-datatypes.html>
112+
fn supports_map_literal_with_angle_brackets(&self) -> bool {
113+
true
114+
}
115+
116+
/// Parse the `DIV` keyword as integer division.
117+
///
118+
/// Example: `SELECT 10 DIV 3` returns `3`.
119+
///
120+
/// See <https://spark.apache.org/docs/latest/sql-ref-functions-builtin-math.html>
121+
fn parse_infix(
122+
&self,
123+
parser: &mut Parser,
124+
expr: &Expr,
125+
_precedence: u8,
126+
) -> Option<Result<Expr, ParserError>> {
127+
if parser.parse_keyword(Keyword::DIV) {
128+
let left = Box::new(expr.clone());
129+
let right = Box::new(match parser.parse_expr() {
130+
Ok(expr) => expr,
131+
Err(e) => return Some(Err(e)),
132+
});
133+
Some(Ok(Expr::BinaryOp {
134+
left,
135+
op: BinaryOperator::MyIntegerDivide,
136+
right,
137+
}))
138+
} else {
139+
None
140+
}
141+
}
142+
}

src/parser/mod.rs

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8311,6 +8311,7 @@ impl<'a> Parser<'a> {
83118311
Keyword::STORED,
83128312
Keyword::LOCATION,
83138313
Keyword::WITH,
8314+
Keyword::USING,
83148315
]) {
83158316
Some(Keyword::ROW) => {
83168317
hive_format
@@ -8350,6 +8351,16 @@ impl<'a> Parser<'a> {
83508351
break;
83518352
}
83528353
}
8354+
Some(Keyword::USING) if self.dialect.supports_create_table_using() => {
8355+
let format = self.parse_identifier()?;
8356+
hive_format.get_or_insert_with(HiveFormat::default).storage =
8357+
Some(HiveIOFormat::Using { format });
8358+
}
8359+
Some(Keyword::USING) => {
8360+
// USING is not a table format keyword in this dialect; put it back
8361+
self.prev_token();
8362+
break;
8363+
}
83538364
None => break,
83548365
_ => break,
83558366
}
@@ -12475,6 +12486,9 @@ impl<'a> Parser<'a> {
1247512486
Keyword::TINYBLOB => Ok(DataType::TinyBlob),
1247612487
Keyword::MEDIUMBLOB => Ok(DataType::MediumBlob),
1247712488
Keyword::LONGBLOB => Ok(DataType::LongBlob),
12489+
Keyword::LONG if self.dialect.supports_long_type_as_bigint() => {
12490+
Ok(DataType::BigInt(None))
12491+
}
1247812492
Keyword::BYTES => Ok(DataType::Bytes(self.parse_optional_precision()?)),
1247912493
Keyword::BIT => {
1248012494
if self.parse_keyword(Keyword::VARYING) {
@@ -12609,8 +12623,7 @@ impl<'a> Parser<'a> {
1260912623
let field_defs = self.parse_duckdb_struct_type_def()?;
1261012624
Ok(DataType::Struct(field_defs, StructBracketKind::Parentheses))
1261112625
}
12612-
Keyword::STRUCT if dialect_is!(dialect is BigQueryDialect | DatabricksDialect | GenericDialect) =>
12613-
{
12626+
Keyword::STRUCT if self.dialect.supports_struct_literal() => {
1261412627
self.prev_token();
1261512628
let (field_defs, _trailing_bracket) =
1261612629
self.parse_struct_type_def(Self::parse_struct_field_def)?;
@@ -12631,6 +12644,17 @@ impl<'a> Parser<'a> {
1263112644
Keyword::LOWCARDINALITY if dialect_is!(dialect is ClickHouseDialect | GenericDialect) => {
1263212645
Ok(self.parse_sub_type(DataType::LowCardinality)?)
1263312646
}
12647+
Keyword::MAP if self.dialect.supports_map_literal_with_angle_brackets() => {
12648+
self.expect_token(&Token::Lt)?;
12649+
let key_data_type = self.parse_data_type()?;
12650+
self.expect_token(&Token::Comma)?;
12651+
let (value_data_type, _trailing_bracket) = self.parse_data_type_helper()?;
12652+
trailing_bracket = self.expect_closing_angle_bracket(_trailing_bracket)?;
12653+
Ok(DataType::Map(
12654+
Box::new(key_data_type),
12655+
Box::new(value_data_type),
12656+
))
12657+
}
1263412658
Keyword::MAP if dialect_is!(dialect is ClickHouseDialect | GenericDialect) => {
1263512659
self.prev_token();
1263612660
let (key_data_type, value_data_type) = self.parse_click_house_map_def()?;

0 commit comments

Comments
 (0)