sigmacomputing
diff --git a/‎src/ast/mod.rs‎
Lines changed: 3 additions & 3 deletions b/‎src/ast/mod.rs‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/dialect/bigquery.rs‎
Lines changed: 5 additions & 0 deletions b/‎src/dialect/bigquery.rs‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/dialect/clickhouse.rs‎
Lines changed: 4 additions & 0 deletions b/‎src/dialect/clickhouse.rs‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/dialect/mod.rs‎
Lines changed: 21 additions & 0 deletions b/‎src/dialect/mod.rs‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎src/dialect/mysql.rs‎
Lines changed: 5 additions & 0 deletions b/‎src/dialect/mysql.rs‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/dialect/snowflake.rs‎
Lines changed: 5 additions & 0 deletions b/‎src/dialect/snowflake.rs‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/parser/mod.rs‎
Lines changed: 2 additions & 2 deletions b/‎src/parser/mod.rs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/tokenizer.rs‎
Lines changed: 94 additions & 36 deletions b/‎src/tokenizer.rs‎
Lines changed: 94 additions & 36 deletions
@@ -512,21 +512,21 @@ pub enum Expr {
         negated: bool,
         expr: Box<Expr>,
         pattern: Box<Expr>,
-        escape_char: Option<char>,
+        escape_char: Option<String>,
     },
     /// `ILIKE` (case-insensitive `LIKE`)
     ILike {
         negated: bool,
         expr: Box<Expr>,
         pattern: Box<Expr>,
-        escape_char: Option<char>,
+        escape_char: Option<String>,
     },
     /// SIMILAR TO regex
     SimilarTo {
         negated: bool,
         expr: Box<Expr>,
         pattern: Box<Expr>,
-        escape_char: Option<char>,
+        escape_char: Option<String>,
     },
     /// MySQL: RLIKE regex or REGEXP regex
     RLike {
 
@@ -29,4 +29,9 @@ impl Dialect for BigQueryDialect {
     fn is_identifier_part(&self, ch: char) -> bool {
         ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch.is_ascii_digit() || ch == '_'
     }
+
+    // See https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#escape_sequences
+    fn supports_string_literal_backslash_escape(&self) -> bool {
+        true
+    }
 }
@@ -25,4 +25,8 @@ impl Dialect for ClickHouseDialect {
     fn is_identifier_part(&self, ch: char) -> bool {
         self.is_identifier_start(ch) || ch.is_ascii_digit()
     }
+
+    fn supports_string_literal_backslash_escape(&self) -> bool {
+        true
+    }
 }
@@ -120,6 +120,23 @@ pub trait Dialect: Debug + Any {
     fn is_identifier_start(&self, ch: char) -> bool;
     /// Determine if a character is a valid unquoted identifier character
     fn is_identifier_part(&self, ch: char) -> bool;
+    /// Determine if the dialect supports escaping characters via '\' in string literals.
+    ///
+    /// Some dialects like BigQuery and Snowflake support this while others like
+    /// Postgres do not. Such that the following is accepted by the former but
+    /// rejected by the latter.
+    /// ```sql
+    /// SELECT 'ab\'cd';
+    /// ```
+    ///
+    /// Conversely, such dialects reject the following statement which
+    /// otherwise would be valid in the other dialects.
+    /// ```sql
+    /// SELECT '\';
+    /// ```
+    fn supports_string_literal_backslash_escape(&self) -> bool {
+        false
+    }
     /// Does the dialect support `FILTER (WHERE expr)` for aggregate queries?
     fn supports_filter_during_aggregation(&self) -> bool {
         false
@@ -306,6 +323,10 @@ mod tests {
                 self.0.identifier_quote_style(identifier)
             }
 
+            fn supports_string_literal_backslash_escape(&self) -> bool {
+                self.0.supports_string_literal_backslash_escape()
+            }
+
             fn is_proper_identifier_inside_quotes(
                 &self,
                 chars: std::iter::Peekable<std::str::Chars<'_>>,
 
@@ -48,6 +48,11 @@ impl Dialect for MySqlDialect {
         Some('`')
     }
 
+    // See https://dev.mysql.com/doc/refman/8.0/en/string-literals.html#character-escape-sequences
+    fn supports_string_literal_backslash_escape(&self) -> bool {
+        true
+    }
+
     fn parse_infix(
         &self,
         parser: &mut crate::parser::Parser,
 
@@ -46,6 +46,11 @@ impl Dialect for SnowflakeDialect {
             || ch == '_'
     }
 
+    // See https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#escape_sequences
+    fn supports_string_literal_backslash_escape(&self) -> bool {
+        true
+    }
+
     fn supports_within_after_array_aggregation(&self) -> bool {
         true
     }
 
@@ -2560,9 +2560,9 @@ impl<'a> Parser<'a> {
     }
 
     /// parse the ESCAPE CHAR portion of LIKE, ILIKE, and SIMILAR TO
-    pub fn parse_escape_char(&mut self) -> Result<Option<char>, ParserError> {
+    pub fn parse_escape_char(&mut self) -> Result<Option<String>, ParserError> {
         if self.parse_keyword(Keyword::ESCAPE) {
-            Ok(Some(self.parse_literal_char()?))
+            Ok(Some(self.parse_literal_string()?))
         } else {
             Ok(None)
         }
 
@@ -627,11 +627,11 @@ impl<'a> Tokenizer<'a> {
                     chars.next(); // consume
                     match chars.peek() {
                         Some('\'') => {
-                            let s = self.tokenize_quoted_string(chars, '\'')?;
+                            let s = self.tokenize_quoted_string(chars, '\'', false)?;
                             Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
                         }
                         Some('\"') => {
-                            let s = self.tokenize_quoted_string(chars, '\"')?;
+                            let s = self.tokenize_quoted_string(chars, '\"', false)?;
                             Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
                         }
                         _ => {
@@ -646,11 +646,11 @@ impl<'a> Tokenizer<'a> {
                     chars.next(); // consume
                     match chars.peek() {
                         Some('\'') => {
-                            let s = self.tokenize_quoted_string(chars, '\'')?;
+                            let s = self.tokenize_quoted_string(chars, '\'', false)?;
                             Ok(Some(Token::RawStringLiteral(s)))
                         }
                         Some('\"') => {
-                            let s = self.tokenize_quoted_string(chars, '\"')?;
+                            let s = self.tokenize_quoted_string(chars, '\"', false)?;
                             Ok(Some(Token::RawStringLiteral(s)))
                         }
                         _ => {
@@ -666,7 +666,7 @@ impl<'a> Tokenizer<'a> {
                     match chars.peek() {
                         Some('\'') => {
                             // N'...' - a <national character string literal>
-                            let s = self.tokenize_quoted_string(chars, '\'')?;
+                            let s = self.tokenize_quoted_string(chars, '\'', true)?;
                             Ok(Some(Token::NationalStringLiteral(s)))
                         }
                         _ => {
@@ -700,7 +700,7 @@ impl<'a> Tokenizer<'a> {
                     match chars.peek() {
                         Some('\'') => {
                             // X'...' - a <binary string literal>
-                            let s = self.tokenize_quoted_string(chars, '\'')?;
+                            let s = self.tokenize_quoted_string(chars, '\'', true)?;
                             Ok(Some(Token::HexStringLiteral(s)))
                         }
                         _ => {
@@ -712,15 +712,23 @@ impl<'a> Tokenizer<'a> {
                 }
                 // single quoted string
                 '\'' => {
-                    let s = self.tokenize_quoted_string(chars, '\'')?;
+                    let s = self.tokenize_quoted_string(
+                        chars,
+                        '\'',
+                        self.dialect.supports_string_literal_backslash_escape(),
+                    )?;
 
                     Ok(Some(Token::SingleQuotedString(s)))
                 }
                 // double quoted string
                 '\"' if !self.dialect.is_delimited_identifier_start(ch)
                     && !self.dialect.is_identifier_start(ch) =>
                 {
-                    let s = self.tokenize_quoted_string(chars, '"')?;
+                    let s = self.tokenize_quoted_string(
+                        chars,
+                        '"',
+                        self.dialect.supports_string_literal_backslash_escape(),
+                    )?;
 
                     Ok(Some(Token::DoubleQuotedString(s)))
                 }
@@ -1222,6 +1230,7 @@ impl<'a> Tokenizer<'a> {
         &self,
         chars: &mut State,
         quote_style: char,
+        allow_escape: bool,
     ) -> Result<String, TokenizerError> {
         let mut s = String::new();
         let error_loc = chars.location();
@@ -1243,35 +1252,31 @@ impl<'a> Tokenizer<'a> {
                         return Ok(s);
                     }
                 }
-                '\\' => {
-                    // consume
+                '\\' if allow_escape => {
+                    // consume backslash
                     chars.next();
-                    // slash escaping is specific to MySQL dialect.
-                    if dialect_of!(self is MySqlDialect) {
-                        if let Some(next) = chars.peek() {
-                            if !self.unescape {
-                                // In no-escape mode, the given query has to be saved completely including backslashes.
-                                s.push(ch);
-                                s.push(*next);
-                                chars.next(); // consume next
-                            } else {
-                                // See https://dev.mysql.com/doc/refman/8.0/en/string-literals.html#character-escape-sequences
-                                let n = match next {
-                                    '\'' | '\"' | '\\' | '%' | '_' => *next,
-                                    '0' => '\0',
-                                    'b' => '\u{8}',
-                                    'n' => '\n',
-                                    'r' => '\r',
-                                    't' => '\t',
-                                    'Z' => '\u{1a}',
-                                    _ => *next,
-                                };
-                                s.push(n);
-                                chars.next(); // consume next
-                            }
+
+                    if let Some(next) = chars.peek() {
+                        if !self.unescape {
+                            // In no-escape mode, the given query has to be saved completely including backslashes.
+                            s.push(ch);
+                            s.push(*next);
+                            chars.next(); // consume next
+                        } else {
+                            let n = match next {
+                                '0' => '\0',
+                                'a' => '\u{7}',
+                                'b' => '\u{8}',
+                                'f' => '\u{c}',
+                                'n' => '\n',
+                                'r' => '\r',
+                                't' => '\t',
+                                'Z' => '\u{1a}',
+                                _ => *next,
+                            };
+                            s.push(n);
+                            chars.next(); // consume next
                         }
-                    } else {
-                        s.push(ch);
                     }
                 }
                 _ => {
@@ -1517,7 +1522,7 @@ impl<'a: 'b, 'b> Unescape<'a, 'b> {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::dialect::{ClickHouseDialect, MsSqlDialect};
+    use crate::dialect::{BigQueryDialect, ClickHouseDialect, MsSqlDialect};
 
     #[test]
     fn tokenizer_error_impl() {
@@ -2386,4 +2391,57 @@ mod tests {
         check_unescape(r"Hello\0", None);
         check_unescape(r"Hello\xCADRust", None);
     }
+
+    #[test]
+    fn tokenize_quoted_string_escape() {
+        for (sql, expected, expected_unescaped) in [
+            (r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
+            (r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
+            (r#"'\\'"#, r#"\\"#, r#"\"#),
+            (
+                r#"'\0\a\b\f\n\r\t\Z'"#,
+                r#"\0\a\b\f\n\r\t\Z"#,
+                "\0\u{7}\u{8}\u{c}\n\r\t\u{1a}",
+            ),
+            (r#"'\"'"#, r#"\""#, "\""),
+            (r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
+            (r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
+            (r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
+        ] {
+            let dialect = BigQueryDialect {};
+
+            let tokens = Tokenizer::new(&dialect, sql)
+                .with_unescape(false)
+                .tokenize()
+                .unwrap();
+            let expected = vec![Token::SingleQuotedString(expected.to_string())];
+            compare(expected, tokens);
+
+            let tokens = Tokenizer::new(&dialect, sql)
+                .with_unescape(true)
+                .tokenize()
+                .unwrap();
+            let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())];
+            compare(expected, tokens);
+        }
+
+        for sql in [r#"'\'"#, r#"'ab\'"#] {
+            let dialect = BigQueryDialect {};
+            let mut tokenizer = Tokenizer::new(&dialect, sql);
+            assert_eq!(
+                "Unterminated string literal",
+                tokenizer.tokenize().unwrap_err().message.as_str(),
+            );
+        }
+
+        // Non-escape dialect
+        for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] {
+            let dialect = GenericDialect {};
+            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
+
+            let expected = vec![Token::SingleQuotedString(expected.to_string())];
+
+            compare(expected, tokens);
+        }
+    }
 }
Original file line number	Diff line number	Diff line change
`@@ -29,4 +29,9 @@ impl Dialect for BigQueryDialect {`
`29`	`29`	`fn is_identifier_part(&self, ch: char) -> bool {`
`30`	`30`	`ch.is_ascii_lowercase() \|\| ch.is_ascii_uppercase() \|\| ch.is_ascii_digit() \|\| ch == '_'`
`31`	`31`	`}`
	`32`	`+`
	`33`	`+ // See https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#escape_sequences`
	`34`	`+ fn supports_string_literal_backslash_escape(&self) -> bool {`
	`35`	`+ true`
	`36`	`+ }`
`32`	`37`	`}`
Original file line number	Diff line number	Diff line change
`@@ -25,4 +25,8 @@ impl Dialect for ClickHouseDialect {`
`25`	`25`	`fn is_identifier_part(&self, ch: char) -> bool {`
`26`	`26`	`self.is_identifier_start(ch) \|\| ch.is_ascii_digit()`
`27`	`27`	`}`
	`28`	`+`
	`29`	`+ fn supports_string_literal_backslash_escape(&self) -> bool {`
	`30`	`+ true`
	`31`	`+ }`
`28`	`32`	`}`
Original file line number	Diff line number	Diff line change
`@@ -46,6 +46,11 @@ impl Dialect for SnowflakeDialect {`
`46`	`46`	`\|\| ch == '_'`
`47`	`47`	`}`
`48`	`48`
	`49`	`+ // See https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#escape_sequences`
	`50`	`+ fn supports_string_literal_backslash_escape(&self) -> bool {`
	`51`	`+ true`
	`52`	`+ }`
	`53`	`+`
`49`	`54`	`fn supports_within_after_array_aggregation(&self) -> bool {`
`50`	`55`	`true`
`51`	`56`	`}`
Original file line number	Diff line number	Diff line change
`@@ -2560,9 +2560,9 @@ impl<'a> Parser<'a> {`
`2560`	`2560`	`}`
`2561`	`2561`
`2562`	`2562`	`/// parse the ESCAPE CHAR portion of LIKE, ILIKE, and SIMILAR TO`
`2563`		`- pub fn parse_escape_char(&mut self) -> Result<Option<char>, ParserError> {`
	`2563`	`+ pub fn parse_escape_char(&mut self) -> Result<Option<String>, ParserError> {`
`2564`	`2564`	`if self.parse_keyword(Keyword::ESCAPE) {`
`2565`		`- Ok(Some(self.parse_literal_char()?))`
	`2565`	`+ Ok(Some(self.parse_literal_string()?))`
`2566`	`2566`	`} else {`
`2567`	`2567`	`Ok(None)`
`2568`	`2568`	`}`