@@ -627,11 +627,11 @@ impl<'a> Tokenizer<'a> {
627627 chars. next ( ) ; // consume
628628 match chars. peek ( ) {
629629 Some ( '\'' ) => {
630- let s = self . tokenize_quoted_string ( chars, '\'' ) ?;
630+ let s = self . tokenize_quoted_string ( chars, '\'' , false ) ?;
631631 Ok ( Some ( Token :: SingleQuotedByteStringLiteral ( s) ) )
632632 }
633633 Some ( '\"' ) => {
634- let s = self . tokenize_quoted_string ( chars, '\"' ) ?;
634+ let s = self . tokenize_quoted_string ( chars, '\"' , false ) ?;
635635 Ok ( Some ( Token :: DoubleQuotedByteStringLiteral ( s) ) )
636636 }
637637 _ => {
@@ -646,11 +646,11 @@ impl<'a> Tokenizer<'a> {
646646 chars. next ( ) ; // consume
647647 match chars. peek ( ) {
648648 Some ( '\'' ) => {
649- let s = self . tokenize_quoted_string ( chars, '\'' ) ?;
649+ let s = self . tokenize_quoted_string ( chars, '\'' , false ) ?;
650650 Ok ( Some ( Token :: RawStringLiteral ( s) ) )
651651 }
652652 Some ( '\"' ) => {
653- let s = self . tokenize_quoted_string ( chars, '\"' ) ?;
653+ let s = self . tokenize_quoted_string ( chars, '\"' , false ) ?;
654654 Ok ( Some ( Token :: RawStringLiteral ( s) ) )
655655 }
656656 _ => {
@@ -666,7 +666,7 @@ impl<'a> Tokenizer<'a> {
666666 match chars. peek ( ) {
667667 Some ( '\'' ) => {
668668 // N'...' - a <national character string literal>
669- let s = self . tokenize_quoted_string ( chars, '\'' ) ?;
669+ let s = self . tokenize_quoted_string ( chars, '\'' , true ) ?;
670670 Ok ( Some ( Token :: NationalStringLiteral ( s) ) )
671671 }
672672 _ => {
@@ -700,7 +700,7 @@ impl<'a> Tokenizer<'a> {
700700 match chars. peek ( ) {
701701 Some ( '\'' ) => {
702702 // X'...' - a <binary string literal>
703- let s = self . tokenize_quoted_string ( chars, '\'' ) ?;
703+ let s = self . tokenize_quoted_string ( chars, '\'' , true ) ?;
704704 Ok ( Some ( Token :: HexStringLiteral ( s) ) )
705705 }
706706 _ => {
@@ -712,15 +712,23 @@ impl<'a> Tokenizer<'a> {
712712 }
713713 // single quoted string
714714 '\'' => {
715- let s = self . tokenize_quoted_string ( chars, '\'' ) ?;
715+ let s = self . tokenize_quoted_string (
716+ chars,
717+ '\'' ,
718+ self . dialect . supports_string_literal_backslash_escape ( ) ,
719+ ) ?;
716720
717721 Ok ( Some ( Token :: SingleQuotedString ( s) ) )
718722 }
719723 // double quoted string
720724 '\"' if !self . dialect . is_delimited_identifier_start ( ch)
721725 && !self . dialect . is_identifier_start ( ch) =>
722726 {
723- let s = self . tokenize_quoted_string ( chars, '"' ) ?;
727+ let s = self . tokenize_quoted_string (
728+ chars,
729+ '"' ,
730+ self . dialect . supports_string_literal_backslash_escape ( ) ,
731+ ) ?;
724732
725733 Ok ( Some ( Token :: DoubleQuotedString ( s) ) )
726734 }
@@ -1222,6 +1230,7 @@ impl<'a> Tokenizer<'a> {
12221230 & self ,
12231231 chars : & mut State ,
12241232 quote_style : char ,
1233+ allow_escape : bool ,
12251234 ) -> Result < String , TokenizerError > {
12261235 let mut s = String :: new ( ) ;
12271236 let error_loc = chars. location ( ) ;
@@ -1243,35 +1252,31 @@ impl<'a> Tokenizer<'a> {
12431252 return Ok ( s) ;
12441253 }
12451254 }
1246- '\\' => {
1247- // consume
1255+ '\\' if allow_escape => {
1256+ // consume backslash
12481257 chars. next ( ) ;
1249- // slash escaping is specific to MySQL dialect.
1250- if dialect_of ! ( self is MySqlDialect ) {
1251- if let Some ( next) = chars. peek ( ) {
1252- if !self . unescape {
1253- // In no-escape mode, the given query has to be saved completely including backslashes.
1254- s. push ( ch) ;
1255- s. push ( * next) ;
1256- chars. next ( ) ; // consume next
1257- } else {
1258- // See https://dev.mysql.com/doc/refman/8.0/en/string-literals.html#character-escape-sequences
1259- let n = match next {
1260- '\'' | '\"' | '\\' | '%' | '_' => * next,
1261- '0' => '\0' ,
1262- 'b' => '\u{8}' ,
1263- 'n' => '\n' ,
1264- 'r' => '\r' ,
1265- 't' => '\t' ,
1266- 'Z' => '\u{1a}' ,
1267- _ => * next,
1268- } ;
1269- s. push ( n) ;
1270- chars. next ( ) ; // consume next
1271- }
1258+
1259+ if let Some ( next) = chars. peek ( ) {
1260+ if !self . unescape {
1261+ // In no-escape mode, the given query has to be saved completely including backslashes.
1262+ s. push ( ch) ;
1263+ s. push ( * next) ;
1264+ chars. next ( ) ; // consume next
1265+ } else {
1266+ let n = match next {
1267+ '0' => '\0' ,
1268+ 'a' => '\u{7}' ,
1269+ 'b' => '\u{8}' ,
1270+ 'f' => '\u{c}' ,
1271+ 'n' => '\n' ,
1272+ 'r' => '\r' ,
1273+ 't' => '\t' ,
1274+ 'Z' => '\u{1a}' ,
1275+ _ => * next,
1276+ } ;
1277+ s. push ( n) ;
1278+ chars. next ( ) ; // consume next
12721279 }
1273- } else {
1274- s. push ( ch) ;
12751280 }
12761281 }
12771282 _ => {
@@ -1517,7 +1522,7 @@ impl<'a: 'b, 'b> Unescape<'a, 'b> {
15171522#[ cfg( test) ]
15181523mod tests {
15191524 use super :: * ;
1520- use crate :: dialect:: { ClickHouseDialect , MsSqlDialect } ;
1525+ use crate :: dialect:: { BigQueryDialect , ClickHouseDialect , MsSqlDialect } ;
15211526
15221527 #[ test]
15231528 fn tokenizer_error_impl ( ) {
@@ -2386,4 +2391,57 @@ mod tests {
23862391 check_unescape ( r"Hello\0" , None ) ;
23872392 check_unescape ( r"Hello\xCADRust" , None ) ;
23882393 }
2394+
2395+ #[ test]
2396+ fn tokenize_quoted_string_escape ( ) {
2397+ for ( sql, expected, expected_unescaped) in [
2398+ ( r#"'%a\'%b'"# , r#"%a\'%b"# , r#"%a'%b"# ) ,
2399+ ( r#"'a\'\'b\'c\'d'"# , r#"a\'\'b\'c\'d"# , r#"a''b'c'd"# ) ,
2400+ ( r#"'\\'"# , r#"\\"# , r#"\"# ) ,
2401+ (
2402+ r#"'\0\a\b\f\n\r\t\Z'"# ,
2403+ r#"\0\a\b\f\n\r\t\Z"# ,
2404+ "\0 \u{7} \u{8} \u{c} \n \r \t \u{1a} " ,
2405+ ) ,
2406+ ( r#"'\"'"# , r#"\""# , "\" " ) ,
2407+ ( r#"'\\a\\b\'c'"# , r#"\\a\\b\'c"# , r#"\a\b'c"# ) ,
2408+ ( r#"'\'abcd'"# , r#"\'abcd"# , r#"'abcd"# ) ,
2409+ ( r#"'''a''b'"# , r#"''a''b"# , r#"'a'b"# ) ,
2410+ ] {
2411+ let dialect = BigQueryDialect { } ;
2412+
2413+ let tokens = Tokenizer :: new ( & dialect, sql)
2414+ . with_unescape ( false )
2415+ . tokenize ( )
2416+ . unwrap ( ) ;
2417+ let expected = vec ! [ Token :: SingleQuotedString ( expected. to_string( ) ) ] ;
2418+ compare ( expected, tokens) ;
2419+
2420+ let tokens = Tokenizer :: new ( & dialect, sql)
2421+ . with_unescape ( true )
2422+ . tokenize ( )
2423+ . unwrap ( ) ;
2424+ let expected = vec ! [ Token :: SingleQuotedString ( expected_unescaped. to_string( ) ) ] ;
2425+ compare ( expected, tokens) ;
2426+ }
2427+
2428+ for sql in [ r#"'\'"# , r#"'ab\'"# ] {
2429+ let dialect = BigQueryDialect { } ;
2430+ let mut tokenizer = Tokenizer :: new ( & dialect, sql) ;
2431+ assert_eq ! (
2432+ "Unterminated string literal" ,
2433+ tokenizer. tokenize( ) . unwrap_err( ) . message. as_str( ) ,
2434+ ) ;
2435+ }
2436+
2437+ // Non-escape dialect
2438+ for ( sql, expected) in [ ( r#"'\'"# , r#"\"# ) , ( r#"'ab\'"# , r#"ab\"# ) ] {
2439+ let dialect = GenericDialect { } ;
2440+ let tokens = Tokenizer :: new ( & dialect, sql) . tokenize ( ) . unwrap ( ) ;
2441+
2442+ let expected = vec ! [ Token :: SingleQuotedString ( expected. to_string( ) ) ] ;
2443+
2444+ compare ( expected, tokens) ;
2445+ }
2446+ }
23892447}
0 commit comments