@@ -1199,61 +1199,10 @@ impl<'a> Tokenizer<'a> {
11991199 starting_loc : Location ,
12001200 chars : & mut State ,
12011201 ) -> Result < String , TokenizerError > {
1202- let mut s = String :: new ( ) ;
1203-
1204- // This case is a bit tricky
1205-
1206- chars. next ( ) ; // consume the opening quote
1207-
1208- // slash escaping
1209- let mut is_escaped = false ;
1210- while let Some ( & ch) = chars. peek ( ) {
1211- macro_rules! escape_control_character {
1212- ( $ESCAPED: expr) => { {
1213- if is_escaped {
1214- s. push( $ESCAPED) ;
1215- is_escaped = false ;
1216- } else {
1217- s. push( ch) ;
1218- }
1219-
1220- chars. next( ) ;
1221- } } ;
1222- }
1223-
1224- match ch {
1225- '\'' => {
1226- chars. next ( ) ; // consume
1227- if is_escaped {
1228- s. push ( ch) ;
1229- is_escaped = false ;
1230- } else if chars. peek ( ) . map ( |c| * c == '\'' ) . unwrap_or ( false ) {
1231- s. push ( ch) ;
1232- chars. next ( ) ;
1233- } else {
1234- return Ok ( s) ;
1235- }
1236- }
1237- '\\' => {
1238- if is_escaped {
1239- s. push ( '\\' ) ;
1240- is_escaped = false ;
1241- } else {
1242- is_escaped = true ;
1243- }
1244-
1245- chars. next ( ) ;
1246- }
1247- 'r' => escape_control_character ! ( '\r' ) ,
1248- 'n' => escape_control_character ! ( '\n' ) ,
1249- 't' => escape_control_character ! ( '\t' ) ,
1250- _ => {
1251- is_escaped = false ;
1252- chars. next ( ) ; // consume
1253- s. push ( ch) ;
1254- }
1255- }
1202+ if let Some ( s) = unescape_single_quoted_string ( chars) {
1203+ return Ok ( s) ;
12561204 }
1205+
12571206 self . tokenizer_error ( starting_loc, "Unterminated encoded string literal" )
12581207 }
12591208
@@ -1406,6 +1355,154 @@ fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool
14061355 s
14071356}
14081357
1358+ fn unescape_single_quoted_string ( chars : & mut State < ' _ > ) -> Option < String > {
1359+ Unescape :: new ( chars) . unescape ( )
1360+ }
1361+
1362+ struct Unescape < ' a : ' b , ' b > {
1363+ chars : & ' b mut State < ' a > ,
1364+ }
1365+
1366+ impl < ' a : ' b , ' b > Unescape < ' a , ' b > {
1367+ fn new ( chars : & ' b mut State < ' a > ) -> Self {
1368+ Self { chars }
1369+ }
1370+ fn unescape ( mut self ) -> Option < String > {
1371+ let mut unescaped = String :: new ( ) ;
1372+
1373+ self . chars . next ( ) ;
1374+
1375+ while let Some ( c) = self . chars . next ( ) {
1376+ if c == '\'' {
1377+ // case: ''''
1378+ if self . chars . peek ( ) . map ( |c| * c == '\'' ) . unwrap_or ( false ) {
1379+ self . chars . next ( ) ;
1380+ unescaped. push ( '\'' ) ;
1381+ continue ;
1382+ }
1383+ return Some ( unescaped) ;
1384+ }
1385+
1386+ if c != '\\' {
1387+ unescaped. push ( c) ;
1388+ continue ;
1389+ }
1390+
1391+ let c = match self . chars . next ( ) ? {
1392+ 'b' => '\u{0008}' ,
1393+ 'f' => '\u{000C}' ,
1394+ 'n' => '\n' ,
1395+ 'r' => '\r' ,
1396+ 't' => '\t' ,
1397+ 'u' => self . unescape_unicode_16 ( ) ?,
1398+ 'U' => self . unescape_unicode_32 ( ) ?,
1399+ 'x' => self . unescape_hex ( ) ?,
1400+ c if c. is_digit ( 8 ) => self . unescape_octal ( c) ?,
1401+ c => c,
1402+ } ;
1403+
1404+ unescaped. push ( Self :: check_null ( c) ?) ;
1405+ }
1406+
1407+ None
1408+ }
1409+
1410+ #[ inline]
1411+ fn check_null ( c : char ) -> Option < char > {
1412+ if c == '\0' {
1413+ None
1414+ } else {
1415+ Some ( c)
1416+ }
1417+ }
1418+
1419+ #[ inline]
1420+ fn byte_to_char < const RADIX : u32 > ( s : & str ) -> Option < char > {
1421+ // u32 is used here because Pg has an overflow operation rather than throwing an exception directly.
1422+ match u32:: from_str_radix ( s, RADIX ) {
1423+ Err ( _) => None ,
1424+ Ok ( n) => {
1425+ let n = n & 0xFF ;
1426+ if n <= 127 {
1427+ char:: from_u32 ( n)
1428+ } else {
1429+ None
1430+ }
1431+ }
1432+ }
1433+ }
1434+
1435+ // Hexadecimal byte value. \xh, \xhh (h = 0–9, A–F)
1436+ fn unescape_hex ( & mut self ) -> Option < char > {
1437+ let mut s = String :: new ( ) ;
1438+
1439+ for _ in 0 ..2 {
1440+ match self . next_hex_digit ( ) {
1441+ Some ( c) => s. push ( c) ,
1442+ None => break ,
1443+ }
1444+ }
1445+
1446+ if s. is_empty ( ) {
1447+ return Some ( 'x' ) ;
1448+ }
1449+
1450+ Self :: byte_to_char :: < 16 > ( & s)
1451+ }
1452+
1453+ #[ inline]
1454+ fn next_hex_digit ( & mut self ) -> Option < char > {
1455+ match self . chars . peek ( ) {
1456+ Some ( c) if c. is_ascii_hexdigit ( ) => self . chars . next ( ) ,
1457+ _ => None ,
1458+ }
1459+ }
1460+
1461+ // Octal byte value. \o, \oo, \ooo (o = 0–7)
1462+ fn unescape_octal ( & mut self , c : char ) -> Option < char > {
1463+ let mut s = String :: new ( ) ;
1464+
1465+ s. push ( c) ;
1466+ for _ in 0 ..2 {
1467+ match self . next_octal_digest ( ) {
1468+ Some ( c) => s. push ( c) ,
1469+ None => break ,
1470+ }
1471+ }
1472+
1473+ Self :: byte_to_char :: < 8 > ( & s)
1474+ }
1475+
1476+ #[ inline]
1477+ fn next_octal_digest ( & mut self ) -> Option < char > {
1478+ match self . chars . peek ( ) {
1479+ Some ( c) if c. is_digit ( 8 ) => self . chars . next ( ) ,
1480+ _ => None ,
1481+ }
1482+ }
1483+
1484+ // 16-bit hexadecimal Unicode character value. \uxxxx (x = 0–9, A–F)
1485+ fn unescape_unicode_16 ( & mut self ) -> Option < char > {
1486+ self . unescape_unicode :: < 4 > ( )
1487+ }
1488+
1489+ // 32-bit hexadecimal Unicode character value. \Uxxxxxxxx (x = 0–9, A–F)
1490+ fn unescape_unicode_32 ( & mut self ) -> Option < char > {
1491+ self . unescape_unicode :: < 8 > ( )
1492+ }
1493+
1494+ fn unescape_unicode < const NUM : usize > ( & mut self ) -> Option < char > {
1495+ let mut s = String :: new ( ) ;
1496+ for _ in 0 ..NUM {
1497+ s. push ( self . chars . next ( ) ?) ;
1498+ }
1499+ match u32:: from_str_radix ( & s, 16 ) {
1500+ Err ( _) => None ,
1501+ Ok ( n) => char:: from_u32 ( n) ,
1502+ }
1503+ }
1504+ }
1505+
14091506#[ cfg( test) ]
14101507mod tests {
14111508 use super :: * ;
@@ -2139,4 +2236,74 @@ mod tests {
21392236 //println!("------------------------------");
21402237 assert_eq ! ( expected, actual) ;
21412238 }
2239+
2240+ fn check_unescape ( s : & str , expected : Option < & str > ) {
2241+ let s = format ! ( "'{}'" , s) ;
2242+ let mut state = State {
2243+ peekable : s. chars ( ) . peekable ( ) ,
2244+ line : 0 ,
2245+ col : 0 ,
2246+ } ;
2247+
2248+ assert_eq ! (
2249+ unescape_single_quoted_string( & mut state) ,
2250+ expected. map( |s| s. to_string( ) )
2251+ ) ;
2252+ }
2253+
2254+ #[ test]
2255+ fn test_unescape ( ) {
2256+ check_unescape ( r"\b" , Some ( "\u{0008} " ) ) ;
2257+ check_unescape ( r"\f" , Some ( "\u{000C} " ) ) ;
2258+ check_unescape ( r"\t" , Some ( "\t " ) ) ;
2259+ check_unescape ( r"\r\n" , Some ( "\r \n " ) ) ;
2260+ check_unescape ( r"\/" , Some ( "/" ) ) ;
2261+ check_unescape ( r"/" , Some ( "/" ) ) ;
2262+ check_unescape ( r"\\" , Some ( "\\ " ) ) ;
2263+
2264+ // 16 and 32-bit hexadecimal Unicode character value
2265+ check_unescape ( r"\u0001" , Some ( "\u{0001} " ) ) ;
2266+ check_unescape ( r"\u4c91" , Some ( "\u{4c91} " ) ) ;
2267+ check_unescape ( r"\u4c916" , Some ( "\u{4c91} 6" ) ) ;
2268+ check_unescape ( r"\u4c" , None ) ;
2269+ check_unescape ( r"\u0000" , None ) ;
2270+ check_unescape ( r"\U0010FFFF" , Some ( "\u{10FFFF} " ) ) ;
2271+ check_unescape ( r"\U00110000" , None ) ;
2272+ check_unescape ( r"\U00000000" , None ) ;
2273+ check_unescape ( r"\u" , None ) ;
2274+ check_unescape ( r"\U" , None ) ;
2275+ check_unescape ( r"\U1010FFFF" , None ) ;
2276+
2277+ // hexadecimal byte value
2278+ check_unescape ( r"\x4B" , Some ( "\u{004b} " ) ) ;
2279+ check_unescape ( r"\x4" , Some ( "\u{0004} " ) ) ;
2280+ check_unescape ( r"\x4L" , Some ( "\u{0004} L" ) ) ;
2281+ check_unescape ( r"\x" , Some ( "x" ) ) ;
2282+ check_unescape ( r"\xP" , Some ( "xP" ) ) ;
2283+ check_unescape ( r"\x0" , None ) ;
2284+ check_unescape ( r"\xCAD" , None ) ;
2285+ check_unescape ( r"\xA9" , None ) ;
2286+
2287+ // octal byte value
2288+ check_unescape ( r"\1" , Some ( "\u{0001} " ) ) ;
2289+ check_unescape ( r"\12" , Some ( "\u{000a} " ) ) ;
2290+ check_unescape ( r"\123" , Some ( "\u{0053} " ) ) ;
2291+ check_unescape ( r"\1232" , Some ( "\u{0053} 2" ) ) ;
2292+ check_unescape ( r"\4" , Some ( "\u{0004} " ) ) ;
2293+ check_unescape ( r"\45" , Some ( "\u{0025} " ) ) ;
2294+ check_unescape ( r"\450" , Some ( "\u{0028} " ) ) ;
2295+ check_unescape ( r"\603" , None ) ;
2296+ check_unescape ( r"\0" , None ) ;
2297+ check_unescape ( r"\080" , None ) ;
2298+
2299+ // others
2300+ check_unescape ( r"\9" , Some ( "9" ) ) ;
2301+ check_unescape ( r"''" , Some ( "'" ) ) ;
2302+ check_unescape (
2303+ r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232" ,
2304+ Some ( "Hello\r \n Rust/\u{4c91} SQL Parser\u{10abcd} \u{0053} 2" ) ,
2305+ ) ;
2306+ check_unescape ( r"Hello\0" , None ) ;
2307+ check_unescape ( r"Hello\xCADRust" , None ) ;
2308+ }
21422309}
0 commit comments