@@ -1189,8 +1189,8 @@ impl<'a> Tokenizer<'a> {
11891189
11901190 Ok ( Some ( Token :: make_word ( & word. concat ( ) , Some ( quote_start) ) ) )
11911191 }
1192- // numbers and period
1193- '0' ..='9' | '.' => {
1192+ // Numbers
1193+ '0' ..='9' => {
11941194 // Some dialects support underscore as number separator
11951195 // There can only be one at a time and it must be followed by another digit
11961196 let is_number_separator = |ch : char , next_char : Option < char > | {
@@ -1199,11 +1199,12 @@ impl<'a> Tokenizer<'a> {
11991199 && next_char. is_some_and ( |next_ch| next_ch. is_ascii_hexdigit ( ) )
12001200 } ;
12011201
1202+ // Start with number or potential separator
12021203 let mut s = peeking_next_take_while ( chars, |ch, next_ch| {
12031204 ch. is_ascii_digit ( ) || is_number_separator ( ch, next_ch)
12041205 } ) ;
12051206
1206- // match binary literal that starts with 0x
1207+ // Match binary literal that starts with 0x
12071208 if s == "0" && chars. peek ( ) == Some ( & 'x' ) {
12081209 chars. next ( ) ;
12091210 let s2 = peeking_next_take_while ( chars, |ch, next_ch| {
@@ -1212,60 +1213,41 @@ impl<'a> Tokenizer<'a> {
12121213 return Ok ( Some ( Token :: HexStringLiteral ( s2) ) ) ;
12131214 }
12141215
1215- // match one period
1216+ // Match fractional part after a dot
12161217 if let Some ( '.' ) = chars. peek ( ) {
12171218 s. push ( '.' ) ;
12181219 chars. next ( ) ;
12191220 }
12201221
1221- // If the dialect supports identifiers that start with a numeric prefix
1222- // and we have now consumed a dot, check if the previous token was a Word.
1223- // If so, what follows is definitely not part of a decimal number and
1224- // we should yield the dot as a dedicated token so compound identifiers
1225- // starting with digits can be parsed correctly.
1226- if s == "." && self . dialect . supports_numeric_prefix ( ) {
1227- if let Some ( Token :: Word ( _) ) = prev_token {
1228- return Ok ( Some ( Token :: Period ) ) ;
1229- }
1230- }
1231-
1232- // Consume fractional digits.
1222+ // Consume fractional digits
12331223 s += & peeking_next_take_while ( chars, |ch, next_ch| {
12341224 ch. is_ascii_digit ( ) || is_number_separator ( ch, next_ch)
12351225 } ) ;
12361226
1237- // No fraction -> Token::Period
1238- if s == "." {
1239- return Ok ( Some ( Token :: Period ) ) ;
1240- }
1241-
1242- // Parse exponent as number
1227+ // Parse exponent part (e.g., e+10 or E-5)
12431228 let mut exponent_part = String :: new ( ) ;
12441229 if chars. peek ( ) == Some ( & 'e' ) || chars. peek ( ) == Some ( & 'E' ) {
12451230 let mut char_clone = chars. peekable . clone ( ) ;
1246- exponent_part. push ( char_clone. next ( ) . unwrap ( ) ) ;
1231+ exponent_part. push ( char_clone. next ( ) . unwrap ( ) ) ; // consume 'e' or 'E'
12471232
12481233 // Optional sign
1249- match char_clone. peek ( ) {
1250- Some ( & c ) if matches ! ( c , '+' | '-' ) => {
1234+ if let Some ( & c ) = char_clone. peek ( ) {
1235+ if c == '+' || c == '-' {
12511236 exponent_part. push ( c) ;
12521237 char_clone. next ( ) ;
12531238 }
1254- _ => ( ) ,
12551239 }
12561240
1257- match char_clone . peek ( ) {
1258- // Definitely an exponent, get original iterator up to speed and use it
1259- Some ( & c ) if c. is_ascii_digit ( ) => {
1241+ // Parse digits after the exponent
1242+ if let Some ( & c ) = char_clone . peek ( ) {
1243+ if c. is_ascii_digit ( ) {
12601244 for _ in 0 ..exponent_part. len ( ) {
12611245 chars. next ( ) ;
12621246 }
12631247 exponent_part +=
12641248 & peeking_take_while ( chars, |ch| ch. is_ascii_digit ( ) ) ;
12651249 s += exponent_part. as_str ( ) ;
12661250 }
1267- // Not an exponent, discard the work done
1268- _ => ( ) ,
12691251 }
12701252 }
12711253
@@ -1274,8 +1256,7 @@ impl<'a> Tokenizer<'a> {
12741256 // be tokenized as a word.
12751257 if self . dialect . supports_numeric_prefix ( ) {
12761258 if exponent_part. is_empty ( ) {
1277- // If it is not a number with an exponent, it may be
1278- // an identifier starting with digits.
1259+ // Handle as potential word if no exponent part
12791260 let word =
12801261 peeking_take_while ( chars, |ch| self . dialect . is_identifier_part ( ch) ) ;
12811262
@@ -1284,20 +1265,84 @@ impl<'a> Tokenizer<'a> {
12841265 return Ok ( Some ( Token :: make_word ( s. as_str ( ) , None ) ) ) ;
12851266 }
12861267 } else if prev_token == Some ( & Token :: Period ) {
1287- // If the previous token was a period, thus not belonging to a number,
1288- // the value we have is part of an identifier.
1268+ // Handle as word if it follows a period
12891269 return Ok ( Some ( Token :: make_word ( s. as_str ( ) , None ) ) ) ;
12901270 }
12911271 }
12921272
1273+ // Handle "L" suffix for long numbers
12931274 let long = if chars. peek ( ) == Some ( & 'L' ) {
12941275 chars. next ( ) ;
12951276 true
12961277 } else {
12971278 false
12981279 } ;
1280+
1281+ // Return the final token for the number
12991282 Ok ( Some ( Token :: Number ( s, long) ) )
13001283 }
1284+
1285+ // Period (`.`) handling
1286+ '.' => {
1287+ chars. next ( ) ; // consume the dot
1288+
1289+ match chars. peek ( ) {
1290+ Some ( '_' ) => {
1291+ // Handle "._" case as a period (special token) followed by identifier
1292+ Ok ( Some ( Token :: Period ) )
1293+ }
1294+ Some ( ch)
1295+ // Hive and mysql dialects allow numeric prefixes for identifers
1296+ if ch. is_ascii_digit ( )
1297+ && self . dialect . supports_numeric_prefix ( )
1298+ && matches ! ( prev_token, Some ( Token :: Word ( _) ) ) =>
1299+ {
1300+ Ok ( Some ( Token :: Period ) )
1301+ }
1302+ Some ( ch) if ch. is_ascii_digit ( ) => {
1303+ // Handle numbers starting with a dot (e.g., ".123")
1304+ let mut s = String :: from ( "." ) ;
1305+ let is_number_separator = |ch : char , next_char : Option < char > | {
1306+ self . dialect . supports_numeric_literal_underscores ( )
1307+ && ch == '_'
1308+ && next_char. is_some_and ( |c| c. is_ascii_digit ( ) )
1309+ } ;
1310+
1311+ s += & peeking_next_take_while ( chars, |ch, next_ch| {
1312+ ch. is_ascii_digit ( ) || is_number_separator ( ch, next_ch)
1313+ } ) ;
1314+
1315+ // Handle exponent part
1316+ if matches ! ( chars. peek( ) , Some ( 'e' | 'E' ) ) {
1317+ let mut exp = String :: new ( ) ;
1318+ exp. push ( chars. next ( ) . unwrap ( ) ) ;
1319+
1320+ if matches ! ( chars. peek( ) , Some ( '+' | '-' ) ) {
1321+ exp. push ( chars. next ( ) . unwrap ( ) ) ;
1322+ }
1323+
1324+ if matches ! ( chars. peek( ) , Some ( c) if c. is_ascii_digit( ) ) {
1325+ exp += & peeking_take_while ( chars, |c| c. is_ascii_digit ( ) ) ;
1326+ s += & exp;
1327+ }
1328+ }
1329+
1330+ // Handle "L" suffix for long numbers
1331+ let long = if chars. peek ( ) == Some ( & 'L' ) {
1332+ chars. next ( ) ;
1333+ true
1334+ } else {
1335+ false
1336+ } ;
1337+
1338+ Ok ( Some ( Token :: Number ( s, long) ) )
1339+ }
1340+ _ => {
1341+ // Just a plain period
1342+ Ok ( Some ( Token :: Period ) )
1343+ }
1344+ }
1345+ }
13011346 // punctuation
13021347 '(' => self . consume_and_return ( chars, Token :: LParen ) ,
13031348 ')' => self . consume_and_return ( chars, Token :: RParen ) ,
@@ -2435,6 +2480,32 @@ mod tests {
24352480 compare ( expected, tokens) ;
24362481 }
24372482
2483+ #[ test]
2484+ fn tokenize_period_underscore ( ) {
2485+ let sql = String :: from ( "SELECT table._col" ) ;
2486+ // a dialect that supports underscores in numeric literals
2487+ let dialect = PostgreSqlDialect { } ;
2488+ let tokens = Tokenizer :: new ( & dialect, & sql) . tokenize ( ) . unwrap ( ) ;
2489+
2490+ let expected = vec ! [
2491+ Token :: make_keyword( "SELECT" ) ,
2492+ Token :: Whitespace ( Whitespace :: Space ) ,
2493+ Token :: Word ( Word {
2494+ value: "table" . to_string( ) ,
2495+ quote_style: None ,
2496+ keyword: Keyword :: TABLE ,
2497+ } ) ,
2498+ Token :: Period ,
2499+ Token :: Word ( Word {
2500+ value: "_col" . to_string( ) ,
2501+ quote_style: None ,
2502+ keyword: Keyword :: NoKeyword ,
2503+ } ) ,
2504+ ] ;
2505+
2506+ compare ( expected, tokens) ;
2507+ }
2508+
24382509 #[ test]
24392510 fn tokenize_select_float ( ) {
24402511 let sql = String :: from ( "SELECT .1" ) ;
0 commit comments