@@ -856,6 +856,26 @@ pub struct Tokenizer<'a> {
856856 unescape : bool ,
857857}
858858
859+ /// Passed into [`Tokenizer::next_token`] as in some situations tokenization
860+ /// is context dependent. The separate enum is used to be able to not clone
861+ /// the previous token during [`TokenWithLocationIter`] iteration.
862+ #[ derive( Debug , Clone , Copy , PartialEq , Eq ) ]
863+ enum PrevTokenKind {
864+ Word ,
865+ Period ,
866+ Other ,
867+ }
868+
869+ impl From < & Token > for PrevTokenKind {
870+ fn from ( value : & Token ) -> Self {
871+ match value {
872+ Token :: Word ( _) => Self :: Word ,
873+ Token :: Period => Self :: Period ,
874+ _ => Self :: Other ,
875+ }
876+ }
877+ }
878+
859879impl < ' a > Tokenizer < ' a > {
860880 /// Create a new SQL tokenizer for the specified SQL statement
861881 ///
@@ -916,6 +936,23 @@ impl<'a> Tokenizer<'a> {
916936 self
917937 }
918938
939+ /// Return an iterator over tokens
940+ pub fn iter ( & mut self ) -> TokenWithSpanIter < ' a , ' _ > {
941+ let state = State {
942+ peekable : self . query . chars ( ) . peekable ( ) ,
943+ line : 1 ,
944+ col : 1 ,
945+ } ;
946+
947+ let location = state. location ( ) ;
948+ TokenWithSpanIter {
949+ state,
950+ location,
951+ tokenizer : self ,
952+ prev_token_kind : None ,
953+ }
954+ }
955+
919956 /// Tokenize the statement and produce a vector of tokens
920957 pub fn tokenize ( & mut self ) -> Result < Vec < Token > , TokenizerError > {
921958 let twl = self . tokenize_with_location ( ) ?;
@@ -935,19 +972,8 @@ impl<'a> Tokenizer<'a> {
935972 & mut self ,
936973 buf : & mut Vec < TokenWithSpan > ,
937974 ) -> Result < ( ) , TokenizerError > {
938- let mut state = State {
939- peekable : self . query . chars ( ) . peekable ( ) ,
940- line : 1 ,
941- col : 1 ,
942- } ;
943-
944- let mut location = state. location ( ) ;
945- while let Some ( token) = self . next_token ( & mut state, buf. last ( ) . map ( |t| & t. token ) ) ? {
946- let span = location. span_to ( state. location ( ) ) ;
947-
948- buf. push ( TokenWithSpan { token, span } ) ;
949-
950- location = state. location ( ) ;
975+ for token in self . iter ( ) {
976+ buf. push ( token?) ;
951977 }
952978 Ok ( ( ) )
953979 }
@@ -982,7 +1008,7 @@ impl<'a> Tokenizer<'a> {
9821008 fn next_token (
9831009 & self ,
9841010 chars : & mut State ,
985- prev_token : Option < & Token > ,
1011+ prev_token_kind : Option < PrevTokenKind > ,
9861012 ) -> Result < Option < Token > , TokenizerError > {
9871013 match chars. peek ( ) {
9881014 Some ( & ch) => match ch {
@@ -1262,7 +1288,7 @@ impl<'a> Tokenizer<'a> {
12621288 // if the prev token is not a word, then this is not a valid sql
12631289 // word or number.
12641290 if ch == '.' && chars. peekable . clone ( ) . nth ( 1 ) == Some ( '_' ) {
1265- if let Some ( Token :: Word ( _ ) ) = prev_token {
1291+ if let Some ( PrevTokenKind :: Word ) = prev_token_kind {
12661292 chars. next ( ) ;
12671293 return Ok ( Some ( Token :: Period ) ) ;
12681294 }
@@ -1306,7 +1332,7 @@ impl<'a> Tokenizer<'a> {
13061332 // we should yield the dot as a dedicated token so compound identifiers
13071333 // starting with digits can be parsed correctly.
13081334 if s == "." && self . dialect . supports_numeric_prefix ( ) {
1309- if let Some ( Token :: Word ( _ ) ) = prev_token {
1335+ if let Some ( PrevTokenKind :: Word ) = prev_token_kind {
13101336 return Ok ( Some ( Token :: Period ) ) ;
13111337 }
13121338 }
@@ -1365,7 +1391,7 @@ impl<'a> Tokenizer<'a> {
13651391 s += word. as_str ( ) ;
13661392 return Ok ( Some ( Token :: make_word ( s. as_str ( ) , None ) ) ) ;
13671393 }
1368- } else if prev_token == Some ( & Token :: Period ) {
1394+ } else if prev_token_kind == Some ( PrevTokenKind :: Period ) {
13691395 // If the previous token was a period, thus not belonging to a number,
13701396 // the value we have is part of an identifier.
13711397 return Ok ( Some ( Token :: make_word ( s. as_str ( ) , None ) ) ) ;
@@ -2298,6 +2324,34 @@ impl<'a> Tokenizer<'a> {
22982324 }
22992325}
23002326
2327+ /// Iterator over tokens.
2328+ pub struct TokenWithSpanIter < ' a , ' b > {
2329+ state : State < ' a > ,
2330+ location : Location ,
2331+ tokenizer : & ' b mut Tokenizer < ' a > ,
2332+ prev_token_kind : Option < PrevTokenKind > ,
2333+ }
2334+
2335+ impl Iterator for TokenWithSpanIter < ' _ , ' _ > {
2336+ type Item = Result < TokenWithSpan , TokenizerError > ;
2337+
2338+ fn next ( & mut self ) -> Option < Self :: Item > {
2339+ let token = match self
2340+ . tokenizer
2341+ . next_token ( & mut self . state , self . prev_token_kind )
2342+ . transpose ( ) ?
2343+ {
2344+ Err ( err) => return Some ( Err ( err) ) ,
2345+ Ok ( token) => token,
2346+ } ;
2347+ self . prev_token_kind = Some ( PrevTokenKind :: from ( & token) ) ;
2348+ let span = self . location . span_to ( self . state . location ( ) ) ;
2349+ self . location = self . state . location ( ) ;
2350+ let token = TokenWithSpan { token, span } ;
2351+ Some ( Ok ( token) )
2352+ }
2353+ }
2354+
23012355/// Read from `chars` until `predicate` returns `false` or EOF is hit.
23022356/// Return the characters read as String, and keep the first non-matching
23032357/// char available as `chars.next()`.
@@ -2575,6 +2629,39 @@ mod tests {
25752629 compare ( expected, tokens) ;
25762630 }
25772631
2632+ #[ test]
2633+ fn tokenize_iterator_map ( ) {
2634+ let sql = String :: from ( "SELECT ?" ) ;
2635+ let dialect = GenericDialect { } ;
2636+ let mut param_num = 1 ;
2637+
2638+ let tokens = Tokenizer :: new ( & dialect, & sql)
2639+ . iter ( )
2640+ . map ( |token| {
2641+ let token = token?;
2642+ Ok ( match token. token {
2643+ Token :: Placeholder ( n) => Token :: Placeholder ( if n == "?" {
2644+ let ret = format ! ( "${}" , param_num) ;
2645+ param_num += 1 ;
2646+ ret
2647+ } else {
2648+ n
2649+ } ) ,
2650+ _ => token. token ,
2651+ } )
2652+ } )
2653+ . collect :: < Result < Vec < _ > , TokenizerError > > ( )
2654+ . unwrap ( ) ;
2655+
2656+ let expected = vec ! [
2657+ Token :: make_keyword( "SELECT" ) ,
2658+ Token :: Whitespace ( Whitespace :: Space ) ,
2659+ Token :: Placeholder ( "$1" . to_string( ) ) ,
2660+ ] ;
2661+
2662+ compare ( expected, tokens) ;
2663+ }
2664+
25782665 #[ test]
25792666 fn tokenize_select_float ( ) {
25802667 let sql = String :: from ( "SELECT .1" ) ;
0 commit comments