Skip to content

Commit 141363b

Browse files
Альберт Скальтaskalt
authored andcommitted
add iterator over tokens in Tokenizer
This patch adds an ability to iterate over recognized tokens converting `Tokenizer` to the iterator. It allows to perform token mapping with single pass, instead of mapping a resulting vector with the additional loop.
1 parent 3ac5670 commit 141363b

1 file changed

Lines changed: 104 additions & 17 deletions

File tree

src/tokenizer.rs

Lines changed: 104 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -856,6 +856,26 @@ pub struct Tokenizer<'a> {
856856
unescape: bool,
857857
}
858858

859+
/// Passed into [`Tokenizer::next_token`] as in some situations tokenization
860+
/// is context dependent. The separate enum is used to be able to not clone
861+
/// the previous token during [`TokenWithLocationIter`] iteration.
862+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
863+
enum PrevTokenKind {
864+
Word,
865+
Period,
866+
Other,
867+
}
868+
869+
impl From<&Token> for PrevTokenKind {
870+
fn from(value: &Token) -> Self {
871+
match value {
872+
Token::Word(_) => Self::Word,
873+
Token::Period => Self::Period,
874+
_ => Self::Other,
875+
}
876+
}
877+
}
878+
859879
impl<'a> Tokenizer<'a> {
860880
/// Create a new SQL tokenizer for the specified SQL statement
861881
///
@@ -916,6 +936,23 @@ impl<'a> Tokenizer<'a> {
916936
self
917937
}
918938

939+
/// Return an iterator over tokens
940+
pub fn iter(&mut self) -> TokenWithSpanIter<'a, '_> {
941+
let state = State {
942+
peekable: self.query.chars().peekable(),
943+
line: 1,
944+
col: 1,
945+
};
946+
947+
let location = state.location();
948+
TokenWithSpanIter {
949+
state,
950+
location,
951+
tokenizer: self,
952+
prev_token_kind: None,
953+
}
954+
}
955+
919956
/// Tokenize the statement and produce a vector of tokens
920957
pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
921958
let twl = self.tokenize_with_location()?;
@@ -935,19 +972,8 @@ impl<'a> Tokenizer<'a> {
935972
&mut self,
936973
buf: &mut Vec<TokenWithSpan>,
937974
) -> Result<(), TokenizerError> {
938-
let mut state = State {
939-
peekable: self.query.chars().peekable(),
940-
line: 1,
941-
col: 1,
942-
};
943-
944-
let mut location = state.location();
945-
while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
946-
let span = location.span_to(state.location());
947-
948-
buf.push(TokenWithSpan { token, span });
949-
950-
location = state.location();
975+
for token in self.iter() {
976+
buf.push(token?);
951977
}
952978
Ok(())
953979
}
@@ -982,7 +1008,7 @@ impl<'a> Tokenizer<'a> {
9821008
fn next_token(
9831009
&self,
9841010
chars: &mut State,
985-
prev_token: Option<&Token>,
1011+
prev_token_kind: Option<PrevTokenKind>,
9861012
) -> Result<Option<Token>, TokenizerError> {
9871013
match chars.peek() {
9881014
Some(&ch) => match ch {
@@ -1262,7 +1288,7 @@ impl<'a> Tokenizer<'a> {
12621288
// if the prev token is not a word, then this is not a valid sql
12631289
// word or number.
12641290
if ch == '.' && chars.peekable.clone().nth(1) == Some('_') {
1265-
if let Some(Token::Word(_)) = prev_token {
1291+
if let Some(PrevTokenKind::Word) = prev_token_kind {
12661292
chars.next();
12671293
return Ok(Some(Token::Period));
12681294
}
@@ -1306,7 +1332,7 @@ impl<'a> Tokenizer<'a> {
13061332
// we should yield the dot as a dedicated token so compound identifiers
13071333
// starting with digits can be parsed correctly.
13081334
if s == "." && self.dialect.supports_numeric_prefix() {
1309-
if let Some(Token::Word(_)) = prev_token {
1335+
if let Some(PrevTokenKind::Word) = prev_token_kind {
13101336
return Ok(Some(Token::Period));
13111337
}
13121338
}
@@ -1365,7 +1391,7 @@ impl<'a> Tokenizer<'a> {
13651391
s += word.as_str();
13661392
return Ok(Some(Token::make_word(s.as_str(), None)));
13671393
}
1368-
} else if prev_token == Some(&Token::Period) {
1394+
} else if prev_token_kind == Some(PrevTokenKind::Period) {
13691395
// If the previous token was a period, thus not belonging to a number,
13701396
// the value we have is part of an identifier.
13711397
return Ok(Some(Token::make_word(s.as_str(), None)));
@@ -2298,6 +2324,34 @@ impl<'a> Tokenizer<'a> {
22982324
}
22992325
}
23002326

2327+
/// Iterator over tokens.
2328+
pub struct TokenWithSpanIter<'a, 'b> {
2329+
state: State<'a>,
2330+
location: Location,
2331+
tokenizer: &'b mut Tokenizer<'a>,
2332+
prev_token_kind: Option<PrevTokenKind>,
2333+
}
2334+
2335+
impl Iterator for TokenWithSpanIter<'_, '_> {
2336+
type Item = Result<TokenWithSpan, TokenizerError>;
2337+
2338+
fn next(&mut self) -> Option<Self::Item> {
2339+
let token = match self
2340+
.tokenizer
2341+
.next_token(&mut self.state, self.prev_token_kind)
2342+
.transpose()?
2343+
{
2344+
Err(err) => return Some(Err(err)),
2345+
Ok(token) => token,
2346+
};
2347+
self.prev_token_kind = Some(PrevTokenKind::from(&token));
2348+
let span = self.location.span_to(self.state.location());
2349+
self.location = self.state.location();
2350+
let token = TokenWithSpan { token, span };
2351+
Some(Ok(token))
2352+
}
2353+
}
2354+
23012355
/// Read from `chars` until `predicate` returns `false` or EOF is hit.
23022356
/// Return the characters read as String, and keep the first non-matching
23032357
/// char available as `chars.next()`.
@@ -2575,6 +2629,39 @@ mod tests {
25752629
compare(expected, tokens);
25762630
}
25772631

2632+
#[test]
2633+
fn tokenize_iterator_map() {
2634+
let sql = String::from("SELECT ?");
2635+
let dialect = GenericDialect {};
2636+
let mut param_num = 1;
2637+
2638+
let tokens = Tokenizer::new(&dialect, &sql)
2639+
.iter()
2640+
.map(|token| {
2641+
let token = token?;
2642+
Ok(match token.token {
2643+
Token::Placeholder(n) => Token::Placeholder(if n == "?" {
2644+
let ret = format!("${}", param_num);
2645+
param_num += 1;
2646+
ret
2647+
} else {
2648+
n
2649+
}),
2650+
_ => token.token,
2651+
})
2652+
})
2653+
.collect::<Result<Vec<_>, TokenizerError>>()
2654+
.unwrap();
2655+
2656+
let expected = vec![
2657+
Token::make_keyword("SELECT"),
2658+
Token::Whitespace(Whitespace::Space),
2659+
Token::Placeholder("$1".to_string()),
2660+
];
2661+
2662+
compare(expected, tokens);
2663+
}
2664+
25782665
#[test]
25792666
fn tokenize_select_float() {
25802667
let sql = String::from("SELECT .1");

0 commit comments

Comments
 (0)