Skip to content

Commit 343cb64

Browse files
committed
bq: parse regex literals (consider merging with snowflake str parsing)
1 parent c375ae2 commit 343cb64

3 files changed

Lines changed: 57 additions & 6 deletions

File tree

src/ast/value.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ pub enum Value {
2727
Number(BigDecimal),
2828
/// 'string value'
2929
SingleQuotedString(String),
30+
/// r'string value'
31+
RegexLiteral { value: String, quote: char },
3032
/// N'string value'
3133
NationalStringLiteral(String),
3234
/// X'hex value'
@@ -62,6 +64,7 @@ impl fmt::Display for Value {
6264
match self {
6365
Value::Number(v) => write!(f, "{}", v),
6466
Value::SingleQuotedString(v) => write!(f, "'{}'", escape_single_quote_string(v)),
67+
Value::RegexLiteral { ref value, quote } => write!(f, "{}{}{}", quote, value, quote),
6568
Value::NationalStringLiteral(v) => write!(f, "N'{}'", v),
6669
Value::HexStringLiteral(v) => write!(f, "X'{}'", v),
6770
Value::Boolean(v) => write!(f, "{}", v),

src/parser.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,9 @@ impl<'a> Parser<'a> {
277277
value: w.clone(),
278278
quote_style: Some('`'),
279279
}),
280+
Token::BqRegexQuotedString { value, quote } => {
281+
Ok(Expr::Value(Value::RegexLiteral { value, quote }))
282+
}
280283
Token::Mult => Ok(Expr::Wildcard),
281284
tok @ Token::Minus | tok @ Token::Plus => {
282285
let op = if tok == Token::Plus {

src/tokenizer.rs

Lines changed: 51 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,10 @@ pub enum Token {
4646
SingleQuotedString(String),
4747
/// Single quoted string: i.e: 'string'
4848
BacktickQuotedString(String),
49+
BqRegexQuotedString {
50+
value: String,
51+
quote: char,
52+
},
4953
/// "National" string literal: i.e: N'string'
5054
NationalStringLiteral(String),
5155
/// Hexadecimal string literal: i.e.: X'deadbeef'
@@ -163,6 +167,9 @@ impl fmt::Display for Token {
163167
Token::SingleQuotedString(ref s) => write!(f, "'{}'", s),
164168
Token::BacktickQuotedString(ref s) => write!(f, "`{}`", s),
165169
Token::NationalStringLiteral(ref s) => write!(f, "N'{}'", s),
170+
Token::BqRegexQuotedString { ref value, quote } => {
171+
write!(f, "r{}{}{}", quote, value, quote)
172+
}
166173
Token::HexStringLiteral(ref s) => write!(f, "X'{}'", s),
167174
Token::Comma => f.write_str(","),
168175
Token::Whitespace(ws) => write!(f, "{}", ws),
@@ -372,6 +379,26 @@ impl<'a> Tokenizer<'a> {
372379
}
373380
Ok(Some(Token::Whitespace(Whitespace::Newline)))
374381
}
382+
r @ 'r' | r @ 'R' if dialect_of!(self is BigQueryDialect) => {
383+
chars.next(); // consume, to check the next char
384+
match chars.peek() {
385+
Some('\'') => {
386+
// r'...' - a regex literal
387+
let value = self.tokenize_single_quoted_string(chars)?;
388+
Ok(Some(Token::BqRegexQuotedString { value, quote: '\'' }))
389+
}
390+
Some('"') => {
391+
// r"..." - a regex literal
392+
let value = self.tokenize_double_quoted_string(chars)?;
393+
Ok(Some(Token::BqRegexQuotedString { value, quote: '"' }))
394+
}
395+
_ => {
396+
// regular identifier starting with an "r" or "R"
397+
let s = self.tokenize_word(r, chars);
398+
Ok(Some(Token::make_word(&s, None)))
399+
}
400+
}
401+
}
375402
'N' => {
376403
chars.next(); // consume, to check the next char
377404
match chars.peek() {
@@ -664,11 +691,29 @@ impl<'a> Tokenizer<'a> {
664691
fn tokenize_single_quoted_string(
665692
&self,
666693
chars: &mut Peekable<Chars<'_>>,
694+
) -> Result<String, TokenizerError> {
695+
self.tokenize_quoted_string(chars, '\'')
696+
}
697+
698+
/// Read a double quoted string, starting with the opening quote.
699+
fn tokenize_double_quoted_string(
700+
&self,
701+
chars: &mut Peekable<Chars<'_>>,
702+
) -> Result<String, TokenizerError> {
703+
self.tokenize_quoted_string(chars, '"')
704+
}
705+
706+
/// Read a quoted string (quoted by any character, typically ' or "),
707+
/// starting with the opening quote.
708+
fn tokenize_quoted_string(
709+
&self,
710+
chars: &mut Peekable<Chars<'_>>,
711+
quote_ch: char,
667712
) -> Result<String, TokenizerError> {
668713
let mut s = String::new();
669714
chars.next(); // consume the opening quote
670715
while let Some(ch) = chars.next() {
671-
let next_char_is_quote = chars.peek().map(|c| *c == '\'').unwrap_or(false);
716+
let next_char_is_quote = chars.peek().map(|c| *c == quote_ch).unwrap_or(false);
672717
match ch {
673718
// allow backslash to escape the next character, whatever it is
674719
'\\' => {
@@ -680,14 +725,14 @@ impl<'a> Tokenizer<'a> {
680725
// bq allows escaping only with backslash; other warehouses
681726
// allow escaping the quote character by repeating it
682727
_ if !dialect_of!(self is BigQueryDialect)
683-
&& ch == '\''
728+
&& ch == quote_ch
684729
&& next_char_is_quote =>
685730
{
686-
s.push('\'');
687-
s.push('\'');
688-
chars.next(); // consume '
731+
s.push(quote_ch);
732+
s.push(quote_ch);
733+
chars.next(); // consume quote_ch
689734
}
690-
'\'' => return Ok(s),
735+
ch if ch == quote_ch => return Ok(s),
691736
_ => s.push(ch),
692737
}
693738
}

0 commit comments

Comments
 (0)