Add support for comment hints

altmannmarcelo · altmannmarcelo · commit f8df3336c9d0 · 2026-02-03T21:03:01.000+05:30
This commit adds support for comment hints supported by MySQL.
It parses and consumes the optional version number after the `!`
character and return all tokens inside a comment hint.
diff --git a/src/dialect/generic.rs b/src/dialect/generic.rs
@@ -156,6 +156,10 @@ impl Dialect for GenericDialect {
         true
     }
 
+    fn supports_multiline_comment_hints(&self) -> bool {
+        true
+    }
+
     fn supports_user_host_grantee(&self) -> bool {
         true
     }
diff --git a/src/dialect/mod.rs b/src/dialect/mod.rs
@@ -898,6 +898,12 @@ pub trait Dialect: Debug + Any {
         false
     }
 
+    /// Returns true if the dialect supports optimizer hints in multiline comments
+    /// e.g. `/*!50110 KEY_BLOCK_SIZE = 1024*/`
+    fn supports_multiline_comment_hints(&self) -> bool {
+        false
+    }
+
     /// Returns true if this dialect supports treating the equals operator `=` within a `SelectItem`
     /// as an alias assignment operator, rather than a boolean expression.
     /// For example: the following statements are equivalent for such a dialect:
diff --git a/src/dialect/mysql.rs b/src/dialect/mysql.rs
@@ -84,6 +84,11 @@ impl Dialect for MySqlDialect {
         true
     }
 
+    /// see <https://dev.mysql.com/doc/refman/8.4/en/comments.html>
+    fn supports_multiline_comment_hints(&self) -> bool {
+        true
+    }
+
     fn parse_infix(
         &self,
         parser: &mut crate::parser::Parser,
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
@@ -901,13 +901,69 @@ impl<'a> Tokenizer<'a> {
         while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
             let span = location.span_to(state.location());
 
-            buf.push(TokenWithSpan { token, span });
+            // Check if this is a multiline comment hint that should be expanded
+            match &token {
+                Token::Whitespace(Whitespace::MultiLineComment(comment))
+                    if self.dialect.supports_multiline_comment_hints()
+                        && comment.starts_with('!') =>
+                {
+                    // Re-tokenize the hints and add them to the buffer
+                    self.tokenize_comment_hints(comment, span, buf)?;
+                }
+                _ => {
+                    buf.push(TokenWithSpan { token, span });
+                }
+            }
 
             location = state.location();
         }
         Ok(())
     }
 
+    /// Re-tokenize optimizer hints from a multiline comment and add them to the buffer.
+    /// For example, `/*!50110 KEY_BLOCK_SIZE = 1024*/` becomes tokens for `KEY_BLOCK_SIZE = 1024`
+    fn tokenize_comment_hints(
+        &self,
+        comment: &str,
+        span: Span,
+        buf: &mut Vec<TokenWithSpan>,
+    ) -> Result<(), TokenizerError> {
+        // Strip the leading '!' and any version digits (e.g., "50110")
+        let hint_content = comment
+            .strip_prefix('!')
+            .unwrap_or(comment)
+            .trim_start_matches(|c: char| c.is_ascii_digit())
+            .trim();
+
+        // If there's no content after stripping, nothing to tokenize
+        if hint_content.is_empty() {
+            return Ok(());
+        }
+
+        // Create a new tokenizer for the hint content
+        let mut inner = Tokenizer::new(self.dialect, hint_content).with_unescape(self.unescape);
+
+        // Create a state for tracking position within the hint
+        let mut state = State {
+            peekable: hint_content.chars().peekable(),
+            line: span.start.line,
+            col: span.start.column,
+        };
+
+        // Tokenize the hint content and add tokens to the buffer
+        let mut location = state.location();
+        while let Some(token) = inner.next_token(&mut state, buf.last().map(|t| &t.token))? {
+            let token_span = location.span_to(state.location());
+            buf.push(TokenWithSpan {
+                token,
+                span: token_span,
+            });
+            location = state.location();
+        }
+
+        Ok(())
+    }
+
     // Tokenize the identifier or keywords in `ch`
     fn tokenize_identifier_or_keyword(
         &self,
@@ -936,7 +992,7 @@ impl<'a> Tokenizer<'a> {
 
     /// Get the next token or return None
     fn next_token(
-        &self,
+        &mut self,
         chars: &mut State,
         prev_token: Option<&Token>,
     ) -> Result<Option<Token>, TokenizerError> {
@@ -2102,13 +2158,12 @@ impl<'a> Tokenizer<'a> {
     }
 
     fn tokenize_multiline_comment(
-        &self,
+        &mut self,
         chars: &mut State,
     ) -> Result<Option<Token>, TokenizerError> {
         let mut s = String::new();
         let mut nested = 1;
         let supports_nested_comments = self.dialect.supports_nested_comments();
-
         loop {
             match chars.next() {
                 Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
@@ -4070,4 +4125,44 @@ mod tests {
             panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
         }
     }
+    #[test]
+    fn tokenize_multiline_comment_with_c_style_comment() {
+        let sql = String::from("0/*! word */1");
+
+        let dialect = MySqlDialect {};
+        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
+        let expected = vec![
+            Token::Number("0".to_string(), false),
+            Token::Word(Word {
+                value: "word".to_string(),
+                quote_style: None,
+                keyword: Keyword::NoKeyword,
+            }),
+            Token::Number("1".to_string(), false),
+        ];
+        compare(expected, tokens);
+    }
+
+    #[test]
+    fn tokenize_multiline_comment_with_c_style_comment_and_version() {
+        let sql_multi = String::from("0 /*!50110 KEY_BLOCK_SIZE = 1024*/ 1");
+        let dialect = MySqlDialect {};
+        let tokens = Tokenizer::new(&dialect, &sql_multi).tokenize().unwrap();
+        let expected = vec![
+            Token::Number("0".to_string(), false),
+            Token::Whitespace(Whitespace::Space),
+            Token::Word(Word {
+                value: "KEY_BLOCK_SIZE".to_string(),
+                quote_style: None,
+                keyword: Keyword::KEY_BLOCK_SIZE,
+            }),
+            Token::Whitespace(Whitespace::Space),
+            Token::Eq,
+            Token::Whitespace(Whitespace::Space),
+            Token::Number("1024".to_string(), false),
+            Token::Whitespace(Whitespace::Space),
+            Token::Number("1".to_string(), false),
+        ];
+        compare(expected, tokens);
+    }
 }

Original file line number	Diff line number	Diff line change
`@@ -156,6 +156,10 @@ impl Dialect for GenericDialect {`
`156`	`156`	`true`
`157`	`157`	`}`
`158`	`158`
	`159`	`+ fn supports_multiline_comment_hints(&self) -> bool {`
	`160`	`+ true`
	`161`	`+ }`
	`162`	`+`
`159`	`163`	`fn supports_user_host_grantee(&self) -> bool {`
`160`	`164`	`true`
`161`	`165`	`}`