Add start and end index to tokens (#273)

alxflam · RubenVerborgh · web-flow · commit 5797c067af25 · 2022-03-22T19:14:21.000Z
Co-authored-by: Ruben Verborgh &lt;ruben@verborgh.org&gt;
diff --git a/src/N3Lexer.js b/src/N3Lexer.js
@@ -78,16 +78,17 @@ export default class N3Lexer {
   _tokenizeToEnd(callback, inputFinished) {
     // Continue parsing as far as possible; the loop will return eventually
     let input = this._input;
-    const outputComments = this._comments;
+    let currentLineLength = input.length;
     while (true) {
       // Count and skip whitespace lines
       let whiteSpaceMatch, comment;
       while (whiteSpaceMatch = this._newline.exec(input)) {
         // Try to find a comment
-        if (outputComments && (comment = this._comment.exec(whiteSpaceMatch[0])))
-          callback(null, { line: this._line, type: 'comment', value: comment[1], prefix: '' });
+        if (this._comments && (comment = this._comment.exec(whiteSpaceMatch[0])))
+          emitToken('comment', comment[1], '', this._line, whiteSpaceMatch[0].length);
         // Advance the input
         input = input.substr(whiteSpaceMatch[0].length, input.length);
+        currentLineLength = input.length;
         this._line++;
       }
       // Skip whitespace on current line
@@ -99,9 +100,10 @@ export default class N3Lexer {
         // If the input is finished, emit EOF
         if (inputFinished) {
           // Try to find a final comment
-          if (outputComments && (comment = this._comment.exec(input)))
-            callback(null, { line: this._line, type: 'comment', value: comment[1], prefix: '' });
-          callback(input = null, { line: this._line, type: 'eof', value: '', prefix: '' });
+          if (this._comments && (comment = this._comment.exec(input)))
+            emitToken('comment', comment[1], '', this._line, input.length);
+          input = null;
+          emitToken('eof', '', '', this._line, 0);
         }
         return this._input = input;
       }
@@ -345,14 +347,23 @@ export default class N3Lexer {
       }
 
       // Emit the parsed token
-      const token = { line: line, type: type, value: value, prefix: prefix };
-      callback(null, token);
+      const length = matchLength || match[0].length;
+      const token = emitToken(type, value, prefix, line, length);
       this.previousToken = token;
       this._previousMarker = type;
+
       // Advance to next part to tokenize
-      input = input.substr(matchLength || match[0].length, input.length);
+      input = input.substr(length, input.length);
     }
 
+    // Emits the token through the callback
+    function emitToken(type, value, prefix, line, length) {
+      const start = input ? currentLineLength - input.length : currentLineLength;
+      const end = start + length;
+      const token = { type, value, prefix, line, start, end };
+      callback(null, token);
+      return token;
+    }
     // Signals the syntax error through the callback
     function reportSyntaxError(self) { callback(self._syntaxError(/^\S*/.exec(input)[0])); }
   }
diff --git a/test/N3Lexer-test.js b/test/N3Lexer-test.js
@@ -1090,6 +1090,57 @@ describe('Lexer', () => {
         { type: '.', line: 1 },
         { type: 'eof', line: 1 }));
 
+    it('returns start and end index for every token', () => {
+      const tokens = new Lexer().tokenize('<a:a> <b:c> "lit"@EN.');
+      tokens.should.deep.equal([
+        { line: 1, prefix: '', type: 'IRI', value: 'a:a', start: 0, end: 6 },
+        { line: 1, prefix: '', type: 'IRI', value: 'b:c', start: 6, end: 12 },
+        { line: 1, prefix: '', type: 'literal', value: 'lit', start: 12, end: 17 },
+        { line: 1, prefix: '', type: 'langcode', value: 'EN', start: 17, end: 20 },
+        { line: 1, prefix: '', type: '.', value: '', start: 20, end: 21 },
+        { line: 1, prefix: '', type: 'eof', value: '', start: 21, end: 21 },
+      ]);
+    });
+
+    it('returns start and end index relative to line', () => {
+      const tokens = new Lexer().tokenize('<a:a> <b:c> "lit"@EN ; \n <b:d> <d:e> .');
+      tokens.should.deep.equal([
+        { line: 1, prefix: '', type: 'IRI', value: 'a:a', start: 0, end: 6 },
+        { line: 1, prefix: '', type: 'IRI', value: 'b:c', start: 6, end: 12 },
+        { line: 1, prefix: '', type: 'literal', value: 'lit', start: 12, end: 17 },
+        { line: 1, prefix: '', type: 'langcode', value: 'EN', start: 17, end: 20 },
+        { line: 1, prefix: '', type: ';', value: '', start: 21, end: 22 },
+        { line: 2, prefix: '', type: 'IRI', value: 'b:d', start: 0, end: 6 },
+        { line: 2, prefix: '', type: 'IRI', value: 'd:e', start: 6, end: 12 },
+        { line: 2, prefix: '', type: '.', value: '', start: 12, end: 13 },
+        { line: 2, prefix: '', type: 'eof', value: '', start: 13, end: 13 },
+      ]);
+    });
+
+    it('returns index including whitespaces', () => {
+      const tokens = new Lexer().tokenize('<a:a>   <b:c>    <d:e>  .');
+      tokens.should.deep.equal([
+        { line: 1, prefix: '', type: 'IRI', value: 'a:a', start: 0, end: 8 },
+        { line: 1, prefix: '', type: 'IRI', value: 'b:c', start: 8, end: 17 },
+        { line: 1, prefix: '', type: 'IRI', value: 'd:e', start: 17, end: 24 },
+        { line: 1, prefix: '', type: '.', value: '', start: 24, end: 25 },
+        { line: 1, prefix: '', type: 'eof', value: '', start: 25, end: 25 },
+      ]);
+    });
+
+    it('returns index for comments and eof', () => {
+      const tokens = new Lexer({ comments: true }).tokenize('# some\n<a:a> <b:b> <c:c> . # trailing comment\n# thing');
+      tokens.should.deep.equal([
+        { line: 1, prefix: '', type: 'comment', value: ' some', start: 0, end: 7 },
+        { line: 2, prefix: '', type: 'IRI', value: 'a:a', start: 0, end: 6 },
+        { line: 2, prefix: '', type: 'IRI', value: 'b:b', start: 6, end: 12 },
+        { line: 2, prefix: '', type: 'IRI', value: 'c:c', start: 12, end: 18 },
+        { line: 2, prefix: '', type: '.', value: '', start: 18, end: 19 },
+        { line: 2, prefix: '', type: 'comment', value: ' trailing comment', start: 19, end: 39 },
+        { line: 3, prefix: '', type: 'comment', value: ' thing', start: 0, end: 7 },
+        { line: 3, prefix: '', type: 'eof', value: '', start: 7, end: 7 },
+      ]);
+    });
 
     describe('passing data after the stream has been finished', () => {
       const tokens = [];
@@ -1171,11 +1222,11 @@ describe('Lexer', () => {
 
       it('returns all tokens synchronously', () => {
         tokens.should.deep.equal([
-          { line: 1, type: 'IRI', value: 'a', prefix: '' },
-          { line: 1, type: 'IRI', value: 'b', prefix: '' },
-          { line: 1, type: 'IRI', value: 'c', prefix: '' },
-          { line: 1, type: '.',   value: '',  prefix: '' },
-          { line: 1, type: 'eof', value: '',  prefix: '' },
+          { line: 1, type: 'IRI', value: 'a', prefix: '', start: 0,  end:  4 },
+          { line: 1, type: 'IRI', value: 'b', prefix: '', start: 4,  end:  8 },
+          { line: 1, type: 'IRI', value: 'c', prefix: '', start: 8,  end: 11 },
+          { line: 1, type: '.',   value: '',  prefix: '', start: 11, end: 12 },
+          { line: 1, type: 'eof', value: '',  prefix: '', start: 12, end: 12 },
         ]);
       });
     });
@@ -1240,6 +1291,8 @@ describe('A Lexer instance with the comment option set to true', () => {
 
 function shouldTokenize(lexer, input) {
   const expected = Array.prototype.slice.call(arguments, 1);
+  const ignoredAttributes = { start: true, end: true };
+
   // Shift parameters as necessary
   if (lexer instanceof Lexer)
     expected.shift();
@@ -1256,7 +1309,8 @@ function shouldTokenize(lexer, input) {
       const expectedItem = expected[result.length];
       if (expectedItem)
         for (const attribute in token)
-          if (token[attribute] === '' && expectedItem[attribute] !== '')
+          if (typeof expectedItem[attribute] === 'undefined' &&
+              (token[attribute] === '' || (ignoredAttributes[attribute])))
             delete token[attribute];
       result.push(token);
       if (token.type === 'eof') {
diff --git a/test/N3Parser-test.js b/test/N3Parser-test.js
@@ -89,13 +89,17 @@ describe('Parser', () => {
                          type: 'type',
                          value: 'z',
                          prefix: 'x',
+                         start: 18,
+                         end: 21,
                        },
                        line: 1,
                        previousToken: {
                          line: 1,
                          type: 'literal',
                          value: 'string',
                          prefix: '',
+                         start: 8,
+                         end: 16,
                        },
                      }));
 
@@ -132,6 +136,8 @@ describe('Parser', () => {
                          type: '@PREFIX',
                          value: '',
                          prefix: '',
+                         start: 0,
+                         end: 7,
                        },
                        previousToken: undefined,
                        line: 1,
@@ -232,13 +238,17 @@ describe('Parser', () => {
                          type: '[',
                          value: '',
                          prefix: '',
+                         start: 4,
+                         end: 5,
                        },
                        line: 2,
                        previousToken: {
                          line: 2,
                          type: 'IRI',
                          value: 'a',
                          prefix: '',
+                         start: 0,
+                         end: 4,
                        },
                      }));
 
@@ -250,13 +260,17 @@ describe('Parser', () => {
                          type: 'blank',
                          value: 'b',
                          prefix: '_',
+                         start: 4,
+                         end: 8,
                        },
                        line: 2,
                        previousToken: {
                          line: 2,
                          type: 'IRI',
                          value: 'a',
                          prefix: '',
+                         start: 0,
+                         end: 4,
                        },
                      }));