Skip to content

Commit 5797c06

Browse files
Add start and end index to tokens (#273)
Co-authored-by: Ruben Verborgh <ruben@verborgh.org>
1 parent b7e9a84 commit 5797c06

3 files changed

Lines changed: 94 additions & 15 deletions

File tree

src/N3Lexer.js

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -78,16 +78,17 @@ export default class N3Lexer {
7878
_tokenizeToEnd(callback, inputFinished) {
7979
// Continue parsing as far as possible; the loop will return eventually
8080
let input = this._input;
81-
const outputComments = this._comments;
81+
let currentLineLength = input.length;
8282
while (true) {
8383
// Count and skip whitespace lines
8484
let whiteSpaceMatch, comment;
8585
while (whiteSpaceMatch = this._newline.exec(input)) {
8686
// Try to find a comment
87-
if (outputComments && (comment = this._comment.exec(whiteSpaceMatch[0])))
88-
callback(null, { line: this._line, type: 'comment', value: comment[1], prefix: '' });
87+
if (this._comments && (comment = this._comment.exec(whiteSpaceMatch[0])))
88+
emitToken('comment', comment[1], '', this._line, whiteSpaceMatch[0].length);
8989
// Advance the input
9090
input = input.substr(whiteSpaceMatch[0].length, input.length);
91+
currentLineLength = input.length;
9192
this._line++;
9293
}
9394
// Skip whitespace on current line
@@ -99,9 +100,10 @@ export default class N3Lexer {
99100
// If the input is finished, emit EOF
100101
if (inputFinished) {
101102
// Try to find a final comment
102-
if (outputComments && (comment = this._comment.exec(input)))
103-
callback(null, { line: this._line, type: 'comment', value: comment[1], prefix: '' });
104-
callback(input = null, { line: this._line, type: 'eof', value: '', prefix: '' });
103+
if (this._comments && (comment = this._comment.exec(input)))
104+
emitToken('comment', comment[1], '', this._line, input.length);
105+
input = null;
106+
emitToken('eof', '', '', this._line, 0);
105107
}
106108
return this._input = input;
107109
}
@@ -345,14 +347,23 @@ export default class N3Lexer {
345347
}
346348

347349
// Emit the parsed token
348-
const token = { line: line, type: type, value: value, prefix: prefix };
349-
callback(null, token);
350+
const length = matchLength || match[0].length;
351+
const token = emitToken(type, value, prefix, line, length);
350352
this.previousToken = token;
351353
this._previousMarker = type;
354+
352355
// Advance to next part to tokenize
353-
input = input.substr(matchLength || match[0].length, input.length);
356+
input = input.substr(length, input.length);
354357
}
355358

359+
// Emits the token through the callback
360+
function emitToken(type, value, prefix, line, length) {
361+
const start = input ? currentLineLength - input.length : currentLineLength;
362+
const end = start + length;
363+
const token = { type, value, prefix, line, start, end };
364+
callback(null, token);
365+
return token;
366+
}
356367
// Signals the syntax error through the callback
357368
function reportSyntaxError(self) { callback(self._syntaxError(/^\S*/.exec(input)[0])); }
358369
}

test/N3Lexer-test.js

Lines changed: 60 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1090,6 +1090,57 @@ describe('Lexer', () => {
10901090
{ type: '.', line: 1 },
10911091
{ type: 'eof', line: 1 }));
10921092

1093+
it('returns start and end index for every token', () => {
1094+
const tokens = new Lexer().tokenize('<a:a> <b:c> "lit"@EN.');
1095+
tokens.should.deep.equal([
1096+
{ line: 1, prefix: '', type: 'IRI', value: 'a:a', start: 0, end: 6 },
1097+
{ line: 1, prefix: '', type: 'IRI', value: 'b:c', start: 6, end: 12 },
1098+
{ line: 1, prefix: '', type: 'literal', value: 'lit', start: 12, end: 17 },
1099+
{ line: 1, prefix: '', type: 'langcode', value: 'EN', start: 17, end: 20 },
1100+
{ line: 1, prefix: '', type: '.', value: '', start: 20, end: 21 },
1101+
{ line: 1, prefix: '', type: 'eof', value: '', start: 21, end: 21 },
1102+
]);
1103+
});
1104+
1105+
it('returns start and end index relative to line', () => {
1106+
const tokens = new Lexer().tokenize('<a:a> <b:c> "lit"@EN ; \n <b:d> <d:e> .');
1107+
tokens.should.deep.equal([
1108+
{ line: 1, prefix: '', type: 'IRI', value: 'a:a', start: 0, end: 6 },
1109+
{ line: 1, prefix: '', type: 'IRI', value: 'b:c', start: 6, end: 12 },
1110+
{ line: 1, prefix: '', type: 'literal', value: 'lit', start: 12, end: 17 },
1111+
{ line: 1, prefix: '', type: 'langcode', value: 'EN', start: 17, end: 20 },
1112+
{ line: 1, prefix: '', type: ';', value: '', start: 21, end: 22 },
1113+
{ line: 2, prefix: '', type: 'IRI', value: 'b:d', start: 0, end: 6 },
1114+
{ line: 2, prefix: '', type: 'IRI', value: 'd:e', start: 6, end: 12 },
1115+
{ line: 2, prefix: '', type: '.', value: '', start: 12, end: 13 },
1116+
{ line: 2, prefix: '', type: 'eof', value: '', start: 13, end: 13 },
1117+
]);
1118+
});
1119+
1120+
it('returns index including whitespaces', () => {
1121+
const tokens = new Lexer().tokenize('<a:a> <b:c> <d:e> .');
1122+
tokens.should.deep.equal([
1123+
{ line: 1, prefix: '', type: 'IRI', value: 'a:a', start: 0, end: 8 },
1124+
{ line: 1, prefix: '', type: 'IRI', value: 'b:c', start: 8, end: 17 },
1125+
{ line: 1, prefix: '', type: 'IRI', value: 'd:e', start: 17, end: 24 },
1126+
{ line: 1, prefix: '', type: '.', value: '', start: 24, end: 25 },
1127+
{ line: 1, prefix: '', type: 'eof', value: '', start: 25, end: 25 },
1128+
]);
1129+
});
1130+
1131+
it('returns index for comments and eof', () => {
1132+
const tokens = new Lexer({ comments: true }).tokenize('# some\n<a:a> <b:b> <c:c> . # trailing comment\n# thing');
1133+
tokens.should.deep.equal([
1134+
{ line: 1, prefix: '', type: 'comment', value: ' some', start: 0, end: 7 },
1135+
{ line: 2, prefix: '', type: 'IRI', value: 'a:a', start: 0, end: 6 },
1136+
{ line: 2, prefix: '', type: 'IRI', value: 'b:b', start: 6, end: 12 },
1137+
{ line: 2, prefix: '', type: 'IRI', value: 'c:c', start: 12, end: 18 },
1138+
{ line: 2, prefix: '', type: '.', value: '', start: 18, end: 19 },
1139+
{ line: 2, prefix: '', type: 'comment', value: ' trailing comment', start: 19, end: 39 },
1140+
{ line: 3, prefix: '', type: 'comment', value: ' thing', start: 0, end: 7 },
1141+
{ line: 3, prefix: '', type: 'eof', value: '', start: 7, end: 7 },
1142+
]);
1143+
});
10931144

10941145
describe('passing data after the stream has been finished', () => {
10951146
const tokens = [];
@@ -1171,11 +1222,11 @@ describe('Lexer', () => {
11711222

11721223
it('returns all tokens synchronously', () => {
11731224
tokens.should.deep.equal([
1174-
{ line: 1, type: 'IRI', value: 'a', prefix: '' },
1175-
{ line: 1, type: 'IRI', value: 'b', prefix: '' },
1176-
{ line: 1, type: 'IRI', value: 'c', prefix: '' },
1177-
{ line: 1, type: '.', value: '', prefix: '' },
1178-
{ line: 1, type: 'eof', value: '', prefix: '' },
1225+
{ line: 1, type: 'IRI', value: 'a', prefix: '', start: 0, end: 4 },
1226+
{ line: 1, type: 'IRI', value: 'b', prefix: '', start: 4, end: 8 },
1227+
{ line: 1, type: 'IRI', value: 'c', prefix: '', start: 8, end: 11 },
1228+
{ line: 1, type: '.', value: '', prefix: '', start: 11, end: 12 },
1229+
{ line: 1, type: 'eof', value: '', prefix: '', start: 12, end: 12 },
11791230
]);
11801231
});
11811232
});
@@ -1240,6 +1291,8 @@ describe('A Lexer instance with the comment option set to true', () => {
12401291

12411292
function shouldTokenize(lexer, input) {
12421293
const expected = Array.prototype.slice.call(arguments, 1);
1294+
const ignoredAttributes = { start: true, end: true };
1295+
12431296
// Shift parameters as necessary
12441297
if (lexer instanceof Lexer)
12451298
expected.shift();
@@ -1256,7 +1309,8 @@ function shouldTokenize(lexer, input) {
12561309
const expectedItem = expected[result.length];
12571310
if (expectedItem)
12581311
for (const attribute in token)
1259-
if (token[attribute] === '' && expectedItem[attribute] !== '')
1312+
if (typeof expectedItem[attribute] === 'undefined' &&
1313+
(token[attribute] === '' || (ignoredAttributes[attribute])))
12601314
delete token[attribute];
12611315
result.push(token);
12621316
if (token.type === 'eof') {

test/N3Parser-test.js

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,13 +89,17 @@ describe('Parser', () => {
8989
type: 'type',
9090
value: 'z',
9191
prefix: 'x',
92+
start: 18,
93+
end: 21,
9294
},
9395
line: 1,
9496
previousToken: {
9597
line: 1,
9698
type: 'literal',
9799
value: 'string',
98100
prefix: '',
101+
start: 8,
102+
end: 16,
99103
},
100104
}));
101105

@@ -132,6 +136,8 @@ describe('Parser', () => {
132136
type: '@PREFIX',
133137
value: '',
134138
prefix: '',
139+
start: 0,
140+
end: 7,
135141
},
136142
previousToken: undefined,
137143
line: 1,
@@ -232,13 +238,17 @@ describe('Parser', () => {
232238
type: '[',
233239
value: '',
234240
prefix: '',
241+
start: 4,
242+
end: 5,
235243
},
236244
line: 2,
237245
previousToken: {
238246
line: 2,
239247
type: 'IRI',
240248
value: 'a',
241249
prefix: '',
250+
start: 0,
251+
end: 4,
242252
},
243253
}));
244254

@@ -250,13 +260,17 @@ describe('Parser', () => {
250260
type: 'blank',
251261
value: 'b',
252262
prefix: '_',
263+
start: 4,
264+
end: 8,
253265
},
254266
line: 2,
255267
previousToken: {
256268
line: 2,
257269
type: 'IRI',
258270
value: 'a',
259271
prefix: '',
272+
start: 0,
273+
end: 4,
260274
},
261275
}));
262276

0 commit comments

Comments
 (0)