1 if (typeof exports !== 'undefined') {
\r
2 var window = {Unicode: require('./unicodecategories').Unicode};
\r
3 exports.Tokenizer = Tokenizer;
\r
7 * Tokenizer for JavaScript / ECMAScript 5
\r
8 * (c) Peter van der Zee, qfox.nl
\r
12 * @param {Object} inp
\r
14 function Tokenizer(inp){
\r
16 // replace all other line terminators with \n (leave \r\n in tact though). we should probably remove the shadowInp when finished...
\r
17 // only replace \r if it is not followed by a \n else \r\n would become \n\n causing a double newline where it is just a single
\r
18 this.shadowInp = (inp||'').replace(Tokenizer.regexNormalizeNewlines, '\n');
\r
24 this.errorStack = [];
\r
29 // this.regexWhiteSpace = Tokenizer.regexWhiteSpace;
\r
30 this.regexLineTerminator = Tokenizer.regexLineTerminator; // used in fallback
\r
31 this.regexAsciiIdentifier = Tokenizer.regexAsciiIdentifier;
\r
32 this.hashAsciiIdentifier = Tokenizer.hashAsciiIdentifier;
\r
33 // this.regexHex = Tokenizer.regexHex;
\r
34 this.hashHex = Tokenizer.hashHex
\r
35 this.regexUnicodeEscape = Tokenizer.regexUnicodeEscape;
\r
36 this.regexIdentifierStop = Tokenizer.regexIdentifierStop;
\r
37 this.hashIdentifierStop = Tokenizer.hashIdentifierStop;
\r
38 // this.regexPunctuators = Tokenizer.regexPunctuators;
\r
39 this.regexNumber = Tokenizer.regexNumber;
\r
40 this.regexNewline = Tokenizer.regexNewline;
\r
42 this.regexBig = Tokenizer.regexBig;
\r
43 this.regexBigAlt = Tokenizer.regexBigAlt;
\r
45 this.tokenCount = 0;
\r
46 this.tokenCountNoWhite = 0;
\r
48 this.Unicode = window.Unicode;
\r
50 // if the Parser throws an error. it will set this property to the next match
\r
51 // at the time of the error (which was not what it was expecting at that point)
\r
52 // and pass on an "error" match. the error should be scooped on the stack and
\r
53 // this property should be returned, without looking at the input...
\r
54 this.errorEscape = null;
\r
57 Tokenizer.prototype = {
\r
66 wtree: null, // contains whitespace (spaces, comments, newlines)
\r
67 btree: null, // does not contain any whitespace tokens.
\r
69 regexLineTerminator:null,
\r
70 regexAsciiIdentifier:null,
\r
71 hashAsciiIdentifier:null,
\r
73 regexUnicodeEscape:null,
\r
74 regexIdentifierStop:null,
\r
75 hashIdentifierStop:null,
\r
81 tokenCountNoWhite:null,
\r
85 // storeCurrentAndFetchNextToken(bool, false, false true) to get just one token
\r
86 storeCurrentAndFetchNextToken: function(noRegex, returnValue, stack, _dontStore){
\r
87 var regex = !noRegex; // TOFIX :)
\r
90 var shadowInp = this.shadowInp;
\r
91 var matchedNewline = false;
\r
95 stack.push(returnValue);
\r
96 // did the parent Parser throw up?
\r
97 if (this.errorEscape) {
\r
98 returnValue = this.errorEscape;
\r
99 this.errorEscape = null;
\r
100 return returnValue;
\r
103 _dontStore = false;
\r
105 if (pos >= inp.length) {
\r
106 returnValue = {start:inp.length,stop:inp.length,name:12/*EOF*/};
\r
109 var returnValue = null;
\r
112 var chr = inp[pos];
\r
114 // 1 ws 2 lt 3 scmt 4 mcmt 5/6 str 7 nr 8 rx 9 punc
\r
116 // substring method (I think this is faster..)
\r
117 var part2 = inp.substring(pos,pos+4);
\r
118 var part = this.regexBig.exec(part2);
\r
120 // // non-substring method (lastIndex)
\r
121 // // this method does not need a substring to apply it
\r
122 // this.regexBigAlt.lastIndex = pos;
\r
123 // var part = this.regexBigAlt.exec(inp);
\r
126 if (part[1]) { //this.regexWhiteSpace.test(chr)) { // SP, TAB, VT, FF, NBSP, BOM (, TOFIX: USP)
\r
128 returnValue = {start:start,stop:pos,name:9/*WHITE_SPACE*/,line:this.line,col:this.column,isWhite:true};
\r
130 } else if (part[2]) { //this.regexLineTerminator.test(chr)) { // LF, CR, LS, PS
\r
132 if (chr=='\r' && inp[pos+1] == '\n') ++end; // support crlf=>lf
\r
133 returnValue = {start:pos,stop:end,name:10/*LINETERMINATOR*/,line:this.line,col:this.column,isWhite:true};
\r
135 // mark newlines for ASI
\r
136 matchedNewline = true;
\r
139 returnValue.hasNewline = 1;
\r
140 } else if (part[3]) { //chr == '/' && inp[pos+1] == '/') {
\r
141 pos = shadowInp.indexOf('\n',pos);
\r
142 if (pos == -1) pos = inp.length;
\r
143 returnValue = {start:start,stop:pos,name:7/*COMMENT_SINGLE*/,line:this.line,col:this.column,isComment:true,isWhite:true};
\r
144 this.column = returnValue.stop;
\r
145 } else if (part[4]) { //chr == '/' && inp[pos+1] == '*') {
\r
146 var newpos = inp.indexOf('*/',pos);
\r
147 if (newpos == -1) {
\r
148 newpos = shadowInp.indexOf('\n', pos);
\r
149 if (newpos < 0) pos += 2;
\r
151 returnValue = {start:start,stop:pos,name:14/*error*/,value:inp.substring(start, pos),line:this.line,col:this.column,isComment:true,isWhite:true,tokenError:true,error:Tokenizer.Error.UnterminatedMultiLineComment};
\r
152 this.errorStack.push(returnValue);
\r
155 returnValue = {start:start,stop:pos,name:8/*COMMENT_MULTI*/,value:inp.substring(start, pos),line:this.line,col:this.column,isComment:true,isWhite:true};
\r
157 // multi line comments are also reason for asi, but only if they contain at least one newline (use shadow input, because all line terminators would be valid...)
\r
158 var shadowValue = shadowInp.substring(start, pos);
\r
159 var i = 0, hasNewline = 0;
\r
160 while (i < (i = shadowValue.indexOf('\n', i+1))) {
\r
164 matchedNewline = true;
\r
165 returnValue.hasNewline = hasNewline;
\r
166 this.line += hasNewline;
\r
169 this.column = returnValue.stop;
\r
172 } else if (part[5]) { //chr == "'") {
\r
174 //console.log("old method");
\r
176 var hasNewline = 0;
\r
178 // process escaped characters
\r
179 while (pos < inp.length && inp[++pos] == '\\') {
\r
180 if (shadowInp[pos+1] == '\n') ++hasNewline;
\r
183 if (this.regexLineTerminator.test(inp[pos])) {
\r
184 returnValue = {start:start,stop:pos,name:14/*error*/,value:inp.substring(start, pos),isString:true,tokenError:true,error:Tokenizer.Error.UnterminatedDoubleStringNewline};
\r
185 this.errorStack.push(returnValue);
\r
188 } while (pos < inp.length && inp[pos] != "'");
\r
189 if (returnValue) {} // error
\r
190 else if (inp[pos] != "'") {
\r
191 returnValue = {start:start,stop:pos,name:14/*error*/,value:inp.substring(start, pos),isString:true,tokenError:true,error:Tokenizer.Error.UnterminatedDoubleStringOther};
\r
192 this.errorStack.push(returnValue);
\r
195 returnValue = {start:start,stop:pos,name:5/*STRING_SINGLE*/,isPrimitive:true,isString:true};
\r
197 returnValue.hasNewline = hasNewline;
\r
198 this.line += hasNewline;
\r
201 this.column += (pos-start);
\r
204 } else if (part[6]) { //chr == '"') {
\r
205 var hasNewline = 0;
\r
206 // TODO: something like this: var regexmatch = /([^\']|$)+/.match();
\r
208 // process escaped chars
\r
209 while (pos < inp.length && inp[++pos] == '\\') {
\r
210 if (shadowInp[pos+1] == '\n') ++hasNewline;
\r
213 if (this.regexLineTerminator.test(inp[pos])) {
\r
214 returnValue = {start:start,stop:pos,name:14/*error*/,value:inp.substring(start, pos),isString:true,tokenError:true,error:Tokenizer.Error.UnterminatedSingleStringNewline};
\r
215 this.errorStack.push(returnValue);
\r
218 } while (pos < inp.length && inp[pos] != '"');
\r
219 if (returnValue) {}
\r
220 else if (inp[pos] != '"') {
\r
221 returnValue = {start:start,stop:pos,name:14/*error*/,value:inp.substring(start, pos),isString:true,tokenError:true,error:Tokenizer.Error.UnterminatedSingleStringOther};
\r
222 this.errorStack.push(returnValue);
\r
225 returnValue = {start:start,stop:pos,name:6/*STRING_DOUBLE*/,isPrimitive:true,isString:true};
\r
227 returnValue.hasNewline = hasNewline;
\r
228 this.line += hasNewline;
\r
231 this.column += (pos-start);
\r
234 } else if (part[7]) { //(chr >= '0' && chr <= '9') || (chr == '.' && inp[pos+1] >= '0' && inp[pos+1] <= '9')) {
\r
235 var nextPart = inp.substring(pos, pos+30);
\r
236 var match = nextPart.match(this.regexNumber);
\r
237 if (match[2]) { // decimal
\r
238 var value = match[2];
\r
239 var parsingOctal = value[0] == '0' && value[1] && value[1] != 'e' && value[1] != 'E' && value[1] != '.';
\r
240 if (parsingOctal) {
\r
241 returnValue = {start:start,stop:pos,name:14/*error*/,isNumber:true,isOctal:true,tokenError:true,error:Tokenizer.Error.IllegalOctalEscape,value:value};
\r
242 this.errorStack.push(returnValue);
\r
244 returnValue = {start:start,stop:start+value.length,name:4/*NUMERIC_DEC*/,isPrimitive:true,isNumber:true,value:value};
\r
246 } else if (match[1]) { // hex
\r
247 var value = match[1];
\r
248 returnValue = {start:start,stop:start+value.length,name:3/*NUMERIC_HEX*/,isPrimitive:true,isNumber:true,value:value};
\r
250 throw 'unexpected parser errror... regex fail :(';
\r
253 if (value.length < 300) {
\r
254 pos += value.length;
\r
256 // old method of parsing numbers. only used for extremely long number literals (300+ chars).
\r
257 // this method does not require substringing... just memory :)
\r
258 var tmpReturnValue = this.oldNumberParser(pos, chr, inp, returnValue, start, Tokenizer);
\r
259 pos = tmpReturnValue[0];
\r
260 returnValue = tmpReturnValue[1];
\r
262 } else if (regex && part[8]) { //chr == '/') { // regex cannot start with /* (would be multiline comment, and not make sense anyways). but if it was /* then an earlier if would have eated it. so we only check for /
\r
263 var twinfo = []; // matching {[( info
\r
266 var nonLethalError = null;
\r
267 while (++pos < inp.length) {
\r
268 chr = shadowInp[pos];
\r
269 // parse RegularExpressionChar
\r
271 returnValue = {start:start,stop:pos,name:14/*error*/,tokenError:true,errorHasContent:true,error:Tokenizer.Error.UnterminatedRegularExpressionNewline};
\r
272 this.errorStack.push(returnValue);
\r
274 } else if (chr == '/') {
\r
277 } else if (chr == '?' || chr == '*' || chr == '+') {
\r
278 nonLethalError = Tokenizer.Error.NothingToRepeat;
\r
279 } else if (chr == '^') {
\r
281 inp[pos-1] != '/' &&
\r
282 inp[pos-1] != '|' &&
\r
283 inp[pos-1] != '(' &&
\r
284 !(inp[pos-3] == '(' && inp[pos-2] == '?' && (inp[pos-1] == ':' || inp[pos-1] == '!' || inp[pos-1] == '='))
\r
286 nonLethalError = Tokenizer.Error.StartOfMatchShouldBeAtStart;
\r
288 } else if (chr == '$') {
\r
289 if (inp[pos+1] != '/' && inp[pos+1] != '|' && inp[pos+1] != ')') nonLethalError = Tokenizer.Error.DollarShouldBeEnd;
\r
290 } else if (chr == '}') {
\r
291 nonLethalError = Tokenizer.Error.MissingOpeningCurly;
\r
292 } else { // it's a "character" (can be group or class), something to match
\r
293 // match parenthesis
\r
295 parens.push(pos-start);
\r
296 } else if (chr == ')') {
\r
297 if (parens.length == 0) {
\r
298 nonLethalError = {start:start,stop:pos,name:14/*error*/,tokenError:true,error:Tokenizer.Error.RegexNoOpenGroups};
\r
300 var twin = parens.pop();
\r
301 var now = pos-start;
\r
302 twinfo[twin] = now;
\r
303 twinfo[now] = twin;
\r
306 // first process character class
\r
308 var before = pos-start;
\r
309 while (++pos < inp.length && shadowInp[pos] != '\n' && inp[pos] != ']') {
\r
310 // only newline is not allowed in class range
\r
311 // anything else can be escaped, most of it does not have to be escaped...
\r
312 if (inp[pos] == '\\') {
\r
313 if (shadowInp[pos+1] == '\n') break;
\r
314 else ++pos; // skip next char. (mainly prohibits ] to be picked up as closing the group...)
\r
317 if (inp[pos] != ']') {
\r
318 returnValue = {start:start,stop:pos,name:14/*error*/,tokenError:true,error:Tokenizer.Error.ClosingClassRangeNotFound};
\r
319 this.errorStack.push(returnValue);
\r
322 var after = pos-start;
\r
323 twinfo[before] = after;
\r
324 twinfo[after] = before;
\r
326 } else if (chr == '\\' && shadowInp[pos+1] != '\n') {
\r
327 // is ok anywhere in the regex (match next char literally, regardless of its otherwise special meaning)
\r
331 // now process repeaters (+, ? and *)
\r
333 // non-collecting group (?:...) and positive (?=...) or negative (?!...) lookahead
\r
335 if (inp[pos+1] == '?' && (inp[pos+2] == ':' || inp[pos+2] == '=' || inp[pos+2] == '!')) {
\r
340 else if (inp[pos+1] == '?') ++pos;
\r
341 else if (inp[pos+1] == '*' || inp[pos+1] == '+') {
\r
343 if (inp[pos+1] == '?') ++pos; // non-greedy match
\r
344 } else if (inp[pos+1] == '{') {
\r
346 var before = pos-start;
\r
351 if (!/[0-9]/.test(inp[pos+1])) {
\r
352 nonLethalError = Tokenizer.Error.QuantifierRequiresNumber;
\r
354 while (++pos < inp.length && /[0-9]/.test(inp[pos+1]));
\r
355 if (inp[pos+1] == ',') {
\r
357 while (pos < inp.length && /[0-9]/.test(inp[pos+1])) ++pos;
\r
359 if (inp[pos+1] != '}') {
\r
360 nonLethalError = Tokenizer.Error.QuantifierRequiresClosingCurly;
\r
363 var after = pos-start;
\r
364 twinfo[before] = after;
\r
365 twinfo[after] = before;
\r
366 if (inp[pos+1] == '?') ++pos; // non-greedy match
\r
371 // if found=false, fail right now. otherwise try to parse an identifiername (that's all RegularExpressionFlags is..., but it's constructed in a stupid fashion)
\r
372 if (!found || returnValue) {
\r
373 if (!returnValue) {
\r
374 returnValue = {start:start,stop:pos,name:14/*error*/,tokenError:true,error:Tokenizer.Error.UnterminatedRegularExpressionOther};
\r
375 this.errorStack.push(returnValue);
\r
378 // this is the identifier scanner, for now
\r
380 while (pos < inp.length && this.hashAsciiIdentifier[inp[pos]]); /*this.regexAsciiIdentifier.test(inp[pos])*/
\r
382 if (parens.length) {
\r
383 // nope, this is still an error, there was at least one paren that did not have a matching twin
\r
384 if (parens.length > 0) returnValue = {start:start,stop:pos,name:14/*error*/,tokenError:true,error:Tokenizer.Error.RegexOpenGroup};
\r
385 this.errorStack.push(returnValue);
\r
386 } else if (nonLethalError) {
\r
387 returnValue = {start:start,stop:pos,name:14/*error*/,errorHasContent:true,tokenError:true,error:nonLethalError};
\r
388 this.errorStack.push(returnValue);
\r
390 returnValue = {start:start,stop:pos,name:1/*REG_EX*/,isPrimitive:true};
\r
393 returnValue.twinfo = twinfo;
\r
395 // note: operators need to be ordered from longest to smallest. regex will take care of the rest.
\r
396 // no need to worry about div vs regex. if looking for regex, earlier if will have eaten it
\r
397 //var result = this.regexPunctuators.exec(inp.substring(pos,pos+4));
\r
399 // note: due to the regex, the single forward slash might be caught by an earlier part of the regex. so check for that.
\r
400 var result = part[8] || part[9];
\r
402 //result = result[1];
\r
403 returnValue = {start:pos,stop:pos+=result.length,name:11/*PUNCTUATOR*/,value:result};
\r
406 // identifiers cannot start with a number. but if the leading string would be a number, another if would have eaten it already for numeric literal :)
\r
407 while (pos < inp.length) {
\r
410 if (this.hashAsciiIdentifier[c]) ++pos; //if (this.regexAsciiIdentifier.test(c)) ++pos;
\r
411 else if (c == '\\' && this.regexUnicodeEscape.test(inp.substring(pos,pos+6))) pos += 6; // this is like a \uxxxx
\r
412 // ok, now test unicode ranges...
\r
413 // basically this hardly ever happens so there's little risk of this hitting performance
\r
414 // however, if you do happen to have used them, it's not a problem. the parser will support it :)
\r
415 else if (this.Unicode) { // the unicode is optional.
\r
416 // these chars may not be part of identifier. i want to try to prevent running the unicode regexes here...
\r
417 if (this.hashIdentifierStop[c] /*this.regexIdentifierStop.test(c)*/) break;
\r
418 // for most scripts, the code wont reach here. which is good, because this is going to be relatively slow :)
\r
419 var Unicode = this.Unicode; // cache
\r
421 // these may all occur in an identifier... (pure a specification compliance thing :)
\r
422 Unicode.Lu.test(c) || Unicode.Ll.test(c) || Unicode.Lt.test(c) || Unicode.Lm.test(c) ||
\r
423 Unicode.Lo.test(c) || Unicode.Nl.test(c) || Unicode.Mn.test(c) || Unicode.Mc.test(c) ||
\r
424 Unicode.Nd.test(c) || Unicode.Pc.test(c) || Unicode.sp.test(c)
\r
425 )) break; // end of match.
\r
426 // passed, next char
\r
428 } else break; // end of match.
\r
434 returnValue = {start:start,stop:pos,name:2/*IDENTIFIER*/,value:inp.substring(start,pos)};
\r
435 if (returnValue.value == 'undefined' || returnValue.value == 'null' || returnValue.value == 'true' || returnValue.value == 'false') returnValue.isPrimitive = true;
\r
437 if (inp[pos] == '`') {
\r
438 returnValue = {start:start,stop:pos+1,name:14/*error*/,tokenError:true,error:Tokenizer.Error.BacktickNotSupported};
\r
439 this.errorStack.push(returnValue);
\r
440 } else if (inp[pos] == '\\') {
\r
441 if (inp[pos+1] == 'u') {
\r
442 returnValue = {start:start,stop:pos+1,name:14/*error*/,tokenError:true,error:Tokenizer.Error.InvalidUnicodeEscape};
\r
443 this.errorStack.push(returnValue);
\r
445 returnValue = {start:start,stop:pos+1,name:14/*error*/,tokenError:true,error:Tokenizer.Error.InvalidBackslash};
\r
446 this.errorStack.push(returnValue);
\r
449 returnValue = {start:start,stop:pos+1,name:14/*error*/,tokenError:true,error:Tokenizer.Error.Unknown,value:c};
\r
450 this.errorStack.push(returnValue);
\r
451 // try to skip this char. it's not going anywhere.
\r
459 // note that ASI's are slipstreamed in here from the parser since the tokenizer cant determine that
\r
460 // if this part ever changes, make sure you change that too :)
\r
461 returnValue.tokposw = this.wtree.length;
\r
462 this.wtree.push(returnValue);
\r
463 if (!returnValue.isWhite) {
\r
464 returnValue.tokposb = this.btree.length;
\r
465 this.btree.push(returnValue);
\r
470 } while (stack && returnValue && returnValue.isWhite); // WHITE_SPACE LINETERMINATOR COMMENT_SINGLE COMMENT_MULTI
\r
471 ++this.tokenCountNoWhite;
\r
475 if (matchedNewline) returnValue.newline = true;
\r
476 return returnValue;
\r
478 addTokenToStreamBefore: function(token, match){
\r
479 var wtree = this.wtree;
\r
480 var btree = this.btree;
\r
481 if (match.name == 12/*asi*/) {
\r
482 token.tokposw = wtree.length;
\r
484 token.tokposb = btree.length;
\r
487 token.tokposw = match.tokposw;
\r
488 wtree[token.tokposw] = token;
\r
489 match.tokposw += 1;
\r
490 wtree[match.tokposw] = match;
\r
492 if (match.tokposb) {
\r
493 token.tokposb = match.tokposb;
\r
494 btree[token.tokposb] = token;
\r
495 match.tokposb += 1;
\r
496 btree[match.tokposb] = match;
\r
500 oldNumberParser: function(pos, chr, inp, returnValue, start, Tokenizer){
\r
502 // either: 0x 0X 0 .3
\r
503 if (chr == '0' && (inp[pos] == 'x' || inp[pos] == 'X')) {
\r
505 while (++pos < inp.length && this.hashHex[inp[pos]]); // this.regexHex.test(inp[pos]));
\r
506 returnValue = {start:start,stop:pos,name:3/*NUMERIC_HEX*/,isPrimitive:true,isNumber:true};
\r
508 var parsingOctal = chr == '0' && inp[pos] >= '0' && inp[pos] <= '9';
\r
510 if (chr != '.') { // integer part
\r
511 while (pos < inp.length && inp[pos] >= '0' && inp[pos] <= '9') ++pos;
\r
512 if (inp[pos] == '.') ++pos;
\r
515 while (pos < inp.length && inp[pos] >= '0' && inp[pos] <= '9') ++pos;
\r
517 if (inp[pos] == 'e' || inp[pos] == 'E') {
\r
518 if (inp[++pos] == '+' || inp[pos] == '-') ++pos;
\r
519 var expPosBak = pos;
\r
520 while (pos < inp.length && inp[pos] >= '0' && inp[pos] <= '9') ++pos;
\r
521 if (expPosBak == pos) {
\r
522 returnValue = {start:start,stop:pos,name:14/*error*/,tokenError:true,error:Tokenizer.Error.NumberExponentRequiresDigits};
\r
523 this.errorStack.push(returnValue);
\r
526 if (returnValue.name != 14/*error*/) {
\r
527 if (parsingOctal) {
\r
528 returnValue = {start:start,stop:pos,name:14/*error*/,isNumber:true,isOctal:true,tokenError:true,error:Tokenizer.Error.IllegalOctalEscape};
\r
529 this.errorStack.push(returnValue);
\r
532 returnValue = {start:start,stop:pos,name:4/*NUMERIC_DEC*/,isPrimitive:true,isNumber:true};
\r
536 return [pos, returnValue];
\r
538 tokens: function(arrx){
\r
543 while ((last = this.storeCurrentAndFetchNextToken(!arrx[n++], false, false, true)) && last.name != 12/*EOF*/) stack.push(last);
\r
546 fixValues: function(){
\r
547 this.wtree.forEach(function(t){
\r
548 if (!t.value) t.value = this.inp.substring(t.start, t.stop);
\r
553 //#ifdef TEST_SUITE
\r
554 Tokenizer.escape = function(s){
\r
555 return s.replace(/\n/g,'\\n').replace(/\t/g,'\\t').replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>').replace(/\uFFFF/g, '\\uFFFF').replace(/\s/g, function(s){
\r
556 // replace whitespace as is...
\r
557 var ord = s.charCodeAt(0).toString(16);
\r
558 switch (ord.length) {
\r
559 case 1: ord = '000'+ord; break;
\r
560 case 2: ord = '00'+ord; break;
\r
561 case 3: ord = '0'+ord; break;
\r
566 Tokenizer.testSuite = function(arr){
\r
567 var out = document.createElement('pre');
\r
568 document.body.appendChild(out);
\r
569 var debug = function(){
\r
570 var f = document.createElement('div');
\r
571 f.innerHTML = Array.prototype.slice.call(arguments).join(' ');
\r
572 out.appendChild(f);
\r
573 return arguments[0];
\r
576 debug("Running test suite...",arr.length,"tests");
\r
578 var start = +new Date;
\r
581 for (var i=0; i<arr.length; ++i) {
\r
582 var test = arr[i], result;
\r
583 var input = test[1];
\r
584 var outputLen = test[2];
\r
585 var regexHints = test[4] ? test[3] : null; // if flags, then len=4
\r
586 var desc = test[4] || test[3];
\r
588 var result = new Tokenizer(input).tokens(regexHints); // regexHints can be null, that's ok
\r
589 if (result.length == outputLen) {
\r
590 debug('<span class="green">Test '+i+' ok:</span>',desc);
\r
593 debug('<b class="red">Test failed:</span>',desc,'(found',result.length,'expected',outputLen+')'),console.log(desc, result);
\r
596 debug('<b>'+Tokenizer.escape(input)+'</b>');
\r
599 debug("Tokenizer test suite finished ("+(+new Date - start)+' ms). ok:'+ok+', fail:'+fail);
\r
603 Tokenizer.regexWhiteSpace = /[ \t\u000B\u000C\u00A0\uFFFF]/;
\r
604 Tokenizer.regexLineTerminator = /[\u000A\u000D\u2028\u2029]/;
\r
605 Tokenizer.regexAsciiIdentifier = /[a-zA-Z0-9\$_]/;
\r
606 Tokenizer.hashAsciiIdentifier = {_:1,$:1,a:1,b:1,c:1,d:1,e:1,f:1,g:1,h:1,i:1,j:1,k:1,l:1,m:1,n:1,o:1,p:1,q:1,r:1,s:1,t:1,u:1,v:1,w:1,x:1,y:1,z:1,A:1,B:1,C:1,D:1,E:1,F:1,G:1,H:1,I:1,J:1,K:1,L:1,M:1,N:1,O:1,P:1,Q:1,R:1,S:1,T:1,U:1,V:1,W:1,X:1,Y:1,Z:1,0:1,1:1,2:1,3:1,4:1,5:1,6:1,7:1,8:1,9:1};
\r
607 Tokenizer.regexHex = /[0-9A-Fa-f]/;
\r
608 Tokenizer.hashHex = {0:1,1:1,2:1,3:1,4:1,5:1,6:1,7:1,8:1,9:1,a:1,b:1,c:1,d:1,e:1,f:1,A:1,B:1,C:1,D:1,E:1,F:1};
\r
609 Tokenizer.regexUnicodeEscape = /u[0-9A-Fa-f]{4}/; // the \ is already checked at usage...
\r
610 Tokenizer.regexIdentifierStop = /[\>\=\!\|\<\+\-\&\*\%\^\/\{\}\(\)\[\]\.\;\,\~\?\:\ \t\n\\\'\"]/;
\r
611 Tokenizer.hashIdentifierStop = {'>':1,'=':1,'!':1,'|':1,'<':1,'+':1,'-':1,'&':1,'*':1,'%':1,'^':1,'/':1,'{':1,'}':1,'(':1,')':1,'[':1,']':1,'.':1,';':1,',':1,'~':1,'?':1,':':1,'\\':1,'\'':1,'"':1,' ':1,'\t':1,'\n':1};
\r
612 Tokenizer.regexNewline = /\n/g;
\r
613 //Tokenizer.regexPunctuators = /^(>>>=|===|!==|>>>|<<=|>>=|<=|>=|==|!=|\+\+|--|<<|>>|\&\&|\|\||\+=|-=|\*=|%=|\&=|\|=|\^=|\/=|\{|\}|\(|\)|\[|\]|\.|;|,|<|>|\+|-|\*|%|\||\&|\||\^|!|~|\?|:|=|\/)/;
\r
614 Tokenizer.Unidocde = window.Unicode;
\r
615 Tokenizer.regexNumber = /^(?:(0[xX][0-9A-Fa-f]+)|((?:(?:(?:(?:[0-9]+)(?:\.[0-9]*)?))|(?:\.[0-9]+))(?:[eE][-+]?[0-9]{1,})?))/;
\r
616 Tokenizer.regexNormalizeNewlines = /(\u000D[^\u000A])|[\u2028\u2029]/;
\r
618 // 1 ws 2 lt 3 scmt 4 mcmt 5/6 str 7 nr 8 rx 9 punc
\r
619 Tokenizer.regexBig = /^([ \t\u000B\u000C\u00A0\uFFFF])?([\u000A\u000D\u2028\u2029])?(\/\/)?(\/\*)?(')?(")?(\.?[0-9])?(?:(\/)[^=])?(>>>=|===|!==|>>>|<<=|>>=|<=|>=|==|!=|\+\+|--|<<|>>|\&\&|\|\||\+=|-=|\*=|%=|\&=|\|=|\^=|\/=|\{|\}|\(|\)|\[|\]|\.|;|,|<|>|\+|-|\*|%|\||\&|\||\^|!|~|\?|:|=|\/)?/;
\r
620 Tokenizer.regexBigAlt = /([ \t\u000B\u000C\u00A0\uFFFF])?([\u000A\u000D\u2028\u2029])?(\/\/)?(\/\*)?(')?(")?(\.?[0-9])?(?:(\/)[^=])?(>>>=|===|!==|>>>|<<=|>>=|<=|>=|==|!=|\+\+|--|<<|>>|\&\&|\|\||\+=|-=|\*=|%=|\&=|\|=|\^=|\/=|\{|\}|\(|\)|\[|\]|\.|;|,|<|>|\+|-|\*|%|\||\&|\||\^|!|~|\?|:|=|\/)?/g;
\r
622 Tokenizer.Error = {
\r
623 UnterminatedSingleStringNewline: {msg:'Newlines are not allowed in string literals'},
\r
624 UnterminatedSingleStringOther: {msg:'Unterminated single string'},
\r
625 UnterminatedDoubleStringNewline: {msg:'Newlines are not allowed in string literals'},
\r
626 UnterminatedDoubleStringOther: {msg:'Unterminated double string'},
\r
627 UnterminatedRegularExpressionNewline: {msg:'Newlines are not allowed in regular expressions'},
\r
628 NothingToRepeat: {msg:'Used a repeat character (*?+) in a regex without something prior to it to match'},
\r
629 ClosingClassRangeNotFound: {msg: 'Unable to find ] for class range'},
\r
630 RegexOpenGroup: {msg: 'Open group did not find closing parenthesis'},
\r
631 RegexNoOpenGroups: {msg: 'Closing parenthesis found but no group open'},
\r
632 UnterminatedRegularExpressionOther: {msg:'Unterminated regular expression'},
\r
633 UnterminatedMultiLineComment: {msg:'Unterminated multi line comment'},
\r
634 UnexpectedIdentifier: {msg:'Unexpected identifier'},
\r
635 IllegalOctalEscape: {msg:'Octal escapes are not valid'},
\r
636 Unknown: {msg:'Unknown input'}, // if this happens, my parser is bad :(
\r
637 NumberExponentRequiresDigits: {msg:'Numbers with exponents require at least one digit after the `e`'},
\r
638 BacktickNotSupported: {msg:'The backtick is not used in js, maybe you copy/pasted from a fancy site/doc?'},
\r
639 InvalidUnicodeEscape: {msg:'Encountered an invalid unicode escape, must be followed by exactly four hex numbers'},
\r
640 InvalidBackslash: {msg:'Encountered a backslash where it not allowed'},
\r
641 StartOfMatchShouldBeAtStart: {msg: 'The ^ signifies the start of match but was not found at a start'},
\r
642 DollarShouldBeEnd: {msg: 'The $ signifies the stop of match but was not found at a stop'},
\r
643 QuantifierRequiresNumber: {msg:'Quantifier curly requires at least one digit before the comma'},
\r
644 QuantifierRequiresClosingCurly: {msg:'Quantifier curly requires to be closed'},
\r
645 MissingOpeningCurly: {msg:'Encountered closing quantifier curly without seeing an opening curly'}
\r