1 # -*- coding: utf-8 -*-
6 A simple JavaScript 1.5 lexer which is used for the JavaScript
9 :copyright: (c) 2013 by the Babel Team.
10 :license: BSD, see LICENSE for more details.
12 from collections import namedtuple
14 from babel._compat import unichr
17 '+', '-', '*', '%', '!=', '==', '<', '>', '<=', '>=', '=',
18 '+=', '-=', '*=', '%=', '<<', '>>', '>>>', '<<=', '>>=',
19 '>>>=', '&', '&=', '|', '|=', '&&', '||', '^', '^=', '(', ')',
20 '[', ']', '{', '}', '!', '--', '++', '~', ',', ';', '.', ':'
21 ], key=len, reverse=True)
23 escapes = {'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'}
25 name_re = re.compile(r'[\w$_][\w\d$_]*', re.UNICODE)
26 dotted_name_re = re.compile(r'[\w$_][\w\d$_.]*[\w\d$_.]', re.UNICODE)
27 division_re = re.compile(r'/=?')
28 regex_re = re.compile(r'/(?:[^/\\]*(?:\\.[^/\\]*)*)/[a-zA-Z]*(?s)')
29 line_re = re.compile(r'(\r\n|\n|\r)')
30 line_join_re = re.compile(r'\\' + line_re.pattern)
31 uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}')
33 Token = namedtuple('Token', 'type value lineno')
36 (None, re.compile(r'\s+(?u)')),
37 (None, re.compile(r'<!--.*')),
38 ('linecomment', re.compile(r'//.*')),
39 ('multilinecomment', re.compile(r'/\*.*?\*/(?us)')),
40 ('dotted_name', dotted_name_re),
42 ('number', re.compile(r'''(?x)(
48 ('jsx_tag', re.compile(r'<(?:/?)\w+.+?>', re.I)), # May be mangled in `get_rules`
49 ('operator', re.compile(r'(%s)' % '|'.join(map(re.escape, operators)))),
50 ('template_string', re.compile(r'''`(?:[^`\\]*(?:\\.[^`\\]*)*)`''', re.UNICODE)),
51 ('string', re.compile(r'''(?xs)(
52 '(?:[^'\\]*(?:\\.[^'\\]*)*)' |
53 "(?:[^"\\]*(?:\\.[^"\\]*)*)"
58 def get_rules(jsx, dotted, template_string):
60 Get a tokenization rule list given the passed syntax options.
62 Internal to this module.
65 for token_type, rule in _rules:
66 if not jsx and token_type and 'jsx' in token_type:
68 if not template_string and token_type == 'template_string':
70 if token_type == 'dotted_name':
74 rules.append((token_type, rule))
78 def indicates_division(token):
79 """A helper function that helps the tokenizer to decide if the current
80 token may be followed by a division operator.
82 if token.type == 'operator':
83 return token.value in (')', ']', '}', '++', '--')
84 return token.type in ('name', 'number', 'string', 'regexp')
87 def unquote_string(string):
88 """Unquote a string with JavaScript rules. The string has to start with
89 string delimiters (``'``, ``"`` or the back-tick/grave accent (for template strings).)
91 assert string and string[0] == string[-1] and string[0] in '"\'`', \
92 'string provided is not properly delimited'
93 string = line_join_re.sub('\\1', string[1:-1])
99 # scan for the next escape
100 escape_pos = string.find('\\', pos)
103 add(string[pos:escape_pos])
105 # check which character is escaped
106 next_char = string[escape_pos + 1]
107 if next_char in escapes:
108 add(escapes[next_char])
110 # unicode escapes. trie to consume up to four characters of
111 # hexadecimal characters and try to interpret them as unicode
112 # character point. If there is no such character point, put
113 # all the consumed characters into the string.
114 elif next_char in 'uU':
115 escaped = uni_escape_re.match(string, escape_pos + 2)
116 if escaped is not None:
117 escaped_value = escaped.group()
118 if len(escaped_value) == 4:
120 add(unichr(int(escaped_value, 16)))
126 add(next_char + escaped_value)
132 # bogus escape. Just remove the backslash.
137 if pos < len(string):
140 return u''.join(result)
143 def tokenize(source, jsx=True, dotted=True, template_string=True):
145 Tokenize JavaScript/JSX source. Returns a generator of tokens.
147 :param jsx: Enable (limited) JSX parsing.
148 :param dotted: Read dotted names as single name token.
149 :param template_string: Support ES6 template strings
155 rules = get_rules(jsx=jsx, dotted=dotted, template_string=template_string)
158 # handle regular rules first
159 for token_type, rule in rules:
160 match = rule.match(source, pos)
161 if match is not None:
163 # if we don't have a match we don't give up yet, but check for
164 # division operators or regular expression literals, based on
165 # the status of `may_divide` which is determined by the last
166 # processed non-whitespace token using `indicates_division`.
169 match = division_re.match(source, pos)
170 token_type = 'operator'
172 match = regex_re.match(source, pos)
173 token_type = 'regexp'
175 # woops. invalid syntax. jump one char ahead and try again.
179 token_value = match.group()
180 if token_type is not None:
181 token = Token(token_type, token_value, lineno)
182 may_divide = indicates_division(token)
184 lineno += len(line_re.findall(token_value))