gerrit.onap Code Review - sdc/sdc-distribution-client.git/blob

   1 # -*- coding: utf-8 -*-
   2 """
   3     babel.messages.jslexer
   4     ~~~~~~~~~~~~~~~~~~~~~~
   5
   6     A simple JavaScript 1.5 lexer which is used for the JavaScript
   7     extractor.
   8
   9     :copyright: (c) 2013 by the Babel Team.
  10     :license: BSD, see LICENSE for more details.
  11 """
  12 from collections import namedtuple
  13 import re
  14 from babel._compat import unichr
  15
  16 operators = sorted([
  17     '+', '-', '*', '%', '!=', '==', '<', '>', '<=', '>=', '=',
  18     '+=', '-=', '*=', '%=', '<<', '>>', '>>>', '<<=', '>>=',
  19     '>>>=', '&', '&=', '|', '|=', '&&', '||', '^', '^=', '(', ')',
  20     '[', ']', '{', '}', '!', '--', '++', '~', ',', ';', '.', ':'
  21 ], key=len, reverse=True)
  22
  23 escapes = {'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'}
  24
  25 name_re = re.compile(r'[\w$_][\w\d$_]*', re.UNICODE)
  26 dotted_name_re = re.compile(r'[\w$_][\w\d$_.]*[\w\d$_.]', re.UNICODE)
  27 division_re = re.compile(r'/=?')
  28 regex_re = re.compile(r'/(?:[^/\\]*(?:\\.[^/\\]*)*)/[a-zA-Z]*(?s)')
  29 line_re = re.compile(r'(\r\n|\n|\r)')
  30 line_join_re = re.compile(r'\\' + line_re.pattern)
  31 uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}')
  32
  33 Token = namedtuple('Token', 'type value lineno')
  34
  35 _rules = [
  36     (None, re.compile(r'\s+(?u)')),
  37     (None, re.compile(r'<!--.*')),
  38     ('linecomment', re.compile(r'//.*')),
  39     ('multilinecomment', re.compile(r'/\*.*?\*/(?us)')),
  40     ('dotted_name', dotted_name_re),
  41     ('name', name_re),
  42     ('number', re.compile(r'''(?x)(
  43         (?:0|[1-9]\d*)
  44         (\.\d+)?
  45         ([eE][-+]?\d+)? |
  46         (0x[a-fA-F0-9]+)
  47     )''')),
  48     ('jsx_tag', re.compile(r'<(?:/?)\w+.+?>', re.I)),  # May be mangled in `get_rules`
  49     ('operator', re.compile(r'(%s)' % '|'.join(map(re.escape, operators)))),
  50     ('template_string', re.compile(r'''`(?:[^`\\]*(?:\\.[^`\\]*)*)`''', re.UNICODE)),
  51     ('string', re.compile(r'''(?xs)(
  52         '(?:[^'\\]*(?:\\.[^'\\]*)*)'  |
  53         "(?:[^"\\]*(?:\\.[^"\\]*)*)"
  54     )'''))
  55 ]
  56
  57
  58 def get_rules(jsx, dotted, template_string):
  59     """
  60     Get a tokenization rule list given the passed syntax options.
  61
  62     Internal to this module.
  63     """
  64     rules = []
  65     for token_type, rule in _rules:
  66         if not jsx and token_type and 'jsx' in token_type:
  67             continue
  68         if not template_string and token_type == 'template_string':
  69             continue
  70         if token_type == 'dotted_name':
  71             if not dotted:
  72                 continue
  73             token_type = 'name'
  74         rules.append((token_type, rule))
  75     return rules
  76
  77
  78 def indicates_division(token):
  79     """A helper function that helps the tokenizer to decide if the current
  80     token may be followed by a division operator.
  81     """
  82     if token.type == 'operator':
  83         return token.value in (')', ']', '}', '++', '--')
  84     return token.type in ('name', 'number', 'string', 'regexp')
  85
  86
  87 def unquote_string(string):
  88     """Unquote a string with JavaScript rules.  The string has to start with
  89     string delimiters (``'``, ``"`` or the back-tick/grave accent (for template strings).)
  90     """
  91     assert string and string[0] == string[-1] and string[0] in '"\'`', \
  92         'string provided is not properly delimited'
  93     string = line_join_re.sub('\\1', string[1:-1])
  94     result = []
  95     add = result.append
  96     pos = 0
  97
  98     while 1:
  99         # scan for the next escape
 100         escape_pos = string.find('\\', pos)
 101         if escape_pos < 0:
 102             break
 103         add(string[pos:escape_pos])
 104
 105         # check which character is escaped
 106         next_char = string[escape_pos + 1]
 107         if next_char in escapes:
 108             add(escapes[next_char])
 109
 110         # unicode escapes.  trie to consume up to four characters of
 111         # hexadecimal characters and try to interpret them as unicode
 112         # character point.  If there is no such character point, put
 113         # all the consumed characters into the string.
 114         elif next_char in 'uU':
 115             escaped = uni_escape_re.match(string, escape_pos + 2)
 116             if escaped is not None:
 117                 escaped_value = escaped.group()
 118                 if len(escaped_value) == 4:
 119                     try:
 120                         add(unichr(int(escaped_value, 16)))
 121                     except ValueError:
 122                         pass
 123                     else:
 124                         pos = escape_pos + 6
 125                         continue
 126                 add(next_char + escaped_value)
 127                 pos = escaped.end()
 128                 continue
 129             else:
 130                 add(next_char)
 131
 132         # bogus escape.  Just remove the backslash.
 133         else:
 134             add(next_char)
 135         pos = escape_pos + 2
 136
 137     if pos < len(string):
 138         add(string[pos:])
 139
 140     return u''.join(result)
 141
 142
 143 def tokenize(source, jsx=True, dotted=True, template_string=True):
 144     """
 145     Tokenize JavaScript/JSX source.  Returns a generator of tokens.
 146
 147     :param jsx: Enable (limited) JSX parsing.
 148     :param dotted: Read dotted names as single name token.
 149     :param template_string: Support ES6 template strings
 150     """
 151     may_divide = False
 152     pos = 0
 153     lineno = 1
 154     end = len(source)
 155     rules = get_rules(jsx=jsx, dotted=dotted, template_string=template_string)
 156
 157     while pos < end:
 158         # handle regular rules first
 159         for token_type, rule in rules:
 160             match = rule.match(source, pos)
 161             if match is not None:
 162                 break
 163         # if we don't have a match we don't give up yet, but check for
 164         # division operators or regular expression literals, based on
 165         # the status of `may_divide` which is determined by the last
 166         # processed non-whitespace token using `indicates_division`.
 167         else:
 168             if may_divide:
 169                 match = division_re.match(source, pos)
 170                 token_type = 'operator'
 171             else:
 172                 match = regex_re.match(source, pos)
 173                 token_type = 'regexp'
 174             if match is None:
 175                 # woops. invalid syntax. jump one char ahead and try again.
 176                 pos += 1
 177                 continue
 178
 179         token_value = match.group()
 180         if token_type is not None:
 181             token = Token(token_type, token_value, lineno)
 182             may_divide = indicates_division(token)
 183             yield token
 184         lineno += len(line_re.findall(token_value))
 185         pos = match.end()