gerrit.onap Code Review - sdc/sdc-distribution-client.git/blob

   1 # -*- coding: utf-8 -*-
   2 """
   3     babel.messages.extract
   4     ~~~~~~~~~~~~~~~~~~~~~~
   5
   6     Basic infrastructure for extracting localizable messages from source files.
   7
   8     This module defines an extensible system for collecting localizable message
   9     strings from a variety of sources. A native extractor for Python source
  10     files is builtin, extractors for other sources can be added using very
  11     simple plugins.
  12
  13     The main entry points into the extraction functionality are the functions
  14     `extract_from_dir` and `extract_from_file`.
  15
  16     :copyright: (c) 2013 by the Babel Team.
  17     :license: BSD, see LICENSE for more details.
  18 """
  19
  20 import os
  21 from os.path import relpath
  22 import sys
  23 from tokenize import generate_tokens, COMMENT, NAME, OP, STRING
  24
  25 from babel.util import parse_encoding, pathmatch
  26 from babel._compat import PY2, text_type
  27 from textwrap import dedent
  28
  29
  30 GROUP_NAME = 'babel.extractors'
  31
  32 DEFAULT_KEYWORDS = {
  33     '_': None,
  34     'gettext': None,
  35     'ngettext': (1, 2),
  36     'ugettext': None,
  37     'ungettext': (1, 2),
  38     'dgettext': (2,),
  39     'dngettext': (2, 3),
  40     'N_': None,
  41     'pgettext': ((1, 'c'), 2),
  42     'npgettext': ((1, 'c'), 2, 3)
  43 }
  44
  45 DEFAULT_MAPPING = [('**.py', 'python')]
  46
  47 empty_msgid_warning = (
  48     '%s: warning: Empty msgid.  It is reserved by GNU gettext: gettext("") '
  49     'returns the header entry with meta information, not the empty string.')
  50
  51
  52 def _strip_comment_tags(comments, tags):
  53     """Helper function for `extract` that strips comment tags from strings
  54     in a list of comment lines.  This functions operates in-place.
  55     """
  56     def _strip(line):
  57         for tag in tags:
  58             if line.startswith(tag):
  59                 return line[len(tag):].strip()
  60         return line
  61     comments[:] = map(_strip, comments)
  62
  63
  64 def extract_from_dir(dirname=None, method_map=DEFAULT_MAPPING,
  65                      options_map=None, keywords=DEFAULT_KEYWORDS,
  66                      comment_tags=(), callback=None, strip_comment_tags=False):
  67     """Extract messages from any source files found in the given directory.
  68
  69     This function generates tuples of the form ``(filename, lineno, message,
  70     comments, context)``.
  71
  72     Which extraction method is used per file is determined by the `method_map`
  73     parameter, which maps extended glob patterns to extraction method names.
  74     For example, the following is the default mapping:
  75
  76     >>> method_map = [
  77     ...     ('**.py', 'python')
  78     ... ]
  79
  80     This basically says that files with the filename extension ".py" at any
  81     level inside the directory should be processed by the "python" extraction
  82     method. Files that don't match any of the mapping patterns are ignored. See
  83     the documentation of the `pathmatch` function for details on the pattern
  84     syntax.
  85
  86     The following extended mapping would also use the "genshi" extraction
  87     method on any file in "templates" subdirectory:
  88
  89     >>> method_map = [
  90     ...     ('**/templates/**.*', 'genshi'),
  91     ...     ('**.py', 'python')
  92     ... ]
  93
  94     The dictionary provided by the optional `options_map` parameter augments
  95     these mappings. It uses extended glob patterns as keys, and the values are
  96     dictionaries mapping options names to option values (both strings).
  97
  98     The glob patterns of the `options_map` do not necessarily need to be the
  99     same as those used in the method mapping. For example, while all files in
 100     the ``templates`` folders in an application may be Genshi applications, the
 101     options for those files may differ based on extension:
 102
 103     >>> options_map = {
 104     ...     '**/templates/**.txt': {
 105     ...         'template_class': 'genshi.template:TextTemplate',
 106     ...         'encoding': 'latin-1'
 107     ...     },
 108     ...     '**/templates/**.html': {
 109     ...         'include_attrs': ''
 110     ...     }
 111     ... }
 112
 113     :param dirname: the path to the directory to extract messages from.  If
 114                     not given the current working directory is used.
 115     :param method_map: a list of ``(pattern, method)`` tuples that maps of
 116                        extraction method names to extended glob patterns
 117     :param options_map: a dictionary of additional options (optional)
 118     :param keywords: a dictionary mapping keywords (i.e. names of functions
 119                      that should be recognized as translation functions) to
 120                      tuples that specify which of their arguments contain
 121                      localizable strings
 122     :param comment_tags: a list of tags of translator comments to search for
 123                          and include in the results
 124     :param callback: a function that is called for every file that message are
 125                      extracted from, just before the extraction itself is
 126                      performed; the function is passed the filename, the name
 127                      of the extraction method and and the options dictionary as
 128                      positional arguments, in that order
 129     :param strip_comment_tags: a flag that if set to `True` causes all comment
 130                                tags to be removed from the collected comments.
 131     :see: `pathmatch`
 132     """
 133     if dirname is None:
 134         dirname = os.getcwd()
 135     if options_map is None:
 136         options_map = {}
 137
 138     absname = os.path.abspath(dirname)
 139     for root, dirnames, filenames in os.walk(absname):
 140         for subdir in dirnames:
 141             if subdir.startswith('.') or subdir.startswith('_'):
 142                 dirnames.remove(subdir)
 143         dirnames.sort()
 144         filenames.sort()
 145         for filename in filenames:
 146             filepath = os.path.join(root, filename).replace(os.sep, '/')
 147
 148             for message_tuple in check_and_call_extract_file(
 149                 filepath,
 150                 method_map,
 151                 options_map,
 152                 callback,
 153                 keywords,
 154                 comment_tags,
 155                 strip_comment_tags,
 156                 dirpath=absname,
 157             ):
 158                 yield message_tuple
 159
 160
 161 def check_and_call_extract_file(filepath, method_map, options_map,
 162                                 callback, keywords, comment_tags,
 163                                 strip_comment_tags, dirpath=None):
 164     """Checks if the given file matches an extraction method mapping, and if so, calls extract_from_file.
 165
 166     Note that the extraction method mappings are based relative to dirpath.
 167     So, given an absolute path to a file `filepath`, we want to check using
 168     just the relative path from `dirpath` to `filepath`.
 169
 170     :param filepath: An absolute path to a file that exists.
 171     :param method_map: a list of ``(pattern, method)`` tuples that maps of
 172                        extraction method names to extended glob patterns
 173     :param options_map: a dictionary of additional options (optional)
 174     :param callback: a function that is called for every file that message are
 175                      extracted from, just before the extraction itself is
 176                      performed; the function is passed the filename, the name
 177                      of the extraction method and and the options dictionary as
 178                      positional arguments, in that order
 179     :param keywords: a dictionary mapping keywords (i.e. names of functions
 180                      that should be recognized as translation functions) to
 181                      tuples that specify which of their arguments contain
 182                      localizable strings
 183     :param comment_tags: a list of tags of translator comments to search for
 184                          and include in the results
 185     :param strip_comment_tags: a flag that if set to `True` causes all comment
 186                                tags to be removed from the collected comments.
 187     :param dirpath: the path to the directory to extract messages from.
 188     """
 189     # filename is the relative path from dirpath to the actual file
 190     filename = relpath(filepath, dirpath)
 191
 192     for pattern, method in method_map:
 193         if not pathmatch(pattern, filename):
 194             continue
 195
 196         options = {}
 197         for opattern, odict in options_map.items():
 198             if pathmatch(opattern, filename):
 199                 options = odict
 200         if callback:
 201             callback(filename, method, options)
 202         for message_tuple in extract_from_file(
 203             method, filepath,
 204             keywords=keywords,
 205             comment_tags=comment_tags,
 206             options=options,
 207             strip_comment_tags=strip_comment_tags
 208         ):
 209             yield (filename, ) + message_tuple
 210
 211         break
 212
 213
 214 def extract_from_file(method, filename, keywords=DEFAULT_KEYWORDS,
 215                       comment_tags=(), options=None, strip_comment_tags=False):
 216     """Extract messages from a specific file.
 217
 218     This function returns a list of tuples of the form ``(lineno, funcname,
 219     message)``.
 220
 221     :param filename: the path to the file to extract messages from
 222     :param method: a string specifying the extraction method (.e.g. "python")
 223     :param keywords: a dictionary mapping keywords (i.e. names of functions
 224                      that should be recognized as translation functions) to
 225                      tuples that specify which of their arguments contain
 226                      localizable strings
 227     :param comment_tags: a list of translator tags to search for and include
 228                          in the results
 229     :param strip_comment_tags: a flag that if set to `True` causes all comment
 230                                tags to be removed from the collected comments.
 231     :param options: a dictionary of additional options (optional)
 232     """
 233     fileobj = open(filename, 'rb')
 234     try:
 235         return list(extract(method, fileobj, keywords, comment_tags, options,
 236                             strip_comment_tags))
 237     finally:
 238         fileobj.close()
 239
 240
 241 def extract(method, fileobj, keywords=DEFAULT_KEYWORDS, comment_tags=(),
 242             options=None, strip_comment_tags=False):
 243     """Extract messages from the given file-like object using the specified
 244     extraction method.
 245
 246     This function returns tuples of the form ``(lineno, message, comments)``.
 247
 248     The implementation dispatches the actual extraction to plugins, based on the
 249     value of the ``method`` parameter.
 250
 251     >>> source = b'''# foo module
 252     ... def run(argv):
 253     ...    print(_('Hello, world!'))
 254     ... '''
 255
 256     >>> from babel._compat import BytesIO
 257     >>> for message in extract('python', BytesIO(source)):
 258     ...     print(message)
 259     (3, u'Hello, world!', [], None)
 260
 261     :param method: an extraction method (a callable), or
 262                    a string specifying the extraction method (.e.g. "python");
 263                    if this is a simple name, the extraction function will be
 264                    looked up by entry point; if it is an explicit reference
 265                    to a function (of the form ``package.module:funcname`` or
 266                    ``package.module.funcname``), the corresponding function
 267                    will be imported and used
 268     :param fileobj: the file-like object the messages should be extracted from
 269     :param keywords: a dictionary mapping keywords (i.e. names of functions
 270                      that should be recognized as translation functions) to
 271                      tuples that specify which of their arguments contain
 272                      localizable strings
 273     :param comment_tags: a list of translator tags to search for and include
 274                          in the results
 275     :param options: a dictionary of additional options (optional)
 276     :param strip_comment_tags: a flag that if set to `True` causes all comment
 277                                tags to be removed from the collected comments.
 278     :raise ValueError: if the extraction method is not registered
 279     """
 280     func = None
 281     if callable(method):
 282         func = method
 283     elif ':' in method or '.' in method:
 284         if ':' not in method:
 285             lastdot = method.rfind('.')
 286             module, attrname = method[:lastdot], method[lastdot + 1:]
 287         else:
 288             module, attrname = method.split(':', 1)
 289         func = getattr(__import__(module, {}, {}, [attrname]), attrname)
 290     else:
 291         try:
 292             from pkg_resources import working_set
 293         except ImportError:
 294             pass
 295         else:
 296             for entry_point in working_set.iter_entry_points(GROUP_NAME,
 297                                                              method):
 298                 func = entry_point.load(require=True)
 299                 break
 300         if func is None:
 301             # if pkg_resources is not available or no usable egg-info was found
 302             # (see #230), we resort to looking up the builtin extractors
 303             # directly
 304             builtin = {
 305                 'ignore': extract_nothing,
 306                 'python': extract_python,
 307                 'javascript': extract_javascript
 308             }
 309             func = builtin.get(method)
 310
 311     if func is None:
 312         raise ValueError('Unknown extraction method %r' % method)
 313
 314     results = func(fileobj, keywords.keys(), comment_tags,
 315                    options=options or {})
 316
 317     for lineno, funcname, messages, comments in results:
 318         if funcname:
 319             spec = keywords[funcname] or (1,)
 320         else:
 321             spec = (1,)
 322         if not isinstance(messages, (list, tuple)):
 323             messages = [messages]
 324         if not messages:
 325             continue
 326
 327         # Validate the messages against the keyword's specification
 328         context = None
 329         msgs = []
 330         invalid = False
 331         # last_index is 1 based like the keyword spec
 332         last_index = len(messages)
 333         for index in spec:
 334             if isinstance(index, tuple):
 335                 context = messages[index[0] - 1]
 336                 continue
 337             if last_index < index:
 338                 # Not enough arguments
 339                 invalid = True
 340                 break
 341             message = messages[index - 1]
 342             if message is None:
 343                 invalid = True
 344                 break
 345             msgs.append(message)
 346         if invalid:
 347             continue
 348
 349         # keyword spec indexes are 1 based, therefore '-1'
 350         if isinstance(spec[0], tuple):
 351             # context-aware *gettext method
 352             first_msg_index = spec[1] - 1
 353         else:
 354             first_msg_index = spec[0] - 1
 355         if not messages[first_msg_index]:
 356             # An empty string msgid isn't valid, emit a warning
 357             where = '%s:%i' % (hasattr(fileobj, 'name') and
 358                                fileobj.name or '(unknown)', lineno)
 359             sys.stderr.write((empty_msgid_warning % where) + '\n')
 360             continue
 361
 362         messages = tuple(msgs)
 363         if len(messages) == 1:
 364             messages = messages[0]
 365
 366         if strip_comment_tags:
 367             _strip_comment_tags(comments, comment_tags)
 368         yield lineno, messages, comments, context
 369
 370
 371 def extract_nothing(fileobj, keywords, comment_tags, options):
 372     """Pseudo extractor that does not actually extract anything, but simply
 373     returns an empty list.
 374     """
 375     return []
 376
 377
 378 def extract_python(fileobj, keywords, comment_tags, options):
 379     """Extract messages from Python source code.
 380
 381     It returns an iterator yielding tuples in the following form ``(lineno,
 382     funcname, message, comments)``.
 383
 384     :param fileobj: the seekable, file-like object the messages should be
 385                     extracted from
 386     :param keywords: a list of keywords (i.e. function names) that should be
 387                      recognized as translation functions
 388     :param comment_tags: a list of translator tags to search for and include
 389                          in the results
 390     :param options: a dictionary of additional options (optional)
 391     :rtype: ``iterator``
 392     """
 393     funcname = lineno = message_lineno = None
 394     call_stack = -1
 395     buf = []
 396     messages = []
 397     translator_comments = []
 398     in_def = in_translator_comments = False
 399     comment_tag = None
 400
 401     encoding = parse_encoding(fileobj) or options.get('encoding', 'iso-8859-1')
 402
 403     if PY2:
 404         next_line = fileobj.readline
 405     else:
 406         next_line = lambda: fileobj.readline().decode(encoding)
 407
 408     tokens = generate_tokens(next_line)
 409     for tok, value, (lineno, _), _, _ in tokens:
 410         if call_stack == -1 and tok == NAME and value in ('def', 'class'):
 411             in_def = True
 412         elif tok == OP and value == '(':
 413             if in_def:
 414                 # Avoid false positives for declarations such as:
 415                 # def gettext(arg='message'):
 416                 in_def = False
 417                 continue
 418             if funcname:
 419                 message_lineno = lineno
 420                 call_stack += 1
 421         elif in_def and tok == OP and value == ':':
 422             # End of a class definition without parens
 423             in_def = False
 424             continue
 425         elif call_stack == -1 and tok == COMMENT:
 426             # Strip the comment token from the line
 427             if PY2:
 428                 value = value.decode(encoding)
 429             value = value[1:].strip()
 430             if in_translator_comments and \
 431                     translator_comments[-1][0] == lineno - 1:
 432                 # We're already inside a translator comment, continue appending
 433                 translator_comments.append((lineno, value))
 434                 continue
 435             # If execution reaches this point, let's see if comment line
 436             # starts with one of the comment tags
 437             for comment_tag in comment_tags:
 438                 if value.startswith(comment_tag):
 439                     in_translator_comments = True
 440                     translator_comments.append((lineno, value))
 441                     break
 442         elif funcname and call_stack == 0:
 443             if tok == OP and value == ')':
 444                 if buf:
 445                     messages.append(''.join(buf))
 446                     del buf[:]
 447                 else:
 448                     messages.append(None)
 449
 450                 if len(messages) > 1:
 451                     messages = tuple(messages)
 452                 else:
 453                     messages = messages[0]
 454                 # Comments don't apply unless they immediately preceed the
 455                 # message
 456                 if translator_comments and \
 457                         translator_comments[-1][0] < message_lineno - 1:
 458                     translator_comments = []
 459
 460                 yield (message_lineno, funcname, messages,
 461                        [comment[1] for comment in translator_comments])
 462
 463                 funcname = lineno = message_lineno = None
 464                 call_stack = -1
 465                 messages = []
 466                 translator_comments = []
 467                 in_translator_comments = False
 468             elif tok == STRING:
 469                 # Unwrap quotes in a safe manner, maintaining the string's
 470                 # encoding
 471                 # https://sourceforge.net/tracker/?func=detail&atid=355470&
 472                 # aid=617979&group_id=5470
 473                 value = eval('# coding=%s\n%s' % (str(encoding), value),
 474                              {'__builtins__': {}}, {})
 475                 if PY2 and not isinstance(value, text_type):
 476                     value = value.decode(encoding)
 477                 buf.append(value)
 478             elif tok == OP and value == ',':
 479                 if buf:
 480                     messages.append(''.join(buf))
 481                     del buf[:]
 482                 else:
 483                     messages.append(None)
 484                 if translator_comments:
 485                     # We have translator comments, and since we're on a
 486                     # comma(,) user is allowed to break into a new line
 487                     # Let's increase the last comment's lineno in order
 488                     # for the comment to still be a valid one
 489                     old_lineno, old_comment = translator_comments.pop()
 490                     translator_comments.append((old_lineno + 1, old_comment))
 491         elif call_stack > 0 and tok == OP and value == ')':
 492             call_stack -= 1
 493         elif funcname and call_stack == -1:
 494             funcname = None
 495         elif tok == NAME and value in keywords:
 496             funcname = value
 497
 498
 499 def extract_javascript(fileobj, keywords, comment_tags, options):
 500     """Extract messages from JavaScript source code.
 501
 502     :param fileobj: the seekable, file-like object the messages should be
 503                     extracted from
 504     :param keywords: a list of keywords (i.e. function names) that should be
 505                      recognized as translation functions
 506     :param comment_tags: a list of translator tags to search for and include
 507                          in the results
 508     :param options: a dictionary of additional options (optional)
 509                     Supported options are:
 510                     * `jsx` -- set to false to disable JSX/E4X support.
 511                     * `template_string` -- set to false to disable ES6
 512                                            template string support.
 513     """
 514     from babel.messages.jslexer import Token, tokenize, unquote_string
 515     funcname = message_lineno = None
 516     messages = []
 517     last_argument = None
 518     translator_comments = []
 519     concatenate_next = False
 520     encoding = options.get('encoding', 'utf-8')
 521     last_token = None
 522     call_stack = -1
 523     dotted = any('.' in kw for kw in keywords)
 524
 525     for token in tokenize(
 526         fileobj.read().decode(encoding),
 527         jsx=options.get("jsx", True),
 528         template_string=options.get("template_string", True),
 529         dotted=dotted
 530     ):
 531         if (  # Turn keyword`foo` expressions into keyword("foo") calls:
 532             funcname and  # have a keyword...
 533             (last_token and last_token.type == 'name') and  # we've seen nothing after the keyword...
 534             token.type == 'template_string'  # this is a template string
 535         ):
 536             message_lineno = token.lineno
 537             messages = [unquote_string(token.value)]
 538             call_stack = 0
 539             token = Token('operator', ')', token.lineno)
 540
 541         if token.type == 'operator' and token.value == '(':
 542             if funcname:
 543                 message_lineno = token.lineno
 544                 call_stack += 1
 545
 546         elif call_stack == -1 and token.type == 'linecomment':
 547             value = token.value[2:].strip()
 548             if translator_comments and \
 549                translator_comments[-1][0] == token.lineno - 1:
 550                 translator_comments.append((token.lineno, value))
 551                 continue
 552
 553             for comment_tag in comment_tags:
 554                 if value.startswith(comment_tag):
 555                     translator_comments.append((token.lineno, value.strip()))
 556                     break
 557
 558         elif token.type == 'multilinecomment':
 559             # only one multi-line comment may preceed a translation
 560             translator_comments = []
 561             value = token.value[2:-2].strip()
 562             for comment_tag in comment_tags:
 563                 if value.startswith(comment_tag):
 564                     lines = value.splitlines()
 565                     if lines:
 566                         lines[0] = lines[0].strip()
 567                         lines[1:] = dedent('\n'.join(lines[1:])).splitlines()
 568                         for offset, line in enumerate(lines):
 569                             translator_comments.append((token.lineno + offset,
 570                                                         line))
 571                     break
 572
 573         elif funcname and call_stack == 0:
 574             if token.type == 'operator' and token.value == ')':
 575                 if last_argument is not None:
 576                     messages.append(last_argument)
 577                 if len(messages) > 1:
 578                     messages = tuple(messages)
 579                 elif messages:
 580                     messages = messages[0]
 581                 else:
 582                     messages = None
 583
 584                 # Comments don't apply unless they immediately precede the
 585                 # message
 586                 if translator_comments and \
 587                    translator_comments[-1][0] < message_lineno - 1:
 588                     translator_comments = []
 589
 590                 if messages is not None:
 591                     yield (message_lineno, funcname, messages,
 592                            [comment[1] for comment in translator_comments])
 593
 594                 funcname = message_lineno = last_argument = None
 595                 concatenate_next = False
 596                 translator_comments = []
 597                 messages = []
 598                 call_stack = -1
 599
 600             elif token.type in ('string', 'template_string'):
 601                 new_value = unquote_string(token.value)
 602                 if concatenate_next:
 603                     last_argument = (last_argument or '') + new_value
 604                     concatenate_next = False
 605                 else:
 606                     last_argument = new_value
 607
 608             elif token.type == 'operator':
 609                 if token.value == ',':
 610                     if last_argument is not None:
 611                         messages.append(last_argument)
 612                         last_argument = None
 613                     else:
 614                         messages.append(None)
 615                     concatenate_next = False
 616                 elif token.value == '+':
 617                     concatenate_next = True
 618
 619         elif call_stack > 0 and token.type == 'operator' \
 620                 and token.value == ')':
 621             call_stack -= 1
 622
 623         elif funcname and call_stack == -1:
 624             funcname = None
 625
 626         elif call_stack == -1 and token.type == 'name' and \
 627             token.value in keywords and \
 628             (last_token is None or last_token.type != 'name' or
 629              last_token.value != 'function'):
 630             funcname = token.value
 631
 632         last_token = token