1 # -*- coding: utf-8 -*-
6 Basic infrastructure for extracting localizable messages from source files.
8 This module defines an extensible system for collecting localizable message
9 strings from a variety of sources. A native extractor for Python source
10 files is builtin, extractors for other sources can be added using very
13 The main entry points into the extraction functionality are the functions
14 `extract_from_dir` and `extract_from_file`.
16 :copyright: (c) 2013 by the Babel Team.
17 :license: BSD, see LICENSE for more details.
21 from os.path import relpath
23 from tokenize import generate_tokens, COMMENT, NAME, OP, STRING
25 from babel.util import parse_encoding, pathmatch
26 from babel._compat import PY2, text_type
27 from textwrap import dedent
30 GROUP_NAME = 'babel.extractors'
41 'pgettext': ((1, 'c'), 2),
42 'npgettext': ((1, 'c'), 2, 3)
45 DEFAULT_MAPPING = [('**.py', 'python')]
47 empty_msgid_warning = (
48 '%s: warning: Empty msgid. It is reserved by GNU gettext: gettext("") '
49 'returns the header entry with meta information, not the empty string.')
52 def _strip_comment_tags(comments, tags):
53 """Helper function for `extract` that strips comment tags from strings
54 in a list of comment lines. This functions operates in-place.
58 if line.startswith(tag):
59 return line[len(tag):].strip()
61 comments[:] = map(_strip, comments)
64 def extract_from_dir(dirname=None, method_map=DEFAULT_MAPPING,
65 options_map=None, keywords=DEFAULT_KEYWORDS,
66 comment_tags=(), callback=None, strip_comment_tags=False):
67 """Extract messages from any source files found in the given directory.
69 This function generates tuples of the form ``(filename, lineno, message,
72 Which extraction method is used per file is determined by the `method_map`
73 parameter, which maps extended glob patterns to extraction method names.
74 For example, the following is the default mapping:
77 ... ('**.py', 'python')
80 This basically says that files with the filename extension ".py" at any
81 level inside the directory should be processed by the "python" extraction
82 method. Files that don't match any of the mapping patterns are ignored. See
83 the documentation of the `pathmatch` function for details on the pattern
86 The following extended mapping would also use the "genshi" extraction
87 method on any file in "templates" subdirectory:
90 ... ('**/templates/**.*', 'genshi'),
91 ... ('**.py', 'python')
94 The dictionary provided by the optional `options_map` parameter augments
95 these mappings. It uses extended glob patterns as keys, and the values are
96 dictionaries mapping options names to option values (both strings).
98 The glob patterns of the `options_map` do not necessarily need to be the
99 same as those used in the method mapping. For example, while all files in
100 the ``templates`` folders in an application may be Genshi applications, the
101 options for those files may differ based on extension:
104 ... '**/templates/**.txt': {
105 ... 'template_class': 'genshi.template:TextTemplate',
106 ... 'encoding': 'latin-1'
108 ... '**/templates/**.html': {
109 ... 'include_attrs': ''
113 :param dirname: the path to the directory to extract messages from. If
114 not given the current working directory is used.
115 :param method_map: a list of ``(pattern, method)`` tuples that maps of
116 extraction method names to extended glob patterns
117 :param options_map: a dictionary of additional options (optional)
118 :param keywords: a dictionary mapping keywords (i.e. names of functions
119 that should be recognized as translation functions) to
120 tuples that specify which of their arguments contain
122 :param comment_tags: a list of tags of translator comments to search for
123 and include in the results
124 :param callback: a function that is called for every file that message are
125 extracted from, just before the extraction itself is
126 performed; the function is passed the filename, the name
127 of the extraction method and and the options dictionary as
128 positional arguments, in that order
129 :param strip_comment_tags: a flag that if set to `True` causes all comment
130 tags to be removed from the collected comments.
134 dirname = os.getcwd()
135 if options_map is None:
138 absname = os.path.abspath(dirname)
139 for root, dirnames, filenames in os.walk(absname):
140 for subdir in dirnames:
141 if subdir.startswith('.') or subdir.startswith('_'):
142 dirnames.remove(subdir)
145 for filename in filenames:
146 filepath = os.path.join(root, filename).replace(os.sep, '/')
148 for message_tuple in check_and_call_extract_file(
161 def check_and_call_extract_file(filepath, method_map, options_map,
162 callback, keywords, comment_tags,
163 strip_comment_tags, dirpath=None):
164 """Checks if the given file matches an extraction method mapping, and if so, calls extract_from_file.
166 Note that the extraction method mappings are based relative to dirpath.
167 So, given an absolute path to a file `filepath`, we want to check using
168 just the relative path from `dirpath` to `filepath`.
170 :param filepath: An absolute path to a file that exists.
171 :param method_map: a list of ``(pattern, method)`` tuples that maps of
172 extraction method names to extended glob patterns
173 :param options_map: a dictionary of additional options (optional)
174 :param callback: a function that is called for every file that message are
175 extracted from, just before the extraction itself is
176 performed; the function is passed the filename, the name
177 of the extraction method and and the options dictionary as
178 positional arguments, in that order
179 :param keywords: a dictionary mapping keywords (i.e. names of functions
180 that should be recognized as translation functions) to
181 tuples that specify which of their arguments contain
183 :param comment_tags: a list of tags of translator comments to search for
184 and include in the results
185 :param strip_comment_tags: a flag that if set to `True` causes all comment
186 tags to be removed from the collected comments.
187 :param dirpath: the path to the directory to extract messages from.
189 # filename is the relative path from dirpath to the actual file
190 filename = relpath(filepath, dirpath)
192 for pattern, method in method_map:
193 if not pathmatch(pattern, filename):
197 for opattern, odict in options_map.items():
198 if pathmatch(opattern, filename):
201 callback(filename, method, options)
202 for message_tuple in extract_from_file(
205 comment_tags=comment_tags,
207 strip_comment_tags=strip_comment_tags
209 yield (filename, ) + message_tuple
214 def extract_from_file(method, filename, keywords=DEFAULT_KEYWORDS,
215 comment_tags=(), options=None, strip_comment_tags=False):
216 """Extract messages from a specific file.
218 This function returns a list of tuples of the form ``(lineno, funcname,
221 :param filename: the path to the file to extract messages from
222 :param method: a string specifying the extraction method (.e.g. "python")
223 :param keywords: a dictionary mapping keywords (i.e. names of functions
224 that should be recognized as translation functions) to
225 tuples that specify which of their arguments contain
227 :param comment_tags: a list of translator tags to search for and include
229 :param strip_comment_tags: a flag that if set to `True` causes all comment
230 tags to be removed from the collected comments.
231 :param options: a dictionary of additional options (optional)
233 fileobj = open(filename, 'rb')
235 return list(extract(method, fileobj, keywords, comment_tags, options,
241 def extract(method, fileobj, keywords=DEFAULT_KEYWORDS, comment_tags=(),
242 options=None, strip_comment_tags=False):
243 """Extract messages from the given file-like object using the specified
246 This function returns tuples of the form ``(lineno, message, comments)``.
248 The implementation dispatches the actual extraction to plugins, based on the
249 value of the ``method`` parameter.
251 >>> source = b'''# foo module
253 ... print(_('Hello, world!'))
256 >>> from babel._compat import BytesIO
257 >>> for message in extract('python', BytesIO(source)):
259 (3, u'Hello, world!', [], None)
261 :param method: an extraction method (a callable), or
262 a string specifying the extraction method (.e.g. "python");
263 if this is a simple name, the extraction function will be
264 looked up by entry point; if it is an explicit reference
265 to a function (of the form ``package.module:funcname`` or
266 ``package.module.funcname``), the corresponding function
267 will be imported and used
268 :param fileobj: the file-like object the messages should be extracted from
269 :param keywords: a dictionary mapping keywords (i.e. names of functions
270 that should be recognized as translation functions) to
271 tuples that specify which of their arguments contain
273 :param comment_tags: a list of translator tags to search for and include
275 :param options: a dictionary of additional options (optional)
276 :param strip_comment_tags: a flag that if set to `True` causes all comment
277 tags to be removed from the collected comments.
278 :raise ValueError: if the extraction method is not registered
283 elif ':' in method or '.' in method:
284 if ':' not in method:
285 lastdot = method.rfind('.')
286 module, attrname = method[:lastdot], method[lastdot + 1:]
288 module, attrname = method.split(':', 1)
289 func = getattr(__import__(module, {}, {}, [attrname]), attrname)
292 from pkg_resources import working_set
296 for entry_point in working_set.iter_entry_points(GROUP_NAME,
298 func = entry_point.load(require=True)
301 # if pkg_resources is not available or no usable egg-info was found
302 # (see #230), we resort to looking up the builtin extractors
305 'ignore': extract_nothing,
306 'python': extract_python,
307 'javascript': extract_javascript
309 func = builtin.get(method)
312 raise ValueError('Unknown extraction method %r' % method)
314 results = func(fileobj, keywords.keys(), comment_tags,
315 options=options or {})
317 for lineno, funcname, messages, comments in results:
319 spec = keywords[funcname] or (1,)
322 if not isinstance(messages, (list, tuple)):
323 messages = [messages]
327 # Validate the messages against the keyword's specification
331 # last_index is 1 based like the keyword spec
332 last_index = len(messages)
334 if isinstance(index, tuple):
335 context = messages[index[0] - 1]
337 if last_index < index:
338 # Not enough arguments
341 message = messages[index - 1]
349 # keyword spec indexes are 1 based, therefore '-1'
350 if isinstance(spec[0], tuple):
351 # context-aware *gettext method
352 first_msg_index = spec[1] - 1
354 first_msg_index = spec[0] - 1
355 if not messages[first_msg_index]:
356 # An empty string msgid isn't valid, emit a warning
357 where = '%s:%i' % (hasattr(fileobj, 'name') and
358 fileobj.name or '(unknown)', lineno)
359 sys.stderr.write((empty_msgid_warning % where) + '\n')
362 messages = tuple(msgs)
363 if len(messages) == 1:
364 messages = messages[0]
366 if strip_comment_tags:
367 _strip_comment_tags(comments, comment_tags)
368 yield lineno, messages, comments, context
371 def extract_nothing(fileobj, keywords, comment_tags, options):
372 """Pseudo extractor that does not actually extract anything, but simply
373 returns an empty list.
378 def extract_python(fileobj, keywords, comment_tags, options):
379 """Extract messages from Python source code.
381 It returns an iterator yielding tuples in the following form ``(lineno,
382 funcname, message, comments)``.
384 :param fileobj: the seekable, file-like object the messages should be
386 :param keywords: a list of keywords (i.e. function names) that should be
387 recognized as translation functions
388 :param comment_tags: a list of translator tags to search for and include
390 :param options: a dictionary of additional options (optional)
393 funcname = lineno = message_lineno = None
397 translator_comments = []
398 in_def = in_translator_comments = False
401 encoding = parse_encoding(fileobj) or options.get('encoding', 'iso-8859-1')
404 next_line = fileobj.readline
406 next_line = lambda: fileobj.readline().decode(encoding)
408 tokens = generate_tokens(next_line)
409 for tok, value, (lineno, _), _, _ in tokens:
410 if call_stack == -1 and tok == NAME and value in ('def', 'class'):
412 elif tok == OP and value == '(':
414 # Avoid false positives for declarations such as:
415 # def gettext(arg='message'):
419 message_lineno = lineno
421 elif in_def and tok == OP and value == ':':
422 # End of a class definition without parens
425 elif call_stack == -1 and tok == COMMENT:
426 # Strip the comment token from the line
428 value = value.decode(encoding)
429 value = value[1:].strip()
430 if in_translator_comments and \
431 translator_comments[-1][0] == lineno - 1:
432 # We're already inside a translator comment, continue appending
433 translator_comments.append((lineno, value))
435 # If execution reaches this point, let's see if comment line
436 # starts with one of the comment tags
437 for comment_tag in comment_tags:
438 if value.startswith(comment_tag):
439 in_translator_comments = True
440 translator_comments.append((lineno, value))
442 elif funcname and call_stack == 0:
443 if tok == OP and value == ')':
445 messages.append(''.join(buf))
448 messages.append(None)
450 if len(messages) > 1:
451 messages = tuple(messages)
453 messages = messages[0]
454 # Comments don't apply unless they immediately preceed the
456 if translator_comments and \
457 translator_comments[-1][0] < message_lineno - 1:
458 translator_comments = []
460 yield (message_lineno, funcname, messages,
461 [comment[1] for comment in translator_comments])
463 funcname = lineno = message_lineno = None
466 translator_comments = []
467 in_translator_comments = False
469 # Unwrap quotes in a safe manner, maintaining the string's
471 # https://sourceforge.net/tracker/?func=detail&atid=355470&
472 # aid=617979&group_id=5470
473 value = eval('# coding=%s\n%s' % (str(encoding), value),
474 {'__builtins__': {}}, {})
475 if PY2 and not isinstance(value, text_type):
476 value = value.decode(encoding)
478 elif tok == OP and value == ',':
480 messages.append(''.join(buf))
483 messages.append(None)
484 if translator_comments:
485 # We have translator comments, and since we're on a
486 # comma(,) user is allowed to break into a new line
487 # Let's increase the last comment's lineno in order
488 # for the comment to still be a valid one
489 old_lineno, old_comment = translator_comments.pop()
490 translator_comments.append((old_lineno + 1, old_comment))
491 elif call_stack > 0 and tok == OP and value == ')':
493 elif funcname and call_stack == -1:
495 elif tok == NAME and value in keywords:
499 def extract_javascript(fileobj, keywords, comment_tags, options):
500 """Extract messages from JavaScript source code.
502 :param fileobj: the seekable, file-like object the messages should be
504 :param keywords: a list of keywords (i.e. function names) that should be
505 recognized as translation functions
506 :param comment_tags: a list of translator tags to search for and include
508 :param options: a dictionary of additional options (optional)
509 Supported options are:
510 * `jsx` -- set to false to disable JSX/E4X support.
511 * `template_string` -- set to false to disable ES6
512 template string support.
514 from babel.messages.jslexer import Token, tokenize, unquote_string
515 funcname = message_lineno = None
518 translator_comments = []
519 concatenate_next = False
520 encoding = options.get('encoding', 'utf-8')
523 dotted = any('.' in kw for kw in keywords)
525 for token in tokenize(
526 fileobj.read().decode(encoding),
527 jsx=options.get("jsx", True),
528 template_string=options.get("template_string", True),
531 if ( # Turn keyword`foo` expressions into keyword("foo") calls:
532 funcname and # have a keyword...
533 (last_token and last_token.type == 'name') and # we've seen nothing after the keyword...
534 token.type == 'template_string' # this is a template string
536 message_lineno = token.lineno
537 messages = [unquote_string(token.value)]
539 token = Token('operator', ')', token.lineno)
541 if token.type == 'operator' and token.value == '(':
543 message_lineno = token.lineno
546 elif call_stack == -1 and token.type == 'linecomment':
547 value = token.value[2:].strip()
548 if translator_comments and \
549 translator_comments[-1][0] == token.lineno - 1:
550 translator_comments.append((token.lineno, value))
553 for comment_tag in comment_tags:
554 if value.startswith(comment_tag):
555 translator_comments.append((token.lineno, value.strip()))
558 elif token.type == 'multilinecomment':
559 # only one multi-line comment may preceed a translation
560 translator_comments = []
561 value = token.value[2:-2].strip()
562 for comment_tag in comment_tags:
563 if value.startswith(comment_tag):
564 lines = value.splitlines()
566 lines[0] = lines[0].strip()
567 lines[1:] = dedent('\n'.join(lines[1:])).splitlines()
568 for offset, line in enumerate(lines):
569 translator_comments.append((token.lineno + offset,
573 elif funcname and call_stack == 0:
574 if token.type == 'operator' and token.value == ')':
575 if last_argument is not None:
576 messages.append(last_argument)
577 if len(messages) > 1:
578 messages = tuple(messages)
580 messages = messages[0]
584 # Comments don't apply unless they immediately precede the
586 if translator_comments and \
587 translator_comments[-1][0] < message_lineno - 1:
588 translator_comments = []
590 if messages is not None:
591 yield (message_lineno, funcname, messages,
592 [comment[1] for comment in translator_comments])
594 funcname = message_lineno = last_argument = None
595 concatenate_next = False
596 translator_comments = []
600 elif token.type in ('string', 'template_string'):
601 new_value = unquote_string(token.value)
603 last_argument = (last_argument or '') + new_value
604 concatenate_next = False
606 last_argument = new_value
608 elif token.type == 'operator':
609 if token.value == ',':
610 if last_argument is not None:
611 messages.append(last_argument)
614 messages.append(None)
615 concatenate_next = False
616 elif token.value == '+':
617 concatenate_next = True
619 elif call_stack > 0 and token.type == 'operator' \
620 and token.value == ')':
623 elif funcname and call_stack == -1:
626 elif call_stack == -1 and token.type == 'name' and \
627 token.value in keywords and \
628 (last_token is None or last_token.type != 'name' or
629 last_token.value != 'function'):
630 funcname = token.value