cmlenz@1: # -*- coding: utf-8 -*- cmlenz@1: # jruigrok@530: # Copyright (C) 2007-2011 Edgewall Software cmlenz@1: # All rights reserved. cmlenz@1: # cmlenz@1: # This software is licensed as described in the file COPYING, which cmlenz@1: # you should have received as part of this distribution. The terms cmlenz@1: # are also available at http://babel.edgewall.org/wiki/License. cmlenz@1: # cmlenz@1: # This software consists of voluntary contributions made by many cmlenz@1: # individuals. For the exact contribution history, see the revision cmlenz@1: # history and logs, available at http://babel.edgewall.org/log/. cmlenz@1: cmlenz@1: """Basic infrastructure for extracting localizable messages from source files. cmlenz@1: cmlenz@1: This module defines an extensible system for collecting localizable message cmlenz@1: strings from a variety of sources. A native extractor for Python source files cmlenz@1: is builtin, extractors for other sources can be added using very simple plugins. cmlenz@1: cmlenz@1: The main entry points into the extraction functionality are the functions cmlenz@1: `extract_from_dir` and `extract_from_file`. cmlenz@1: """ cmlenz@1: cmlenz@1: import os cmlenz@1: import sys pjenvey@162: from tokenize import generate_tokens, COMMENT, NAME, OP, STRING cmlenz@1: jruigrok@525: from babel.util import parse_encoding, pathmatch, relpath aronacher@338: from textwrap import dedent cmlenz@1: cmlenz@1: __all__ = ['extract', 'extract_from_dir', 'extract_from_file'] cmlenz@1: __docformat__ = 'restructuredtext en' cmlenz@1: cmlenz@1: GROUP_NAME = 'babel.extractors' cmlenz@1: cmlenz@12: DEFAULT_KEYWORDS = { palgarvio@10: '_': None, palgarvio@10: 'gettext': None, palgarvio@10: 'ngettext': (1, 2), palgarvio@10: 'ugettext': None, palgarvio@10: 'ungettext': (1, 2), palgarvio@10: 'dgettext': (2,), palgarvio@10: 'dngettext': (2, 3), pjenvey@179: 'N_': None palgarvio@10: } cmlenz@1: cmlenz@62: DEFAULT_MAPPING = [('**.py', 'python')] cmlenz@1: pjenvey@222: empty_msgid_warning = ( pjenvey@222: '%s: warning: Empty msgid. It is reserved by GNU gettext: gettext("") ' pjenvey@222: 'returns the header entry with meta information, not the empty string.') pjenvey@222: aronacher@338: aronacher@338: def _strip_comment_tags(comments, tags): aronacher@338: """Helper function for `extract` that strips comment tags from strings aronacher@338: in a list of comment lines. This functions operates in-place. aronacher@338: """ aronacher@338: def _strip(line): aronacher@338: for tag in tags: aronacher@338: if line.startswith(tag): aronacher@338: return line[len(tag):].strip() aronacher@338: return line aronacher@338: comments[:] = map(_strip, comments) aronacher@338: aronacher@340: cmlenz@47: def extract_from_dir(dirname=os.getcwd(), method_map=DEFAULT_MAPPING, cmlenz@47: options_map=None, keywords=DEFAULT_KEYWORDS, aronacher@338: comment_tags=(), callback=None, strip_comment_tags=False): cmlenz@1: """Extract messages from any source files found in the given directory. palgarvio@224: cmlenz@1: This function generates tuples of the form: palgarvio@224: palgarvio@82: ``(filename, lineno, message, comments)`` palgarvio@224: cmlenz@44: Which extraction method is used per file is determined by the `method_map` cmlenz@44: parameter, which maps extended glob patterns to extraction method names. cmlenz@44: For example, the following is the default mapping: palgarvio@224: cmlenz@62: >>> method_map = [ cmlenz@62: ... ('**.py', 'python') cmlenz@62: ... ] palgarvio@224: cmlenz@1: This basically says that files with the filename extension ".py" at any cmlenz@1: level inside the directory should be processed by the "python" extraction cmlenz@44: method. Files that don't match any of the mapping patterns are ignored. See cmlenz@44: the documentation of the `pathmatch` function for details on the pattern cmlenz@44: syntax. palgarvio@224: cmlenz@62: The following extended mapping would also use the "genshi" extraction cmlenz@62: method on any file in "templates" subdirectory: palgarvio@224: cmlenz@62: >>> method_map = [ cmlenz@62: ... ('**/templates/**.*', 'genshi'), cmlenz@62: ... ('**.py', 'python') cmlenz@62: ... ] palgarvio@224: cmlenz@44: The dictionary provided by the optional `options_map` parameter augments cmlenz@62: these mappings. It uses extended glob patterns as keys, and the values are cmlenz@62: dictionaries mapping options names to option values (both strings). palgarvio@224: cmlenz@44: The glob patterns of the `options_map` do not necessarily need to be the cmlenz@62: same as those used in the method mapping. For example, while all files in cmlenz@62: the ``templates`` folders in an application may be Genshi applications, the cmlenz@44: options for those files may differ based on extension: palgarvio@224: cmlenz@44: >>> options_map = { cmlenz@44: ... '**/templates/**.txt': { cmlenz@144: ... 'template_class': 'genshi.template:TextTemplate', cmlenz@44: ... 'encoding': 'latin-1' cmlenz@44: ... }, cmlenz@44: ... '**/templates/**.html': { cmlenz@44: ... 'include_attrs': '' cmlenz@44: ... } cmlenz@1: ... } palgarvio@224: cmlenz@1: :param dirname: the path to the directory to extract messages from cmlenz@62: :param method_map: a list of ``(pattern, method)`` tuples that maps of cmlenz@62: extraction method names to extended glob patterns cmlenz@44: :param options_map: a dictionary of additional options (optional) cmlenz@12: :param keywords: a dictionary mapping keywords (i.e. names of functions cmlenz@12: that should be recognized as translation functions) to cmlenz@12: tuples that specify which of their arguments contain cmlenz@12: localizable strings cmlenz@84: :param comment_tags: a list of tags of translator comments to search for cmlenz@84: and include in the results cmlenz@47: :param callback: a function that is called for every file that message are cmlenz@47: extracted from, just before the extraction itself is cmlenz@75: performed; the function is passed the filename, the name cmlenz@75: of the extraction method and and the options dictionary as cmlenz@75: positional arguments, in that order aronacher@338: :param strip_comment_tags: a flag that if set to `True` causes all comment aronacher@338: tags to be removed from the collected comments. cmlenz@1: :return: an iterator over ``(filename, lineno, funcname, message)`` tuples cmlenz@1: :rtype: ``iterator`` cmlenz@44: :see: `pathmatch` cmlenz@1: """ cmlenz@44: if options_map is None: cmlenz@44: options_map = {} cmlenz@56: cmlenz@44: absname = os.path.abspath(dirname) cmlenz@44: for root, dirnames, filenames in os.walk(absname): cmlenz@44: for subdir in dirnames: cmlenz@44: if subdir.startswith('.') or subdir.startswith('_'): cmlenz@44: dirnames.remove(subdir) cmlenz@154: dirnames.sort() cmlenz@154: filenames.sort() cmlenz@44: for filename in filenames: cmlenz@44: filename = relpath( cmlenz@44: os.path.join(root, filename).replace(os.sep, '/'), cmlenz@44: dirname cmlenz@44: ) cmlenz@62: for pattern, method in method_map: cmlenz@44: if pathmatch(pattern, filename): cmlenz@44: filepath = os.path.join(absname, filename) cmlenz@44: options = {} cmlenz@44: for opattern, odict in options_map.items(): cmlenz@44: if pathmatch(opattern, filename): cmlenz@44: options = odict cmlenz@47: if callback: cmlenz@57: callback(filename, method, options) palgarvio@80: for lineno, message, comments in \ aronacher@338: extract_from_file(method, filepath, aronacher@338: keywords=keywords, aronacher@338: comment_tags=comment_tags, aronacher@338: options=options, aronacher@338: strip_comment_tags= aronacher@338: strip_comment_tags): palgarvio@80: yield filename, lineno, message, comments cmlenz@57: break cmlenz@1: aronacher@340: cmlenz@12: def extract_from_file(method, filename, keywords=DEFAULT_KEYWORDS, aronacher@338: comment_tags=(), options=None, strip_comment_tags=False): cmlenz@1: """Extract messages from a specific file. palgarvio@224: cmlenz@1: This function returns a list of tuples of the form: palgarvio@224: cmlenz@1: ``(lineno, funcname, message)`` palgarvio@224: cmlenz@1: :param filename: the path to the file to extract messages from cmlenz@1: :param method: a string specifying the extraction method (.e.g. "python") cmlenz@12: :param keywords: a dictionary mapping keywords (i.e. names of functions cmlenz@12: that should be recognized as translation functions) to cmlenz@12: tuples that specify which of their arguments contain cmlenz@12: localizable strings cmlenz@84: :param comment_tags: a list of translator tags to search for and include cmlenz@84: in the results aronacher@338: :param strip_comment_tags: a flag that if set to `True` causes all comment aronacher@338: tags to be removed from the collected comments. cmlenz@1: :param options: a dictionary of additional options (optional) cmlenz@1: :return: the list of extracted messages cmlenz@1: :rtype: `list` cmlenz@1: """ cmlenz@1: fileobj = open(filename, 'U') cmlenz@1: try: aronacher@338: return list(extract(method, fileobj, keywords, comment_tags, options, aronacher@338: strip_comment_tags)) cmlenz@1: finally: cmlenz@1: fileobj.close() cmlenz@1: aronacher@340: cmlenz@84: def extract(method, fileobj, keywords=DEFAULT_KEYWORDS, comment_tags=(), aronacher@338: options=None, strip_comment_tags=False): cmlenz@1: """Extract messages from the given file-like object using the specified cmlenz@1: extraction method. palgarvio@224: fschwarz@560: This function returns tuples of the form: palgarvio@224: palgarvio@80: ``(lineno, message, comments)`` palgarvio@224: cmlenz@1: The implementation dispatches the actual extraction to plugins, based on the cmlenz@1: value of the ``method`` parameter. palgarvio@224: cmlenz@1: >>> source = '''# foo module cmlenz@1: ... def run(argv): cmlenz@1: ... print _('Hello, world!') cmlenz@1: ... ''' palgarvio@224: cmlenz@1: >>> from StringIO import StringIO cmlenz@1: >>> for message in extract('python', StringIO(source)): cmlenz@1: ... print message pjenvey@164: (3, u'Hello, world!', []) palgarvio@224: cmlenz@250: :param method: a string specifying the extraction method (.e.g. "python"); cmlenz@250: if this is a simple name, the extraction function will be cmlenz@250: looked up by entry point; if it is an explicit reference cmlenz@329: to a function (of the form ``package.module:funcname`` or cmlenz@329: ``package.module.funcname``), the corresponding function cmlenz@329: will be imported and used cmlenz@1: :param fileobj: the file-like object the messages should be extracted from cmlenz@12: :param keywords: a dictionary mapping keywords (i.e. names of functions cmlenz@12: that should be recognized as translation functions) to cmlenz@12: tuples that specify which of their arguments contain cmlenz@12: localizable strings cmlenz@84: :param comment_tags: a list of translator tags to search for and include cmlenz@84: in the results cmlenz@1: :param options: a dictionary of additional options (optional) aronacher@338: :param strip_comment_tags: a flag that if set to `True` causes all comment aronacher@338: tags to be removed from the collected comments. fschwarz@560: :return: an iterator over ``(lineno, message, comments)`` tuples fschwarz@560: :rtype: `iterator` cmlenz@1: :raise ValueError: if the extraction method is not registered cmlenz@1: """ pjenvey@322: func = None cmlenz@329: if ':' in method or '.' in method: cmlenz@329: if ':' not in method: cmlenz@329: lastdot = method.rfind('.') cmlenz@329: module, attrname = method[:lastdot], method[lastdot + 1:] cmlenz@329: else: cmlenz@329: module, attrname = method.split(':', 1) cmlenz@329: func = getattr(__import__(module, {}, {}, [attrname]), attrname) cmlenz@250: else: cmlenz@250: try: cmlenz@250: from pkg_resources import working_set cmlenz@250: except ImportError: cmlenz@250: # pkg_resources is not available, so we resort to looking up the cmlenz@250: # builtin extractors directly cmlenz@250: builtin = {'ignore': extract_nothing, 'python': extract_python} cmlenz@250: func = builtin.get(method) cmlenz@250: else: cmlenz@250: for entry_point in working_set.iter_entry_points(GROUP_NAME, cmlenz@250: method): cmlenz@250: func = entry_point.load(require=True) cmlenz@250: break cmlenz@250: if func is None: cmlenz@250: raise ValueError('Unknown extraction method %r' % method) pjenvey@222: cmlenz@250: results = func(fileobj, keywords.keys(), comment_tags, cmlenz@250: options=options or {}) palgarvio@366: cmlenz@250: for lineno, funcname, messages, comments in results: cmlenz@250: if funcname: cmlenz@250: spec = keywords[funcname] or (1,) cmlenz@250: else: cmlenz@250: spec = (1,) cmlenz@250: if not isinstance(messages, (list, tuple)): cmlenz@250: messages = [messages] pjenvey@258: if not messages: pjenvey@258: continue pjenvey@222: pjenvey@258: # Validate the messages against the keyword's specification cmlenz@250: msgs = [] cmlenz@250: invalid = False pjenvey@258: # last_index is 1 based like the keyword spec pjenvey@258: last_index = len(messages) cmlenz@250: for index in spec: pjenvey@258: if last_index < index: pjenvey@258: # Not enough arguments pjenvey@258: invalid = True pjenvey@258: break cmlenz@250: message = messages[index - 1] cmlenz@250: if message is None: cmlenz@250: invalid = True cmlenz@250: break cmlenz@250: msgs.append(message) cmlenz@250: if invalid: cmlenz@250: continue pjenvey@222: cmlenz@250: first_msg_index = spec[0] - 1 cmlenz@250: if not messages[first_msg_index]: cmlenz@250: # An empty string msgid isn't valid, emit a warning cmlenz@250: where = '%s:%i' % (hasattr(fileobj, 'name') and \ cmlenz@250: fileobj.name or '(unknown)', lineno) cmlenz@250: print >> sys.stderr, empty_msgid_warning % where cmlenz@250: continue cmlenz@12: cmlenz@250: messages = tuple(msgs) cmlenz@250: if len(messages) == 1: cmlenz@250: messages = messages[0] aronacher@338: aronacher@338: if strip_comment_tags: aronacher@338: _strip_comment_tags(comments, comment_tags) cmlenz@250: yield lineno, messages, comments cmlenz@1: aronacher@340: cmlenz@84: def extract_nothing(fileobj, keywords, comment_tags, options): cmlenz@57: """Pseudo extractor that does not actually extract anything, but simply cmlenz@57: returns an empty list. cmlenz@57: """ cmlenz@57: return [] cmlenz@57: aronacher@340: cmlenz@84: def extract_python(fileobj, keywords, comment_tags, options): cmlenz@1: """Extract messages from Python source code. palgarvio@224: pjenvey@164: :param fileobj: the seekable, file-like object the messages should be pjenvey@164: extracted from cmlenz@1: :param keywords: a list of keywords (i.e. function names) that should be cmlenz@1: recognized as translation functions cmlenz@84: :param comment_tags: a list of translator tags to search for and include cmlenz@84: in the results cmlenz@1: :param options: a dictionary of additional options (optional) palgarvio@81: :return: an iterator over ``(lineno, funcname, message, comments)`` tuples cmlenz@1: :rtype: ``iterator`` cmlenz@1: """ pjenvey@222: funcname = lineno = message_lineno = None pjenvey@222: call_stack = -1 cmlenz@1: buf = [] cmlenz@1: messages = [] palgarvio@80: translator_comments = [] pjenvey@222: in_def = in_translator_comments = False aronacher@338: comment_tag = None cmlenz@1: pjenvey@222: encoding = parse_encoding(fileobj) or options.get('encoding', 'iso-8859-1') pjenvey@164: cmlenz@1: tokens = generate_tokens(fileobj.readline) cmlenz@1: for tok, value, (lineno, _), _, _ in tokens: pjenvey@222: if call_stack == -1 and tok == NAME and value in ('def', 'class'): pjenvey@222: in_def = True pjenvey@222: elif tok == OP and value == '(': pjenvey@222: if in_def: pjenvey@222: # Avoid false positives for declarations such as: pjenvey@222: # def gettext(arg='message'): pjenvey@222: in_def = False pjenvey@222: continue pjenvey@222: if funcname: pjenvey@222: message_lineno = lineno pjenvey@222: call_stack += 1 pjenvey@223: elif in_def and tok == OP and value == ':': pjenvey@223: # End of a class definition without parens pjenvey@223: in_def = False pjenvey@223: continue pjenvey@222: elif call_stack == -1 and tok == COMMENT: palgarvio@92: # Strip the comment token from the line pjenvey@164: value = value.decode(encoding)[1:].strip() pjenvey@147: if in_translator_comments and \ palgarvio@93: translator_comments[-1][0] == lineno - 1: palgarvio@92: # We're already inside a translator comment, continue appending palgarvio@93: translator_comments.append((lineno, value)) palgarvio@92: continue palgarvio@92: # If execution reaches this point, let's see if comment line palgarvio@92: # starts with one of the comment tags palgarvio@85: for comment_tag in comment_tags: palgarvio@92: if value.startswith(comment_tag): pjenvey@147: in_translator_comments = True aronacher@338: translator_comments.append((lineno, value)) palgarvio@92: break pjenvey@222: elif funcname and call_stack == 0: cmlenz@1: if tok == OP and value == ')': cmlenz@1: if buf: cmlenz@1: messages.append(''.join(buf)) cmlenz@1: del buf[:] pjenvey@222: else: pjenvey@222: messages.append(None) palgarvio@93: pjenvey@222: if len(messages) > 1: pjenvey@222: messages = tuple(messages) pjenvey@222: else: pjenvey@222: messages = messages[0] pjenvey@222: # Comments don't apply unless they immediately preceed the pjenvey@222: # message pjenvey@222: if translator_comments and \ pjenvey@222: translator_comments[-1][0] < message_lineno - 1: pjenvey@222: translator_comments = [] pjenvey@222: pjenvey@222: yield (message_lineno, funcname, messages, pjenvey@222: [comment[1] for comment in translator_comments]) pjenvey@222: pjenvey@222: funcname = lineno = message_lineno = None pjenvey@222: call_stack = -1 cmlenz@1: messages = [] palgarvio@80: translator_comments = [] pjenvey@222: in_translator_comments = False cmlenz@1: elif tok == STRING: pjenvey@164: # Unwrap quotes in a safe manner, maintaining the string's pjenvey@164: # encoding pjenvey@222: # https://sourceforge.net/tracker/?func=detail&atid=355470& pjenvey@222: # aid=617979&group_id=5470 pjenvey@164: value = eval('# coding=%s\n%s' % (encoding, value), pjenvey@164: {'__builtins__':{}}, {}) pjenvey@164: if isinstance(value, str): pjenvey@164: value = value.decode(encoding) pjenvey@164: buf.append(value) cmlenz@1: elif tok == OP and value == ',': pjenvey@222: if buf: pjenvey@222: messages.append(''.join(buf)) pjenvey@222: del buf[:] pjenvey@222: else: pjenvey@222: messages.append(None) palgarvio@366: if translator_comments: palgarvio@366: # We have translator comments, and since we're on a palgarvio@366: # comma(,) user is allowed to break into a new line palgarvio@366: # Let's increase the last comment's lineno in order palgarvio@366: # for the comment to still be a valid one palgarvio@366: old_lineno, old_comment = translator_comments.pop() palgarvio@366: translator_comments.append((old_lineno+1, old_comment)) pjenvey@222: elif call_stack > 0 and tok == OP and value == ')': pjenvey@222: call_stack -= 1 pjenvey@222: elif funcname and call_stack == -1: cmlenz@1: funcname = None cmlenz@1: elif tok == NAME and value in keywords: cmlenz@1: funcname = value aronacher@339: aronacher@340: aronacher@339: def extract_javascript(fileobj, keywords, comment_tags, options): aronacher@339: """Extract messages from JavaScript source code. aronacher@339: aronacher@339: :param fileobj: the seekable, file-like object the messages should be aronacher@339: extracted from aronacher@339: :param keywords: a list of keywords (i.e. function names) that should be aronacher@339: recognized as translation functions aronacher@339: :param comment_tags: a list of translator tags to search for and include aronacher@339: in the results aronacher@339: :param options: a dictionary of additional options (optional) aronacher@339: :return: an iterator over ``(lineno, funcname, message, comments)`` tuples aronacher@339: :rtype: ``iterator`` aronacher@339: """ aronacher@339: from babel.messages.jslexer import tokenize, unquote_string aronacher@339: funcname = message_lineno = None aronacher@339: messages = [] aronacher@339: last_argument = None aronacher@339: translator_comments = [] aronacher@405: concatenate_next = False aronacher@339: encoding = options.get('encoding', 'utf-8') aronacher@339: last_token = None aronacher@339: call_stack = -1 aronacher@339: aronacher@339: for token in tokenize(fileobj.read().decode(encoding)): aronacher@339: if token.type == 'operator' and token.value == '(': aronacher@339: if funcname: aronacher@339: message_lineno = token.lineno aronacher@339: call_stack += 1 aronacher@339: aronacher@339: elif call_stack == -1 and token.type == 'linecomment': aronacher@339: value = token.value[2:].strip() aronacher@339: if translator_comments and \ aronacher@339: translator_comments[-1][0] == token.lineno - 1: aronacher@339: translator_comments.append((token.lineno, value)) aronacher@339: continue aronacher@339: aronacher@339: for comment_tag in comment_tags: aronacher@339: if value.startswith(comment_tag): aronacher@339: translator_comments.append((token.lineno, value.strip())) aronacher@339: break aronacher@339: aronacher@339: elif token.type == 'multilinecomment': aronacher@339: # only one multi-line comment may preceed a translation aronacher@339: translator_comments = [] aronacher@339: value = token.value[2:-2].strip() aronacher@339: for comment_tag in comment_tags: aronacher@339: if value.startswith(comment_tag): aronacher@339: lines = value.splitlines() aronacher@339: if lines: aronacher@339: lines[0] = lines[0].strip() aronacher@339: lines[1:] = dedent('\n'.join(lines[1:])).splitlines() aronacher@339: for offset, line in enumerate(lines): aronacher@339: translator_comments.append((token.lineno + offset, aronacher@339: line)) aronacher@339: break aronacher@339: aronacher@339: elif funcname and call_stack == 0: aronacher@339: if token.type == 'operator' and token.value == ')': aronacher@339: if last_argument is not None: aronacher@339: messages.append(last_argument) aronacher@339: if len(messages) > 1: aronacher@339: messages = tuple(messages) aronacher@339: elif messages: aronacher@339: messages = messages[0] aronacher@339: else: aronacher@339: messages = None aronacher@339: palgarvio@426: # Comments don't apply unless they immediately precede the aronacher@339: # message aronacher@339: if translator_comments and \ aronacher@339: translator_comments[-1][0] < message_lineno - 1: aronacher@339: translator_comments = [] aronacher@339: aronacher@339: if messages is not None: aronacher@339: yield (message_lineno, funcname, messages, aronacher@339: [comment[1] for comment in translator_comments]) aronacher@339: aronacher@339: funcname = message_lineno = last_argument = None aronacher@405: concatenate_next = False aronacher@339: translator_comments = [] aronacher@339: messages = [] aronacher@339: call_stack = -1 aronacher@339: aronacher@339: elif token.type == 'string': aronacher@405: new_value = unquote_string(token.value) aronacher@405: if concatenate_next: aronacher@405: last_argument = (last_argument or '') + new_value aronacher@405: concatenate_next = False aronacher@405: else: aronacher@405: last_argument = new_value aronacher@339: aronacher@405: elif token.type == 'operator': aronacher@405: if token.value == ',': aronacher@405: if last_argument is not None: aronacher@405: messages.append(last_argument) aronacher@405: last_argument = None aronacher@405: else: aronacher@405: messages.append(None) aronacher@405: concatenate_next = False aronacher@405: elif token.value == '+': aronacher@405: concatenate_next = True aronacher@339: aronacher@339: elif call_stack > 0 and token.type == 'operator' \ aronacher@339: and token.value == ')': aronacher@339: call_stack -= 1 aronacher@339: aronacher@339: elif funcname and call_stack == -1: aronacher@339: funcname = None aronacher@339: aronacher@339: elif call_stack == -1 and token.type == 'name' and \ aronacher@339: token.value in keywords and \ aronacher@339: (last_token is None or last_token.type != 'name' or aronacher@339: last_token.value != 'function'): aronacher@339: funcname = token.value aronacher@339: aronacher@339: last_token = token