Mercurial > babel > old > mirror
changeset 341:672b6b8e945d
Added !JavaScript extractor
author | aronacher |
---|---|
date | Thu, 12 Jun 2008 16:26:52 +0000 |
parents | 292c639506a3 |
children | 603192024857 |
files | ChangeLog babel/messages/extract.py babel/messages/jslexer.py babel/messages/tests/extract.py setup.py |
diffstat | 5 files changed, 373 insertions(+), 1 deletions(-) [+] |
line wrap: on
line diff
--- a/ChangeLog +++ b/ChangeLog @@ -5,7 +5,8 @@ * Fixed invalid message extraction methods causing an UnboundLocalError. * The stripping of the comment tags in comments is optional now and is done for each line in a comment. - + * a JavaScript extractor was added. + Version 0.9.2 http://svn.edgewall.org/repos/babel/tags/0.9.2/
--- a/babel/messages/extract.py +++ b/babel/messages/extract.py @@ -428,3 +428,109 @@ funcname = None elif tok == NAME and value in keywords: funcname = value + +def extract_javascript(fileobj, keywords, comment_tags, options): + """Extract messages from JavaScript source code. + + :param fileobj: the seekable, file-like object the messages should be + extracted from + :param keywords: a list of keywords (i.e. function names) that should be + recognized as translation functions + :param comment_tags: a list of translator tags to search for and include + in the results + :param options: a dictionary of additional options (optional) + :return: an iterator over ``(lineno, funcname, message, comments)`` tuples + :rtype: ``iterator`` + """ + from babel.messages.jslexer import tokenize, unquote_string + funcname = message_lineno = None + messages = [] + last_argument = None + translator_comments = [] + encoding = options.get('encoding', 'utf-8') + last_token = None + call_stack = -1 + + for token in tokenize(fileobj.read().decode(encoding)): + if token.type == 'operator' and token.value == '(': + if funcname: + message_lineno = token.lineno + call_stack += 1 + + elif call_stack == -1 and token.type == 'linecomment': + value = token.value[2:].strip() + if translator_comments and \ + translator_comments[-1][0] == token.lineno - 1: + translator_comments.append((token.lineno, value)) + continue + + for comment_tag in comment_tags: + if value.startswith(comment_tag): + translator_comments.append((token.lineno, value.strip())) + break + + elif token.type == 'multilinecomment': + # only one multi-line comment may preceed a translation + translator_comments = [] + value = token.value[2:-2].strip() + for comment_tag in comment_tags: + if value.startswith(comment_tag): + lines = value.splitlines() + if lines: + lines[0] = lines[0].strip() + lines[1:] = dedent('\n'.join(lines[1:])).splitlines() + for offset, line in enumerate(lines): + translator_comments.append((token.lineno + offset, + line)) + break + + elif funcname and call_stack == 0: + if token.type == 'operator' and token.value == ')': + if last_argument is not None: + messages.append(last_argument) + if len(messages) > 1: + messages = tuple(messages) + elif messages: + messages = messages[0] + else: + messages = None + + # Comments don't apply unless they immediately preceed the + # message + if translator_comments and \ + translator_comments[-1][0] < message_lineno - 1: + translator_comments = [] + + if messages is not None: + yield (message_lineno, funcname, messages, + [comment[1] for comment in translator_comments]) + + funcname = message_lineno = last_argument = None + translator_comments = [] + messages = [] + call_stack = -1 + + elif token.type == 'string': + last_argument = unquote_string(token.value) + + elif token.type == 'operator' and token.value == ',': + if last_argument is not None: + messages.append(last_argument) + last_argument = None + else: + messages.append(None) + + elif call_stack > 0 and token.type == 'operator' \ + and token.value == ')': + call_stack -= 1 + + elif funcname and call_stack == -1: + funcname = None + + elif call_stack == -1 and token.type == 'name' and \ + token.value in keywords and \ + (last_token is None or last_token.type != 'name' or + last_token.value != 'function'): + funcname = token.value + + last_token = token
new file mode 100644 --- /dev/null +++ b/babel/messages/jslexer.py @@ -0,0 +1,173 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2008 Edgewall Software +# All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://babel.edgewall.org/wiki/License. +# +# This software consists of voluntary contributions made by many +# individuals. For the exact contribution history, see the revision +# history and logs, available at http://babel.edgewall.org/log/. + +"""A simple JavaScript 1.5 lexer which is used for the JavaScript +extractor. +""" + +import re +from operator import itemgetter + + +operators = [ + '+', '-', '*', '%', '!=', '==', '<', '>', '<=', '>=', '=', + '+=', '-=', '*=', '%=', '<<', '>>', '>>>', '<<=', '>>=', + '>>>=', '&', '&=', '|', '|=', '&&', '||', '^', '^=', '(', ')', + '[', ']', '{', '}', '!', '--', '++', '~', ',', ';', '.' +] +operators.sort(lambda a, b: cmp(-len(a), -len(b))) + +escapes = {'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'} + +rules = [ + (None, re.compile(r'\s+(?u)')), + (None, re.compile(r'<!--.*')), + ('linecomment', re.compile(r'//.*')), + ('multilinecomment', re.compile(r'/\*.*?\*/(?us)')), + ('name', re.compile(r'(\$+\w*|[^\W\d]\w*)(?u)')), + ('number', re.compile(r'''(?x)( + (?:0|[1-9]\d*) + (\.\d+)? + ([eE][-+]?\d+)? | + (0x[a-fA-F0-9]+) + )''')), + ('operator', re.compile(r'(%s)' % '|'.join(map(re.escape, operators)))), + ('string', re.compile(r'''(?xs)( + '(?:[^'\\]*(?:\\.[^'\\]*)*)' | + "(?:[^"\\]*(?:\\.[^"\\]*)*)" + )''')) +] + +division_re = re.compile(r'/=?') +regex_re = re.compile(r'/.+?/[a-zA-Z]*(?s)') +line_re = re.compile(r'(\r\n|\n|\r)') +line_join_re = re.compile(r'\\' + line_re.pattern) +uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}') + + +class TokenError(ValueError): + """Raised if the tokenizer stumbled upon invalid tokens.""" + +class Token(tuple): + """Represents a token as returned by `tokenize`.""" + __slots__ = () + + def __new__(cls, type, value, lineno): + return tuple.__new__(cls, (type, value, lineno)) + + type = property(itemgetter(0)) + value = property(itemgetter(1)) + lineno = property(itemgetter(2)) + +def indicates_division(token): + """A helper function that helps the tokenizer to decide if the current + token may be followed by a division operator. + """ + if token.type == 'operator': + return token.value in (')', ']', '}', '++', '--') + return token.type in ('name', 'number', 'string', 'regexp') + +def unquote_string(string): + """Unquote a string with JavaScript rules. The string has to start with + string delimiters (``'`` or ``"``.) + + :return: a string + """ + assert string and string[0] == string[-1] and string[0] in '"\'', \ + 'string provided is not properly delimited' + string = line_join_re.sub('\\1', string[1:-1]) + result = [] + add = result.append + pos = 0 + + while 1: + # scan for the next escape + escape_pos = string.find('\\', pos) + if escape_pos < 0: + break + add(string[pos:escape_pos]) + + # check which character is escaped + next_char = string[escape_pos + 1] + if next_char in escapes: + add(escapes[next_char]) + + # unicode escapes. trie to consume up to four characters of + # hexadecimal characters and try to interpret them as unicode + # character point. If there is no such character point, put + # all the consumed characters into the string. + elif next_char in 'uU': + escaped = uni_escape_re.match(string, escape_pos + 2) + if escaped is not None: + escaped_value = escaped.group() + if len(escaped_value) == 4: + try: + add(unichr(int(escaped_value, 16))) + except ValueError: + pass + else: + pos = escape_pos + 6 + continue + add(next_char + escaped_value) + pos = escaped.end() + continue + else: + add(next_char) + + # bogus escape. Just remove the backslash. + else: + add(next_char) + pos = escape_pos + 2 + + if pos < len(string): + add(string[pos:]) + + return u''.join(result) + +def tokenize(source): + """Tokenize a JavaScript source. + + :return: generator of `Token`\s + """ + may_divide = False + pos = 0 + lineno = 1 + end = len(source) + + while pos < end: + # handle regular rules first + for token_type, rule in rules: + match = rule.match(source, pos) + if match is not None: + break + # if we don't have a match we don't give up yet, but check for + # division operators or regular expression literals, based on + # the status of `may_divide` which is determined by the last + # processed non-whitespace token using `indicates_division`. + else: + if may_divide: + match = division_re.match(source, pos) + token_type = 'operator' + else: + match = regex_re.match(source, pos) + token_type = 'regexp' + if match is None: + raise TokenError('invalid syntax around line %d' % lineno) + + token_value = match.group() + if token_type is not None: + token = Token(token_type, token_value, lineno) + may_divide = indicates_division(token) + yield token + lineno += len(line_re.findall(token_value)) + pos = match.end()
--- a/babel/messages/tests/extract.py +++ b/babel/messages/tests/extract.py @@ -321,6 +321,96 @@ self.assertEqual([u'This is a multiline comment with', u'a prefix too'], messages[1][2]) +class ExtractJavaScriptTestCase(unittest.TestCase): + + def test_simple_extract(self): + buf = StringIO("""\ +msg1 = _('simple') +msg2 = gettext('simple') +msg3 = ngettext('s', 'p', 42) + """) + messages = \ + list(extract.extract('javascript', buf, extract.DEFAULT_KEYWORDS, + [], {})) + + self.assertEqual([(1, 'simple', []), + (2, 'simple', []), + (3, ('s', 'p'), [])], messages) + + def test_various_calls(self): + buf = StringIO("""\ +msg1 = _(i18n_arg.replace(/"/, '"')) +msg2 = ungettext(i18n_arg.replace(/"/, '"'), multi_arg.replace(/"/, '"'), 2) +msg3 = ungettext("Babel", multi_arg.replace(/"/, '"'), 2) +msg4 = ungettext(i18n_arg.replace(/"/, '"'), "Babels", 2) +msg5 = ungettext('bunny', 'bunnies', parseInt(Math.random() * 2 + 1)) +msg6 = ungettext(arg0, 'bunnies', rparseInt(Math.random() * 2 + 1)) +msg7 = _(hello.there) +msg8 = gettext('Rabbit') +msg9 = dgettext('wiki', model.addPage()) +msg10 = dngettext(domain, 'Page', 'Pages', 3) +""") + messages = \ + list(extract.extract('javascript', buf, extract.DEFAULT_KEYWORDS, [], + {})) + self.assertEqual([(5, (u'bunny', u'bunnies'), []), + (8, u'Rabbit', []), + (10, (u'Page', u'Pages'), [])], messages) + + def test_message_with_line_comment(self): + buf = StringIO("""\ +// NOTE: hello +msg = _('Bonjour à tous') +""") + messages = list(extract.extract_javascript(buf, ('_',), ['NOTE:'], {})) + self.assertEqual(u'Bonjour à tous', messages[0][2]) + self.assertEqual([u'NOTE: hello'], messages[0][3]) + + def test_message_with_multiline_comment(self): + buf = StringIO("""\ +/* NOTE: hello + and bonjour + and servus */ +msg = _('Bonjour à tous') +""") + messages = list(extract.extract_javascript(buf, ('_',), ['NOTE:'], {})) + self.assertEqual(u'Bonjour à tous', messages[0][2]) + self.assertEqual([u'NOTE: hello', 'and bonjour', ' and servus'], messages[0][3]) + + def test_ignore_function_definitions(self): + buf = StringIO("""\ +function gettext(value) { + return translations[language][value] || value; +}""") + + messages = list(extract.extract_javascript(buf, ('gettext',), [], {})) + self.assertEqual(messages, []) + + def test_misplaced_comments(self): + buf = StringIO("""\ +/* NOTE: this won't show up */ +foo() + +/* NOTE: this will */ +msg = _('Something') + +// NOTE: this will show up +// too. +msg = _('Something else') + +// NOTE: but this won't +bar() + +_('no comment here') +""") + messages = list(extract.extract_javascript(buf, ('_',), ['NOTE:'], {})) + self.assertEqual(u'Something', messages[0][2]) + self.assertEqual([u'NOTE: this will'], messages[0][3]) + self.assertEqual(u'Something else', messages[1][2]) + self.assertEqual([u'NOTE: this will show up', 'too.'], messages[1][3]) + self.assertEqual(u'no comment here', messages[2][2]) + self.assertEqual([], messages[2][3]) + class ExtractTestCase(unittest.TestCase): def test_invalid_filter(self): @@ -382,6 +472,7 @@ suite = unittest.TestSuite() suite.addTest(doctest.DocTestSuite(extract)) suite.addTest(unittest.makeSuite(ExtractPythonTestCase)) + suite.addTest(unittest.makeSuite(ExtractJavaScriptTestCase)) suite.addTest(unittest.makeSuite(ExtractTestCase)) return suite