# HG changeset patch # User pjenvey # Date 1182472734 0 # Node ID 84a9e5f97658f53aab04205e993656b921abdb8b # Parent f2c78a271159129dec1faf57572910e8fe9aae25 made the python extractor detect source file encodings from the magic encoding comment (or default to ascii) and convert message strings and comments to unicode fixes #23 diff --git a/babel/messages/extract.py b/babel/messages/extract.py --- a/babel/messages/extract.py +++ b/babel/messages/extract.py @@ -29,7 +29,7 @@ import sys from tokenize import generate_tokens, COMMENT, NAME, OP, STRING -from babel.util import pathmatch, relpath +from babel.util import parse_encoding, pathmatch, relpath __all__ = ['extract', 'extract_from_dir', 'extract_from_file'] __docformat__ = 'restructuredtext en' @@ -195,7 +195,7 @@ >>> from StringIO import StringIO >>> for message in extract('python', StringIO(source)): ... print message - (3, 'Hello, world!', []) + (3, u'Hello, world!', []) :param method: a string specifying the extraction method (.e.g. "python") :param fileobj: the file-like object the messages should be extracted from @@ -238,7 +238,8 @@ def extract_python(fileobj, keywords, comment_tags, options): """Extract messages from Python source code. - :param fileobj: the file-like object the messages should be extracted from + :param fileobj: the seekable, file-like object the messages should be + extracted from :param keywords: a list of keywords (i.e. function names) that should be recognized as translation functions :param comment_tags: a list of translator tags to search for and include @@ -255,13 +256,15 @@ in_args = False in_translator_comments = False + encoding = parse_encoding(fileobj) or options.get('encoding', 'ascii') + tokens = generate_tokens(fileobj.readline) for tok, value, (lineno, _), _, _ in tokens: if funcname and tok == OP and value == '(': in_args = True elif tok == COMMENT: # Strip the comment token from the line - value = value[1:].strip() + value = value.decode(encoding)[1:].strip() if in_translator_comments and \ translator_comments[-1][0] == lineno - 1: # We're already inside a translator comment, continue appending @@ -300,8 +303,14 @@ messages = [] translator_comments = [] elif tok == STRING: - # Unwrap quotes in a safe manner - buf.append(eval(value, {'__builtins__':{}}, {})) + # Unwrap quotes in a safe manner, maintaining the string's + # encoding + # https://sourceforge.net/tracker/?func=detail&atid=355470&aid=617979&group_id=5470 + value = eval('# coding=%s\n%s' % (encoding, value), + {'__builtins__':{}}, {}) + if isinstance(value, str): + value = value.decode(encoding) + buf.append(value) elif tok == OP and value == ',': messages.append(''.join(buf)) del buf[:] diff --git a/babel/messages/tests/extract.py b/babel/messages/tests/extract.py --- a/babel/messages/tests/extract.py +++ b/babel/messages/tests/extract.py @@ -11,6 +11,7 @@ # individuals. For the exact contribution history, see the revision # history and logs, available at http://babel.edgewall.org/log/. +import codecs import doctest from StringIO import StringIO import unittest @@ -23,7 +24,7 @@ def test_unicode_string_arg(self): buf = StringIO("msg = _(u'Foo Bar')") messages = list(extract.extract_python(buf, ('_',), [], {})) - self.assertEqual('Foo Bar', messages[0][2]) + self.assertEqual(u'Foo Bar', messages[0][2]) def test_comment_tag(self): buf = StringIO(""" @@ -31,8 +32,8 @@ msg = _(u'Foo Bar') """) messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {})) - self.assertEqual('Foo Bar', messages[0][2]) - self.assertEqual(['A translation comment'], messages[0][3]) + self.assertEqual(u'Foo Bar', messages[0][2]) + self.assertEqual([u'A translation comment'], messages[0][3]) def test_comment_tag_multiline(self): buf = StringIO(""" @@ -41,8 +42,8 @@ msg = _(u'Foo Bar') """) messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {})) - self.assertEqual('Foo Bar', messages[0][2]) - self.assertEqual(['A translation comment', 'with a second line'], + self.assertEqual(u'Foo Bar', messages[0][2]) + self.assertEqual([u'A translation comment', u'with a second line'], messages[0][3]) def test_translator_comments_with_previous_non_translator_comments(self): @@ -54,8 +55,8 @@ msg = _(u'Foo Bar') """) messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {})) - self.assertEqual('Foo Bar', messages[0][2]) - self.assertEqual(['A translation comment', 'with a second line'], + self.assertEqual(u'Foo Bar', messages[0][2]) + self.assertEqual([u'A translation comment', u'with a second line'], messages[0][3]) def test_comment_tags_not_on_start_of_comment(self): @@ -67,8 +68,8 @@ msg = _(u'Foo Bar') """) messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {})) - self.assertEqual('Foo Bar', messages[0][2]) - self.assertEqual(['This one will be'], messages[0][3]) + self.assertEqual(u'Foo Bar', messages[0][2]) + self.assertEqual([u'This one will be'], messages[0][3]) def test_multiple_comment_tags(self): buf = StringIO(""" @@ -81,11 +82,11 @@ """) messages = list(extract.extract_python(buf, ('_',), ['NOTE1:', 'NOTE2:'], {})) - self.assertEqual('Foo Bar1', messages[0][2]) - self.assertEqual(['A translation comment for tag1', - 'with a second line'], messages[0][3]) - self.assertEqual('Foo Bar2', messages[1][2]) - self.assertEqual(['A translation comment for tag2'], messages[1][3]) + self.assertEqual(u'Foo Bar1', messages[0][2]) + self.assertEqual([u'A translation comment for tag1', + u'with a second line'], messages[0][3]) + self.assertEqual(u'Foo Bar2', messages[1][2]) + self.assertEqual([u'A translation comment for tag2'], messages[1][3]) def test_two_succeeding_comments(self): buf = StringIO(""" @@ -94,8 +95,8 @@ msg = _(u'Foo Bar') """) messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {})) - self.assertEqual('Foo Bar', messages[0][2]) - self.assertEqual(['one', 'NOTE: two'], messages[0][3]) + self.assertEqual(u'Foo Bar', messages[0][2]) + self.assertEqual([u'one', u'NOTE: two'], messages[0][3]) def test_invalid_translator_comments(self): buf = StringIO(""" @@ -105,7 +106,7 @@ msg = _(u'Foo Bar') """) messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {})) - self.assertEqual('Foo Bar', messages[0][2]) + self.assertEqual(u'Foo Bar', messages[0][2]) self.assertEqual([], messages[0][3]) def test_invalid_translator_comments2(self): @@ -120,9 +121,9 @@ hello = _('Hello') """) messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {})) - self.assertEqual('Hi there!', messages[0][2]) - self.assertEqual(['Hi!'], messages[0][3]) - self.assertEqual('Hello', messages[1][2]) + self.assertEqual(u'Hi there!', messages[0][2]) + self.assertEqual([u'Hi!'], messages[0][3]) + self.assertEqual(u'Hello', messages[1][2]) self.assertEqual([], messages[1][3]) def test_invalid_translator_comments3(self): @@ -133,9 +134,46 @@ hithere = _('Hi there!') """) messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {})) - self.assertEqual('Hi there!', messages[0][2]) + self.assertEqual(u'Hi there!', messages[0][2]) self.assertEqual([], messages[0][3]) + def test_utf8_message(self): + buf = StringIO(""" +# NOTE: hello +msg = _('Bonjour à tous') +""") + messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], + {'encoding': 'utf-8'})) + self.assertEqual(u'Bonjour à tous', messages[0][2]) + self.assertEqual([u'hello'], messages[0][3]) + + def test_utf8_message_with_magic_comment(self): + buf = StringIO("""# -*- coding: utf-8 -*- +# NOTE: hello +msg = _('Bonjour à tous') +""") + messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {})) + self.assertEqual(u'Bonjour à tous', messages[0][2]) + self.assertEqual([u'hello'], messages[0][3]) + + def test_utf8_message_with_utf8_bom(self): + buf = StringIO(codecs.BOM_UTF8 + """ +# NOTE: hello +msg = _('Bonjour à tous') +""") + messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {})) + self.assertEqual(u'Bonjour à tous', messages[0][2]) + self.assertEqual([u'hello'], messages[0][3]) + + def test_utf8_raw_strings_match_unicode_strings(self): + buf = StringIO(codecs.BOM_UTF8 + """ +msg = _('Bonjour à tous') +msgu = _(u'Bonjour à tous') +""") + messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {})) + self.assertEqual(u'Bonjour à tous', messages[0][2]) + self.assertEqual(messages[0][2], messages[1][2]) + def suite(): suite = unittest.TestSuite() suite.addTest(doctest.DocTestSuite(extract)) diff --git a/babel/util.py b/babel/util.py --- a/babel/util.py +++ b/babel/util.py @@ -13,14 +13,65 @@ """Various utility classes and functions.""" +import codecs from datetime import timedelta, tzinfo import os +import parser import re import time __all__ = ['pathmatch', 'relpath', 'UTC', 'LOCALTZ'] __docformat__ = 'restructuredtext en' +# Regexp to match python magic encoding line +PYTHON_MAGIC_COMMENT_re = re.compile( + r'[ \t\f]* \# .* coding[=:][ \t]*([-\w.]+)', re.VERBOSE) +def parse_encoding(fp): + """Deduce the encoding of a source file from magic comment. + + It does this in the same way as the `Python interpreter`__ + + .. __: http://docs.python.org/ref/encodings.html + + The ``fp`` argument should be a seekable file object. + + (From Jeff Dairiki) + """ + pos = fp.tell() + fp.seek(0) + try: + line1 = fp.readline() + has_bom = line1.startswith(codecs.BOM_UTF8) + if has_bom: + line1 = line1[len(codecs.BOM_UTF8):] + + m = PYTHON_MAGIC_COMMENT_re.match(line1) + if not m: + try: + parser.suite(line1) + except SyntaxError: + # Either it's a real syntax error, in which case the source is + # not valid python source, or line2 is a continuation of line1, + # in which case we don't want to scan line2 for a magic + # comment. + pass + else: + line2 = fp.readline() + m = PYTHON_MAGIC_COMMENT_re.match(line2) + + if has_bom: + if m: + raise SyntaxError( + "python refuses to compile code with both a UTF8 " + "byte-order-mark and a magic encoding comment") + return 'utf_8' + elif m: + return m.group(1) + else: + return None + finally: + fp.seek(pos) + def pathmatch(pattern, filename): """Extended pathname pattern matching.