Mercurial > babel > mirror

--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,4 @@
+
 Version 0.8.1
 http://svn.edgewall.org/repos/babel/tags/0.8.1/
 (?, from branches/stable/0.8.x)
@@ -15,6 +16,8 @@
    the order those messages are found when walking the source tree, is no
    longer subject to differences between platforms; directory and file names
    are now always sorted alphabetically.
+ * The Python message extractor now respects the special encoding comment to be
+   able to handle files containing non-ASCII characters (ticket #23).


 Version 0.8
--- a/babel/messages/extract.py
+++ b/babel/messages/extract.py
@@ -27,9 +27,9 @@
 except NameError:
     from sets import Set as set
 import sys
-from tokenize import generate_tokens, NAME, OP, STRING, COMMENT
+from tokenize import generate_tokens, COMMENT, NAME, OP, STRING

-from babel.util import pathmatch, relpath
+from babel.util import parse_encoding, pathmatch, relpath

 __all__ = ['extract', 'extract_from_dir', 'extract_from_file']
 __docformat__ = 'restructuredtext en'
@@ -195,7 +195,7 @@
     >>> from StringIO import StringIO
     >>> for message in extract('python', StringIO(source)):
     ...     print message
-    (3, 'Hello, world!', [])
+    (3, u'Hello, world!', [])

     :param method: a string specifying the extraction method (.e.g. "python")
     :param fileobj: the file-like object the messages should be extracted from
@@ -238,7 +238,8 @@
 def extract_python(fileobj, keywords, comment_tags, options):
     """Extract messages from Python source code.

-    :param fileobj: the file-like object the messages should be extracted from
+    :param fileobj: the seekable, file-like object the messages should be
+                    extracted from
     :param keywords: a list of keywords (i.e. function names) that should be
                      recognized as translation functions
     :param comment_tags: a list of translator tags to search for and include
@@ -255,13 +256,15 @@
     in_args = False
     in_translator_comments = False

+    encoding = parse_encoding(fileobj) or options.get('encoding', 'ascii')
+
     tokens = generate_tokens(fileobj.readline)
     for tok, value, (lineno, _), _, _ in tokens:
         if funcname and tok == OP and value == '(':
             in_args = True
         elif tok == COMMENT:
             # Strip the comment token from the line
-            value = value[1:].strip()
+            value = value.decode(encoding)[1:].strip()
             if in_translator_comments and \
                     translator_comments[-1][0] == lineno - 1:
                 # We're already inside a translator comment, continue appending
@@ -300,8 +303,14 @@
                 messages = []
                 translator_comments = []
             elif tok == STRING:
-                # Unwrap quotes in a safe manner
-                buf.append(eval(value, {'__builtins__':{}}, {}))
+                # Unwrap quotes in a safe manner, maintaining the string's
+                # encoding
+                # https://sourceforge.net/tracker/?func=detail&atid=355470&aid=617979&group_id=5470
+                value = eval('# coding=%s\n%s' % (encoding, value),
+                             {'__builtins__':{}}, {})
+                if isinstance(value, str):
+                    value = value.decode(encoding)
+                buf.append(value)
             elif tok == OP and value == ',':
                 messages.append(''.join(buf))
                 del buf[:]
--- a/babel/messages/tests/extract.py
+++ b/babel/messages/tests/extract.py
@@ -11,6 +11,7 @@
 # individuals. For the exact contribution history, see the revision
 # history and logs, available at http://babel.edgewall.org/log/.

+import codecs
 import doctest
 from StringIO import StringIO
 import unittest
@@ -23,7 +24,7 @@
     def test_unicode_string_arg(self):
         buf = StringIO("msg = _(u'Foo Bar')")
         messages = list(extract.extract_python(buf, ('_',), [], {}))
-        self.assertEqual('Foo Bar', messages[0][2])
+        self.assertEqual(u'Foo Bar', messages[0][2])

     def test_comment_tag(self):
         buf = StringIO("""
@@ -31,8 +32,8 @@
 msg = _(u'Foo Bar')
 """)
         messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
-        self.assertEqual('Foo Bar', messages[0][2])
-        self.assertEqual(['A translation comment'], messages[0][3])
+        self.assertEqual(u'Foo Bar', messages[0][2])
+        self.assertEqual([u'A translation comment'], messages[0][3])

     def test_comment_tag_multiline(self):
         buf = StringIO("""
@@ -41,8 +42,8 @@
 msg = _(u'Foo Bar')
 """)
         messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
-        self.assertEqual('Foo Bar', messages[0][2])
-        self.assertEqual(['A translation comment', 'with a second line'],
+        self.assertEqual(u'Foo Bar', messages[0][2])
+        self.assertEqual([u'A translation comment', u'with a second line'],
                          messages[0][3])

     def test_translator_comments_with_previous_non_translator_comments(self):
@@ -54,8 +55,8 @@
 msg = _(u'Foo Bar')
 """)
         messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
-        self.assertEqual('Foo Bar', messages[0][2])
-        self.assertEqual(['A translation comment', 'with a second line'],
+        self.assertEqual(u'Foo Bar', messages[0][2])
+        self.assertEqual([u'A translation comment', u'with a second line'],
                          messages[0][3])

     def test_comment_tags_not_on_start_of_comment(self):
@@ -67,8 +68,8 @@
 msg = _(u'Foo Bar')
 """)
         messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
-        self.assertEqual('Foo Bar', messages[0][2])
-        self.assertEqual(['This one will be'], messages[0][3])
+        self.assertEqual(u'Foo Bar', messages[0][2])
+        self.assertEqual([u'This one will be'], messages[0][3])

     def test_multiple_comment_tags(self):
         buf = StringIO("""
@@ -81,11 +82,11 @@
 """)
         messages = list(extract.extract_python(buf, ('_',),
                                                ['NOTE1:', 'NOTE2:'], {}))
-        self.assertEqual('Foo Bar1', messages[0][2])
-        self.assertEqual(['A translation comment for tag1',
-                          'with a second line'], messages[0][3])
-        self.assertEqual('Foo Bar2', messages[1][2])
-        self.assertEqual(['A translation comment for tag2'], messages[1][3])
+        self.assertEqual(u'Foo Bar1', messages[0][2])
+        self.assertEqual([u'A translation comment for tag1',
+                          u'with a second line'], messages[0][3])
+        self.assertEqual(u'Foo Bar2', messages[1][2])
+        self.assertEqual([u'A translation comment for tag2'], messages[1][3])

     def test_two_succeeding_comments(self):
         buf = StringIO("""
@@ -94,8 +95,8 @@
 msg = _(u'Foo Bar')
 """)
         messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
-        self.assertEqual('Foo Bar', messages[0][2])
-        self.assertEqual(['one', 'NOTE: two'], messages[0][3])
+        self.assertEqual(u'Foo Bar', messages[0][2])
+        self.assertEqual([u'one', u'NOTE: two'], messages[0][3])

     def test_invalid_translator_comments(self):
         buf = StringIO("""
@@ -105,7 +106,7 @@
 msg = _(u'Foo Bar')
 """)
         messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
-        self.assertEqual('Foo Bar', messages[0][2])
+        self.assertEqual(u'Foo Bar', messages[0][2])
         self.assertEqual([], messages[0][3])

     def test_invalid_translator_comments2(self):
@@ -120,9 +121,9 @@
 hello = _('Hello')
 """)
         messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
-        self.assertEqual('Hi there!', messages[0][2])
-        self.assertEqual(['Hi!'], messages[0][3])
-        self.assertEqual('Hello', messages[1][2])
+        self.assertEqual(u'Hi there!', messages[0][2])
+        self.assertEqual([u'Hi!'], messages[0][3])
+        self.assertEqual(u'Hello', messages[1][2])
         self.assertEqual([], messages[1][3])

     def test_invalid_translator_comments3(self):
@@ -133,9 +134,46 @@
 hithere = _('Hi there!')
 """)
         messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
-        self.assertEqual('Hi there!', messages[0][2])
+        self.assertEqual(u'Hi there!', messages[0][2])
         self.assertEqual([], messages[0][3])

+    def test_utf8_message(self):
+        buf = StringIO("""
+# NOTE: hello
+msg = _('Bonjour à tous')
+""")
+        messages = list(extract.extract_python(buf, ('_',), ['NOTE:'],
+                                               {'encoding': 'utf-8'}))
+        self.assertEqual(u'Bonjour à tous', messages[0][2])
+        self.assertEqual([u'hello'], messages[0][3])
+
+    def test_utf8_message_with_magic_comment(self):
+        buf = StringIO("""# -*- coding: utf-8 -*-
+# NOTE: hello
+msg = _('Bonjour à tous')
+""")
+        messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
+        self.assertEqual(u'Bonjour à tous', messages[0][2])
+        self.assertEqual([u'hello'], messages[0][3])
+
+    def test_utf8_message_with_utf8_bom(self):
+        buf = StringIO(codecs.BOM_UTF8 + """
+# NOTE: hello
+msg = _('Bonjour à tous')
+""")
+        messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
+        self.assertEqual(u'Bonjour à tous', messages[0][2])
+        self.assertEqual([u'hello'], messages[0][3])
+
+    def test_utf8_raw_strings_match_unicode_strings(self):
+        buf = StringIO(codecs.BOM_UTF8 + """
+msg = _('Bonjour à tous')
+msgu = _(u'Bonjour à tous')
+""")
+        messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
+        self.assertEqual(u'Bonjour à tous', messages[0][2])
+        self.assertEqual(messages[0][2], messages[1][2])
+
 def suite():
     suite = unittest.TestSuite()
     suite.addTest(doctest.DocTestSuite(extract))
--- a/babel/util.py
+++ b/babel/util.py
@@ -13,14 +13,65 @@

 """Various utility classes and functions."""

+import codecs
 from datetime import timedelta, tzinfo
 import os
+import parser
 import re
 import time

 __all__ = ['pathmatch', 'relpath', 'UTC', 'LOCALTZ']
 __docformat__ = 'restructuredtext en'

+# Regexp to match python magic encoding line
+PYTHON_MAGIC_COMMENT_re = re.compile(
+    r'[ \t\f]* \# .* coding[=:][ \t]*([-\w.]+)', re.VERBOSE)
+def parse_encoding(fp):
+    """Deduce the encoding of a source file from magic comment.
+
+    It does this in the same way as the `Python interpreter`__
+
+    .. __: http://docs.python.org/ref/encodings.html
+
+    The ``fp`` argument should be a seekable file object.
+
+    (From Jeff Dairiki)
+    """
+    pos = fp.tell()
+    fp.seek(0)
+    try:
+        line1 = fp.readline()
+        has_bom = line1.startswith(codecs.BOM_UTF8)
+        if has_bom:
+            line1 = line1[len(codecs.BOM_UTF8):]
+
+        m = PYTHON_MAGIC_COMMENT_re.match(line1)
+        if not m:
+            try:
+                parser.suite(line1)
+            except SyntaxError:
+                # Either it's a real syntax error, in which case the source is
+                # not valid python source, or line2 is a continuation of line1,
+                # in which case we don't want to scan line2 for a magic
+                # comment.
+                pass
+            else:
+                line2 = fp.readline()
+                m = PYTHON_MAGIC_COMMENT_re.match(line2)
+
+        if has_bom:
+            if m:
+                raise SyntaxError(
+                    "python refuses to compile code with both a UTF8 "
+                    "byte-order-mark and a magic encoding comment")
+            return 'utf_8'
+        elif m:
+            return m.group(1)
+        else:
+            return None
+    finally:
+        fp.seek(pos)
+
 def pathmatch(pattern, filename):
     """Extended pathname pattern matching.