Mercurial > babel > mirror

diff babel/messages/extract.py @ 164:e1199c0fb3bf trunk
made the python extractor detect source file encodings from the magic encoding comment (or default to ascii) and convert message strings and comments to unicode fixes #23
author: pjenvey
date: Fri, 22 Jun 2007 00:38:54 +0000
parents: 32be08ab2440
children: 31beb381d62f
--- a/babel/messages/extract.py
+++ b/babel/messages/extract.py
@@ -29,7 +29,7 @@
 import sys
 from tokenize import generate_tokens, COMMENT, NAME, OP, STRING
 
-from babel.util import pathmatch, relpath
+from babel.util import parse_encoding, pathmatch, relpath
 
 __all__ = ['extract', 'extract_from_dir', 'extract_from_file']
 __docformat__ = 'restructuredtext en'
@@ -195,7 +195,7 @@
     >>> from StringIO import StringIO
     >>> for message in extract('python', StringIO(source)):
     ...     print message
-    (3, 'Hello, world!', [])
+    (3, u'Hello, world!', [])
     
     :param method: a string specifying the extraction method (.e.g. "python")
     :param fileobj: the file-like object the messages should be extracted from
@@ -238,7 +238,8 @@
 def extract_python(fileobj, keywords, comment_tags, options):
     """Extract messages from Python source code.
     
-    :param fileobj: the file-like object the messages should be extracted from
+    :param fileobj: the seekable, file-like object the messages should be
+                    extracted from
     :param keywords: a list of keywords (i.e. function names) that should be
                      recognized as translation functions
     :param comment_tags: a list of translator tags to search for and include
@@ -255,13 +256,15 @@
     in_args = False
     in_translator_comments = False
 
+    encoding = parse_encoding(fileobj) or options.get('encoding', 'ascii')
+
     tokens = generate_tokens(fileobj.readline)
     for tok, value, (lineno, _), _, _ in tokens:
         if funcname and tok == OP and value == '(':
             in_args = True
         elif tok == COMMENT:
             # Strip the comment token from the line
-            value = value[1:].strip()
+            value = value.decode(encoding)[1:].strip()
             if in_translator_comments and \
                     translator_comments[-1][0] == lineno - 1:
                 # We're already inside a translator comment, continue appending
@@ -300,8 +303,14 @@
                 messages = []
                 translator_comments = []
             elif tok == STRING:
-                # Unwrap quotes in a safe manner
-                buf.append(eval(value, {'__builtins__':{}}, {}))
+                # Unwrap quotes in a safe manner, maintaining the string's
+                # encoding
+                # https://sourceforge.net/tracker/?func=detail&atid=355470&aid=617979&group_id=5470
+                value = eval('# coding=%s\n%s' % (encoding, value),
+                             {'__builtins__':{}}, {})
+                if isinstance(value, str):
+                    value = value.decode(encoding)
+                buf.append(value)
             elif tok == OP and value == ',':
                 messages.append(''.join(buf))
                 del buf[:]
author	pjenvey
date	Fri, 22 Jun 2007 00:38:54 +0000
parents	32be08ab2440
children	31beb381d62f