# HG changeset patch
# User pjenvey
# Date 1184718544 0
# Node ID 9d0a19b4518b7d16837bfa9d599cb9603f536cee
# Parent  49b089453f81534fd39287df26408cdaa85db77d
o extract_python fixes:
 - now returns None for non-string arguments
 - no longer extracts strings from nested function calls
 refs #38
 - use the correct starting line number in multi-line gettext function
 calls
 - avoids falsely identifying string keyword arg defaults from
 function definition names that match a keyword, e.g.:
 def gettext(foo='bar')
 - avoid capturing translator comments embedded within a gettext
   function call
 - default the file encoding to iso-8859-1 instead of ascii when
 missing a magic encoding comment, to emulate pre Python 2.5
 behavior. Python warns about 'non-ascii' chars when there is no magic
 encoding comment, but < 2.5 actually treats them as iso-8859-1 for
 backwards compat (PEP 263). >= 2.5 treats them as strict ascii
o extract fixes:
 - filter out messages that don't contain strings where the keyword
 specification calls for
 fixes #39
 - filter out empty string messages and emit a warning about them,
  like xgettext

diff --git a/ChangeLog b/ChangeLog
--- a/ChangeLog
+++ b/ChangeLog
@@ -12,6 +12,13 @@
  * The number formatting functions now also work with numbers represented by
    Python `Decimal` objects (ticket #53).
  * Added extensible infrastructure for validating translation catalogs.
+ * Fixed the extractor not filtering out messages that didn't validate against
+   the keyword's specification (ticket #39).
+ * Fixed the extractor raising an exception when encountering an empty string
+   msgid. It now emits a warning to stderr.
+ * Numerous Python message extractor fixes: it now handles nested function
+   calls within a gettext function call correctly, uses the correct line number
+   for multi-line function calls, and other small fixes (tickets #38 and #39).
 
 
 Version 0.8.1
diff --git a/babel/messages/extract.py b/babel/messages/extract.py
--- a/babel/messages/extract.py
+++ b/babel/messages/extract.py
@@ -49,6 +49,10 @@
 
 DEFAULT_MAPPING = [('**.py', 'python')]
 
+empty_msgid_warning = (
+'%s: warning: Empty msgid.  It is reserved by GNU gettext: gettext("") '
+'returns the header entry with meta information, not the empty string.')
+
 def extract_from_dir(dirname=os.getcwd(), method_map=DEFAULT_MAPPING,
                      options_map=None, keywords=DEFAULT_KEYWORDS,
                      comment_tags=(), callback=None):
@@ -218,13 +222,33 @@
         results = func(fileobj, keywords.keys(), comment_tags,
                        options=options or {})
         for lineno, funcname, messages, comments in results:
-            if isinstance(messages, (list, tuple)):
-                msgs = []
-                for index in keywords[funcname] or (1,):
-                    msgs.append(messages[index - 1])
-                messages = tuple(msgs)
-                if len(messages) == 1:
-                    messages = messages[0]
+            spec = keywords[funcname] or (1,)
+            if not isinstance(messages, (list, tuple)):
+                messages = [messages]
+
+            msgs = []
+            # Validate the messages against the keyword's specification
+            invalid = False
+            for index in spec:
+                message = messages[index - 1]
+                if message is None:
+                    invalid = True
+                    break
+                msgs.append(message)
+            if invalid:
+                continue
+
+            first_msg_index = spec[0] - 1
+            if not messages[first_msg_index]:
+                # An empty string msgid isn't valid, emit a warning
+                where = '%s:%i' % (hasattr(fileobj, 'name') and \
+                                       fileobj.name or '(unknown)', lineno)
+                print >> sys.stderr, empty_msgid_warning % where
+                continue
+
+            messages = tuple(msgs)
+            if len(messages) == 1:
+                messages = messages[0]
             yield lineno, messages, comments
         return
 
@@ -249,21 +273,29 @@
     :return: an iterator over ``(lineno, funcname, message, comments)`` tuples
     :rtype: ``iterator``
     """
-    funcname = None
-    lineno = None
+    funcname = lineno = message_lineno = None
+    call_stack = -1
     buf = []
     messages = []
     translator_comments = []
-    in_args = False
-    in_translator_comments = False
+    in_def = in_translator_comments = False
 
-    encoding = parse_encoding(fileobj) or options.get('encoding', 'ascii')
+    encoding = parse_encoding(fileobj) or options.get('encoding', 'iso-8859-1')
 
     tokens = generate_tokens(fileobj.readline)
     for tok, value, (lineno, _), _, _ in tokens:
-        if funcname and tok == OP and value == '(':
-            in_args = True
-        elif tok == COMMENT:
+        if call_stack == -1 and tok == NAME and value in ('def', 'class'):
+            in_def = True
+        elif tok == OP and value == '(':
+            if in_def:
+                # Avoid false positives for declarations such as:
+                # def gettext(arg='message'):
+                in_def = False
+                continue
+            if funcname:
+                message_lineno = lineno
+                call_stack += 1
+        elif call_stack == -1 and tok == COMMENT:
             # Strip the comment token from the line
             value = value.decode(encoding)[1:].strip()
             if in_translator_comments and \
@@ -281,41 +313,51 @@
                     comment = value[len(comment_tag):].strip()
                     translator_comments.append((lineno, comment))
                     break
-        elif funcname and in_args:
+        elif funcname and call_stack == 0:
             if tok == OP and value == ')':
-                in_args = in_translator_comments = False
                 if buf:
                     messages.append(''.join(buf))
                     del buf[:]
-                if filter(None, messages):
-                    if len(messages) > 1:
-                        messages = tuple(messages)
-                    else:
-                        messages = messages[0]
-                    # Comments don't apply unless they immediately preceed the
-                    # message
-                    if translator_comments and \
-                            translator_comments[-1][0] < lineno - 1:
-                        translator_comments = []
+                else:
+                    messages.append(None)
 
-                    yield (lineno, funcname, messages,
-                           [comment[1] for comment in translator_comments])
-                funcname = lineno = None
+                if len(messages) > 1:
+                    messages = tuple(messages)
+                else:
+                    messages = messages[0]
+                # Comments don't apply unless they immediately preceed the
+                # message
+                if translator_comments and \
+                        translator_comments[-1][0] < message_lineno - 1:
+                    translator_comments = []
+
+                yield (message_lineno, funcname, messages,
+                       [comment[1] for comment in translator_comments])
+
+                funcname = lineno = message_lineno = None
+                call_stack = -1
                 messages = []
                 translator_comments = []
+                in_translator_comments = False
             elif tok == STRING:
                 # Unwrap quotes in a safe manner, maintaining the string's
                 # encoding
-                # https://sourceforge.net/tracker/?func=detail&atid=355470&aid=617979&group_id=5470
+                # https://sourceforge.net/tracker/?func=detail&atid=355470&
+                # aid=617979&group_id=5470
                 value = eval('# coding=%s\n%s' % (encoding, value),
                              {'__builtins__':{}}, {})
                 if isinstance(value, str):
                     value = value.decode(encoding)
                 buf.append(value)
             elif tok == OP and value == ',':
-                messages.append(''.join(buf))
-                del buf[:]
-        elif funcname:
+                if buf:
+                    messages.append(''.join(buf))
+                    del buf[:]
+                else:
+                    messages.append(None)
+        elif call_stack > 0 and tok == OP and value == ')':
+            call_stack -= 1
+        elif funcname and call_stack == -1:
             funcname = None
         elif tok == NAME and value in keywords:
             funcname = value
diff --git a/babel/messages/tests/extract.py b/babel/messages/tests/extract.py
--- a/babel/messages/tests/extract.py
+++ b/babel/messages/tests/extract.py
@@ -14,6 +14,7 @@
 import codecs
 import doctest
 from StringIO import StringIO
+import sys
 import unittest
 
 from babel.messages import extract
@@ -21,6 +22,105 @@
 
 class ExtractPythonTestCase(unittest.TestCase):
 
+    def test_nested_calls(self):
+        buf = StringIO("""\
+msg1 = _(i18n_arg.replace(r'\"', '"'))
+msg2 = ungettext(i18n_arg.replace(r'\"', '"'), multi_arg.replace(r'\"', '"'), 2)
+msg3 = ungettext("Babel", multi_arg.replace(r'\"', '"'), 2)
+msg4 = ungettext(i18n_arg.replace(r'\"', '"'), "Babels", 2)
+msg5 = ungettext('bunny', 'bunnies', random.randint(1, 2))
+msg6 = ungettext(arg0, 'bunnies', random.randint(1, 2))
+msg7 = _(hello.there)
+msg8 = gettext('Rabbit')
+msg9 = dgettext('wiki', model.addPage())
+msg10 = dngettext(getDomain(), 'Page', 'Pages', 3)
+""")
+        messages = list(extract.extract_python(buf,
+                                               extract.DEFAULT_KEYWORDS.keys(),
+                                               [], {}))
+        self.assertEqual([
+                (1, '_', None, []),
+                (2, 'ungettext', (None, None, None), []),
+                (3, 'ungettext', (u'Babel', None, None), []),
+                (4, 'ungettext', (None, u'Babels', None), []),
+                (5, 'ungettext', (u'bunny', u'bunnies', None), []),
+                (6, 'ungettext', (None, u'bunnies', None), []),
+                (7, '_', None, []),
+                (8, 'gettext', u'Rabbit', []),
+                (9, 'dgettext', (u'wiki', None), []),
+                (10, 'dngettext', (None, u'Page', u'Pages', None), [])],
+                         messages)
+
+    def test_nested_comments(self):
+        buf = StringIO("""\
+msg = ngettext('pylon',  # TRANSLATORS: shouldn't be
+               'pylons', # TRANSLATORS: seeing this
+               count)
+""")
+        messages = list(extract.extract_python(buf, ('ngettext',),
+                                               ['TRANSLATORS:'], {}))
+        self.assertEqual([(1, 'ngettext', (u'pylon', u'pylons', None), [])],
+                         messages)
+
+    def test_declarations(self):
+        buf = StringIO("""\
+class gettext(object):
+    pass
+def render_body(context,x,y=_('Page arg 1'),z=_('Page arg 2'),**pageargs):
+    pass
+def ngettext(y='arg 1',z='arg 2',**pageargs):
+    pass
+""")
+        messages = list(extract.extract_python(buf,
+                                               extract.DEFAULT_KEYWORDS.keys(),
+                                               [], {}))
+        self.assertEqual([(3, '_', u'Page arg 1', []),
+                          (3, '_', u'Page arg 2', [])],
+                         messages)
+
+    def test_multiline(self):
+        buf = StringIO("""\
+msg1 = ngettext('pylon',
+                'pylons', count)
+msg2 = ngettext('elvis',
+                'elvises',
+                 count)
+""")
+        messages = list(extract.extract_python(buf, ('ngettext',), [], {}))
+        self.assertEqual([(1, 'ngettext', (u'pylon', u'pylons', None), []),
+                          (3, 'ngettext', (u'elvis', u'elvises', None), [])],
+                         messages)
+
+    def test_triple_quoted_strings(self):
+        buf = StringIO("""\
+msg1 = _('''pylons''')
+msg2 = ngettext(r'''elvis''', \"\"\"elvises\"\"\", count)
+msg2 = ngettext(\"\"\"elvis\"\"\", 'elvises', count)
+""")
+        messages = list(extract.extract_python(buf,
+                                               extract.DEFAULT_KEYWORDS.keys(),
+                                               [], {}))
+        self.assertEqual([(1, '_', (u'pylons'), []),
+                          (2, 'ngettext', (u'elvis', u'elvises', None), []),
+                          (3, 'ngettext', (u'elvis', u'elvises', None), [])],
+                         messages)
+
+    def test_multiline_strings(self):
+        buf = StringIO("""\
+_('''This module provides internationalization and localization
+support for your Python programs by providing an interface to the GNU
+gettext message catalog library.''')
+""")
+        messages = list(extract.extract_python(buf,
+                                               extract.DEFAULT_KEYWORDS.keys(),
+                                               [], {}))
+        self.assertEqual(
+            [(1, '_',
+              u'This module provides internationalization and localization\n'
+              'support for your Python programs by providing an interface to '
+              'the GNU\ngettext message catalog library.', [])],
+            messages)
+
     def test_unicode_string_arg(self):
         buf = StringIO("msg = _(u'Foo Bar')")
         messages = list(extract.extract_python(buf, ('_',), [], {}))
@@ -45,7 +145,7 @@
         self.assertEqual(u'Foo Bar', messages[0][2])
         self.assertEqual([u'A translation comment', u'with a second line'],
                          messages[0][3])
-        
+
     def test_translator_comments_with_previous_non_translator_comments(self):
         buf = StringIO("""
 # This shouldn't be in the output
@@ -97,7 +197,7 @@
         messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
         self.assertEqual(u'Foo Bar', messages[0][2])
         self.assertEqual([u'one', u'NOTE: two'], messages[0][3])
-        
+
     def test_invalid_translator_comments(self):
         buf = StringIO("""
 # NOTE: this shouldn't apply to any messages
@@ -174,10 +274,48 @@
         self.assertEqual(u'Bonjour à tous', messages[0][2])
         self.assertEqual(messages[0][2], messages[1][2])
 
+class ExtractTestCase(unittest.TestCase):
+
+    def test_invalid_filter(self):
+        buf = StringIO("""\
+msg1 = _(i18n_arg.replace(r'\"', '"'))
+msg2 = ungettext(i18n_arg.replace(r'\"', '"'), multi_arg.replace(r'\"', '"'), 2)
+msg3 = ungettext("Babel", multi_arg.replace(r'\"', '"'), 2)
+msg4 = ungettext(i18n_arg.replace(r'\"', '"'), "Babels", 2)
+msg5 = ungettext('bunny', 'bunnies', random.randint(1, 2))
+msg6 = ungettext(arg0, 'bunnies', random.randint(1, 2))
+msg7 = _(hello.there)
+msg8 = gettext('Rabbit')
+msg9 = dgettext('wiki', model.addPage())
+msg10 = dngettext(domain, 'Page', 'Pages', 3)
+""")
+        messages = \
+            list(extract.extract('python', buf, extract.DEFAULT_KEYWORDS, [],
+                                 {}))
+        self.assertEqual([(5, (u'bunny', u'bunnies'), []),
+                          (8, u'Rabbit', []),
+                          (10, (u'Page', u'Pages'), [])], messages)
+
+    def test_empty_string_msgid(self):
+        buf = StringIO("""\
+msg = _('')
+""")
+        stderr = sys.stderr
+        sys.stderr = StringIO()
+        try:
+            messages = \
+                list(extract.extract('python', buf, extract.DEFAULT_KEYWORDS,
+                                     [], {}))
+            self.assertEqual([], messages)
+            assert 'warning: Empty msgid.' in sys.stderr.getvalue()
+        finally:
+            sys.stderr = stderr
+
 def suite():
     suite = unittest.TestSuite()
     suite.addTest(doctest.DocTestSuite(extract))
     suite.addTest(unittest.makeSuite(ExtractPythonTestCase))
+    suite.addTest(unittest.makeSuite(ExtractTestCase))
     return suite
 
 if __name__ == '__main__':