diff genshi/filters/i18n.py @ 820:1837f39efd6f experimental-inline

Sync (old) experimental inline branch with trunk@1027.
author cmlenz
date Wed, 11 Mar 2009 17:51:06 +0000
parents 0742f421caba
children 09cc3627654c
line wrap: on
line diff
--- a/genshi/filters/i18n.py
+++ b/genshi/filters/i18n.py
@@ -1,24 +1,74 @@
-"""Utilities for internationalization and localization of templates."""
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2007 Edgewall Software
+# All rights reserved.
+#
+# This software is licensed as described in the file COPYING, which
+# you should have received as part of this distribution. The terms
+# are also available at http://genshi.edgewall.org/wiki/License.
+#
+# This software consists of voluntary contributions made by many
+# individuals. For the exact contribution history, see the revision
+# history and logs, available at http://genshi.edgewall.org/log/.
 
-try:
-    frozenset
-except NameError:
-    from sets import ImmutableSet as frozenset
-from gettext import gettext
-from opcode import opmap
+"""Utilities for internationalization and localization of templates.
+
+:since: version 0.4
+"""
+
+from gettext import NullTranslations
 import re
+from types import FunctionType
 
-from genshi.core import Attrs, Namespace, QName, START, END, TEXT, _ensure
-from genshi.template.base import Template, EXPR, SUB
-from genshi.template.markup import EXEC
+from genshi.core import Attrs, Namespace, QName, START, END, TEXT, START_NS, \
+                        END_NS, XML_NAMESPACE, _ensure
+from genshi.template.eval import _ast
+from genshi.template.base import DirectiveFactory, EXPR, SUB, _apply_directives
+from genshi.template.directives import Directive
+from genshi.template.markup import MarkupTemplate, EXEC
 
-_LOAD_NAME = chr(opmap['LOAD_NAME'])
-_LOAD_CONST = chr(opmap['LOAD_CONST'])
-_CALL_FUNCTION = chr(opmap['CALL_FUNCTION'])
-_BINARY_ADD = chr(opmap['BINARY_ADD'])
+__all__ = ['Translator', 'extract']
+__docformat__ = 'restructuredtext en'
+
+I18N_NAMESPACE = Namespace('http://genshi.edgewall.org/i18n')
 
 
-class Translator(object):
+class CommentDirective(Directive):
+
+    __slots__ = []
+
+    @classmethod
+    def attach(cls, template, stream, value, namespaces, pos):
+        return None, stream
+
+
+class MsgDirective(Directive):
+
+    __slots__ = ['params']
+
+    def __init__(self, value, template, hints=None, namespaces=None,
+                 lineno=-1, offset=-1):
+        Directive.__init__(self, None, template, namespaces, lineno, offset)
+        self.params = [name.strip() for name in value.split(',')]
+
+    def __call__(self, stream, directives, ctxt, **vars):
+        msgbuf = MessageBuffer(self.params)
+
+        stream = iter(stream)
+        yield stream.next() # the outer start tag
+        previous = stream.next()
+        for event in stream:
+            msgbuf.append(*previous)
+            previous = event
+
+        gettext = ctxt.get('_i18n.gettext')
+        for event in msgbuf.translate(gettext(msgbuf.format())):
+            yield event
+
+        yield previous # the outer end tag
+
+
+class Translator(DirectiveFactory):
     """Can extract and translate localizable strings from markup streams and
     templates.
     
@@ -65,27 +115,45 @@
         <p>Hallo, Hans</p>
       </body>
     </html>
+
+    Note that elements defining ``xml:lang`` attributes that do not contain
+    variable expressions are ignored by this filter. That can be used to
+    exclude specific parts of a template from being extracted and translated.
     """
 
+    directives = [
+        ('comment', CommentDirective),
+        ('msg', MsgDirective)
+    ]
+
     IGNORE_TAGS = frozenset([
         QName('script'), QName('http://www.w3.org/1999/xhtml}script'),
         QName('style'), QName('http://www.w3.org/1999/xhtml}style')
     ])
     INCLUDE_ATTRS = frozenset(['abbr', 'alt', 'label', 'prompt', 'standby',
                                'summary', 'title'])
+    NAMESPACE = I18N_NAMESPACE
 
-    def __init__(self, translate=gettext, ignore_tags=IGNORE_TAGS,
-                 include_attrs=INCLUDE_ATTRS):
+    def __init__(self, translate=NullTranslations(), ignore_tags=IGNORE_TAGS,
+                 include_attrs=INCLUDE_ATTRS, extract_text=True):
         """Initialize the translator.
         
         :param translate: the translation function, for example ``gettext`` or
                           ``ugettext``.
         :param ignore_tags: a set of tag names that should not be localized
         :param include_attrs: a set of attribute names should be localized
+        :param extract_text: whether the content of text nodes should be
+                             extracted, or only text in explicit ``gettext``
+                             function calls
+
+        :note: Changed in 0.6: the `translate` parameter can now be either
+               a ``gettext``-style function, or an object compatible with the
+               ``NullTransalations`` or ``GNUTranslations`` interface
         """
         self.translate = translate
         self.ignore_tags = ignore_tags
         self.include_attrs = include_attrs
+        self.extract_text = extract_text
 
     def __call__(self, stream, ctxt=None, search_text=True):
         """Translate any localizable strings in the given stream.
@@ -104,27 +172,36 @@
         """
         ignore_tags = self.ignore_tags
         include_attrs = self.include_attrs
-        translate = self.translate
         skip = 0
+        xml_lang = XML_NAMESPACE['lang']
+
+        if type(self.translate) is FunctionType:
+            gettext = self.translate
+        else:
+            gettext = self.translate.ugettext
+        if ctxt:
+            ctxt['_i18n.gettext'] = gettext
+
+        extract_text = self.extract_text
+        if not extract_text:
+            search_text = False
 
         for kind, data, pos in stream:
 
             # skip chunks that should not be localized
             if skip:
                 if kind is START:
-                    tag, attrs = data
-                    if tag in ignore_tags:
-                        skip += 1
+                    skip += 1
                 elif kind is END:
-                    if tag in ignore_tags:
-                        skip -= 1
+                    skip -= 1
                 yield kind, data, pos
                 continue
 
             # handle different events that can be localized
             if kind is START:
                 tag, attrs = data
-                if tag in ignore_tags:
+                if tag in self.ignore_tags or \
+                        isinstance(attrs.get(xml_lang), basestring):
                     skip += 1
                     yield kind, data, pos
                     continue
@@ -133,32 +210,37 @@
                 changed = False
                 for name, value in attrs:
                     newval = value
-                    if isinstance(value, basestring):
+                    if extract_text and isinstance(value, basestring):
                         if name in include_attrs:
-                            newval = self.translate(value)
+                            newval = gettext(value)
                     else:
                         newval = list(self(_ensure(value), ctxt,
-                            search_text=name in include_attrs)
+                            search_text=False)
                         )
                     if newval != value:
                         value = newval
                         changed = True
                     new_attrs.append((name, value))
                 if changed:
-                    attrs = new_attrs
+                    attrs = Attrs(new_attrs)
 
                 yield kind, (tag, attrs), pos
 
             elif search_text and kind is TEXT:
                 text = data.strip()
                 if text:
-                    data = data.replace(text, translate(text))
+                    data = data.replace(text, unicode(gettext(text)))
                 yield kind, data, pos
 
             elif kind is SUB:
-                subkind, substream = data
-                new_substream = list(self(substream, ctxt))
-                yield kind, (subkind, new_substream), pos
+                directives, substream = data
+                # If this is an i18n:msg directive, no need to translate text
+                # nodes here
+                is_msg = filter(None, [isinstance(d, MsgDirective)
+                                       for d in directives])
+                substream = list(self(substream, ctxt,
+                                      search_text=not is_msg))
+                yield kind, (directives, substream), pos
 
             else:
                 yield kind, data, pos
@@ -167,17 +249,20 @@
                          'ugettext', 'ungettext')
 
     def extract(self, stream, gettext_functions=GETTEXT_FUNCTIONS,
-                search_text=True):
+                search_text=True, msgbuf=None):
         """Extract localizable strings from the given template stream.
         
         For every string found, this function yields a ``(lineno, function,
-        message)`` tuple, where:
+        message, comments)`` tuple, where:
         
         * ``lineno`` is the number of the line on which the string was found,
         * ``function`` is the name of the ``gettext`` function used (if the
           string was extracted from embedded Python code), and
         *  ``message`` is the string itself (a ``unicode`` object, or a tuple
-           of ``unicode`` objects for functions with multiple string arguments).
+           of ``unicode`` objects for functions with multiple string
+           arguments).
+        *  ``comments`` is a list of comments related to the message, extracted
+           from ``i18n:comment`` attributes found in the markup
         
         >>> from genshi.template import MarkupTemplate
         >>> 
@@ -192,12 +277,12 @@
         ...   </body>
         ... </html>''', filename='example.html')
         >>> 
-        >>> for lineno, funcname, message in Translator().extract(tmpl.stream):
-        ...    print "%d, %r, %r" % (lineno, funcname, message)
+        >>> for line, func, msg, comments in Translator().extract(tmpl.stream):
+        ...    print "%d, %r, %r" % (line, func, msg)
         3, None, u'Example'
         6, None, u'Example'
         7, '_', u'Hello, %(name)s'
-        8, 'ngettext', (u'You have %d item', u'You have %d items')
+        8, 'ngettext', (u'You have %d item', u'You have %d items', None)
         
         :param stream: the event stream to extract strings from; can be a
                        regular stream or a template stream
@@ -210,77 +295,302 @@
         :note: Changed in 0.4.1: For a function with multiple string arguments
                (such as ``ngettext``), a single item with a tuple of strings is
                yielded, instead an item for each string argument.
+        :note: Changed in 0.6: The returned tuples now include a 4th element,
+               which is a list of comments for the translator
         """
-        tagname = None
+        if not self.extract_text:
+            search_text = False
         skip = 0
+        i18n_comment = I18N_NAMESPACE['comment']
+        i18n_msg = I18N_NAMESPACE['msg']
+        xml_lang = XML_NAMESPACE['lang']
 
         for kind, data, pos in stream:
+
             if skip:
                 if kind is START:
-                    tag, attrs = data
-                    if tag in self.ignore_tags:
-                        skip += 1
+                    skip += 1
                 if kind is END:
-                    tag = data
-                    if tag in self.ignore_tags:
-                        skip -= 1
-                continue
+                    skip -= 1
 
-            if kind is START:
+            if kind is START and not skip:
                 tag, attrs = data
-                if tag in self.ignore_tags:
+
+                if tag in self.ignore_tags or \
+                        isinstance(attrs.get(xml_lang), basestring):
                     skip += 1
                     continue
 
                 for name, value in attrs:
-                    if isinstance(value, basestring):
+                    if search_text and isinstance(value, basestring):
                         if name in self.include_attrs:
                             text = value.strip()
                             if text:
-                                yield pos[1], None, text
+                                yield pos[1], None, text, []
                     else:
-                        for lineno, funcname, text in self.extract(
+                        for lineno, funcname, text, comments in self.extract(
                                 _ensure(value), gettext_functions,
-                                search_text=name in self.include_attrs):
-                            yield lineno, funcname, text
+                                search_text=False):
+                            yield lineno, funcname, text, comments
 
-            elif search_text and kind is TEXT:
-                text = data.strip()
-                if text and filter(None, [ch.isalpha() for ch in text]):
-                    yield pos[1], None, text
+                if msgbuf:
+                    msgbuf.append(kind, data, pos)
+                else:
+                    msg_params = attrs.get(i18n_msg)
+                    if msg_params is not None:
+                        if type(msg_params) is list: # event tuple
+                            msg_params = msg_params[0][1]
+                        msgbuf = MessageBuffer(
+                            msg_params, attrs.get(i18n_comment), pos[1]
+                        )
+
+            elif not skip and search_text and kind is TEXT:
+                if not msgbuf:
+                    text = data.strip()
+                    if text and filter(None, [ch.isalpha() for ch in text]):
+                        yield pos[1], None, text, []
+                else:
+                    msgbuf.append(kind, data, pos)
+
+            elif not skip and msgbuf and kind is END:
+                msgbuf.append(kind, data, pos)
+                if not msgbuf.depth:
+                    yield msgbuf.lineno, None, msgbuf.format(), \
+                          filter(None, [msgbuf.comment])
+                    msgbuf = None
 
             elif kind is EXPR or kind is EXEC:
-                consts = dict([(n, chr(i) + '\x00') for i, n in
-                               enumerate(data.code.co_consts)])
-                gettext_locs = [consts[n] for n in gettext_functions
-                                if n in consts]
-                ops = [
-                    _LOAD_CONST, '(', '|'.join(gettext_locs), ')',
-                    _CALL_FUNCTION, '.\x00',
-                    '((?:', _BINARY_ADD, '|', _LOAD_CONST, '.\x00)+)'
-                ]
-                for loc, opcodes in re.findall(''.join(ops), data.code.co_code):
-                    funcname = data.code.co_consts[ord(loc[0])]
-                    strings = []
-                    opcodes = iter(opcodes)
-                    for opcode in opcodes:
-                        if opcode == _BINARY_ADD:
-                            arg = strings.pop()
-                            strings[-1] += arg
-                        else:
-                            arg = data.code.co_consts[ord(opcodes.next())]
-                            opcodes.next() # skip second byte
-                            if not isinstance(arg, basestring):
-                                break
-                            strings.append(unicode(arg))
-                    if len(strings) == 1:
-                        strings = strings[0]
-                    else:
-                        strings = tuple(strings)
-                    yield pos[1], funcname, strings
+                if msgbuf:
+                    msgbuf.append(kind, data, pos)
+                for funcname, strings in extract_from_code(data,
+                                                           gettext_functions):
+                    yield pos[1], funcname, strings, []
 
             elif kind is SUB:
                 subkind, substream = data
-                for lineno, funcname, text in self.extract(substream,
-                                                           gettext_functions):
-                    yield lineno, funcname, text
+                messages = self.extract(substream, gettext_functions,
+                                        search_text=search_text and not skip,
+                                        msgbuf=msgbuf)
+                for lineno, funcname, text, comments in messages:
+                    yield lineno, funcname, text, comments
+
+
+class MessageBuffer(object):
+    """Helper class for managing internationalized mixed content.
+    
+    :since: version 0.5
+    """
+
+    def __init__(self, params=u'', comment=None, lineno=-1):
+        """Initialize the message buffer.
+        
+        :param params: comma-separated list of parameter names
+        :type params: `basestring`
+        :param lineno: the line number on which the first stream event
+                       belonging to the message was found
+        """
+        if isinstance(params, basestring):
+            params = [name.strip() for name in params.split(',')]
+        self.params = params
+        self.comment = comment
+        self.lineno = lineno
+        self.string = []
+        self.events = {}
+        self.values = {}
+        self.depth = 1
+        self.order = 1
+        self.stack = [0]
+
+    def append(self, kind, data, pos):
+        """Append a stream event to the buffer.
+        
+        :param kind: the stream event kind
+        :param data: the event data
+        :param pos: the position of the event in the source
+        """
+        if kind is TEXT:
+            self.string.append(data)
+            self.events.setdefault(self.stack[-1], []).append(None)
+        elif kind is EXPR:
+            param = self.params.pop(0)
+            self.string.append('%%(%s)s' % param)
+            self.events.setdefault(self.stack[-1], []).append(None)
+            self.values[param] = (kind, data, pos)
+        else:
+            if kind is START:
+                self.string.append(u'[%d:' % self.order)
+                self.events.setdefault(self.order, []).append((kind, data, pos))
+                self.stack.append(self.order)
+                self.depth += 1
+                self.order += 1
+            elif kind is END:
+                self.depth -= 1
+                if self.depth:
+                    self.events[self.stack[-1]].append((kind, data, pos))
+                    self.string.append(u']')
+                    self.stack.pop()
+
+    def format(self):
+        """Return a message identifier representing the content in the
+        buffer.
+        """
+        return u''.join(self.string).strip()
+
+    def translate(self, string, regex=re.compile(r'%\((\w+)\)s')):
+        """Interpolate the given message translation with the events in the
+        buffer and return the translated stream.
+        
+        :param string: the translated message string
+        """
+        parts = parse_msg(string)
+        for order, string in parts:
+            events = self.events[order]
+            while events:
+                event = events.pop(0)
+                if event:
+                    yield event
+                else:
+                    if not string:
+                        break
+                    for idx, part in enumerate(regex.split(string)):
+                        if idx % 2:
+                            yield self.values[part]
+                        elif part:
+                            yield TEXT, part, (None, -1, -1)
+                    if not self.events[order] or not self.events[order][0]:
+                        break
+
+
+def parse_msg(string, regex=re.compile(r'(?:\[(\d+)\:)|\]')):
+    """Parse a translated message using Genshi mixed content message
+    formatting.
+
+    >>> parse_msg("See [1:Help].")
+    [(0, 'See '), (1, 'Help'), (0, '.')]
+
+    >>> parse_msg("See [1:our [2:Help] page] for details.")
+    [(0, 'See '), (1, 'our '), (2, 'Help'), (1, ' page'), (0, ' for details.')]
+
+    >>> parse_msg("[2:Details] finden Sie in [1:Hilfe].")
+    [(2, 'Details'), (0, ' finden Sie in '), (1, 'Hilfe'), (0, '.')]
+
+    >>> parse_msg("[1:] Bilder pro Seite anzeigen.")
+    [(1, ''), (0, ' Bilder pro Seite anzeigen.')]
+
+    :param string: the translated message string
+    :return: a list of ``(order, string)`` tuples
+    :rtype: `list`
+    """
+    parts = []
+    stack = [0]
+    while True:
+        mo = regex.search(string)
+        if not mo:
+            break
+
+        if mo.start() or stack[-1]:
+            parts.append((stack[-1], string[:mo.start()]))
+        string = string[mo.end():]
+
+        orderno = mo.group(1)
+        if orderno is not None:
+            stack.append(int(orderno))
+        else:
+            stack.pop()
+        if not stack:
+            break
+
+    if string:
+        parts.append((stack[-1], string))
+
+    return parts
+
+
+def extract_from_code(code, gettext_functions):
+    """Extract strings from Python bytecode.
+    
+    >>> from genshi.template.eval import Expression
+    
+    >>> expr = Expression('_("Hello")')
+    >>> list(extract_from_code(expr, Translator.GETTEXT_FUNCTIONS))
+    [('_', u'Hello')]
+
+    >>> expr = Expression('ngettext("You have %(num)s item", '
+    ...                            '"You have %(num)s items", num)')
+    >>> list(extract_from_code(expr, Translator.GETTEXT_FUNCTIONS))
+    [('ngettext', (u'You have %(num)s item', u'You have %(num)s items', None))]
+    
+    :param code: the `Code` object
+    :type code: `genshi.template.eval.Code`
+    :param gettext_functions: a sequence of function names
+    :since: version 0.5
+    """
+    def _walk(node):
+        if isinstance(node, _ast.Call) and isinstance(node.func, _ast.Name) \
+                and node.func.id in gettext_functions:
+            strings = []
+            def _add(arg):
+                if isinstance(arg, _ast.Str) and isinstance(arg.s, basestring):
+                    strings.append(unicode(arg.s, 'utf-8'))
+                elif arg:
+                    strings.append(None)
+            [_add(arg) for arg in node.args]
+            _add(node.starargs)
+            _add(node.kwargs)
+            if len(strings) == 1:
+                strings = strings[0]
+            else:
+                strings = tuple(strings)
+            yield node.func.id, strings
+        elif node._fields:
+            children = []
+            for field in node._fields:
+                child = getattr(node, field, None)
+                if isinstance(child, list):
+                    for elem in child:
+                        children.append(elem)
+                elif isinstance(child, _ast.AST):
+                    children.append(child)
+            for child in children:
+                for funcname, strings in _walk(child):
+                    yield funcname, strings
+    return _walk(code.ast)
+
+
+def extract(fileobj, keywords, comment_tags, options):
+    """Babel extraction method for Genshi templates.
+    
+    :param fileobj: the file-like object the messages should be extracted from
+    :param keywords: a list of keywords (i.e. function names) that should be
+                     recognized as translation functions
+    :param comment_tags: a list of translator tags to search for and include
+                         in the results
+    :param options: a dictionary of additional options (optional)
+    :return: an iterator over ``(lineno, funcname, message, comments)`` tuples
+    :rtype: ``iterator``
+    """
+    template_class = options.get('template_class', MarkupTemplate)
+    if isinstance(template_class, basestring):
+        module, clsname = template_class.split(':', 1)
+        template_class = getattr(__import__(module, {}, {}, [clsname]), clsname)
+    encoding = options.get('encoding', None)
+
+    extract_text = options.get('extract_text', True)
+    if isinstance(extract_text, basestring):
+        extract_text = extract_text.lower() in ('1', 'on', 'yes', 'true')
+
+    ignore_tags = options.get('ignore_tags', Translator.IGNORE_TAGS)
+    if isinstance(ignore_tags, basestring):
+        ignore_tags = ignore_tags.split()
+    ignore_tags = [QName(tag) for tag in ignore_tags]
+
+    include_attrs = options.get('include_attrs', Translator.INCLUDE_ATTRS)
+    if isinstance(include_attrs, basestring):
+        include_attrs = include_attrs.split()
+    include_attrs = [QName(attr) for attr in include_attrs]
+
+    tmpl = template_class(fileobj, filename=getattr(fileobj, 'name', None),
+                          encoding=encoding)
+    translator = Translator(None, ignore_tags, include_attrs, extract_text)
+    for message in translator.extract(tmpl.stream, gettext_functions=keywords):
+        yield message
Copyright (C) 2012-2017 Edgewall Software