view genshi/filters/i18n.py @ 450:94601511cd68 trunk

Extend the I18n extraction to also yield function names if applicable.
author cmlenz
date Fri, 13 Apr 2007 20:58:48 +0000
parents 1154f2aadb6c
children 4b6dc4978691
line wrap: on
line source
"""Utilities for internationalization and localization of templates."""

try:
    frozenset
except NameError:
    from sets import ImmutableSet as frozenset
from gettext import gettext
from opcode import opmap
import re

from genshi.core import Attrs, Namespace, QName, START, END, TEXT
from genshi.template.base import Template, EXPR, SUB
from genshi.template.markup import EXEC

_LOAD_NAME = chr(opmap['LOAD_NAME'])
_LOAD_CONST = chr(opmap['LOAD_CONST'])
_CALL_FUNCTION = chr(opmap['CALL_FUNCTION'])
_BINARY_ADD = chr(opmap['BINARY_ADD'])


class Translator(object):
    """Can extract and translate localizable strings from markup streams and
    templates.
    
    For example, assume the followng template:
    
    >>> from genshi.template import MarkupTemplate
    >>> 
    >>> tmpl = MarkupTemplate('''<html xmlns:py="http://genshi.edgewall.org/">
    ...   <head>
    ...     <title>Example</title>
    ...   </head>
    ...   <body>
    ...     <h1>Example</h1>
    ...     <p>${_("Hello, %(name)s") % dict(name=username)}</p>
    ...   </body>
    ... </html>''', filename='example.html')
    
    For demonstration, we define a dummy ``gettext``-style function with a
    hard-coded translation table, and pass that to the `Translator` initializer:
    
    >>> def pseudo_gettext(string):
    ...     return {
    ...         'Example': 'Beispiel',
    ...         'Hello, %(name)s': 'Hallo, %(name)s'
    ...     }[string]
    >>> 
    >>> translator = Translator(pseudo_gettext)
    
    Next, the translator needs to be prepended to any already defined filters
    on the template:
    
    >>> tmpl.filters.insert(0, translator)
    
    When generating the template output, our hard-coded translations should be
    applied as expected:
    
    >>> print tmpl.generate(username='Hans', _=pseudo_gettext)
    <html>
      <head>
        <title>Beispiel</title>
      </head>
      <body>
        <h1>Beispiel</h1>
        <p>Hallo, Hans</p>
      </body>
    </html>
    """

    IGNORE_TAGS = frozenset([
        QName('script'), QName('http://www.w3.org/1999/xhtml}script'),
        QName('style'), QName('http://www.w3.org/1999/xhtml}style')
    ])
    INCLUDE_ATTRS = frozenset(['title', 'alt'])

    def __init__(self, translate=gettext, ignore_tags=IGNORE_TAGS,
                 include_attrs=INCLUDE_ATTRS):
        """Initialize the translator.
        
        :param translate: the translation function, for example ``gettext`` or
                          ``ugettext``.
        :param ignore_tags: a set of tag names that should not be localized
        :param include_attrs: a set of attribute names should be localized
        """
        self.translate = translate
        self.ignore_tags = ignore_tags
        self.include_attrs = include_attrs

    def __call__(self, stream, ctxt=None, search_text=True):
        """Translate any localizable strings in the given stream.
        
        This function shouldn't be called directly. Instead, an instance of
        the `Translator` class should be registered as a filter with the
        `Template` or the `TemplateLoader`, or applied as a regular stream
        filter. If used as a template filter, it should be inserted in front of
        all the default filters.
        
        :param stream: the markup event stream
        :param ctxt: the template context (not used)
        :param search_text: whether text nodes should be translated (used
                            internally)
        :return: the localized stream
        """
        ignore_tags = self.ignore_tags
        include_attrs = self.include_attrs
        translate = self.translate
        skip = 0

        for kind, data, pos in stream:

            # skip chunks that should not be localized
            if skip:
                if kind is START:
                    tag, attrs = data
                    if tag in ignore_tags:
                        skip += 1
                elif kind is END:
                    if tag in ignore_tags:
                        skip -= 1
                yield kind, data, pos
                continue

            # handle different events that can be localized
            if kind is START:
                tag, attrs = data
                if tag in ignore_tags:
                    skip += 1
                    yield kind, data, pos
                    continue

                new_attrs = list(attrs)
                changed = False
                for name, value in attrs:
                    if name in include_attrs:
                        if isinstance(value, basestring):
                            newval = ugettext(value)
                        else:
                            newval = list(self(value, ctxt, search_text=name in include_attrs))
                        if newval != value:
                            value = new_val
                            changed = True
                    new_attrs.append((name, value))
                if changed:
                    attrs = new_attrs

                yield kind, (tag, attrs), pos

            elif kind is TEXT:
                text = data.strip()
                if text:
                    data = data.replace(text, translate(text))
                yield kind, data, pos

            elif kind is SUB:
                subkind, substream = data
                new_substream = list(self(substream, ctxt))
                yield kind, (subkind, new_substream), pos

            else:
                yield kind, data, pos

    GETTEXT_FUNCTIONS = ('_', 'gettext', 'ngettext', 'dgettext', 'dngettext',
                         'ugettext', 'ungettext')

    def extract(self, stream, gettext_functions=GETTEXT_FUNCTIONS):
        """Extract localizable strings from the given template stream.
        
        For every string found, this function yields a ``(lineno, function,
        message)`` tuple, where:
        
        * ``lineno`` is the number of the line on which the string was found,
        * ``function`` is the name of the ``gettext`` function used (if the
          string was extracted from embedded Python code), and
        *  ``message`` is the string itself (a ``unicode`` object).
        
        >>> from genshi.template import MarkupTemplate
        >>> 
        >>> tmpl = MarkupTemplate('''<html xmlns:py="http://genshi.edgewall.org/">
        ...   <head>
        ...     <title>Example</title>
        ...   </head>
        ...   <body>
        ...     <h1>Example</h1>
        ...     <p>${_("Hello, %(name)s") % dict(name=username)}</p>
        ...   </body>
        ... </html>''', filename='example.html')
        >>> 
        >>> for lineno, funcname, message in Translator().extract(tmpl.stream):
        ...    print "%d, %r, %r" % (lineno, funcname, message)
        3, None, u'Example'
        6, None, u'Example'
        7, '_', u'Hello, %(name)s'

        :param stream: the event stream to extract strings from; can be a
                       regular stream or a template stream
        :param gettext_functions: a sequence of function names that should be
                                  treated as gettext-style localization
                                  functions
        """
        tagname = None
        skip = 0

        for kind, data, pos in stream:
            if skip:
                if kind is START:
                    tag, attrs = data
                    if tag in self.ignore_tags:
                        skip += 1
                if kind is END:
                    tag = data
                    if tag in self.ignore_tags:
                        skip -= 1
                continue

            if kind is START:
                tag, attrs = data
                if tag in self.ignore_tags:
                    skip += 1
                    continue

                for name, value in attrs:
                    if name in self.include_attrs:
                        if isinstance(value, basestring):
                            text = value.strip()
                            if text:
                                yield pos[1], None, text
                        else:
                            for lineno, funcname, text in harvest(value):
                                yield lineno, funcname, text

            elif kind is TEXT:
                text = data.strip()
                if text and filter(None, [ch.isalpha() for ch in text]):
                    yield pos[1], None, text

            elif kind is EXPR or kind is EXEC:
                consts = dict([(n, chr(i) + '\x00') for i, n in
                               enumerate(data.code.co_consts)])
                gettext_locs = [consts[n] for n in gettext_functions
                                if n in consts]
                ops = [
                    _LOAD_CONST, '(', '|'.join(gettext_locs), ')',
                    _CALL_FUNCTION, '.\x00',
                    '((?:', _BINARY_ADD, '|', _LOAD_CONST, '.\x00)+)'
                ]
                for loc, opcodes in re.findall(''.join(ops), data.code.co_code):
                    funcname = data.code.co_consts[ord(loc[0])]
                    strings = []
                    opcodes = iter(opcodes)
                    for opcode in opcodes:
                        if opcode == _BINARY_ADD:
                            arg = strings.pop()
                            strings[-1] += arg
                        else:
                            arg = data.code.co_consts[ord(opcodes.next())]
                            opcodes.next() # skip second byte
                            if not isinstance(arg, basestring):
                                break
                            strings.append(unicode(arg))
                    for string in strings:
                        yield pos[1], funcname, string

            elif kind is SUB:
                subkind, substream = data
                for lineno, funcname, text in self.harvest(substream):
                    yield lineno, funcname, text
Copyright (C) 2012-2017 Edgewall Software