Mercurial > genshi > genshi-test
diff genshi/filters/i18n.py @ 820:1837f39efd6f experimental-inline
Sync (old) experimental inline branch with trunk@1027.
author | cmlenz |
---|---|
date | Wed, 11 Mar 2009 17:51:06 +0000 |
parents | 0742f421caba |
children | 09cc3627654c |
line wrap: on
line diff
--- a/genshi/filters/i18n.py +++ b/genshi/filters/i18n.py @@ -1,24 +1,74 @@ -"""Utilities for internationalization and localization of templates.""" +# -*- coding: utf-8 -*- +# +# Copyright (C) 2007 Edgewall Software +# All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://genshi.edgewall.org/wiki/License. +# +# This software consists of voluntary contributions made by many +# individuals. For the exact contribution history, see the revision +# history and logs, available at http://genshi.edgewall.org/log/. -try: - frozenset -except NameError: - from sets import ImmutableSet as frozenset -from gettext import gettext -from opcode import opmap +"""Utilities for internationalization and localization of templates. + +:since: version 0.4 +""" + +from gettext import NullTranslations import re +from types import FunctionType -from genshi.core import Attrs, Namespace, QName, START, END, TEXT, _ensure -from genshi.template.base import Template, EXPR, SUB -from genshi.template.markup import EXEC +from genshi.core import Attrs, Namespace, QName, START, END, TEXT, START_NS, \ + END_NS, XML_NAMESPACE, _ensure +from genshi.template.eval import _ast +from genshi.template.base import DirectiveFactory, EXPR, SUB, _apply_directives +from genshi.template.directives import Directive +from genshi.template.markup import MarkupTemplate, EXEC -_LOAD_NAME = chr(opmap['LOAD_NAME']) -_LOAD_CONST = chr(opmap['LOAD_CONST']) -_CALL_FUNCTION = chr(opmap['CALL_FUNCTION']) -_BINARY_ADD = chr(opmap['BINARY_ADD']) +__all__ = ['Translator', 'extract'] +__docformat__ = 'restructuredtext en' + +I18N_NAMESPACE = Namespace('http://genshi.edgewall.org/i18n') -class Translator(object): +class CommentDirective(Directive): + + __slots__ = [] + + @classmethod + def attach(cls, template, stream, value, namespaces, pos): + return None, stream + + +class MsgDirective(Directive): + + __slots__ = ['params'] + + def __init__(self, value, template, hints=None, namespaces=None, + lineno=-1, offset=-1): + Directive.__init__(self, None, template, namespaces, lineno, offset) + self.params = [name.strip() for name in value.split(',')] + + def __call__(self, stream, directives, ctxt, **vars): + msgbuf = MessageBuffer(self.params) + + stream = iter(stream) + yield stream.next() # the outer start tag + previous = stream.next() + for event in stream: + msgbuf.append(*previous) + previous = event + + gettext = ctxt.get('_i18n.gettext') + for event in msgbuf.translate(gettext(msgbuf.format())): + yield event + + yield previous # the outer end tag + + +class Translator(DirectiveFactory): """Can extract and translate localizable strings from markup streams and templates. @@ -65,27 +115,45 @@ <p>Hallo, Hans</p> </body> </html> + + Note that elements defining ``xml:lang`` attributes that do not contain + variable expressions are ignored by this filter. That can be used to + exclude specific parts of a template from being extracted and translated. """ + directives = [ + ('comment', CommentDirective), + ('msg', MsgDirective) + ] + IGNORE_TAGS = frozenset([ QName('script'), QName('http://www.w3.org/1999/xhtml}script'), QName('style'), QName('http://www.w3.org/1999/xhtml}style') ]) INCLUDE_ATTRS = frozenset(['abbr', 'alt', 'label', 'prompt', 'standby', 'summary', 'title']) + NAMESPACE = I18N_NAMESPACE - def __init__(self, translate=gettext, ignore_tags=IGNORE_TAGS, - include_attrs=INCLUDE_ATTRS): + def __init__(self, translate=NullTranslations(), ignore_tags=IGNORE_TAGS, + include_attrs=INCLUDE_ATTRS, extract_text=True): """Initialize the translator. :param translate: the translation function, for example ``gettext`` or ``ugettext``. :param ignore_tags: a set of tag names that should not be localized :param include_attrs: a set of attribute names should be localized + :param extract_text: whether the content of text nodes should be + extracted, or only text in explicit ``gettext`` + function calls + + :note: Changed in 0.6: the `translate` parameter can now be either + a ``gettext``-style function, or an object compatible with the + ``NullTransalations`` or ``GNUTranslations`` interface """ self.translate = translate self.ignore_tags = ignore_tags self.include_attrs = include_attrs + self.extract_text = extract_text def __call__(self, stream, ctxt=None, search_text=True): """Translate any localizable strings in the given stream. @@ -104,27 +172,36 @@ """ ignore_tags = self.ignore_tags include_attrs = self.include_attrs - translate = self.translate skip = 0 + xml_lang = XML_NAMESPACE['lang'] + + if type(self.translate) is FunctionType: + gettext = self.translate + else: + gettext = self.translate.ugettext + if ctxt: + ctxt['_i18n.gettext'] = gettext + + extract_text = self.extract_text + if not extract_text: + search_text = False for kind, data, pos in stream: # skip chunks that should not be localized if skip: if kind is START: - tag, attrs = data - if tag in ignore_tags: - skip += 1 + skip += 1 elif kind is END: - if tag in ignore_tags: - skip -= 1 + skip -= 1 yield kind, data, pos continue # handle different events that can be localized if kind is START: tag, attrs = data - if tag in ignore_tags: + if tag in self.ignore_tags or \ + isinstance(attrs.get(xml_lang), basestring): skip += 1 yield kind, data, pos continue @@ -133,32 +210,37 @@ changed = False for name, value in attrs: newval = value - if isinstance(value, basestring): + if extract_text and isinstance(value, basestring): if name in include_attrs: - newval = self.translate(value) + newval = gettext(value) else: newval = list(self(_ensure(value), ctxt, - search_text=name in include_attrs) + search_text=False) ) if newval != value: value = newval changed = True new_attrs.append((name, value)) if changed: - attrs = new_attrs + attrs = Attrs(new_attrs) yield kind, (tag, attrs), pos elif search_text and kind is TEXT: text = data.strip() if text: - data = data.replace(text, translate(text)) + data = data.replace(text, unicode(gettext(text))) yield kind, data, pos elif kind is SUB: - subkind, substream = data - new_substream = list(self(substream, ctxt)) - yield kind, (subkind, new_substream), pos + directives, substream = data + # If this is an i18n:msg directive, no need to translate text + # nodes here + is_msg = filter(None, [isinstance(d, MsgDirective) + for d in directives]) + substream = list(self(substream, ctxt, + search_text=not is_msg)) + yield kind, (directives, substream), pos else: yield kind, data, pos @@ -167,17 +249,20 @@ 'ugettext', 'ungettext') def extract(self, stream, gettext_functions=GETTEXT_FUNCTIONS, - search_text=True): + search_text=True, msgbuf=None): """Extract localizable strings from the given template stream. For every string found, this function yields a ``(lineno, function, - message)`` tuple, where: + message, comments)`` tuple, where: * ``lineno`` is the number of the line on which the string was found, * ``function`` is the name of the ``gettext`` function used (if the string was extracted from embedded Python code), and * ``message`` is the string itself (a ``unicode`` object, or a tuple - of ``unicode`` objects for functions with multiple string arguments). + of ``unicode`` objects for functions with multiple string + arguments). + * ``comments`` is a list of comments related to the message, extracted + from ``i18n:comment`` attributes found in the markup >>> from genshi.template import MarkupTemplate >>> @@ -192,12 +277,12 @@ ... </body> ... </html>''', filename='example.html') >>> - >>> for lineno, funcname, message in Translator().extract(tmpl.stream): - ... print "%d, %r, %r" % (lineno, funcname, message) + >>> for line, func, msg, comments in Translator().extract(tmpl.stream): + ... print "%d, %r, %r" % (line, func, msg) 3, None, u'Example' 6, None, u'Example' 7, '_', u'Hello, %(name)s' - 8, 'ngettext', (u'You have %d item', u'You have %d items') + 8, 'ngettext', (u'You have %d item', u'You have %d items', None) :param stream: the event stream to extract strings from; can be a regular stream or a template stream @@ -210,77 +295,302 @@ :note: Changed in 0.4.1: For a function with multiple string arguments (such as ``ngettext``), a single item with a tuple of strings is yielded, instead an item for each string argument. + :note: Changed in 0.6: The returned tuples now include a 4th element, + which is a list of comments for the translator """ - tagname = None + if not self.extract_text: + search_text = False skip = 0 + i18n_comment = I18N_NAMESPACE['comment'] + i18n_msg = I18N_NAMESPACE['msg'] + xml_lang = XML_NAMESPACE['lang'] for kind, data, pos in stream: + if skip: if kind is START: - tag, attrs = data - if tag in self.ignore_tags: - skip += 1 + skip += 1 if kind is END: - tag = data - if tag in self.ignore_tags: - skip -= 1 - continue + skip -= 1 - if kind is START: + if kind is START and not skip: tag, attrs = data - if tag in self.ignore_tags: + + if tag in self.ignore_tags or \ + isinstance(attrs.get(xml_lang), basestring): skip += 1 continue for name, value in attrs: - if isinstance(value, basestring): + if search_text and isinstance(value, basestring): if name in self.include_attrs: text = value.strip() if text: - yield pos[1], None, text + yield pos[1], None, text, [] else: - for lineno, funcname, text in self.extract( + for lineno, funcname, text, comments in self.extract( _ensure(value), gettext_functions, - search_text=name in self.include_attrs): - yield lineno, funcname, text + search_text=False): + yield lineno, funcname, text, comments - elif search_text and kind is TEXT: - text = data.strip() - if text and filter(None, [ch.isalpha() for ch in text]): - yield pos[1], None, text + if msgbuf: + msgbuf.append(kind, data, pos) + else: + msg_params = attrs.get(i18n_msg) + if msg_params is not None: + if type(msg_params) is list: # event tuple + msg_params = msg_params[0][1] + msgbuf = MessageBuffer( + msg_params, attrs.get(i18n_comment), pos[1] + ) + + elif not skip and search_text and kind is TEXT: + if not msgbuf: + text = data.strip() + if text and filter(None, [ch.isalpha() for ch in text]): + yield pos[1], None, text, [] + else: + msgbuf.append(kind, data, pos) + + elif not skip and msgbuf and kind is END: + msgbuf.append(kind, data, pos) + if not msgbuf.depth: + yield msgbuf.lineno, None, msgbuf.format(), \ + filter(None, [msgbuf.comment]) + msgbuf = None elif kind is EXPR or kind is EXEC: - consts = dict([(n, chr(i) + '\x00') for i, n in - enumerate(data.code.co_consts)]) - gettext_locs = [consts[n] for n in gettext_functions - if n in consts] - ops = [ - _LOAD_CONST, '(', '|'.join(gettext_locs), ')', - _CALL_FUNCTION, '.\x00', - '((?:', _BINARY_ADD, '|', _LOAD_CONST, '.\x00)+)' - ] - for loc, opcodes in re.findall(''.join(ops), data.code.co_code): - funcname = data.code.co_consts[ord(loc[0])] - strings = [] - opcodes = iter(opcodes) - for opcode in opcodes: - if opcode == _BINARY_ADD: - arg = strings.pop() - strings[-1] += arg - else: - arg = data.code.co_consts[ord(opcodes.next())] - opcodes.next() # skip second byte - if not isinstance(arg, basestring): - break - strings.append(unicode(arg)) - if len(strings) == 1: - strings = strings[0] - else: - strings = tuple(strings) - yield pos[1], funcname, strings + if msgbuf: + msgbuf.append(kind, data, pos) + for funcname, strings in extract_from_code(data, + gettext_functions): + yield pos[1], funcname, strings, [] elif kind is SUB: subkind, substream = data - for lineno, funcname, text in self.extract(substream, - gettext_functions): - yield lineno, funcname, text + messages = self.extract(substream, gettext_functions, + search_text=search_text and not skip, + msgbuf=msgbuf) + for lineno, funcname, text, comments in messages: + yield lineno, funcname, text, comments + + +class MessageBuffer(object): + """Helper class for managing internationalized mixed content. + + :since: version 0.5 + """ + + def __init__(self, params=u'', comment=None, lineno=-1): + """Initialize the message buffer. + + :param params: comma-separated list of parameter names + :type params: `basestring` + :param lineno: the line number on which the first stream event + belonging to the message was found + """ + if isinstance(params, basestring): + params = [name.strip() for name in params.split(',')] + self.params = params + self.comment = comment + self.lineno = lineno + self.string = [] + self.events = {} + self.values = {} + self.depth = 1 + self.order = 1 + self.stack = [0] + + def append(self, kind, data, pos): + """Append a stream event to the buffer. + + :param kind: the stream event kind + :param data: the event data + :param pos: the position of the event in the source + """ + if kind is TEXT: + self.string.append(data) + self.events.setdefault(self.stack[-1], []).append(None) + elif kind is EXPR: + param = self.params.pop(0) + self.string.append('%%(%s)s' % param) + self.events.setdefault(self.stack[-1], []).append(None) + self.values[param] = (kind, data, pos) + else: + if kind is START: + self.string.append(u'[%d:' % self.order) + self.events.setdefault(self.order, []).append((kind, data, pos)) + self.stack.append(self.order) + self.depth += 1 + self.order += 1 + elif kind is END: + self.depth -= 1 + if self.depth: + self.events[self.stack[-1]].append((kind, data, pos)) + self.string.append(u']') + self.stack.pop() + + def format(self): + """Return a message identifier representing the content in the + buffer. + """ + return u''.join(self.string).strip() + + def translate(self, string, regex=re.compile(r'%\((\w+)\)s')): + """Interpolate the given message translation with the events in the + buffer and return the translated stream. + + :param string: the translated message string + """ + parts = parse_msg(string) + for order, string in parts: + events = self.events[order] + while events: + event = events.pop(0) + if event: + yield event + else: + if not string: + break + for idx, part in enumerate(regex.split(string)): + if idx % 2: + yield self.values[part] + elif part: + yield TEXT, part, (None, -1, -1) + if not self.events[order] or not self.events[order][0]: + break + + +def parse_msg(string, regex=re.compile(r'(?:\[(\d+)\:)|\]')): + """Parse a translated message using Genshi mixed content message + formatting. + + >>> parse_msg("See [1:Help].") + [(0, 'See '), (1, 'Help'), (0, '.')] + + >>> parse_msg("See [1:our [2:Help] page] for details.") + [(0, 'See '), (1, 'our '), (2, 'Help'), (1, ' page'), (0, ' for details.')] + + >>> parse_msg("[2:Details] finden Sie in [1:Hilfe].") + [(2, 'Details'), (0, ' finden Sie in '), (1, 'Hilfe'), (0, '.')] + + >>> parse_msg("[1:] Bilder pro Seite anzeigen.") + [(1, ''), (0, ' Bilder pro Seite anzeigen.')] + + :param string: the translated message string + :return: a list of ``(order, string)`` tuples + :rtype: `list` + """ + parts = [] + stack = [0] + while True: + mo = regex.search(string) + if not mo: + break + + if mo.start() or stack[-1]: + parts.append((stack[-1], string[:mo.start()])) + string = string[mo.end():] + + orderno = mo.group(1) + if orderno is not None: + stack.append(int(orderno)) + else: + stack.pop() + if not stack: + break + + if string: + parts.append((stack[-1], string)) + + return parts + + +def extract_from_code(code, gettext_functions): + """Extract strings from Python bytecode. + + >>> from genshi.template.eval import Expression + + >>> expr = Expression('_("Hello")') + >>> list(extract_from_code(expr, Translator.GETTEXT_FUNCTIONS)) + [('_', u'Hello')] + + >>> expr = Expression('ngettext("You have %(num)s item", ' + ... '"You have %(num)s items", num)') + >>> list(extract_from_code(expr, Translator.GETTEXT_FUNCTIONS)) + [('ngettext', (u'You have %(num)s item', u'You have %(num)s items', None))] + + :param code: the `Code` object + :type code: `genshi.template.eval.Code` + :param gettext_functions: a sequence of function names + :since: version 0.5 + """ + def _walk(node): + if isinstance(node, _ast.Call) and isinstance(node.func, _ast.Name) \ + and node.func.id in gettext_functions: + strings = [] + def _add(arg): + if isinstance(arg, _ast.Str) and isinstance(arg.s, basestring): + strings.append(unicode(arg.s, 'utf-8')) + elif arg: + strings.append(None) + [_add(arg) for arg in node.args] + _add(node.starargs) + _add(node.kwargs) + if len(strings) == 1: + strings = strings[0] + else: + strings = tuple(strings) + yield node.func.id, strings + elif node._fields: + children = [] + for field in node._fields: + child = getattr(node, field, None) + if isinstance(child, list): + for elem in child: + children.append(elem) + elif isinstance(child, _ast.AST): + children.append(child) + for child in children: + for funcname, strings in _walk(child): + yield funcname, strings + return _walk(code.ast) + + +def extract(fileobj, keywords, comment_tags, options): + """Babel extraction method for Genshi templates. + + :param fileobj: the file-like object the messages should be extracted from + :param keywords: a list of keywords (i.e. function names) that should be + recognized as translation functions + :param comment_tags: a list of translator tags to search for and include + in the results + :param options: a dictionary of additional options (optional) + :return: an iterator over ``(lineno, funcname, message, comments)`` tuples + :rtype: ``iterator`` + """ + template_class = options.get('template_class', MarkupTemplate) + if isinstance(template_class, basestring): + module, clsname = template_class.split(':', 1) + template_class = getattr(__import__(module, {}, {}, [clsname]), clsname) + encoding = options.get('encoding', None) + + extract_text = options.get('extract_text', True) + if isinstance(extract_text, basestring): + extract_text = extract_text.lower() in ('1', 'on', 'yes', 'true') + + ignore_tags = options.get('ignore_tags', Translator.IGNORE_TAGS) + if isinstance(ignore_tags, basestring): + ignore_tags = ignore_tags.split() + ignore_tags = [QName(tag) for tag in ignore_tags] + + include_attrs = options.get('include_attrs', Translator.INCLUDE_ATTRS) + if isinstance(include_attrs, basestring): + include_attrs = include_attrs.split() + include_attrs = [QName(attr) for attr in include_attrs] + + tmpl = template_class(fileobj, filename=getattr(fileobj, 'name', None), + encoding=encoding) + translator = Translator(None, ignore_tags, include_attrs, extract_text) + for message in translator.extract(tmpl.stream, gettext_functions=keywords): + yield message