# HG changeset patch # User cmlenz # Date 1184354972 0 # Node ID 7f49cc5eb6e390e1744ea5d71596c98c546da502 # Parent 98ff0f3fc03e0916022c9a0a749b6bfe3d60e109 newctxt: Merged [667:676/trunk]. diff --git a/ChangeLog b/ChangeLog --- a/ChangeLog +++ b/ChangeLog @@ -32,6 +32,8 @@ it is not available for use through configuration files. * The I18n filter now extracts messages from gettext functions even inside ignored tags (ticket #132). + * The HTML sanitizer now strips any CSS comments in style attributes, which + could previously be used to hide malicious property values. Version 0.4.2 diff --git a/doc/i18n.txt b/doc/i18n.txt --- a/doc/i18n.txt +++ b/doc/i18n.txt @@ -191,7 +191,7 @@ from genshi.template import TemplateLoader def template_loaded(template): - template.filters.insert(0, , Translator(translations.ugettext)) + template.filters.insert(0, Translator(translations.ugettext)) loader = TemplateLoader('templates', callback=template_loaded) template = loader.load("...") diff --git a/genshi/filters/html.py b/genshi/filters/html.py --- a/genshi/filters/html.py +++ b/genshi/filters/html.py @@ -285,7 +285,9 @@ elif attr == 'style': # Remove dangerous CSS declarations from inline styles decls = [] - value = self._replace_unicode_escapes(value) + value = self._strip_css_comments( + self._replace_unicode_escapes(value) + ) for decl in filter(None, value.split(';')): is_evil = False if 'expression' in decl: @@ -322,3 +324,8 @@ def _repl(match): return unichr(int(match.group(1), 16)) return self._UNICODE_ESCAPE(_repl, self._NORMALIZE_NEWLINES('\n', text)) + + _CSS_COMMENTS = re.compile(r'/\*.*?\*/').sub + + def _strip_css_comments(self, text): + return self._CSS_COMMENTS('', text) diff --git a/genshi/filters/i18n.py b/genshi/filters/i18n.py --- a/genshi/filters/i18n.py +++ b/genshi/filters/i18n.py @@ -13,26 +13,23 @@ """Utilities for internationalization and localization of templates.""" +from compiler import ast try: frozenset except NameError: from sets import ImmutableSet as frozenset from gettext import gettext -from opcode import opmap import re -from genshi.core import Attrs, Namespace, QName, START, END, TEXT, \ - XML_NAMESPACE, _ensure +from genshi.core import Attrs, Namespace, QName, START, END, TEXT, START_NS, \ + END_NS, XML_NAMESPACE, _ensure from genshi.template.base import Template, EXPR, SUB from genshi.template.markup import MarkupTemplate, EXEC __all__ = ['Translator', 'extract'] __docformat__ = 'restructuredtext en' -_LOAD_NAME = chr(opmap['LOAD_NAME']) -_LOAD_CONST = chr(opmap['LOAD_CONST']) -_CALL_FUNCTION = chr(opmap['CALL_FUNCTION']) -_BINARY_ADD = chr(opmap['BINARY_ADD']) +I18N_NAMESPACE = Namespace('http://genshi.edgewall.org/i18n') class Translator(object): @@ -108,7 +105,7 @@ self.ignore_tags = ignore_tags self.include_attrs = include_attrs - def __call__(self, stream, ctxt=None, search_text=True): + def __call__(self, stream, ctxt=None, search_text=True, msgbuf=None): """Translate any localizable strings in the given stream. This function shouldn't be called directly. Instead, an instance of @@ -121,12 +118,15 @@ :param ctxt: the template context (not used) :param search_text: whether text nodes should be translated (used internally) + :param msgbuf: a `MessageBuffer` object or `None` (used internally) :return: the localized stream """ ignore_tags = self.ignore_tags include_attrs = self.include_attrs translate = self.translate skip = 0 + i18n_msg = I18N_NAMESPACE['msg'] + ns_prefixes = [] xml_lang = XML_NAMESPACE['lang'] for kind, data, pos in stream: @@ -158,7 +158,7 @@ newval = self.translate(value) else: newval = list(self(_ensure(value), ctxt, - search_text=False) + search_text=False, msgbuf=msgbuf) ) if newval != value: value = newval @@ -167,19 +167,43 @@ if changed: attrs = new_attrs + if msgbuf: + msgbuf.append(kind, data, pos) + continue + elif i18n_msg in attrs: + msgbuf = MessageBuffer() + attrs -= i18n_msg + yield kind, (tag, attrs), pos elif search_text and kind is TEXT: - text = data.strip() - if text: - data = data.replace(text, translate(text)) - yield kind, data, pos + if not msgbuf: + text = data.strip() + if text: + data = data.replace(text, translate(text)) + yield kind, data, pos + else: + msgbuf.append(kind, data, pos) + + elif not skip and msgbuf and kind is END: + msgbuf.append(kind, data, pos) + if not msgbuf.depth: + for event in msgbuf.translate(translate(msgbuf.format())): + yield event + msgbuf = None + yield kind, data, pos elif kind is SUB: subkind, substream = data - new_substream = list(self(substream, ctxt)) + new_substream = list(self(substream, ctxt, msgbuf=msgbuf)) yield kind, (subkind, new_substream), pos + elif kind is START_NS and data[1] == I18N_NAMESPACE: + ns_prefixes.append(data[0]) + + elif kind is END_NS and data in ns_prefixes: + ns_prefixes.remove(data) + else: yield kind, data, pos @@ -187,7 +211,7 @@ 'ugettext', 'ungettext') def extract(self, stream, gettext_functions=GETTEXT_FUNCTIONS, - search_text=True): + search_text=True, msgbuf=None): """Extract localizable strings from the given template stream. For every string found, this function yields a ``(lineno, function, @@ -217,7 +241,7 @@ 3, None, u'Example' 6, None, u'Example' 7, '_', u'Hello, %(name)s' - 8, 'ngettext', (u'You have %d item', u'You have %d items') + 8, 'ngettext', (u'You have %d item', u'You have %d items', None) :param stream: the event stream to extract strings from; can be a regular stream or a template stream @@ -231,8 +255,8 @@ (such as ``ngettext``), a single item with a tuple of strings is yielded, instead an item for each string argument. """ - tagname = None skip = 0 + i18n_msg = I18N_NAMESPACE['msg'] xml_lang = XML_NAMESPACE['lang'] for kind, data, pos in stream: @@ -245,6 +269,7 @@ if kind is START and not skip: tag, attrs = data + if tag in self.ignore_tags or \ isinstance(attrs.get(xml_lang), basestring): skip += 1 @@ -262,49 +287,165 @@ search_text=False): yield lineno, funcname, text + if msgbuf: + msgbuf.append(kind, data, pos) + elif i18n_msg in attrs: + msgbuf = MessageBuffer(pos[1]) + elif not skip and search_text and kind is TEXT: - text = data.strip() - if text and filter(None, [ch.isalpha() for ch in text]): - yield pos[1], None, text + if not msgbuf: + text = data.strip() + if text and filter(None, [ch.isalpha() for ch in text]): + yield pos[1], None, text + else: + msgbuf.append(kind, data, pos) + + elif not skip and msgbuf and kind is END: + msgbuf.append(kind, data, pos) + if not msgbuf.depth: + yield msgbuf.lineno, None, msgbuf.format() + msgbuf = None elif kind is EXPR or kind is EXEC: - consts = dict([(n, chr(i) + '\x00') for i, n in - enumerate(data.code.co_consts)]) - gettext_locs = [consts[n] for n in gettext_functions - if n in consts] - ops = [ - _LOAD_CONST, '(', '|'.join(gettext_locs), ')', - _CALL_FUNCTION, '.\x00', - '((?:', _BINARY_ADD, '|', _LOAD_CONST, '.\x00)+)' - ] - for loc, opcodes in re.findall(''.join(ops), data.code.co_code): - funcname = data.code.co_consts[ord(loc[0])] - strings = [] - opcodes = iter(opcodes) - for opcode in opcodes: - if opcode == _BINARY_ADD: - arg = strings.pop() - strings[-1] += arg - else: - arg = data.code.co_consts[ord(opcodes.next())] - opcodes.next() # skip second byte - if not isinstance(arg, basestring): - break - strings.append(unicode(arg)) - if len(strings) == 1: - strings = strings[0] - else: - strings = tuple(strings) + for funcname, strings in extract_from_code(data, + gettext_functions): yield pos[1], funcname, strings elif kind is SUB: subkind, substream = data messages = self.extract(substream, gettext_functions, - search_text=search_text and not skip) + search_text=search_text and not skip, + msgbuf=msgbuf) for lineno, funcname, text in messages: yield lineno, funcname, text +class MessageBuffer(object): + """Helper class for managing localizable mixed content.""" + + def __init__(self, lineno=-1): + self.lineno = lineno + self.strings = [] + self.events = {} + self.depth = 1 + self.order = 1 + self.stack = [0] + + def append(self, kind, data, pos): + if kind is TEXT: + self.strings.append(data) + self.events.setdefault(self.stack[-1], []).append(None) + else: + if kind is START: + self.strings.append(u'[%d:' % self.order) + self.events.setdefault(self.order, []).append((kind, data, pos)) + self.stack.append(self.order) + self.depth += 1 + self.order += 1 + elif kind is END: + self.depth -= 1 + if self.depth: + self.events[self.stack[-1]].append((kind, data, pos)) + self.strings.append(u']') + self.stack.pop() + + def format(self): + return u''.join(self.strings).strip() + + def translate(self, string): + parts = parse_msg(string) + for order, string in parts: + events = self.events[order] + while events: + event = self.events[order].pop(0) + if not event: + if not string: + break + yield TEXT, string, (None, -1, -1) + if not self.events[order] or not self.events[order][0]: + break + else: + yield event + + +def extract_from_code(code, gettext_functions): + """Extract strings from Python bytecode. + + >>> from genshi.template.eval import Expression + + >>> expr = Expression('_("Hello")') + >>> list(extract_from_code(expr, Translator.GETTEXT_FUNCTIONS)) + [('_', u'Hello')] + + >>> expr = Expression('ngettext("You have %(num)s item", ' + ... '"You have %(num)s items", num)') + >>> list(extract_from_code(expr, Translator.GETTEXT_FUNCTIONS)) + [('ngettext', (u'You have %(num)s item', u'You have %(num)s items', None))] + + :param code: the `Code` object + :type code: `genshi.template.eval.Code` + :param gettext_functions: a sequence of function names + """ + def _walk(node): + if isinstance(node, ast.CallFunc) and isinstance(node.node, ast.Name) \ + and node.node.name in gettext_functions: + strings = [] + for arg in node.args: + if isinstance(arg, ast.Const) \ + and isinstance(arg.value, basestring): + strings.append(unicode(arg.value)) + elif not isinstance(arg, ast.Keyword): + strings.append(None) + if len(strings) == 1: + strings = strings[0] + else: + strings = tuple(strings) + yield node.node.name, strings + else: + for child in node.getChildNodes(): + for funcname, strings in _walk(child): + yield funcname, strings + return _walk(code.ast) + +def parse_msg(string, regex=re.compile(r'(?:\[(\d+)\:)|\]')): + """Parse a message using Genshi compound message formatting. + + >>> parse_msg("See [1:Help].") + [(0, 'See '), (1, 'Help'), (0, '.')] + + >>> parse_msg("See [1:our [2:Help] page] for details.") + [(0, 'See '), (1, 'our '), (2, 'Help'), (1, ' page'), (0, ' for details.')] + + >>> parse_msg("[2:Details] finden Sie in [1:Hilfe].") + [(2, 'Details'), (0, ' finden Sie in '), (1, 'Hilfe'), (0, '.')] + + >>> parse_msg("[1:] Bilder pro Seite anzeigen.") + [(1, ''), (0, ' Bilder pro Seite anzeigen.')] + """ + parts = [] + stack = [0] + while True: + mo = regex.search(string) + if not mo: + break + + if mo.start() or stack[-1]: + parts.append((stack[-1], string[:mo.start()])) + string = string[mo.end():] + + orderno = mo.group(1) + if orderno is not None: + stack.append(int(orderno)) + else: + stack.pop() + if not stack: + break + + if string: + parts.append((stack[-1], string)) + + return parts + def extract(fileobj, keywords, comment_tags, options): """Babel extraction method for Genshi templates. diff --git a/genshi/filters/tests/html.py b/genshi/filters/tests/html.py --- a/genshi/filters/tests/html.py +++ b/genshi/filters/tests/html.py @@ -332,6 +332,8 @@ # IE expressions in CSS not allowed html = HTML('
+ Please see Help for details. +
+ """) + translator = Translator() + messages = list(translator.extract(tmpl.stream)) + self.assertEqual(1, len(messages)) + self.assertEqual('Please see [1:Help] for details.', messages[0][2]) + + def test_translate_i18n_msg(self): + tmpl = MarkupTemplate(""" ++ Please see Help for details. +
+ """) + gettext = lambda s: u"Für Details siehe bitte [1:Hilfe]." + tmpl.filters.insert(0, Translator(gettext)) + self.assertEqual(""" +Für Details siehe bitte Hilfe.
+ """, tmpl.generate().render()) + + def test_extract_i18n_msg_nested(self): + tmpl = MarkupTemplate(""" ++ Please see Help page for details. +
+ """) + translator = Translator() + messages = list(translator.extract(tmpl.stream)) + self.assertEqual(1, len(messages)) + self.assertEqual('Please see [1:[2:Help] page] for details.', + messages[0][2]) + + def test_translate_i18n_msg_nested(self): + tmpl = MarkupTemplate(""" ++ Please see Help page for details. +
+ """) + gettext = lambda s: u"Für Details siehe bitte [1:[2:Hilfeseite]]." + tmpl.filters.insert(0, Translator(gettext)) + self.assertEqual(""" +Für Details siehe bitte Hilfeseite.
+ """, tmpl.generate().render()) + + def test_extract_i18n_msg_empty(self): + tmpl = MarkupTemplate(""" ++ Show me entries per page. +
+ """) + translator = Translator() + messages = list(translator.extract(tmpl.stream)) + self.assertEqual(1, len(messages)) + self.assertEqual('Show me [1:] entries per page.', messages[0][2]) + + def test_translate_i18n_msg_empty(self): + tmpl = MarkupTemplate(""" ++ Show me entries per page. +
+ """) + gettext = lambda s: u"[1:] Einträge pro Seite anzeigen." + tmpl.filters.insert(0, Translator(gettext)) + self.assertEqual(""" +Einträge pro Seite anzeigen.
+ """, tmpl.generate().render()) + + def test_extract_i18n_msg_multiple(self): + tmpl = MarkupTemplate(""" ++ Please see Help for details. +
+ """) + translator = Translator() + messages = list(translator.extract(tmpl.stream)) + self.assertEqual(1, len(messages)) + self.assertEqual('Please see [1:Help] for [2:details].', messages[0][2]) + + def test_translate_i18n_msg_multiple(self): + tmpl = MarkupTemplate(""" ++ Please see Help for details. +
+ """) + gettext = lambda s: u"Für [2:Details] siehe bitte [1:Hilfe]." + tmpl.filters.insert(0, Translator(gettext)) + self.assertEqual(""" +Für Details siehe bitte Hilfe.
+ """, tmpl.generate().render()) + + def test_extract_i18n_msg_multiple_empty(self): + tmpl = MarkupTemplate(""" ++ Show me entries per page, starting at page . +
+ """) + translator = Translator() + messages = list(translator.extract(tmpl.stream)) + self.assertEqual(1, len(messages)) + self.assertEqual('Show me [1:] entries per page, starting at page [2:].', + messages[0][2]) + + def test_translate_i18n_msg_multiple_empty(self): + tmpl = MarkupTemplate(""" ++ Show me entries per page, starting at page . +
+ """) + gettext = lambda s: u"[1:] Einträge pro Seite, beginnend auf Seite [2:]." + tmpl.filters.insert(0, Translator(gettext)) + self.assertEqual(""" +Eintr\xc3\xa4ge pro Seite, beginnend auf Seite .
+ """, tmpl.generate().render()) + + def test_extract_i18n_msg_with_directive(self): + tmpl = MarkupTemplate(""" ++ Show me entries per page. +
+ """) + translator = Translator() + messages = list(translator.extract(tmpl.stream)) + self.assertEqual(1, len(messages)) + self.assertEqual('Show me [1:] entries per page.', messages[0][2]) + + # FIXME: this currently fails :-/ +# def test_translate_i18n_msg_with_directive(self): +# tmpl = MarkupTemplate(""" +#+# Show me entries per page. +#
+# """) +# gettext = lambda s: u"[1:] Einträge pro Seite anzeigen." +# tmpl.filters.insert(0, Translator(gettext)) +# self.assertEqual(""" +#Einträge pro Seite anzeigen.
+# """, tmpl.generate().render()) + class ExtractTestCase(unittest.TestCase): @@ -110,7 +264,8 @@ (3, None, u'Example', []), (6, None, u'Example', []), (7, '_', u'Hello, %(name)s', []), - (8, 'ngettext', (u'You have %d item', u'You have %d items'), []), + (8, 'ngettext', (u'You have %d item', u'You have %d items', None), + []), ], results) def test_text_template_extraction(self): @@ -128,10 +283,28 @@ })) self.assertEqual([ (1, '_', u'Dear %(name)s', []), - (3, 'ngettext', (u'Your item:', u'Your items'), []), + (3, 'ngettext', (u'Your item:', u'Your items', None), []), (7, None, u'All the best,\n Foobar', []) ], results) + def test_extraction_with_keyword_arg(self): + buf = StringIO(""" + ${gettext('Foobar', foo='bar')} + """) + results = list(extract(buf, ['gettext'], [], {})) + self.assertEqual([ + (2, 'gettext', (u'Foobar'), []), + ], results) + + def test_extraction_with_nonstring_arg(self): + buf = StringIO(""" + ${dgettext(curdomain, 'Foobar')} + """) + results = list(extract(buf, ['dgettext'], [], {})) + self.assertEqual([ + (2, 'dgettext', (None, u'Foobar'), []), + ], results) + def test_extraction_inside_ignored_tags(self): buf = StringIO("""