# HG changeset patch # User cmlenz # Date 1154386806 0 # Node ID e815c2c0757277f38d6761ca92eafcbc3072d6d8 # Parent a834a66696813ca08102ba5c42a119c4bc708de9 Removed the `sanitize()` method from the `Markup` class, and migrate the existing unit tests to `markup.tests.filters`. Provide a `Stream.filter()` method instead which can be used to conveniently apply a filter to a stream. diff --git a/markup/core.py b/markup/core.py --- a/markup/core.py +++ b/markup/core.py @@ -64,6 +64,14 @@ def __iter__(self): return iter(self.events) + def filter(self, filter): + """Apply a filter to the stream. + + This method returns a new stream with the given filter applied. The + filter must be a callable that accepts the stream object as parameter. + """ + return Stream(filter(html)) + def render(self, method='xml', encoding='utf-8', filters=None, **kwargs): """Return a string representation of the stream. @@ -238,6 +246,39 @@ return TEXT, u''.join([x[1] for x in self]), (None, -1, -1) +def stripentities(text, keepxmlentities=False): + """Return a copy of the given text with any character or numeric entities + replaced by the equivalent UTF-8 characters. + + If the `keepxmlentities` parameter is provided and evaluates to `True`, + the core XML entities (&, ', >, < and ") are not + stripped. + """ + def _replace_entity(match): + if match.group(1): # numeric entity + ref = match.group(1) + if ref.startswith('x'): + ref = int(ref[1:], 16) + else: + ref = int(ref, 10) + return unichr(ref) + else: # character entity + ref = match.group(2) + if keepxmlentities and ref in ('amp', 'apos', 'gt', 'lt', + 'quot'): + return '&%s;' % ref + try: + codepoint = htmlentitydefs.name2codepoint[ref] + return unichr(codepoint) + except KeyError: + if keepxmlentities: + return '&%s;' % ref + else: + return ref + return re.sub(r'&(?:#((?:\d+)|(?:[xX][0-9a-fA-F]+));?|(\w+);)', + _replace_entity, text) + + class Markup(unicode): """Marks a string as being safe for inclusion in HTML/XML output without needing to be escaped. @@ -276,29 +317,7 @@ the core XML entities (&, ', >, < and ") are not stripped. """ - def _replace_entity(match): - if match.group(1): # numeric entity - ref = match.group(1) - if ref.startswith('x'): - ref = int(ref[1:], 16) - else: - ref = int(ref, 10) - return unichr(ref) - else: # character entity - ref = match.group(2) - if keepxmlentities and ref in ('amp', 'apos', 'gt', 'lt', - 'quot'): - return '&%s;' % ref - try: - codepoint = htmlentitydefs.name2codepoint[ref] - return unichr(codepoint) - except KeyError: - if keepxmlentities: - return '&%s;' % ref - else: - return ref - return Markup(re.sub(r'&(?:#((?:\d+)|(?:[xX][0-9a-fA-F]+));?|(\w+);)', - _replace_entity, self)) + return Markup(stripentities(self, keepxmlentities=keepxmlentities)) def striptags(self): """Return a copy of the text with all XML/HTML tags removed.""" @@ -342,12 +361,6 @@ text = text.replace(u'\n', u' ') return text - def sanitize(self): - from markup.filters import HTMLSanitizer - from markup.input import HTMLParser - text = StringIO(self.stripentities(keepxmlentities=True)) - return Markup(Stream(HTMLSanitizer()(HTMLParser(text)))) - escape = Markup.escape diff --git a/markup/filters.py b/markup/filters.py --- a/markup/filters.py +++ b/markup/filters.py @@ -20,7 +20,7 @@ from sets import ImmutableSet as frozenset import re -from markup.core import Attributes, Markup, Namespace, escape +from markup.core import Attributes, Markup, Namespace, escape, stripentities from markup.core import END, END_NS, START, START_NS, TEXT from markup.path import Path @@ -130,9 +130,9 @@ del textbuf[:] yield TEXT, output, pos else: - output = escape(collapse_lines('\n', + output = Markup(collapse_lines('\n', trim_trailing_space('', - textbuf.pop())), quotes=False) + escape(textbuf.pop(), quotes=False)))) yield TEXT, output, pos if kind is not None: yield kind, data, pos @@ -182,6 +182,7 @@ new_attrib = [] for attr, value in attrib: + value = stripentities(value) if attr not in self._SAFE_ATTRS: continue elif attr in self._URI_ATTRS: diff --git a/markup/tests/core.py b/markup/tests/core.py --- a/markup/tests/core.py +++ b/markup/tests/core.py @@ -91,94 +91,6 @@ assert isinstance(markup, Markup) self.assertEquals('foo', markup) - def test_sanitize_unchanged(self): - markup = Markup('fo
o
') - self.assertEquals('fo
o
', str(markup.sanitize())) - - def test_sanitize_escape_text(self): - markup = Markup('fo&') - self.assertEquals('fo&', str(markup.sanitize())) - markup = Markup('<foo>') - self.assertEquals('<foo>', str(markup.sanitize())) - - def test_sanitize_entityref_text(self): - markup = Markup('foö') - self.assertEquals(u'foƶ', unicode(markup.sanitize())) - - def test_sanitize_escape_attr(self): - markup = Markup('
') - self.assertEquals('
', str(markup.sanitize())) - - def test_sanitize_close_empty_tag(self): - markup = Markup('fo
o
') - self.assertEquals('fo
o
', str(markup.sanitize())) - - def test_sanitize_invalid_entity(self): - markup = Markup('&junk;') - self.assertEquals('&junk;', str(markup.sanitize())) - - def test_sanitize_remove_script_elem(self): - markup = Markup('') - self.assertEquals('', str(markup.sanitize())) - markup = Markup('') - self.assertEquals('', str(markup.sanitize())) - markup = Markup('alert("foo")') - self.assertRaises(ParseError, markup.sanitize) - markup = Markup('') - self.assertRaises(ParseError, markup.sanitize) - - def test_sanitize_remove_onclick_attr(self): - markup = Markup('
') - self.assertEquals('
', str(markup.sanitize())) - - def test_sanitize_remove_style_scripts(self): - # Inline style with url() using javascript: scheme - markup = Markup('
') - self.assertEquals('
', str(markup.sanitize())) - # Inline style with url() using javascript: scheme, using control char - markup = Markup('
') - self.assertEquals('
', str(markup.sanitize())) - # Inline style with url() using javascript: scheme, in quotes - markup = Markup('
') - self.assertEquals('
', str(markup.sanitize())) - # IE expressions in CSS not allowed - markup = Markup('
') - self.assertEquals('
', str(markup.sanitize())) - markup = Markup('
') - self.assertEquals('
', str(markup.sanitize())) - - def test_sanitize_remove_src_javascript(self): - markup = Markup('') - self.assertEquals('', str(markup.sanitize())) - # Case-insensitive protocol matching - markup = Markup('') - self.assertEquals('', str(markup.sanitize())) - # Grave accents (not parsed) - markup = Markup('') - self.assertRaises(ParseError, markup.sanitize) - # Protocol encoded using UTF-8 numeric entities - markup = Markup('') - self.assertEquals('', str(markup.sanitize())) - # Protocol encoded using UTF-8 numeric entities without a semicolon - # (which is allowed because the max number of digits is used) - markup = Markup('') - self.assertEquals('', str(markup.sanitize())) - # Protocol encoded using UTF-8 numeric hex entities without a semicolon - # (which is allowed because the max number of digits is used) - markup = Markup('') - self.assertEquals('', str(markup.sanitize())) - # Embedded tab character in protocol - markup = Markup('') - self.assertEquals('', str(markup.sanitize())) - # Embedded tab character in protocol, but encoded this time - markup = Markup('') - self.assertEquals('', str(markup.sanitize())) - def suite(): suite = unittest.TestSuite() diff --git a/markup/tests/filters.py b/markup/tests/filters.py new file mode 100644 --- /dev/null +++ b/markup/tests/filters.py @@ -0,0 +1,125 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2006 Edgewall Software +# All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://markup.edgewall.org/wiki/License. +# +# This software consists of voluntary contributions made by many +# individuals. For the exact contribution history, see the revision +# history and logs, available at http://markup.edgewall.org/log/. + +import doctest +import unittest + +from markup.core import Stream +from markup.input import HTML, ParseError +from markup.filters import HTMLSanitizer + + +class HTMLSanitizerTestCase(unittest.TestCase): + + def test_sanitize_unchanged(self): + html = HTML('fo
o
') + self.assertEquals('fo
o
', + str(html.filter(HTMLSanitizer())) + + def test_sanitize_escape_text(self): + html = HTML('fo&') + self.assertEquals('fo&', + str(html.filter(HTMLSanitizer())) + html = HTML('<foo>') + self.assertEquals('<foo>', + str(html.filter(HTMLSanitizer())) + + def test_sanitize_entityref_text(self): + html = HTML('foö') + self.assertEquals(u'foö', + str(html.filter(HTMLSanitizer())) + + def test_sanitize_escape_attr(self): + html = HTML('
') + self.assertEquals('
', + str(html.filter(HTMLSanitizer()))) + + def test_sanitize_close_empty_tag(self): + html = HTML('fo
o
') + self.assertEquals('fo
o
', + str(html.filter(HTMLSanitizer())) + + def test_sanitize_invalid_entity(self): + html = HTML('&junk;') + self.assertEquals('&junk;', str(html.filter(HTMLSanitizer())) + + def test_sanitize_remove_script_elem(self): + html = HTML('') + self.assertEquals('', str(html.filter(HTMLSanitizer())) + html = HTML('') + self.assertEquals('', str(html.filter(HTMLSanitizer())) + self.assertRaises(ParseError, HTML, 'alert("foo")') + self.assertRaises(ParseError, HTML, + '') + + def test_sanitize_remove_onclick_attr(self): + html = HTML('
') + self.assertEquals('
', str(html.filter(HTMLSanitizer())) + + def test_sanitize_remove_style_scripts(self): + # Inline style with url() using javascript: scheme + html = HTML('
') + self.assertEquals('
', str(html.filter(HTMLSanitizer())) + # Inline style with url() using javascript: scheme, using control char + html = HTML('
') + self.assertEquals('
', str(html.filter(HTMLSanitizer())) + # Inline style with url() using javascript: scheme, in quotes + html = HTML('
') + self.assertEquals('
', str(html.filter(HTMLSanitizer())) + # IE expressions in CSS not allowed + html = HTML('
') + self.assertEquals('
', str(html.filter(HTMLSanitizer())) + html = HTML('
') + self.assertEquals('
', + str(html.filter(HTMLSanitizer())) + + def test_sanitize_remove_src_javascript(self): + html = HTML('') + self.assertEquals('', str(html.filter(HTMLSanitizer())) + # Case-insensitive protocol matching + html = HTML('') + self.assertEquals('', str(html.filter(HTMLSanitizer())) + # Grave accents (not parsed) + self.assertRaises(ParseError, HTML, + '') + # Protocol encoded using UTF-8 numeric entities + html = HTML('') + self.assertEquals('', str(html.filter(HTMLSanitizer())) + # Protocol encoded using UTF-8 numeric entities without a semicolon + # (which is allowed because the max number of digits is used) + html = HTML('') + self.assertEquals('', str(html.filter(HTMLSanitizer())) + # Protocol encoded using UTF-8 numeric hex entities without a semicolon + # (which is allowed because the max number of digits is used) + html = HTML('') + self.assertEquals('', str(html.filter(HTMLSanitizer())) + # Embedded tab character in protocol + html = HTML('') + self.assertEquals('', str(html.filter(HTMLSanitizer())) + # Embedded tab character in protocol, but encoded this time + html = HTML('') + self.assertEquals('', str(html.filter(HTMLSanitizer())) + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(HTMLSanitizerTestCase, 'test')) + return suite + +if __name__ == '__main__': + unittest.main(defaultTest='suite')