Mercurial > genshi > mirror
changeset 113:d10fbba1d5e0 trunk
Removed the `sanitize()` method from the `Markup` class, and migrate the existing unit tests to `markup.tests.filters`. Provide a `Stream.filter()` method instead which can be used to conveniently apply a filter to a stream.
author | cmlenz |
---|---|
date | Mon, 31 Jul 2006 23:00:06 +0000 |
parents | 5f9af749341c |
children | 4c4e81d12649 |
files | markup/core.py markup/filters.py markup/tests/core.py markup/tests/filters.py |
diffstat | 4 files changed, 171 insertions(+), 120 deletions(-) [+] |
line wrap: on
line diff
--- a/markup/core.py +++ b/markup/core.py @@ -64,6 +64,14 @@ def __iter__(self): return iter(self.events) + def filter(self, filter): + """Apply a filter to the stream. + + This method returns a new stream with the given filter applied. The + filter must be a callable that accepts the stream object as parameter. + """ + return Stream(filter(html)) + def render(self, method='xml', encoding='utf-8', filters=None, **kwargs): """Return a string representation of the stream. @@ -238,6 +246,39 @@ return TEXT, u''.join([x[1] for x in self]), (None, -1, -1) +def stripentities(text, keepxmlentities=False): + """Return a copy of the given text with any character or numeric entities + replaced by the equivalent UTF-8 characters. + + If the `keepxmlentities` parameter is provided and evaluates to `True`, + the core XML entities (&, ', >, < and ") are not + stripped. + """ + def _replace_entity(match): + if match.group(1): # numeric entity + ref = match.group(1) + if ref.startswith('x'): + ref = int(ref[1:], 16) + else: + ref = int(ref, 10) + return unichr(ref) + else: # character entity + ref = match.group(2) + if keepxmlentities and ref in ('amp', 'apos', 'gt', 'lt', + 'quot'): + return '&%s;' % ref + try: + codepoint = htmlentitydefs.name2codepoint[ref] + return unichr(codepoint) + except KeyError: + if keepxmlentities: + return '&%s;' % ref + else: + return ref + return re.sub(r'&(?:#((?:\d+)|(?:[xX][0-9a-fA-F]+));?|(\w+);)', + _replace_entity, text) + + class Markup(unicode): """Marks a string as being safe for inclusion in HTML/XML output without needing to be escaped. @@ -276,29 +317,7 @@ the core XML entities (&, ', >, < and ") are not stripped. """ - def _replace_entity(match): - if match.group(1): # numeric entity - ref = match.group(1) - if ref.startswith('x'): - ref = int(ref[1:], 16) - else: - ref = int(ref, 10) - return unichr(ref) - else: # character entity - ref = match.group(2) - if keepxmlentities and ref in ('amp', 'apos', 'gt', 'lt', - 'quot'): - return '&%s;' % ref - try: - codepoint = htmlentitydefs.name2codepoint[ref] - return unichr(codepoint) - except KeyError: - if keepxmlentities: - return '&%s;' % ref - else: - return ref - return Markup(re.sub(r'&(?:#((?:\d+)|(?:[xX][0-9a-fA-F]+));?|(\w+);)', - _replace_entity, self)) + return Markup(stripentities(self, keepxmlentities=keepxmlentities)) def striptags(self): """Return a copy of the text with all XML/HTML tags removed.""" @@ -342,12 +361,6 @@ text = text.replace(u'\n', u' ') return text - def sanitize(self): - from markup.filters import HTMLSanitizer - from markup.input import HTMLParser - text = StringIO(self.stripentities(keepxmlentities=True)) - return Markup(Stream(HTMLSanitizer()(HTMLParser(text)))) - escape = Markup.escape
--- a/markup/filters.py +++ b/markup/filters.py @@ -20,7 +20,7 @@ from sets import ImmutableSet as frozenset import re -from markup.core import Attributes, Markup, Namespace, escape +from markup.core import Attributes, Markup, Namespace, escape, stripentities from markup.core import END, END_NS, START, START_NS, TEXT from markup.path import Path @@ -130,9 +130,9 @@ del textbuf[:] yield TEXT, output, pos else: - output = escape(collapse_lines('\n', + output = Markup(collapse_lines('\n', trim_trailing_space('', - textbuf.pop())), quotes=False) + escape(textbuf.pop(), quotes=False)))) yield TEXT, output, pos if kind is not None: yield kind, data, pos @@ -182,6 +182,7 @@ new_attrib = [] for attr, value in attrib: + value = stripentities(value) if attr not in self._SAFE_ATTRS: continue elif attr in self._URI_ATTRS:
--- a/markup/tests/core.py +++ b/markup/tests/core.py @@ -91,94 +91,6 @@ assert isinstance(markup, Markup) self.assertEquals('foo', markup) - def test_sanitize_unchanged(self): - markup = Markup('<a href="#">fo<br />o</a>') - self.assertEquals('<a href="#">fo<br/>o</a>', str(markup.sanitize())) - - def test_sanitize_escape_text(self): - markup = Markup('<a href="#">fo&</a>') - self.assertEquals('<a href="#">fo&</a>', str(markup.sanitize())) - markup = Markup('<a href="#"><foo></a>') - self.assertEquals('<a href="#"><foo></a>', str(markup.sanitize())) - - def test_sanitize_entityref_text(self): - markup = Markup('<a href="#">foö</a>') - self.assertEquals(u'<a href="#">foƶ</a>', unicode(markup.sanitize())) - - def test_sanitize_escape_attr(self): - markup = Markup('<div title="<foo>"></div>') - self.assertEquals('<div title="<foo>"/>', str(markup.sanitize())) - - def test_sanitize_close_empty_tag(self): - markup = Markup('<a href="#">fo<br>o</a>') - self.assertEquals('<a href="#">fo<br/>o</a>', str(markup.sanitize())) - - def test_sanitize_invalid_entity(self): - markup = Markup('&junk;') - self.assertEquals('&junk;', str(markup.sanitize())) - - def test_sanitize_remove_script_elem(self): - markup = Markup('<script>alert("Foo")</script>') - self.assertEquals('', str(markup.sanitize())) - markup = Markup('<SCRIPT SRC="http://example.com/"></SCRIPT>') - self.assertEquals('', str(markup.sanitize())) - markup = Markup('<SCR\0IPT>alert("foo")</SCR\0IPT>') - self.assertRaises(ParseError, markup.sanitize) - markup = Markup('<SCRIPT&XYZ SRC="http://example.com/"></SCRIPT>') - self.assertRaises(ParseError, markup.sanitize) - - def test_sanitize_remove_onclick_attr(self): - markup = Markup('<div onclick=\'alert("foo")\' />') - self.assertEquals('<div/>', str(markup.sanitize())) - - def test_sanitize_remove_style_scripts(self): - # Inline style with url() using javascript: scheme - markup = Markup('<DIV STYLE=\'background: url(javascript:alert("foo"))\'>') - self.assertEquals('<div/>', str(markup.sanitize())) - # Inline style with url() using javascript: scheme, using control char - markup = Markup('<DIV STYLE=\'background: url(javascript:alert("foo"))\'>') - self.assertEquals('<div/>', str(markup.sanitize())) - # Inline style with url() using javascript: scheme, in quotes - markup = Markup('<DIV STYLE=\'background: url("javascript:alert(foo)")\'>') - self.assertEquals('<div/>', str(markup.sanitize())) - # IE expressions in CSS not allowed - markup = Markup('<DIV STYLE=\'width: expression(alert("foo"));\'>') - self.assertEquals('<div/>', str(markup.sanitize())) - markup = Markup('<DIV STYLE=\'background: url(javascript:alert("foo"));' - 'color: #fff\'>') - self.assertEquals('<div style="color: #fff"/>', str(markup.sanitize())) - - def test_sanitize_remove_src_javascript(self): - markup = Markup('<img src=\'javascript:alert("foo")\'>') - self.assertEquals('<img/>', str(markup.sanitize())) - # Case-insensitive protocol matching - markup = Markup('<IMG SRC=\'JaVaScRiPt:alert("foo")\'>') - self.assertEquals('<img/>', str(markup.sanitize())) - # Grave accents (not parsed) - markup = Markup('<IMG SRC=`javascript:alert("RSnake says, \'foo\'")`>') - self.assertRaises(ParseError, markup.sanitize) - # Protocol encoded using UTF-8 numeric entities - markup = Markup('<IMG SRC=\'javascri' - 'pt:alert("foo")\'>') - self.assertEquals('<img/>', str(markup.sanitize())) - # Protocol encoded using UTF-8 numeric entities without a semicolon - # (which is allowed because the max number of digits is used) - markup = Markup('<IMG SRC=\'java' - 'script' - ':alert("foo")\'>') - self.assertEquals('<img/>', str(markup.sanitize())) - # Protocol encoded using UTF-8 numeric hex entities without a semicolon - # (which is allowed because the max number of digits is used) - markup = Markup('<IMG SRC=\'javascri' - 'pt:alert("foo")\'>') - self.assertEquals('<img/>', str(markup.sanitize())) - # Embedded tab character in protocol - markup = Markup('<IMG SRC=\'jav\tascript:alert("foo");\'>') - self.assertEquals('<img/>', str(markup.sanitize())) - # Embedded tab character in protocol, but encoded this time - markup = Markup('<IMG SRC=\'jav	ascript:alert("foo");\'>') - self.assertEquals('<img/>', str(markup.sanitize())) - def suite(): suite = unittest.TestSuite()
new file mode 100644 --- /dev/null +++ b/markup/tests/filters.py @@ -0,0 +1,125 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2006 Edgewall Software +# All rights reserved. +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at http://markup.edgewall.org/wiki/License. +# +# This software consists of voluntary contributions made by many +# individuals. For the exact contribution history, see the revision +# history and logs, available at http://markup.edgewall.org/log/. + +import doctest +import unittest + +from markup.core import Stream +from markup.input import HTML, ParseError +from markup.filters import HTMLSanitizer + + +class HTMLSanitizerTestCase(unittest.TestCase): + + def test_sanitize_unchanged(self): + html = HTML('<a href="#">fo<br />o</a>') + self.assertEquals('<a href="#">fo<br/>o</a>', + str(html.filter(HTMLSanitizer())) + + def test_sanitize_escape_text(self): + html = HTML('<a href="#">fo&</a>') + self.assertEquals('<a href="#">fo&</a>', + str(html.filter(HTMLSanitizer())) + html = HTML('<a href="#"><foo></a>') + self.assertEquals('<a href="#"><foo></a>', + str(html.filter(HTMLSanitizer())) + + def test_sanitize_entityref_text(self): + html = HTML('<a href="#">foö</a>') + self.assertEquals(u'<a href="#">foö</a>', + str(html.filter(HTMLSanitizer())) + + def test_sanitize_escape_attr(self): + html = HTML('<div title="<foo>"></div>') + self.assertEquals('<div title="<foo>"/>', + str(html.filter(HTMLSanitizer()))) + + def test_sanitize_close_empty_tag(self): + html = HTML('<a href="#">fo<br>o</a>') + self.assertEquals('<a href="#">fo<br/>o</a>', + str(html.filter(HTMLSanitizer())) + + def test_sanitize_invalid_entity(self): + html = HTML('&junk;') + self.assertEquals('&junk;', str(html.filter(HTMLSanitizer())) + + def test_sanitize_remove_script_elem(self): + html = HTML('<script>alert("Foo")</script>') + self.assertEquals('', str(html.filter(HTMLSanitizer())) + html = HTML('<SCRIPT SRC="http://example.com/"></SCRIPT>') + self.assertEquals('', str(html.filter(HTMLSanitizer())) + self.assertRaises(ParseError, HTML, '<SCR\0IPT>alert("foo")</SCR\0IPT>') + self.assertRaises(ParseError, HTML, + '<SCRIPT&XYZ SRC="http://example.com/"></SCRIPT>') + + def test_sanitize_remove_onclick_attr(self): + html = HTML('<div onclick=\'alert("foo")\' />') + self.assertEquals('<div/>', str(html.filter(HTMLSanitizer())) + + def test_sanitize_remove_style_scripts(self): + # Inline style with url() using javascript: scheme + html = HTML('<DIV STYLE=\'background: url(javascript:alert("foo"))\'>') + self.assertEquals('<div/>', str(html.filter(HTMLSanitizer())) + # Inline style with url() using javascript: scheme, using control char + html = HTML('<DIV STYLE=\'background: url(javascript:alert("foo"))\'>') + self.assertEquals('<div/>', str(html.filter(HTMLSanitizer())) + # Inline style with url() using javascript: scheme, in quotes + html = HTML('<DIV STYLE=\'background: url("javascript:alert(foo)")\'>') + self.assertEquals('<div/>', str(html.filter(HTMLSanitizer())) + # IE expressions in CSS not allowed + html = HTML('<DIV STYLE=\'width: expression(alert("foo"));\'>') + self.assertEquals('<div/>', str(html.filter(HTMLSanitizer())) + html = HTML('<DIV STYLE=\'background: url(javascript:alert("foo"));' + 'color: #fff\'>') + self.assertEquals('<div style="color: #fff"/>', + str(html.filter(HTMLSanitizer())) + + def test_sanitize_remove_src_javascript(self): + html = HTML('<img src=\'javascript:alert("foo")\'>') + self.assertEquals('<img/>', str(html.filter(HTMLSanitizer())) + # Case-insensitive protocol matching + html = HTML('<IMG SRC=\'JaVaScRiPt:alert("foo")\'>') + self.assertEquals('<img/>', str(html.filter(HTMLSanitizer())) + # Grave accents (not parsed) + self.assertRaises(ParseError, HTML, + '<IMG SRC=`javascript:alert("RSnake says, \'foo\'")`>') + # Protocol encoded using UTF-8 numeric entities + html = HTML('<IMG SRC=\'javascri' + 'pt:alert("foo")\'>') + self.assertEquals('<img/>', str(html.filter(HTMLSanitizer())) + # Protocol encoded using UTF-8 numeric entities without a semicolon + # (which is allowed because the max number of digits is used) + html = HTML('<IMG SRC=\'java' + 'script' + ':alert("foo")\'>') + self.assertEquals('<img/>', str(html.filter(HTMLSanitizer())) + # Protocol encoded using UTF-8 numeric hex entities without a semicolon + # (which is allowed because the max number of digits is used) + html = HTML('<IMG SRC=\'javascri' + 'pt:alert("foo")\'>') + self.assertEquals('<img/>', str(html.filter(HTMLSanitizer())) + # Embedded tab character in protocol + html = HTML('<IMG SRC=\'jav\tascript:alert("foo");\'>') + self.assertEquals('<img/>', str(html.filter(HTMLSanitizer())) + # Embedded tab character in protocol, but encoded this time + html = HTML('<IMG SRC=\'jav	ascript:alert("foo");\'>') + self.assertEquals('<img/>', str(html.filter(HTMLSanitizer())) + + +def suite(): + suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(HTMLSanitizerTestCase, 'test')) + return suite + +if __name__ == '__main__': + unittest.main(defaultTest='suite')