cmlenz@113: # -*- coding: utf-8 -*- cmlenz@113: # cmlenz@113: # Copyright (C) 2006 Edgewall Software cmlenz@113: # All rights reserved. cmlenz@113: # cmlenz@113: # This software is licensed as described in the file COPYING, which cmlenz@113: # you should have received as part of this distribution. The terms cmlenz@113: # are also available at http://markup.edgewall.org/wiki/License. cmlenz@113: # cmlenz@113: # This software consists of voluntary contributions made by many cmlenz@113: # individuals. For the exact contribution history, see the revision cmlenz@113: # history and logs, available at http://markup.edgewall.org/log/. cmlenz@113: cmlenz@113: import doctest cmlenz@113: import unittest cmlenz@113: cmlenz@113: from markup.core import Stream cmlenz@113: from markup.input import HTML, ParseError cmlenz@113: from markup.filters import HTMLSanitizer cmlenz@113: cmlenz@113: cmlenz@113: class HTMLSanitizerTestCase(unittest.TestCase): cmlenz@113: cmlenz@113: def test_sanitize_unchanged(self): cmlenz@113: html = HTML('fo
o
') cmlenz@113: self.assertEquals('fo
o
', cmlenz@113: str(html.filter(HTMLSanitizer())) cmlenz@113: cmlenz@113: def test_sanitize_escape_text(self): cmlenz@113: html = HTML('fo&') cmlenz@113: self.assertEquals('fo&', cmlenz@113: str(html.filter(HTMLSanitizer())) cmlenz@113: html = HTML('<foo>') cmlenz@113: self.assertEquals('<foo>', cmlenz@113: str(html.filter(HTMLSanitizer())) cmlenz@113: cmlenz@113: def test_sanitize_entityref_text(self): cmlenz@113: html = HTML('foö') cmlenz@113: self.assertEquals(u'foö', cmlenz@113: str(html.filter(HTMLSanitizer())) cmlenz@113: cmlenz@113: def test_sanitize_escape_attr(self): cmlenz@113: html = HTML('
') cmlenz@113: self.assertEquals('
', cmlenz@113: str(html.filter(HTMLSanitizer()))) cmlenz@113: cmlenz@113: def test_sanitize_close_empty_tag(self): cmlenz@113: html = HTML('fo
o
') cmlenz@113: self.assertEquals('fo
o
', cmlenz@113: str(html.filter(HTMLSanitizer())) cmlenz@113: cmlenz@113: def test_sanitize_invalid_entity(self): cmlenz@113: html = HTML('&junk;') cmlenz@113: self.assertEquals('&junk;', str(html.filter(HTMLSanitizer())) cmlenz@113: cmlenz@113: def test_sanitize_remove_script_elem(self): cmlenz@113: html = HTML('') cmlenz@113: self.assertEquals('', str(html.filter(HTMLSanitizer())) cmlenz@113: html = HTML('') cmlenz@113: self.assertEquals('', str(html.filter(HTMLSanitizer())) cmlenz@113: self.assertRaises(ParseError, HTML, 'alert("foo")') cmlenz@113: self.assertRaises(ParseError, HTML, cmlenz@113: '') cmlenz@113: cmlenz@113: def test_sanitize_remove_onclick_attr(self): cmlenz@113: html = HTML('
') cmlenz@113: self.assertEquals('
', str(html.filter(HTMLSanitizer())) cmlenz@113: cmlenz@113: def test_sanitize_remove_style_scripts(self): cmlenz@113: # Inline style with url() using javascript: scheme cmlenz@113: html = HTML('
') cmlenz@113: self.assertEquals('
', str(html.filter(HTMLSanitizer())) cmlenz@113: # Inline style with url() using javascript: scheme, using control char cmlenz@113: html = HTML('
') cmlenz@113: self.assertEquals('
', str(html.filter(HTMLSanitizer())) cmlenz@113: # Inline style with url() using javascript: scheme, in quotes cmlenz@113: html = HTML('
') cmlenz@113: self.assertEquals('
', str(html.filter(HTMLSanitizer())) cmlenz@113: # IE expressions in CSS not allowed cmlenz@113: html = HTML('
') cmlenz@113: self.assertEquals('
', str(html.filter(HTMLSanitizer())) cmlenz@113: html = HTML('
') cmlenz@113: self.assertEquals('
', cmlenz@113: str(html.filter(HTMLSanitizer())) cmlenz@113: cmlenz@113: def test_sanitize_remove_src_javascript(self): cmlenz@113: html = HTML('') cmlenz@113: self.assertEquals('', str(html.filter(HTMLSanitizer())) cmlenz@113: # Case-insensitive protocol matching cmlenz@113: html = HTML('') cmlenz@113: self.assertEquals('', str(html.filter(HTMLSanitizer())) cmlenz@113: # Grave accents (not parsed) cmlenz@113: self.assertRaises(ParseError, HTML, cmlenz@113: '') cmlenz@113: # Protocol encoded using UTF-8 numeric entities cmlenz@113: html = HTML('') cmlenz@113: self.assertEquals('', str(html.filter(HTMLSanitizer())) cmlenz@113: # Protocol encoded using UTF-8 numeric entities without a semicolon cmlenz@113: # (which is allowed because the max number of digits is used) cmlenz@113: html = HTML('') cmlenz@113: self.assertEquals('', str(html.filter(HTMLSanitizer())) cmlenz@113: # Protocol encoded using UTF-8 numeric hex entities without a semicolon cmlenz@113: # (which is allowed because the max number of digits is used) cmlenz@113: html = HTML('') cmlenz@113: self.assertEquals('', str(html.filter(HTMLSanitizer())) cmlenz@113: # Embedded tab character in protocol cmlenz@113: html = HTML('') cmlenz@113: self.assertEquals('', str(html.filter(HTMLSanitizer())) cmlenz@113: # Embedded tab character in protocol, but encoded this time cmlenz@113: html = HTML('') cmlenz@113: self.assertEquals('', str(html.filter(HTMLSanitizer())) cmlenz@113: cmlenz@113: cmlenz@113: def suite(): cmlenz@113: suite = unittest.TestSuite() cmlenz@113: suite.addTest(unittest.makeSuite(HTMLSanitizerTestCase, 'test')) cmlenz@113: return suite cmlenz@113: cmlenz@113: if __name__ == '__main__': cmlenz@113: unittest.main(defaultTest='suite')