# HG changeset patch # User cmlenz # Date 1237312250 0 # Node ID 878306a5b465e91d614b5ce40e04d7f345d62908 # Parent 80bce2f6f9fed1f01979c85e091a4a217a506918 Ported some of the HTML sanitization improvements from Trac (see [T7658]). diff --git a/ChangeLog b/ChangeLog --- a/ChangeLog +++ b/ChangeLog @@ -12,6 +12,7 @@ with Google App Engine. This, too, is the result of integrating work done by Marcin Kurczych during GSoC 2008. * Added caching in the serialization stage for improved performance. + * Various improvements to the HTML sanitization filter. Version 0.5.2 diff --git a/genshi/filters/html.py b/genshi/filters/html.py --- a/genshi/filters/html.py +++ b/genshi/filters/html.py @@ -211,6 +211,10 @@ default because it is very hard for such sanitizing to be completely safe, especially considering how much error recovery current web browsers perform. + It also does some basic filtering of CSS properties that may be used for + typical phishing attacks. For more sophisticated filtering, this class + provides a couple of hooks that can be overridden in sub-classes. + :warn: Note that this special processing of CSS is currently only applied to style attributes, **not** style elements. """ @@ -274,7 +278,7 @@ if waiting_for: continue tag, attrs = data - if tag not in self.safe_tags: + if not self.is_safe_elem(tag, attrs): waiting_for = tag continue @@ -309,6 +313,43 @@ if not waiting_for: yield kind, data, pos + def is_safe_css(self, propname, value): + """Determine whether the given css property declaration is to be + considered safe for inclusion in the output. + + :param propname: the CSS property name + :param value: the value of the property + :return: whether the property value should be considered safe + :rtype: bool + :since: version 0.6 + """ + if propname == 'position': + return False + if propname.startswith('margin') and '-' in value: + # Negative margins can be used for phishing + return False + return True + + def is_safe_elem(self, tag, attrs): + """Determine whether the given element should be considered safe for + inclusion in the output. + + :param tag: the tag name of the element + :type tag: QName + :param attrs: the element attributes + :type attrs: Attrs + :return: whether the element should be considered safe + :rtype: bool + :since: version 0.6 + """ + if tag not in self.safe_tags: + return False + if tag.localname == 'input': + input_type = attrs.get('type', '').lower() + if input_type == 'password': + return False + return True + def is_safe_uri(self, uri): """Determine whether the given URI is to be considered safe for inclusion in the output. @@ -369,10 +410,16 @@ decl = decl.strip() if not decl: continue + try: + propname, value = decl.split(':', 1) + except ValueError: + continue + if not self.is_safe_css(propname.strip().lower(), value.strip()): + continue is_evil = False - if 'expression' in decl: + if 'expression' in value: is_evil = True - for match in re.finditer(r'url\s*\(([^)]+)', decl): + for match in re.finditer(r'url\s*\(([^)]+)', value): if not self.is_safe_uri(match.group(1)): is_evil = True break diff --git a/genshi/filters/tests/html.py b/genshi/filters/tests/html.py --- a/genshi/filters/tests/html.py +++ b/genshi/filters/tests/html.py @@ -357,6 +357,10 @@ html = HTML('
') self.assertEquals(u'
', unicode(html | HTMLSanitizer())) + def test_sanitize_remove_input_password(self): + html = HTML('
') + self.assertEquals(u'
', unicode(html | HTMLSanitizer())) + def test_sanitize_remove_comments(self): html = HTML('''
''') self.assertEquals(u'
', unicode(html | HTMLSanitizer())) @@ -394,6 +398,22 @@ html = HTML('
') self.assertEquals(u'
', unicode(html | sanitizer)) + def test_sanitize_remove_style_phishing(self): + sanitizer = HTMLSanitizer(safe_attrs=HTMLSanitizer.SAFE_ATTRS | set(['style'])) + # The position property is not allowed + html = HTML('
') + self.assertEquals(u'
', unicode(html | sanitizer)) + # Normal margins get passed through + html = HTML('
') + self.assertEquals(u'
', unicode(html | sanitizer)) + # But not negative margins + html = HTML('
') + self.assertEquals(u'
', unicode(html | sanitizer)) + html = HTML('
') + self.assertEquals(u'
', unicode(html | sanitizer)) + html = HTML('
') + self.assertEquals(u'
', unicode(html | sanitizer)) + def test_sanitize_remove_src_javascript(self): html = HTML('') self.assertEquals(u'', unicode(html | HTMLSanitizer())) @@ -433,5 +453,6 @@ suite.addTest(unittest.makeSuite(HTMLSanitizerTestCase, 'test')) return suite + if __name__ == '__main__': unittest.main(defaultTest='suite')