Mercurial > genshi > mirror
changeset 431:ad01564e87f2 trunk
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
* In case `style` attributes are explicitly allowed, also handle unicode escapes correctly.
author | cmlenz |
---|---|
date | Thu, 22 Mar 2007 18:13:02 +0000 |
parents | 77e99857b351 |
children | 406915754870 |
files | genshi/builder.py genshi/filters.py genshi/tests/filters.py |
diffstat | 3 files changed, 56 insertions(+), 8 deletions(-) [+] |
line wrap: on
line diff
--- a/genshi/builder.py +++ b/genshi/builder.py @@ -49,7 +49,7 @@ stream: >>> stream = doc.generate() ->>> stream +>>> stream #doctest: +ELLIPSIS <genshi.core.Stream object at ...> >>> print stream <p class="intro">Some text and <a href="http://example.org/">a link</a>.<br/></p>
--- a/genshi/filters.py +++ b/genshi/filters.py @@ -180,6 +180,32 @@ class HTMLSanitizer(object): """A filter that removes potentially dangerous HTML tags and attributes from the stream. + + >>> from genshi import HTML + >>> html = HTML('<div><script>alert(document.cookie)</script></div>') + >>> print html | HTMLSanitizer() + <div/> + + The default set of safe tags and attributes can be modified when the filter + is instantiated. For example, to allow inline ``style`` attributes, the + following instantation would work: + + >>> html = HTML('<div style="background: #000"></div>') + >>> sanitizer = HTMLSanitizer(safe_attrs=HTMLSanitizer.SAFE_ATTRS | set(['style'])) + >>> print html | sanitizer + <div style="background: #000"/> + + Note that even in this case, the filter *does* attempt to remove dangerous + constructs from style attributes: + + >>> html = HTML('<div style="background: url(javascript:void); color: #000"></div>') + >>> print html | sanitizer + <div style="color: #000"/> + + This handles HTML entities, unicode escapes in CSS and Javascript text, as + well as a lot of other things. However, the style tag is still excluded by + default because it is very hard for such sanitizing to be completely safe, + especially considering how much error recovery current web browsers perform. """ SAFE_TAGS = frozenset(['a', 'abbr', 'acronym', 'address', 'area', 'b', @@ -201,8 +227,8 @@ 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', - 'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target', - 'title', 'type', 'usemap', 'valign', 'value', 'vspace', 'width']) + 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', + 'type', 'usemap', 'valign', 'value', 'vspace', 'width']) SAFE_SCHEMES = frozenset(['file', 'ftp', 'http', 'https', 'mailto', None]) @@ -260,6 +286,7 @@ elif attr == 'style': # Remove dangerous CSS declarations from inline styles decls = [] + value = self._replace_unicode_escapes(value) for decl in filter(None, value.split(';')): is_evil = False if 'expression' in decl: @@ -288,3 +315,11 @@ else: if not waiting_for: yield kind, data, pos + + _NORMALIZE_NEWLINES = re.compile(r'\r\n').sub + _UNICODE_ESCAPE = re.compile(r'\\([0-9a-fA-F]{1,6})\s?').sub + + def _replace_unicode_escapes(self, text): + def _repl(match): + return unichr(int(match.group(1), 16)) + return self._UNICODE_ESCAPE(_repl, self._NORMALIZE_NEWLINES('\n', text))
--- a/genshi/tests/filters.py +++ b/genshi/tests/filters.py @@ -320,22 +320,35 @@ self.assertEquals(u'<div/>', unicode(html | HTMLSanitizer())) def test_sanitize_remove_style_scripts(self): + sanitizer = HTMLSanitizer(safe_attrs=HTMLSanitizer.SAFE_ATTRS | set(['style'])) # Inline style with url() using javascript: scheme html = HTML('<DIV STYLE=\'background: url(javascript:alert("foo"))\'>') - self.assertEquals(u'<div/>', unicode(html | HTMLSanitizer())) + self.assertEquals(u'<div/>', unicode(html | sanitizer)) # Inline style with url() using javascript: scheme, using control char html = HTML('<DIV STYLE=\'background: url(javascript:alert("foo"))\'>') - self.assertEquals(u'<div/>', unicode(html | HTMLSanitizer())) + self.assertEquals(u'<div/>', unicode(html | sanitizer)) # Inline style with url() using javascript: scheme, in quotes html = HTML('<DIV STYLE=\'background: url("javascript:alert(foo)")\'>') - self.assertEquals(u'<div/>', unicode(html | HTMLSanitizer())) + self.assertEquals(u'<div/>', unicode(html | sanitizer)) # IE expressions in CSS not allowed html = HTML('<DIV STYLE=\'width: expression(alert("foo"));\'>') - self.assertEquals(u'<div/>', unicode(html | HTMLSanitizer())) + self.assertEquals(u'<div/>', unicode(html | sanitizer)) html = HTML('<DIV STYLE=\'background: url(javascript:alert("foo"));' 'color: #fff\'>') self.assertEquals(u'<div style="color: #fff"/>', - unicode(html | HTMLSanitizer())) + unicode(html | sanitizer)) + # Inline style with url() using javascript: scheme, using unicode + # escapes + html = HTML('<DIV STYLE=\'background: \\75rl(javascript:alert("foo"))\'>') + self.assertEquals(u'<div/>', unicode(html | sanitizer)) + html = HTML('<DIV STYLE=\'background: \\000075rl(javascript:alert("foo"))\'>') + self.assertEquals(u'<div/>', unicode(html | sanitizer)) + html = HTML('<DIV STYLE=\'background: \\75 rl(javascript:alert("foo"))\'>') + self.assertEquals(u'<div/>', unicode(html | sanitizer)) + html = HTML('<DIV STYLE=\'background: \\000075 rl(javascript:alert("foo"))\'>') + self.assertEquals(u'<div/>', unicode(html | sanitizer)) + html = HTML('<DIV STYLE=\'background: \\000075\r\nrl(javascript:alert("foo"))\'>') + self.assertEquals(u'<div/>', unicode(html | sanitizer)) def test_sanitize_remove_src_javascript(self): html = HTML('<img src=\'javascript:alert("foo")\'>')