Mercurial > genshi > mirror

--- a/genshi/builder.py
+++ b/genshi/builder.py
@@ -49,7 +49,7 @@
 stream:

 >>> stream = doc.generate()
->>> stream
+>>> stream #doctest: +ELLIPSIS
 <genshi.core.Stream object at ...>
 >>> print stream
 <p class="intro">Some text and <a href="http://example.org/">a link</a>.<br/></p>
--- a/genshi/filters.py
+++ b/genshi/filters.py
@@ -180,6 +180,32 @@
 class HTMLSanitizer(object):
     """A filter that removes potentially dangerous HTML tags and attributes
     from the stream.
+
+    >>> from genshi import HTML
+    >>> html = HTML('<div><script>alert(document.cookie)</script></div>')
+    >>> print html | HTMLSanitizer()
+    <div/>
+
+    The default set of safe tags and attributes can be modified when the filter
+    is instantiated. For example, to allow inline ``style`` attributes, the
+    following instantation would work:
+
+    >>> html = HTML('<div style="background: #000"></div>')
+    >>> sanitizer = HTMLSanitizer(safe_attrs=HTMLSanitizer.SAFE_ATTRS | set(['style']))
+    >>> print html | sanitizer
+    <div style="background: #000"/>
+
+    Note that even in this case, the filter *does* attempt to remove dangerous
+    constructs from style attributes:
+
+    >>> html = HTML('<div style="background: url(javascript:void); color: #000"></div>')
+    >>> print html | sanitizer
+    <div style="color: #000"/>
+
+    This handles HTML entities, unicode escapes in CSS and Javascript text, as
+    well as a lot of other things. However, the style tag is still excluded by
+    default because it is very hard for such sanitizing to be completely safe,
+    especially considering how much error recovery current web browsers perform.
     """

     SAFE_TAGS = frozenset(['a', 'abbr', 'acronym', 'address', 'area', 'b',
@@ -201,8 +227,8 @@
         'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
         'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
         'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
-        'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target',
-        'title', 'type', 'usemap', 'valign', 'value', 'vspace', 'width'])
+        'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
+        'type', 'usemap', 'valign', 'value', 'vspace', 'width'])

     SAFE_SCHEMES = frozenset(['file', 'ftp', 'http', 'https', 'mailto', None])

@@ -260,6 +286,7 @@
                     elif attr == 'style':
                         # Remove dangerous CSS declarations from inline styles
                         decls = []
+                        value = self._replace_unicode_escapes(value)
                         for decl in filter(None, value.split(';')):
                             is_evil = False
                             if 'expression' in decl:
@@ -288,3 +315,11 @@
             else:
                 if not waiting_for:
                     yield kind, data, pos
+
+    _NORMALIZE_NEWLINES = re.compile(r'\r\n').sub
+    _UNICODE_ESCAPE = re.compile(r'\\([0-9a-fA-F]{1,6})\s?').sub
+
+    def _replace_unicode_escapes(self, text):
+        def _repl(match):
+            return unichr(int(match.group(1), 16))
+        return self._UNICODE_ESCAPE(_repl, self._NORMALIZE_NEWLINES('\n', text))
--- a/genshi/tests/filters.py
+++ b/genshi/tests/filters.py
@@ -320,22 +320,35 @@
         self.assertEquals(u'<div/>', unicode(html | HTMLSanitizer()))

     def test_sanitize_remove_style_scripts(self):
+        sanitizer = HTMLSanitizer(safe_attrs=HTMLSanitizer.SAFE_ATTRS | set(['style']))
         # Inline style with url() using javascript: scheme
         html = HTML('<DIV STYLE=\'background: url(javascript:alert("foo"))\'>')
-        self.assertEquals(u'<div/>', unicode(html | HTMLSanitizer()))
+        self.assertEquals(u'<div/>', unicode(html | sanitizer))
         # Inline style with url() using javascript: scheme, using control char
         html = HTML('<DIV STYLE=\'background: url(&#1;javascript:alert("foo"))\'>')
-        self.assertEquals(u'<div/>', unicode(html | HTMLSanitizer()))
+        self.assertEquals(u'<div/>', unicode(html | sanitizer))
         # Inline style with url() using javascript: scheme, in quotes
         html = HTML('<DIV STYLE=\'background: url("javascript:alert(foo)")\'>')
-        self.assertEquals(u'<div/>', unicode(html | HTMLSanitizer()))
+        self.assertEquals(u'<div/>', unicode(html | sanitizer))
         # IE expressions in CSS not allowed
         html = HTML('<DIV STYLE=\'width: expression(alert("foo"));\'>')
-        self.assertEquals(u'<div/>', unicode(html | HTMLSanitizer()))
+        self.assertEquals(u'<div/>', unicode(html | sanitizer))
         html = HTML('<DIV STYLE=\'background: url(javascript:alert("foo"));'
                                  'color: #fff\'>')
         self.assertEquals(u'<div style="color: #fff"/>',
-                          unicode(html | HTMLSanitizer()))
+                          unicode(html | sanitizer))
+        # Inline style with url() using javascript: scheme, using unicode
+        # escapes
+        html = HTML('<DIV STYLE=\'background: \\75rl(javascript:alert("foo"))\'>')
+        self.assertEquals(u'<div/>', unicode(html | sanitizer))
+        html = HTML('<DIV STYLE=\'background: \\000075rl(javascript:alert("foo"))\'>')
+        self.assertEquals(u'<div/>', unicode(html | sanitizer))
+        html = HTML('<DIV STYLE=\'background: \\75 rl(javascript:alert("foo"))\'>')
+        self.assertEquals(u'<div/>', unicode(html | sanitizer))
+        html = HTML('<DIV STYLE=\'background: \\000075 rl(javascript:alert("foo"))\'>')
+        self.assertEquals(u'<div/>', unicode(html | sanitizer))
+        html = HTML('<DIV STYLE=\'background: \\000075\r\nrl(javascript:alert("foo"))\'>')
+        self.assertEquals(u'<div/>', unicode(html | sanitizer))

     def test_sanitize_remove_src_javascript(self):
         html = HTML('<img src=\'javascript:alert("foo")\'>')