# HG changeset patch
# User cmlenz
# Date 1237312250 0
# Node ID 878306a5b465e91d614b5ce40e04d7f345d62908
# Parent  80bce2f6f9fed1f01979c85e091a4a217a506918
Ported some of the HTML sanitization improvements from Trac (see [T7658]).

diff --git a/ChangeLog b/ChangeLog
--- a/ChangeLog
+++ b/ChangeLog
@@ -12,6 +12,7 @@
    with Google App Engine. This, too, is the result of integrating work done
    by  Marcin Kurczych during GSoC 2008.
  * Added caching in the serialization stage for improved performance.
+ * Various improvements to the HTML sanitization filter.
 
 
 Version 0.5.2
diff --git a/genshi/filters/html.py b/genshi/filters/html.py
--- a/genshi/filters/html.py
+++ b/genshi/filters/html.py
@@ -211,6 +211,10 @@
     default because it is very hard for such sanitizing to be completely safe,
     especially considering how much error recovery current web browsers perform.
     
+    It also does some basic filtering of CSS properties that may be used for
+    typical phishing attacks. For more sophisticated filtering, this class
+    provides a couple of hooks that can be overridden in sub-classes.
+    
     :warn: Note that this special processing of CSS is currently only applied to
            style attributes, **not** style elements.
     """
@@ -274,7 +278,7 @@
                 if waiting_for:
                     continue
                 tag, attrs = data
-                if tag not in self.safe_tags:
+                if not self.is_safe_elem(tag, attrs):
                     waiting_for = tag
                     continue
 
@@ -309,6 +313,43 @@
                 if not waiting_for:
                     yield kind, data, pos
 
+    def is_safe_css(self, propname, value):
+        """Determine whether the given css property declaration is to be
+        considered safe for inclusion in the output.
+        
+        :param propname: the CSS property name
+        :param value: the value of the property
+        :return: whether the property value should be considered safe
+        :rtype: bool
+        :since: version 0.6
+        """
+        if propname == 'position':
+            return False
+        if propname.startswith('margin') and '-' in value:
+            # Negative margins can be used for phishing
+            return False
+        return True
+
+    def is_safe_elem(self, tag, attrs):
+        """Determine whether the given element should be considered safe for
+        inclusion in the output.
+        
+        :param tag: the tag name of the element
+        :type tag: QName
+        :param attrs: the element attributes
+        :type attrs: Attrs
+        :return: whether the element should be considered safe
+        :rtype: bool
+        :since: version 0.6
+        """
+        if tag not in self.safe_tags:
+            return False
+        if tag.localname == 'input':
+            input_type = attrs.get('type', '').lower()
+            if input_type == 'password':
+                return False
+        return True
+
     def is_safe_uri(self, uri):
         """Determine whether the given URI is to be considered safe for
         inclusion in the output.
@@ -369,10 +410,16 @@
             decl = decl.strip()
             if not decl:
                 continue
+            try:
+                propname, value = decl.split(':', 1)
+            except ValueError:
+                continue
+            if not self.is_safe_css(propname.strip().lower(), value.strip()):
+                continue
             is_evil = False
-            if 'expression' in decl:
+            if 'expression' in value:
                 is_evil = True
-            for match in re.finditer(r'url\s*\(([^)]+)', decl):
+            for match in re.finditer(r'url\s*\(([^)]+)', value):
                 if not self.is_safe_uri(match.group(1)):
                     is_evil = True
                     break
diff --git a/genshi/filters/tests/html.py b/genshi/filters/tests/html.py
--- a/genshi/filters/tests/html.py
+++ b/genshi/filters/tests/html.py
@@ -357,6 +357,10 @@
         html = HTML('<div onclick=\'alert("foo")\' />')
         self.assertEquals(u'<div/>', unicode(html | HTMLSanitizer()))
 
+    def test_sanitize_remove_input_password(self):
+        html = HTML('<form><input type="password" /></form>')
+        self.assertEquals(u'<form/>', unicode(html | HTMLSanitizer()))
+
     def test_sanitize_remove_comments(self):
         html = HTML('''<div><!-- conditional comment crap --></div>''')
         self.assertEquals(u'<div/>', unicode(html | HTMLSanitizer()))
@@ -394,6 +398,22 @@
         html = HTML('<DIV STYLE=\'background: \\000075\r\nrl(javascript:alert("foo"))\'>')
         self.assertEquals(u'<div/>', unicode(html | sanitizer))
 
+    def test_sanitize_remove_style_phishing(self):
+        sanitizer = HTMLSanitizer(safe_attrs=HTMLSanitizer.SAFE_ATTRS | set(['style']))
+        # The position property is not allowed
+        html = HTML('<div style="position:absolute;top:0"></div>')
+        self.assertEquals(u'<div style="top:0"/>', unicode(html | sanitizer))
+        # Normal margins get passed through
+        html = HTML('<div style="margin:10px 20px"></div>')
+        self.assertEquals(u'<div style="margin:10px 20px"/>', unicode(html | sanitizer))
+        # But not negative margins
+        html = HTML('<div style="margin:-1000px 0 0"></div>')
+        self.assertEquals(u'<div/>', unicode(html | sanitizer))
+        html = HTML('<div style="margin-left:-2000px 0 0"></div>')
+        self.assertEquals(u'<div/>', unicode(html | sanitizer))
+        html = HTML('<div style="margin-left:1em 1em 1em -4000px"></div>')
+        self.assertEquals(u'<div/>', unicode(html | sanitizer))
+
     def test_sanitize_remove_src_javascript(self):
         html = HTML('<img src=\'javascript:alert("foo")\'>')
         self.assertEquals(u'<img/>', unicode(html | HTMLSanitizer()))
@@ -433,5 +453,6 @@
     suite.addTest(unittest.makeSuite(HTMLSanitizerTestCase, 'test'))
     return suite
 
+
 if __name__ == '__main__':
     unittest.main(defaultTest='suite')