# HG changeset patch
# User hodgestar
# Date 1315000902 0
# Node ID 8bc6f32fdd456a0339bb013d22b2b14596662c5f
# Parent 0f9fe59dfa00a9c017823d3a72d983bdfac24032
Improve sanitizing of CSS in style attributes (note that the Genshi documentation already warns users that enabling the style attribute is dangerous -- now it is slightly less dangerous). Fixes #455. Patch taken from jomae's Trac commit trac:r10788 and modified for Genshi -- thanks!
diff --git a/genshi/filters/html.py b/genshi/filters/html.py
--- a/genshi/filters/html.py
+++ b/genshi/filters/html.py
@@ -230,7 +230,7 @@
:warn: Note that this special processing of CSS is currently only applied to
style attributes, **not** style elements.
- """
+ " """
SAFE_TAGS = frozenset(['a', 'abbr', 'acronym', 'address', 'area', 'b',
'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
@@ -254,13 +254,42 @@
'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
'type', 'usemap', 'valign', 'value', 'vspace', 'width'])
+ SAFE_CSS = frozenset([
+ # CSS 3 properties
+ 'background', 'background-attachment', 'background-color',
+ 'background-image', 'background-position', 'background-repeat',
+ 'border', 'border-bottom', 'border-bottom-color',
+ 'border-bottom-style', 'border-bottom-width', 'border-collapse',
+ 'border-color', 'border-left', 'border-left-color',
+ 'border-left-style', 'border-left-width', 'border-right',
+ 'border-right-color', 'border-right-style', 'border-right-width',
+ 'border-spacing', 'border-style', 'border-top', 'border-top-color',
+ 'border-top-style', 'border-top-width', 'border-width', 'bottom',
+ 'caption-side', 'clear', 'clip', 'color', 'content',
+ 'counter-increment', 'counter-reset', 'cursor', 'direction', 'display',
+ 'empty-cells', 'float', 'font', 'font-family', 'font-size',
+ 'font-style', 'font-variant', 'font-weight', 'height', 'left',
+ 'letter-spacing', 'line-height', 'list-style', 'list-style-image',
+ 'list-style-position', 'list-style-type', 'margin', 'margin-bottom',
+ 'margin-left', 'margin-right', 'margin-top', 'max-height', 'max-width',
+ 'min-height', 'min-width', 'opacity', 'orphans', 'outline',
+ 'outline-color', 'outline-style', 'outline-width', 'overflow',
+ 'padding', 'padding-bottom', 'padding-left', 'padding-right',
+ 'padding-top', 'page-break-after', 'page-break-before',
+ 'page-break-inside', 'quotes', 'right', 'table-layout',
+ 'text-align', 'text-decoration', 'text-indent', 'text-transform',
+ 'top', 'unicode-bidi', 'vertical-align', 'visibility', 'white-space',
+ 'widows', 'width', 'word-spacing', 'z-index',
+ ])
+
SAFE_SCHEMES = frozenset(['file', 'ftp', 'http', 'https', 'mailto', None])
URI_ATTRS = frozenset(['action', 'background', 'dynsrc', 'href', 'lowsrc',
'src'])
def __init__(self, safe_tags=SAFE_TAGS, safe_attrs=SAFE_ATTRS,
- safe_schemes=SAFE_SCHEMES, uri_attrs=URI_ATTRS):
+ safe_schemes=SAFE_SCHEMES, uri_attrs=URI_ATTRS,
+ safe_css=SAFE_CSS):
"""Create the sanitizer.
The exact set of allowed elements and attributes can be configured.
@@ -271,13 +300,63 @@
:param uri_attrs: a set of names of attributes that contain URIs
"""
self.safe_tags = safe_tags
- "The set of tag names that are considered safe."
+ # The set of tag names that are considered safe.
self.safe_attrs = safe_attrs
- "The set of attribute names that are considered safe."
+ # The set of attribute names that are considered safe.
+ self.safe_css = safe_css
+ # The set of CSS properties that are considered safe.
self.uri_attrs = uri_attrs
- "The set of names of attributes that may contain URIs."
+ # The set of names of attributes that may contain URIs.
self.safe_schemes = safe_schemes
- "The set of URI schemes that are considered safe."
+ # The set of URI schemes that are considered safe.
+
+ # IE6
+ _EXPRESSION_SEARCH = re.compile(u"""
+ [eE
+ \uFF25 # FULLWIDTH LATIN CAPITAL LETTER E
+ \uFF45 # FULLWIDTH LATIN SMALL LETTER E
+ ]
+ [xX
+ \uFF38 # FULLWIDTH LATIN CAPITAL LETTER X
+ \uFF58 # FULLWIDTH LATIN SMALL LETTER X
+ ]
+ [pP
+ \uFF30 # FULLWIDTH LATIN CAPITAL LETTER P
+ \uFF50 # FULLWIDTH LATIN SMALL LETTER P
+ ]
+ [rR
+ \u0280 # LATIN LETTER SMALL CAPITAL R
+ \uFF32 # FULLWIDTH LATIN CAPITAL LETTER R
+ \uFF52 # FULLWIDTH LATIN SMALL LETTER R
+ ]
+ [eE
+ \uFF25 # FULLWIDTH LATIN CAPITAL LETTER E
+ \uFF45 # FULLWIDTH LATIN SMALL LETTER E
+ ]
+ [sS
+ \uFF33 # FULLWIDTH LATIN CAPITAL LETTER S
+ \uFF53 # FULLWIDTH LATIN SMALL LETTER S
+ ]{2}
+ [iI
+ \u026A # LATIN LETTER SMALL CAPITAL I
+ \uFF29 # FULLWIDTH LATIN CAPITAL LETTER I
+ \uFF49 # FULLWIDTH LATIN SMALL LETTER I
+ ]
+ [oO
+ \uFF2F # FULLWIDTH LATIN CAPITAL LETTER O
+ \uFF4F # FULLWIDTH LATIN SMALL LETTER O
+ ]
+ [nN
+ \u0274 # LATIN LETTER SMALL CAPITAL N
+ \uFF2E # FULLWIDTH LATIN CAPITAL LETTER N
+ \uFF4E # FULLWIDTH LATIN SMALL LETTER N
+ ]
+ """, re.VERBOSE).search
+
+ # IE6
+ # 7) Particular bit of Unicode characters
+ _URL_FINDITER = re.compile(
+ u'[Uu][Rr\u0280][Ll\u029F]\s*\(([^)]+)').finditer
def __call__(self, stream):
"""Apply the filter to the given stream.
@@ -336,7 +415,7 @@
:rtype: bool
:since: version 0.6
"""
- if propname == 'position':
+ if propname not in self.safe_css:
return False
if propname.startswith('margin') and '-' in value:
# Negative margins can be used for phishing
@@ -430,9 +509,9 @@
if not self.is_safe_css(propname.strip().lower(), value.strip()):
continue
is_evil = False
- if 'expression' in value:
+ if self._EXPRESSION_SEARCH(value):
is_evil = True
- for match in re.finditer(r'url\s*\(([^)]+)', value):
+ for match in self._URL_FINDITER(value):
if not self.is_safe_uri(match.group(1)):
is_evil = True
break
@@ -441,11 +520,20 @@
return decls
_NORMALIZE_NEWLINES = re.compile(r'\r\n').sub
- _UNICODE_ESCAPE = re.compile(r'\\([0-9a-fA-F]{1,6})\s?').sub
+ _UNICODE_ESCAPE = re.compile(
+ r"""\\([0-9a-fA-F]{1,6})\s?|\\([^\r\n\f0-9a-fA-F'"{};:()#*])""",
+ re.UNICODE).sub
def _replace_unicode_escapes(self, text):
def _repl(match):
- return unichr(int(match.group(1), 16))
+ t = match.group(1)
+ if t:
+ return unichr(int(t, 16))
+ t = match.group(2)
+ if t == '\\':
+ return r'\\'
+ else:
+ return t
return self._UNICODE_ESCAPE(_repl, self._NORMALIZE_NEWLINES('\n', text))
_CSS_COMMENTS = re.compile(r'/\*.*?\*/').sub
diff --git a/genshi/filters/tests/test_html.py b/genshi/filters/tests/test_html.py
--- a/genshi/filters/tests/test_html.py
+++ b/genshi/filters/tests/test_html.py
@@ -361,6 +361,11 @@
""", html.render())
+def StyleSanitizer():
+ safe_attrs = HTMLSanitizer.SAFE_ATTRS | frozenset(['style'])
+ return HTMLSanitizer(safe_attrs=safe_attrs)
+
+
class HTMLSanitizerTestCase(unittest.TestCase):
def test_sanitize_unchanged(self):
@@ -420,7 +425,7 @@
self.assertEquals('', (html | HTMLSanitizer()).render())
def test_sanitize_remove_style_scripts(self):
- sanitizer = HTMLSanitizer(safe_attrs=HTMLSanitizer.SAFE_ATTRS | set(['style']))
+ sanitizer = StyleSanitizer()
# Inline style with url() using javascript: scheme
html = HTML(u'')
self.assertEquals('
', (html | sanitizer).render())
@@ -453,7 +458,7 @@
self.assertEquals('
', (html | sanitizer).render())
def test_sanitize_remove_style_phishing(self):
- sanitizer = HTMLSanitizer(safe_attrs=HTMLSanitizer.SAFE_ATTRS | set(['style']))
+ sanitizer = StyleSanitizer()
# The position property is not allowed
html = HTML(u'
')
self.assertEquals('
', (html | sanitizer).render())
@@ -500,6 +505,95 @@
html = HTML(u'
;\')
')
self.assertEquals('
![]()
', (html | HTMLSanitizer()).render())
+ def test_sanitize_expression(self):
+ html = HTML(ur'
XSS
')
+ self.assertEqual('
XSS
', unicode(html | StyleSanitizer()))
+
+ def test_capital_expression(self):
+ html = HTML(ur'
XSS
')
+ self.assertEqual('
XSS
', unicode(html | StyleSanitizer()))
+
+ def test_sanitize_url_with_javascript(self):
+ html = HTML(u'
'
+ u'XSS
')
+ self.assertEqual('
XSS
', unicode(html | StyleSanitizer()))
+
+ def test_sanitize_capital_url_with_javascript(self):
+ html = HTML(u'
'
+ u'XSS
')
+ self.assertEqual('
XSS
', unicode(html | StyleSanitizer()))
+
+ def test_sanitize_unicode_escapes(self):
+ html = HTML(ur'
'
+ ur'XSS
')
+ self.assertEqual('
XSS
', unicode(html | StyleSanitizer()))
+
+ def test_sanitize_backslash_without_hex(self):
+ html = HTML(ur'
XSS
')
+ self.assertEqual('
XSS
', unicode(html | StyleSanitizer()))
+ html = HTML(ur'
XSS
')
+ self.assertEqual(r'
'
+ 'XSS
',
+ unicode(html | StyleSanitizer()))
+
+ def test_sanitize_unsafe_props(self):
+ html = HTML(u'
XSS
')
+ self.assertEqual('
XSS
', unicode(html | StyleSanitizer()))
+
+ html = HTML(u'
XSS
')
+ self.assertEqual('
XSS
', unicode(html | StyleSanitizer()))
+
+ html = HTML(u'
'
+ u'XSS
')
+ self.assertEqual('
XSS
', unicode(html | StyleSanitizer()))
+
+ html = HTML(u"""
XSS
""")
+ self.assertEqual('
XSS
', unicode(html | StyleSanitizer()))
+
+ html = HTML(u"""
XSS
""")
+ self.assertEqual('
XSS
', unicode(html | StyleSanitizer()))
+
+ def test_sanitize_negative_margin(self):
+ html = HTML(u'
XSS
')
+ self.assertEqual('
XSS
', unicode(html | StyleSanitizer()))
+ html = HTML(u'
XSS
')
+ self.assertEqual('
XSS
', unicode(html | StyleSanitizer()))
+
+ def test_sanitize_css_hack(self):
+ html = HTML(u'
XSS
')
+ self.assertEqual('
XSS
', unicode(html | StyleSanitizer()))
+
+ html = HTML(u'
XSS
')
+ self.assertEqual('
XSS
', unicode(html | StyleSanitizer()))
+
+ def test_sanitize_property_name(self):
+ html = HTML(u'
prop
')
+ self.assertEqual('
prop
',
+ unicode(html | StyleSanitizer()))
+
+ def test_sanitize_unicode_expression(self):
+ # Fullwidth small letters
+ html = HTML(u'
'
+ u'XSS
')
+ self.assertEqual('
XSS
', unicode(html | StyleSanitizer()))
+ # Fullwidth capital letters
+ html = HTML(u'
'
+ u'XSS
')
+ self.assertEqual('
XSS
', unicode(html | StyleSanitizer()))
+ # IPA extensions
+ html = HTML(u'
'
+ u'XSS
')
+ self.assertEqual('
XSS
', unicode(html | StyleSanitizer()))
+
+ def test_sanitize_unicode_url(self):
+ # IPA extensions
+ html = HTML(u'
'
+ u'XSS
')
+ self.assertEqual('
XSS
', unicode(html | StyleSanitizer()))
+
def suite():
suite = unittest.TestSuite()