cmlenz@1: # -*- coding: utf-8 -*-
cmlenz@1: #
cmlenz@854: # Copyright (C) 2006-2009 Edgewall Software
cmlenz@1: # All rights reserved.
cmlenz@1: #
cmlenz@1: # This software is licensed as described in the file COPYING, which
cmlenz@1: # you should have received as part of this distribution. The terms
cmlenz@230: # are also available at http://genshi.edgewall.org/wiki/License.
cmlenz@1: #
cmlenz@1: # This software consists of voluntary contributions made by many
cmlenz@1: # individuals. For the exact contribution history, see the revision
cmlenz@230: # history and logs, available at http://genshi.edgewall.org/log/.
cmlenz@1: 
cmlenz@1: """Implementation of a number of stream filters."""
cmlenz@1: 
cmlenz@856: try:
cmlenz@856:     any
cmlenz@856: except NameError:
cmlenz@856:     from genshi.util import any
cmlenz@1: import re
cmlenz@1: 
cmlenz@403: from genshi.core import Attrs, QName, stripentities
cmlenz@571: from genshi.core import END, START, TEXT, COMMENT
cmlenz@1: 
cmlenz@363: __all__ = ['HTMLFormFiller', 'HTMLSanitizer']
cmlenz@425: __docformat__ = 'restructuredtext en'
cmlenz@275: 
cmlenz@275: 
cmlenz@275: class HTMLFormFiller(object):
cmlenz@275:     """A stream filter that can populate HTML forms from a dictionary of values.
cmlenz@275:     
cmlenz@275:     >>> from genshi.input import HTML
cmlenz@275:     >>> html = HTML('''<form>
cmlenz@275:     ...   <p><input type="text" name="foo" /></p>
cmlenz@275:     ... </form>''')
cmlenz@275:     >>> filler = HTMLFormFiller(data={'foo': 'bar'})
cmlenz@853:     >>> print(html | filler)
cmlenz@275:     <form>
cmlenz@275:       <p><input type="text" name="foo" value="bar"/></p>
cmlenz@275:     </form>
cmlenz@275:     """
cmlenz@275:     # TODO: only select the first radio button, and the first select option
cmlenz@275:     #       (if not in a multiple-select)
cmlenz@275:     # TODO: only apply to elements in the XHTML namespace (or no namespace)?
cmlenz@275: 
cmlenz@841:     def __init__(self, name=None, id=None, data=None, passwords=False):
cmlenz@275:         """Create the filter.
cmlenz@275:         
cmlenz@425:         :param name: The name of the form that should be populated. If this
cmlenz@425:                      parameter is given, only forms where the ``name`` attribute
cmlenz@425:                      value matches the parameter are processed.
cmlenz@425:         :param id: The ID of the form that should be populated. If this
cmlenz@425:                    parameter is given, only forms where the ``id`` attribute
cmlenz@425:                    value matches the parameter are processed.
cmlenz@425:         :param data: The dictionary of form values, where the keys are the names
cmlenz@425:                      of the form fields, and the values are the values to fill
cmlenz@425:                      in.
cmlenz@841:         :param passwords: Whether password input fields should be populated.
cmlenz@841:                           This is off by default for security reasons (for
cmlenz@841:                           example, a password may end up in the browser cache)
cmlenz@841:         :note: Changed in 0.5.2: added the `passwords` option
cmlenz@275:         """
cmlenz@275:         self.name = name
cmlenz@275:         self.id = id
cmlenz@275:         if data is None:
cmlenz@275:             data = {}
cmlenz@275:         self.data = data
cmlenz@841:         self.passwords = passwords
cmlenz@275: 
cmlenz@439:     def __call__(self, stream):
cmlenz@277:         """Apply the filter to the given stream.
cmlenz@277:         
cmlenz@425:         :param stream: the markup event stream to filter
cmlenz@277:         """
cmlenz@275:         in_form = in_select = in_option = in_textarea = False
cmlenz@275:         select_value = option_value = textarea_value = None
jonas@584:         option_start = None
jonas@584:         option_text = []
jonas@584:         no_option_value = False
cmlenz@275: 
cmlenz@275:         for kind, data, pos in stream:
cmlenz@275: 
cmlenz@275:             if kind is START:
cmlenz@345:                 tag, attrs = data
cmlenz@275:                 tagname = tag.localname
cmlenz@275: 
cmlenz@275:                 if tagname == 'form' and (
cmlenz@345:                         self.name and attrs.get('name') == self.name or
cmlenz@345:                         self.id and attrs.get('id') == self.id or
cmlenz@275:                         not (self.id or self.name)):
cmlenz@275:                     in_form = True
cmlenz@275: 
cmlenz@275:                 elif in_form:
cmlenz@275:                     if tagname == 'input':
jruigrok@844:                         type = attrs.get('type', '').lower()
cmlenz@275:                         if type in ('checkbox', 'radio'):
cmlenz@345:                             name = attrs.get('name')
cmlenz@471:                             if name and name in self.data:
cmlenz@471:                                 value = self.data[name]
cmlenz@345:                                 declval = attrs.get('value')
cmlenz@275:                                 checked = False
cmlenz@275:                                 if isinstance(value, (list, tuple)):
cmlenz@275:                                     if declval:
jonas@584:                                         checked = declval in [unicode(v) for v
cmlenz@415:                                                               in value]
cmlenz@275:                                     else:
cmlenz@856:                                         checked = any(value)
cmlenz@275:                                 else:
cmlenz@275:                                     if declval:
jonas@584:                                         checked = declval == unicode(value)
cmlenz@275:                                     elif type == 'checkbox':
cmlenz@275:                                         checked = bool(value)
cmlenz@275:                                 if checked:
cmlenz@403:                                     attrs |= [(QName('checked'), 'checked')]
cmlenz@345:                                 elif 'checked' in attrs:
cmlenz@345:                                     attrs -= 'checked'
jruigrok@844:                         elif type in ('', 'hidden', 'text') \
cmlenz@841:                                 or type == 'password' and self.passwords:
cmlenz@345:                             name = attrs.get('name')
cmlenz@471:                             if name and name in self.data:
cmlenz@471:                                 value = self.data[name]
cmlenz@275:                                 if isinstance(value, (list, tuple)):
cmlenz@275:                                     value = value[0]
cmlenz@275:                                 if value is not None:
cmlenz@841:                                     attrs |= [
cmlenz@841:                                         (QName('value'), unicode(value))
cmlenz@841:                                     ]
cmlenz@275:                     elif tagname == 'select':
cmlenz@345:                         name = attrs.get('name')
cmlenz@471:                         if name in self.data:
cmlenz@471:                             select_value = self.data[name]
cmlenz@471:                             in_select = True
cmlenz@275:                     elif tagname == 'textarea':
cmlenz@345:                         name = attrs.get('name')
cmlenz@471:                         if name in self.data:
cmlenz@471:                             textarea_value = self.data.get(name)
cmlenz@471:                             if isinstance(textarea_value, (list, tuple)):
cmlenz@471:                                 textarea_value = textarea_value[0]
cmlenz@471:                             in_textarea = True
cmlenz@275:                     elif in_select and tagname == 'option':
cmlenz@275:                         option_start = kind, data, pos
cmlenz@345:                         option_value = attrs.get('value')
jonas@584:                         if option_value is None:
jonas@584:                             no_option_value = True
jonas@584:                             option_value = ''
cmlenz@275:                         in_option = True
cmlenz@275:                         continue
cmlenz@345:                 yield kind, (tag, attrs), pos
cmlenz@345: 
cmlenz@275:             elif in_form and kind is TEXT:
cmlenz@275:                 if in_select and in_option:
jonas@584:                     if no_option_value:
jonas@584:                         option_value += data
jonas@584:                     option_text.append((kind, data, pos))
cmlenz@275:                     continue
cmlenz@275:                 elif in_textarea:
cmlenz@275:                     continue
cmlenz@345:                 yield kind, data, pos
cmlenz@275: 
cmlenz@275:             elif in_form and kind is END:
cmlenz@275:                 tagname = data.localname
cmlenz@275:                 if tagname == 'form':
cmlenz@275:                     in_form = False
cmlenz@275:                 elif tagname == 'select':
cmlenz@275:                     in_select = False
cmlenz@275:                     select_value = None
cmlenz@275:                 elif in_select and tagname == 'option':
cmlenz@275:                     if isinstance(select_value, (tuple, list)):
jonas@584:                         selected = option_value in [unicode(v) for v
cmlenz@415:                                                     in select_value]
cmlenz@275:                     else:
jonas@584:                         selected = option_value == unicode(select_value)
cmlenz@345:                     okind, (tag, attrs), opos = option_start
cmlenz@275:                     if selected:
cmlenz@403:                         attrs |= [(QName('selected'), 'selected')]
cmlenz@345:                     elif 'selected' in attrs:
cmlenz@345:                         attrs -= 'selected'
cmlenz@345:                     yield okind, (tag, attrs), opos
cmlenz@275:                     if option_text:
jonas@584:                         for event in option_text:
jonas@584:                             yield event
cmlenz@275:                     in_option = False
jonas@584:                     no_option_value = False
jonas@584:                     option_start = option_value = None
jonas@584:                     option_text = []
cmlenz@275:                 elif tagname == 'textarea':
cmlenz@275:                     if textarea_value:
cmlenz@275:                         yield TEXT, unicode(textarea_value), pos
cmlenz@275:                     in_textarea = False
cmlenz@345:                 yield kind, data, pos
cmlenz@275: 
cmlenz@345:             else:
cmlenz@345:                 yield kind, data, pos
cmlenz@123: 
cmlenz@123: 
cmlenz@123: class HTMLSanitizer(object):
cmlenz@123:     """A filter that removes potentially dangerous HTML tags and attributes
cmlenz@123:     from the stream.
cmlenz@431:     
cmlenz@431:     >>> from genshi import HTML
cmlenz@431:     >>> html = HTML('<div><script>alert(document.cookie)</script></div>')
cmlenz@853:     >>> print(html | HTMLSanitizer())
cmlenz@431:     <div/>
cmlenz@431:     
cmlenz@431:     The default set of safe tags and attributes can be modified when the filter
cmlenz@431:     is instantiated. For example, to allow inline ``style`` attributes, the
cmlenz@431:     following instantation would work:
cmlenz@431:     
cmlenz@431:     >>> html = HTML('<div style="background: #000"></div>')
cmlenz@431:     >>> sanitizer = HTMLSanitizer(safe_attrs=HTMLSanitizer.SAFE_ATTRS | set(['style']))
cmlenz@853:     >>> print(html | sanitizer)
cmlenz@431:     <div style="background: #000"/>
cmlenz@431:     
cmlenz@431:     Note that even in this case, the filter *does* attempt to remove dangerous
cmlenz@431:     constructs from style attributes:
cmlenz@431: 
cmlenz@431:     >>> html = HTML('<div style="background: url(javascript:void); color: #000"></div>')
cmlenz@853:     >>> print(html | sanitizer)
cmlenz@431:     <div style="color: #000"/>
cmlenz@431:     
cmlenz@431:     This handles HTML entities, unicode escapes in CSS and Javascript text, as
cmlenz@431:     well as a lot of other things. However, the style tag is still excluded by
cmlenz@431:     default because it is very hard for such sanitizing to be completely safe,
cmlenz@431:     especially considering how much error recovery current web browsers perform.
cmlenz@571:     
cmlenz@840:     It also does some basic filtering of CSS properties that may be used for
cmlenz@840:     typical phishing attacks. For more sophisticated filtering, this class
cmlenz@840:     provides a couple of hooks that can be overridden in sub-classes.
cmlenz@840:     
cmlenz@571:     :warn: Note that this special processing of CSS is currently only applied to
cmlenz@571:            style attributes, **not** style elements.
cmlenz@123:     """
cmlenz@123: 
cmlenz@277:     SAFE_TAGS = frozenset(['a', 'abbr', 'acronym', 'address', 'area', 'b',
cmlenz@123:         'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
cmlenz@123:         'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
cmlenz@123:         'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
cmlenz@123:         'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
cmlenz@123:         'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
cmlenz@123:         'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
cmlenz@123:         'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
cmlenz@123:         'ul', 'var'])
cmlenz@123: 
cmlenz@277:     SAFE_ATTRS = frozenset(['abbr', 'accept', 'accept-charset', 'accesskey',
cmlenz@123:         'action', 'align', 'alt', 'axis', 'bgcolor', 'border', 'cellpadding',
cmlenz@123:         'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
cmlenz@123:         'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
cmlenz@123:         'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
cmlenz@123:         'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
cmlenz@123:         'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
cmlenz@123:         'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
cmlenz@123:         'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
cmlenz@431:         'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
cmlenz@431:         'type', 'usemap', 'valign', 'value', 'vspace', 'width'])
cmlenz@277: 
hodgestar@951:     SAFE_CSS = frozenset([
hodgestar@951:         # CSS 3 properties <http://www.w3.org/TR/CSS/#properties>
hodgestar@951:         'background', 'background-attachment', 'background-color',
hodgestar@951:         'background-image', 'background-position', 'background-repeat',
hodgestar@951:         'border', 'border-bottom', 'border-bottom-color',
hodgestar@951:         'border-bottom-style', 'border-bottom-width', 'border-collapse',
hodgestar@951:         'border-color', 'border-left', 'border-left-color',
hodgestar@951:         'border-left-style', 'border-left-width', 'border-right',
hodgestar@951:         'border-right-color', 'border-right-style', 'border-right-width',
hodgestar@951:         'border-spacing', 'border-style', 'border-top', 'border-top-color',
hodgestar@951:         'border-top-style', 'border-top-width', 'border-width', 'bottom',
hodgestar@951:         'caption-side', 'clear', 'clip', 'color', 'content',
hodgestar@951:         'counter-increment', 'counter-reset', 'cursor', 'direction', 'display',
hodgestar@951:         'empty-cells', 'float', 'font', 'font-family', 'font-size',
hodgestar@951:         'font-style', 'font-variant', 'font-weight', 'height', 'left',
hodgestar@951:         'letter-spacing', 'line-height', 'list-style', 'list-style-image',
hodgestar@951:         'list-style-position', 'list-style-type', 'margin', 'margin-bottom',
hodgestar@951:         'margin-left', 'margin-right', 'margin-top', 'max-height', 'max-width',
hodgestar@951:         'min-height', 'min-width', 'opacity', 'orphans', 'outline',
hodgestar@951:         'outline-color', 'outline-style', 'outline-width', 'overflow',
hodgestar@951:         'padding', 'padding-bottom', 'padding-left', 'padding-right',
hodgestar@951:         'padding-top', 'page-break-after', 'page-break-before',
hodgestar@951:         'page-break-inside', 'quotes', 'right', 'table-layout',
hodgestar@951:         'text-align', 'text-decoration', 'text-indent', 'text-transform',
hodgestar@951:         'top', 'unicode-bidi', 'vertical-align', 'visibility', 'white-space',
hodgestar@951:         'widows', 'width', 'word-spacing', 'z-index',
hodgestar@951:      ])
hodgestar@951: 
cmlenz@277:     SAFE_SCHEMES = frozenset(['file', 'ftp', 'http', 'https', 'mailto', None])
cmlenz@277: 
cmlenz@277:     URI_ATTRS = frozenset(['action', 'background', 'dynsrc', 'href', 'lowsrc',
cmlenz@123:         'src'])
cmlenz@277: 
cmlenz@277:     def __init__(self, safe_tags=SAFE_TAGS, safe_attrs=SAFE_ATTRS,
hodgestar@951:                  safe_schemes=SAFE_SCHEMES, uri_attrs=URI_ATTRS,
hodgestar@951:                  safe_css=SAFE_CSS):
cmlenz@277:         """Create the sanitizer.
cmlenz@277:         
cmlenz@277:         The exact set of allowed elements and attributes can be configured.
cmlenz@277:         
cmlenz@425:         :param safe_tags: a set of tag names that are considered safe
cmlenz@425:         :param safe_attrs: a set of attribute names that are considered safe
cmlenz@425:         :param safe_schemes: a set of URI schemes that are considered safe
cmlenz@425:         :param uri_attrs: a set of names of attributes that contain URIs
cmlenz@277:         """
cmlenz@277:         self.safe_tags = safe_tags
hodgestar@951:         # The set of tag names that are considered safe.
cmlenz@277:         self.safe_attrs = safe_attrs
hodgestar@951:         # The set of attribute names that are considered safe.
hodgestar@951:         self.safe_css = safe_css
hodgestar@951:         # The set of CSS properties that are considered safe.
cmlenz@277:         self.uri_attrs = uri_attrs
hodgestar@951:         # The set of names of attributes that may contain URIs.
cmlenz@277:         self.safe_schemes = safe_schemes
hodgestar@951:         # The set of URI schemes that are considered safe.
hodgestar@951: 
hodgestar@951:     # IE6 <http://heideri.ch/jso/#80>
hodgestar@951:     _EXPRESSION_SEARCH = re.compile(u"""
hodgestar@951:         [eE
hodgestar@951:          \uFF25 # FULLWIDTH LATIN CAPITAL LETTER E
hodgestar@951:          \uFF45 # FULLWIDTH LATIN SMALL LETTER E
hodgestar@951:         ]
hodgestar@951:         [xX
hodgestar@951:          \uFF38 # FULLWIDTH LATIN CAPITAL LETTER X
hodgestar@951:          \uFF58 # FULLWIDTH LATIN SMALL LETTER X
hodgestar@951:         ]
hodgestar@951:         [pP
hodgestar@951:          \uFF30 # FULLWIDTH LATIN CAPITAL LETTER P
hodgestar@951:          \uFF50 # FULLWIDTH LATIN SMALL LETTER P
hodgestar@951:         ]
hodgestar@951:         [rR
hodgestar@951:          \u0280 # LATIN LETTER SMALL CAPITAL R
hodgestar@951:          \uFF32 # FULLWIDTH LATIN CAPITAL LETTER R
hodgestar@951:          \uFF52 # FULLWIDTH LATIN SMALL LETTER R
hodgestar@951:         ]
hodgestar@951:         [eE
hodgestar@951:          \uFF25 # FULLWIDTH LATIN CAPITAL LETTER E
hodgestar@951:          \uFF45 # FULLWIDTH LATIN SMALL LETTER E
hodgestar@951:         ]
hodgestar@951:         [sS
hodgestar@951:          \uFF33 # FULLWIDTH LATIN CAPITAL LETTER S
hodgestar@951:          \uFF53 # FULLWIDTH LATIN SMALL LETTER S
hodgestar@951:         ]{2}
hodgestar@951:         [iI
hodgestar@951:          \u026A # LATIN LETTER SMALL CAPITAL I
hodgestar@951:          \uFF29 # FULLWIDTH LATIN CAPITAL LETTER I
hodgestar@951:          \uFF49 # FULLWIDTH LATIN SMALL LETTER I
hodgestar@951:         ]
hodgestar@951:         [oO
hodgestar@951:          \uFF2F # FULLWIDTH LATIN CAPITAL LETTER O
hodgestar@951:          \uFF4F # FULLWIDTH LATIN SMALL LETTER O
hodgestar@951:         ]
hodgestar@951:         [nN
hodgestar@951:          \u0274 # LATIN LETTER SMALL CAPITAL N
hodgestar@951:          \uFF2E # FULLWIDTH LATIN CAPITAL LETTER N
hodgestar@951:          \uFF4E # FULLWIDTH LATIN SMALL LETTER N
hodgestar@951:         ]
hodgestar@951:         """, re.VERBOSE).search
hodgestar@951: 
hodgestar@951:     # IE6 <http://openmya.hacker.jp/hasegawa/security/expression.txt>
hodgestar@951:     #     7) Particular bit of Unicode characters
hodgestar@951:     _URL_FINDITER = re.compile(
hodgestar@951:         u'[Uu][Rr\u0280][Ll\u029F]\s*\(([^)]+)').finditer
cmlenz@123: 
cmlenz@439:     def __call__(self, stream):
cmlenz@277:         """Apply the filter to the given stream.
cmlenz@277:         
cmlenz@425:         :param stream: the markup event stream to filter
cmlenz@277:         """
cmlenz@123:         waiting_for = None
cmlenz@123: 
cmlenz@123:         for kind, data, pos in stream:
cmlenz@123:             if kind is START:
cmlenz@123:                 if waiting_for:
cmlenz@123:                     continue
cmlenz@345:                 tag, attrs = data
cmlenz@840:                 if not self.is_safe_elem(tag, attrs):
cmlenz@123:                     waiting_for = tag
cmlenz@123:                     continue
cmlenz@123: 
cmlenz@345:                 new_attrs = []
cmlenz@345:                 for attr, value in attrs:
cmlenz@123:                     value = stripentities(value)
cmlenz@277:                     if attr not in self.safe_attrs:
cmlenz@123:                         continue
cmlenz@277:                     elif attr in self.uri_attrs:
cmlenz@123:                         # Don't allow URI schemes such as "javascript:"
cmlenz@571:                         if not self.is_safe_uri(value):
cmlenz@123:                             continue
cmlenz@123:                     elif attr == 'style':
cmlenz@123:                         # Remove dangerous CSS declarations from inline styles
cmlenz@571:                         decls = self.sanitize_css(value)
cmlenz@123:                         if not decls:
cmlenz@123:                             continue
cmlenz@123:                         value = '; '.join(decls)
cmlenz@345:                     new_attrs.append((attr, value))
cmlenz@123: 
cmlenz@345:                 yield kind, (tag, Attrs(new_attrs)), pos
cmlenz@123: 
cmlenz@123:             elif kind is END:
cmlenz@123:                 tag = data
cmlenz@123:                 if waiting_for:
cmlenz@123:                     if waiting_for == tag:
cmlenz@123:                         waiting_for = None
cmlenz@123:                 else:
cmlenz@123:                     yield kind, data, pos
cmlenz@123: 
cmlenz@571:             elif kind is not COMMENT:
cmlenz@123:                 if not waiting_for:
cmlenz@123:                     yield kind, data, pos
cmlenz@431: 
cmlenz@840:     def is_safe_css(self, propname, value):
cmlenz@840:         """Determine whether the given css property declaration is to be
cmlenz@840:         considered safe for inclusion in the output.
cmlenz@840:         
cmlenz@840:         :param propname: the CSS property name
cmlenz@840:         :param value: the value of the property
cmlenz@840:         :return: whether the property value should be considered safe
cmlenz@840:         :rtype: bool
cmlenz@840:         :since: version 0.6
cmlenz@840:         """
hodgestar@951:         if propname not in self.safe_css:
cmlenz@840:             return False
cmlenz@840:         if propname.startswith('margin') and '-' in value:
cmlenz@840:             # Negative margins can be used for phishing
cmlenz@840:             return False
cmlenz@840:         return True
cmlenz@840: 
cmlenz@840:     def is_safe_elem(self, tag, attrs):
cmlenz@840:         """Determine whether the given element should be considered safe for
cmlenz@840:         inclusion in the output.
cmlenz@840:         
cmlenz@840:         :param tag: the tag name of the element
cmlenz@840:         :type tag: QName
cmlenz@840:         :param attrs: the element attributes
cmlenz@840:         :type attrs: Attrs
cmlenz@840:         :return: whether the element should be considered safe
cmlenz@840:         :rtype: bool
cmlenz@840:         :since: version 0.6
cmlenz@840:         """
cmlenz@840:         if tag not in self.safe_tags:
cmlenz@840:             return False
cmlenz@840:         if tag.localname == 'input':
cmlenz@840:             input_type = attrs.get('type', '').lower()
cmlenz@840:             if input_type == 'password':
cmlenz@840:                 return False
cmlenz@840:         return True
cmlenz@840: 
cmlenz@571:     def is_safe_uri(self, uri):
cmlenz@571:         """Determine whether the given URI is to be considered safe for
cmlenz@571:         inclusion in the output.
cmlenz@571:         
cmlenz@571:         The default implementation checks whether the scheme of the URI is in
cmlenz@571:         the set of allowed URIs (`safe_schemes`).
cmlenz@571:         
cmlenz@571:         >>> sanitizer = HTMLSanitizer()
cmlenz@571:         >>> sanitizer.is_safe_uri('http://example.org/')
cmlenz@571:         True
cmlenz@571:         >>> sanitizer.is_safe_uri('javascript:alert(document.cookie)')
cmlenz@571:         False
cmlenz@571:         
cmlenz@571:         :param uri: the URI to check
cmlenz@571:         :return: `True` if the URI can be considered safe, `False` otherwise
cmlenz@571:         :rtype: `bool`
cmlenz@576:         :since: version 0.4.3
cmlenz@571:         """
cmlenz@837:         if '#' in uri:
cmlenz@837:             uri = uri.split('#', 1)[0] # Strip out the fragment identifier
cmlenz@571:         if ':' not in uri:
cmlenz@571:             return True # This is a relative URI
cmlenz@571:         chars = [char for char in uri.split(':', 1)[0] if char.isalnum()]
cmlenz@571:         return ''.join(chars).lower() in self.safe_schemes
cmlenz@571: 
cmlenz@571:     def sanitize_css(self, text):
cmlenz@571:         """Remove potentially dangerous property declarations from CSS code.
cmlenz@571:         
cmlenz@571:         In particular, properties using the CSS ``url()`` function with a scheme
cmlenz@571:         that is not considered safe are removed:
cmlenz@571:         
cmlenz@571:         >>> sanitizer = HTMLSanitizer()
cmlenz@571:         >>> sanitizer.sanitize_css(u'''
cmlenz@571:         ...   background: url(javascript:alert("foo"));
cmlenz@571:         ...   color: #000;
cmlenz@571:         ... ''')
cmlenz@571:         [u'color: #000']
cmlenz@571:         
cmlenz@571:         Also, the proprietary Internet Explorer function ``expression()`` is
cmlenz@571:         always stripped:
cmlenz@571:         
cmlenz@571:         >>> sanitizer.sanitize_css(u'''
cmlenz@571:         ...   background: #fff;
cmlenz@571:         ...   color: #000;
cmlenz@571:         ...   width: e/**/xpression(alert("foo"));
cmlenz@571:         ... ''')
cmlenz@571:         [u'background: #fff', u'color: #000']
cmlenz@571:         
cmlenz@571:         :param text: the CSS text; this is expected to be `unicode` and to not
cmlenz@571:                      contain any character or numeric references
cmlenz@571:         :return: a list of declarations that are considered safe
cmlenz@571:         :rtype: `list`
cmlenz@576:         :since: version 0.4.3
cmlenz@571:         """
cmlenz@571:         decls = []
cmlenz@571:         text = self._strip_css_comments(self._replace_unicode_escapes(text))
cmlenz@856:         for decl in text.split(';'):
cmlenz@571:             decl = decl.strip()
cmlenz@571:             if not decl:
cmlenz@571:                 continue
cmlenz@840:             try:
cmlenz@840:                 propname, value = decl.split(':', 1)
cmlenz@840:             except ValueError:
cmlenz@840:                 continue
cmlenz@840:             if not self.is_safe_css(propname.strip().lower(), value.strip()):
cmlenz@840:                 continue
cmlenz@571:             is_evil = False
hodgestar@951:             if self._EXPRESSION_SEARCH(value):
cmlenz@571:                 is_evil = True
hodgestar@951:             for match in self._URL_FINDITER(value):
cmlenz@571:                 if not self.is_safe_uri(match.group(1)):
cmlenz@571:                     is_evil = True
cmlenz@571:                     break
cmlenz@571:             if not is_evil:
cmlenz@571:                 decls.append(decl.strip())
cmlenz@571:         return decls
cmlenz@571: 
cmlenz@431:     _NORMALIZE_NEWLINES = re.compile(r'\r\n').sub
hodgestar@951:     _UNICODE_ESCAPE = re.compile(
hodgestar@951:         r"""\\([0-9a-fA-F]{1,6})\s?|\\([^\r\n\f0-9a-fA-F'"{};:()#*])""",
hodgestar@951:         re.UNICODE).sub
cmlenz@431: 
cmlenz@431:     def _replace_unicode_escapes(self, text):
cmlenz@431:         def _repl(match):
hodgestar@951:             t = match.group(1)
hodgestar@951:             if t:
hodgestar@951:                 return unichr(int(t, 16))
hodgestar@951:             t = match.group(2)
hodgestar@951:             if t == '\\':
hodgestar@951:                 return r'\\'
hodgestar@951:             else:
hodgestar@951:                 return t
cmlenz@431:         return self._UNICODE_ESCAPE(_repl, self._NORMALIZE_NEWLINES('\n', text))
cmlenz@556: 
cmlenz@556:     _CSS_COMMENTS = re.compile(r'/\*.*?\*/').sub
cmlenz@556: 
cmlenz@556:     def _strip_css_comments(self, text):
cmlenz@556:         return self._CSS_COMMENTS('', text)