cmlenz@1: # -*- coding: utf-8 -*- cmlenz@1: # aflett@724: # Copyright (C) 2006-2008 Edgewall Software cmlenz@1: # All rights reserved. cmlenz@1: # cmlenz@1: # This software is licensed as described in the file COPYING, which cmlenz@1: # you should have received as part of this distribution. The terms cmlenz@230: # are also available at http://genshi.edgewall.org/wiki/License. cmlenz@1: # cmlenz@1: # This software consists of voluntary contributions made by many cmlenz@1: # individuals. For the exact contribution history, see the revision cmlenz@230: # history and logs, available at http://genshi.edgewall.org/log/. cmlenz@1: cmlenz@1: """Implementation of a number of stream filters.""" cmlenz@1: cmlenz@1: try: aflett@718: set cmlenz@1: except NameError: cmlenz@1: from sets import ImmutableSet as frozenset aflett@718: from sets import Set as set cmlenz@1: import re cmlenz@1: cmlenz@403: from genshi.core import Attrs, QName, stripentities cmlenz@571: from genshi.core import END, START, TEXT, COMMENT cmlenz@1: cmlenz@363: __all__ = ['HTMLFormFiller', 'HTMLSanitizer'] cmlenz@425: __docformat__ = 'restructuredtext en' cmlenz@275: cmlenz@275: cmlenz@275: class HTMLFormFiller(object): cmlenz@275: """A stream filter that can populate HTML forms from a dictionary of values. cmlenz@275: cmlenz@275: >>> from genshi.input import HTML cmlenz@275: >>> html = HTML('''
cmlenz@275: ...

cmlenz@275: ...
''') cmlenz@275: >>> filler = HTMLFormFiller(data={'foo': 'bar'}) cmlenz@275: >>> print html | filler cmlenz@275:
cmlenz@275:

cmlenz@275:
cmlenz@275: """ cmlenz@275: # TODO: only select the first radio button, and the first select option cmlenz@275: # (if not in a multiple-select) cmlenz@275: # TODO: only apply to elements in the XHTML namespace (or no namespace)? cmlenz@275: cmlenz@275: def __init__(self, name=None, id=None, data=None): cmlenz@275: """Create the filter. cmlenz@275: cmlenz@425: :param name: The name of the form that should be populated. If this cmlenz@425: parameter is given, only forms where the ``name`` attribute cmlenz@425: value matches the parameter are processed. cmlenz@425: :param id: The ID of the form that should be populated. If this cmlenz@425: parameter is given, only forms where the ``id`` attribute cmlenz@425: value matches the parameter are processed. cmlenz@425: :param data: The dictionary of form values, where the keys are the names cmlenz@425: of the form fields, and the values are the values to fill cmlenz@425: in. cmlenz@275: """ cmlenz@275: self.name = name cmlenz@275: self.id = id cmlenz@275: if data is None: cmlenz@275: data = {} cmlenz@275: self.data = data cmlenz@275: cmlenz@439: def __call__(self, stream): cmlenz@277: """Apply the filter to the given stream. cmlenz@277: cmlenz@425: :param stream: the markup event stream to filter cmlenz@277: """ cmlenz@275: in_form = in_select = in_option = in_textarea = False cmlenz@275: select_value = option_value = textarea_value = None jonas@584: option_start = None jonas@584: option_text = [] jonas@584: no_option_value = False cmlenz@275: cmlenz@275: for kind, data, pos in stream: cmlenz@275: cmlenz@275: if kind is START: cmlenz@345: tag, attrs = data cmlenz@275: tagname = tag.localname cmlenz@275: cmlenz@275: if tagname == 'form' and ( cmlenz@345: self.name and attrs.get('name') == self.name or cmlenz@345: self.id and attrs.get('id') == self.id or cmlenz@275: not (self.id or self.name)): cmlenz@275: in_form = True cmlenz@275: cmlenz@275: elif in_form: cmlenz@275: if tagname == 'input': cmlenz@345: type = attrs.get('type') cmlenz@275: if type in ('checkbox', 'radio'): cmlenz@345: name = attrs.get('name') cmlenz@471: if name and name in self.data: cmlenz@471: value = self.data[name] cmlenz@345: declval = attrs.get('value') cmlenz@275: checked = False cmlenz@275: if isinstance(value, (list, tuple)): cmlenz@275: if declval: jonas@584: checked = declval in [unicode(v) for v cmlenz@415: in value] cmlenz@275: else: cmlenz@275: checked = bool(filter(None, value)) cmlenz@275: else: cmlenz@275: if declval: jonas@584: checked = declval == unicode(value) cmlenz@275: elif type == 'checkbox': cmlenz@275: checked = bool(value) cmlenz@275: if checked: cmlenz@403: attrs |= [(QName('checked'), 'checked')] cmlenz@345: elif 'checked' in attrs: cmlenz@345: attrs -= 'checked' cmlenz@275: elif type in (None, 'hidden', 'text'): cmlenz@345: name = attrs.get('name') cmlenz@471: if name and name in self.data: cmlenz@471: value = self.data[name] cmlenz@275: if isinstance(value, (list, tuple)): cmlenz@275: value = value[0] cmlenz@275: if value is not None: cmlenz@403: attrs |= [(QName('value'), unicode(value))] cmlenz@275: elif tagname == 'select': cmlenz@345: name = attrs.get('name') cmlenz@471: if name in self.data: cmlenz@471: select_value = self.data[name] cmlenz@471: in_select = True cmlenz@275: elif tagname == 'textarea': cmlenz@345: name = attrs.get('name') cmlenz@471: if name in self.data: cmlenz@471: textarea_value = self.data.get(name) cmlenz@471: if isinstance(textarea_value, (list, tuple)): cmlenz@471: textarea_value = textarea_value[0] cmlenz@471: in_textarea = True cmlenz@275: elif in_select and tagname == 'option': cmlenz@275: option_start = kind, data, pos cmlenz@345: option_value = attrs.get('value') jonas@584: if option_value is None: jonas@584: no_option_value = True jonas@584: option_value = '' cmlenz@275: in_option = True cmlenz@275: continue cmlenz@345: yield kind, (tag, attrs), pos cmlenz@345: cmlenz@275: elif in_form and kind is TEXT: cmlenz@275: if in_select and in_option: jonas@584: if no_option_value: jonas@584: option_value += data jonas@584: option_text.append((kind, data, pos)) cmlenz@275: continue cmlenz@275: elif in_textarea: cmlenz@275: continue cmlenz@345: yield kind, data, pos cmlenz@275: cmlenz@275: elif in_form and kind is END: cmlenz@275: tagname = data.localname cmlenz@275: if tagname == 'form': cmlenz@275: in_form = False cmlenz@275: elif tagname == 'select': cmlenz@275: in_select = False cmlenz@275: select_value = None cmlenz@275: elif in_select and tagname == 'option': cmlenz@275: if isinstance(select_value, (tuple, list)): jonas@584: selected = option_value in [unicode(v) for v cmlenz@415: in select_value] cmlenz@275: else: jonas@584: selected = option_value == unicode(select_value) cmlenz@345: okind, (tag, attrs), opos = option_start cmlenz@275: if selected: cmlenz@403: attrs |= [(QName('selected'), 'selected')] cmlenz@345: elif 'selected' in attrs: cmlenz@345: attrs -= 'selected' cmlenz@345: yield okind, (tag, attrs), opos cmlenz@275: if option_text: jonas@584: for event in option_text: jonas@584: yield event cmlenz@275: in_option = False jonas@584: no_option_value = False jonas@584: option_start = option_value = None jonas@584: option_text = [] cmlenz@275: elif tagname == 'textarea': cmlenz@275: if textarea_value: cmlenz@275: yield TEXT, unicode(textarea_value), pos cmlenz@275: in_textarea = False cmlenz@345: yield kind, data, pos cmlenz@275: cmlenz@345: else: cmlenz@345: yield kind, data, pos cmlenz@123: cmlenz@123: cmlenz@123: class HTMLSanitizer(object): cmlenz@123: """A filter that removes potentially dangerous HTML tags and attributes cmlenz@123: from the stream. cmlenz@431: cmlenz@431: >>> from genshi import HTML cmlenz@431: >>> html = HTML('
') cmlenz@431: >>> print html | HTMLSanitizer() cmlenz@431:
cmlenz@431: cmlenz@431: The default set of safe tags and attributes can be modified when the filter cmlenz@431: is instantiated. For example, to allow inline ``style`` attributes, the cmlenz@431: following instantation would work: cmlenz@431: cmlenz@431: >>> html = HTML('
') cmlenz@431: >>> sanitizer = HTMLSanitizer(safe_attrs=HTMLSanitizer.SAFE_ATTRS | set(['style'])) cmlenz@431: >>> print html | sanitizer cmlenz@431:
cmlenz@431: cmlenz@431: Note that even in this case, the filter *does* attempt to remove dangerous cmlenz@431: constructs from style attributes: cmlenz@431: cmlenz@431: >>> html = HTML('
') cmlenz@431: >>> print html | sanitizer cmlenz@431:
cmlenz@431: cmlenz@431: This handles HTML entities, unicode escapes in CSS and Javascript text, as cmlenz@431: well as a lot of other things. However, the style tag is still excluded by cmlenz@431: default because it is very hard for such sanitizing to be completely safe, cmlenz@431: especially considering how much error recovery current web browsers perform. cmlenz@571: cmlenz@571: :warn: Note that this special processing of CSS is currently only applied to cmlenz@571: style attributes, **not** style elements. cmlenz@123: """ cmlenz@123: cmlenz@277: SAFE_TAGS = frozenset(['a', 'abbr', 'acronym', 'address', 'area', 'b', cmlenz@123: 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite', cmlenz@123: 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', cmlenz@123: 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', cmlenz@123: 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map', cmlenz@123: 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp', cmlenz@123: 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table', cmlenz@123: 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', cmlenz@123: 'ul', 'var']) cmlenz@123: cmlenz@277: SAFE_ATTRS = frozenset(['abbr', 'accept', 'accept-charset', 'accesskey', cmlenz@123: 'action', 'align', 'alt', 'axis', 'bgcolor', 'border', 'cellpadding', cmlenz@123: 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class', cmlenz@123: 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime', cmlenz@123: 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height', cmlenz@123: 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang', cmlenz@123: 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name', cmlenz@123: 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev', cmlenz@123: 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', cmlenz@431: 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', cmlenz@431: 'type', 'usemap', 'valign', 'value', 'vspace', 'width']) cmlenz@277: cmlenz@277: SAFE_SCHEMES = frozenset(['file', 'ftp', 'http', 'https', 'mailto', None]) cmlenz@277: cmlenz@277: URI_ATTRS = frozenset(['action', 'background', 'dynsrc', 'href', 'lowsrc', cmlenz@123: 'src']) cmlenz@277: cmlenz@277: def __init__(self, safe_tags=SAFE_TAGS, safe_attrs=SAFE_ATTRS, cmlenz@277: safe_schemes=SAFE_SCHEMES, uri_attrs=URI_ATTRS): cmlenz@277: """Create the sanitizer. cmlenz@277: cmlenz@277: The exact set of allowed elements and attributes can be configured. cmlenz@277: cmlenz@425: :param safe_tags: a set of tag names that are considered safe cmlenz@425: :param safe_attrs: a set of attribute names that are considered safe cmlenz@425: :param safe_schemes: a set of URI schemes that are considered safe cmlenz@425: :param uri_attrs: a set of names of attributes that contain URIs cmlenz@277: """ cmlenz@277: self.safe_tags = safe_tags cmlenz@571: "The set of tag names that are considered safe." cmlenz@277: self.safe_attrs = safe_attrs cmlenz@571: "The set of attribute names that are considered safe." cmlenz@277: self.uri_attrs = uri_attrs cmlenz@571: "The set of names of attributes that may contain URIs." cmlenz@277: self.safe_schemes = safe_schemes cmlenz@571: "The set of URI schemes that are considered safe." cmlenz@123: cmlenz@439: def __call__(self, stream): cmlenz@277: """Apply the filter to the given stream. cmlenz@277: cmlenz@425: :param stream: the markup event stream to filter cmlenz@277: """ cmlenz@123: waiting_for = None cmlenz@123: cmlenz@123: for kind, data, pos in stream: cmlenz@123: if kind is START: cmlenz@123: if waiting_for: cmlenz@123: continue cmlenz@345: tag, attrs = data cmlenz@277: if tag not in self.safe_tags: cmlenz@123: waiting_for = tag cmlenz@123: continue cmlenz@123: cmlenz@345: new_attrs = [] cmlenz@345: for attr, value in attrs: cmlenz@123: value = stripentities(value) cmlenz@277: if attr not in self.safe_attrs: cmlenz@123: continue cmlenz@277: elif attr in self.uri_attrs: cmlenz@123: # Don't allow URI schemes such as "javascript:" cmlenz@571: if not self.is_safe_uri(value): cmlenz@123: continue cmlenz@123: elif attr == 'style': cmlenz@123: # Remove dangerous CSS declarations from inline styles cmlenz@571: decls = self.sanitize_css(value) cmlenz@123: if not decls: cmlenz@123: continue cmlenz@123: value = '; '.join(decls) cmlenz@345: new_attrs.append((attr, value)) cmlenz@123: cmlenz@345: yield kind, (tag, Attrs(new_attrs)), pos cmlenz@123: cmlenz@123: elif kind is END: cmlenz@123: tag = data cmlenz@123: if waiting_for: cmlenz@123: if waiting_for == tag: cmlenz@123: waiting_for = None cmlenz@123: else: cmlenz@123: yield kind, data, pos cmlenz@123: cmlenz@571: elif kind is not COMMENT: cmlenz@123: if not waiting_for: cmlenz@123: yield kind, data, pos cmlenz@431: cmlenz@571: def is_safe_uri(self, uri): cmlenz@571: """Determine whether the given URI is to be considered safe for cmlenz@571: inclusion in the output. cmlenz@571: cmlenz@571: The default implementation checks whether the scheme of the URI is in cmlenz@571: the set of allowed URIs (`safe_schemes`). cmlenz@571: cmlenz@571: >>> sanitizer = HTMLSanitizer() cmlenz@571: >>> sanitizer.is_safe_uri('http://example.org/') cmlenz@571: True cmlenz@571: >>> sanitizer.is_safe_uri('javascript:alert(document.cookie)') cmlenz@571: False cmlenz@571: cmlenz@571: :param uri: the URI to check cmlenz@571: :return: `True` if the URI can be considered safe, `False` otherwise cmlenz@571: :rtype: `bool` cmlenz@576: :since: version 0.4.3 cmlenz@571: """ cmlenz@571: if ':' not in uri: cmlenz@571: return True # This is a relative URI cmlenz@571: chars = [char for char in uri.split(':', 1)[0] if char.isalnum()] cmlenz@571: return ''.join(chars).lower() in self.safe_schemes cmlenz@571: cmlenz@571: def sanitize_css(self, text): cmlenz@571: """Remove potentially dangerous property declarations from CSS code. cmlenz@571: cmlenz@571: In particular, properties using the CSS ``url()`` function with a scheme cmlenz@571: that is not considered safe are removed: cmlenz@571: cmlenz@571: >>> sanitizer = HTMLSanitizer() cmlenz@571: >>> sanitizer.sanitize_css(u''' cmlenz@571: ... background: url(javascript:alert("foo")); cmlenz@571: ... color: #000; cmlenz@571: ... ''') cmlenz@571: [u'color: #000'] cmlenz@571: cmlenz@571: Also, the proprietary Internet Explorer function ``expression()`` is cmlenz@571: always stripped: cmlenz@571: cmlenz@571: >>> sanitizer.sanitize_css(u''' cmlenz@571: ... background: #fff; cmlenz@571: ... color: #000; cmlenz@571: ... width: e/**/xpression(alert("foo")); cmlenz@571: ... ''') cmlenz@571: [u'background: #fff', u'color: #000'] cmlenz@571: cmlenz@571: :param text: the CSS text; this is expected to be `unicode` and to not cmlenz@571: contain any character or numeric references cmlenz@571: :return: a list of declarations that are considered safe cmlenz@571: :rtype: `list` cmlenz@576: :since: version 0.4.3 cmlenz@571: """ cmlenz@571: decls = [] cmlenz@571: text = self._strip_css_comments(self._replace_unicode_escapes(text)) cmlenz@571: for decl in filter(None, text.split(';')): cmlenz@571: decl = decl.strip() cmlenz@571: if not decl: cmlenz@571: continue cmlenz@571: is_evil = False cmlenz@571: if 'expression' in decl: cmlenz@571: is_evil = True cmlenz@571: for match in re.finditer(r'url\s*\(([^)]+)', decl): cmlenz@571: if not self.is_safe_uri(match.group(1)): cmlenz@571: is_evil = True cmlenz@571: break cmlenz@571: if not is_evil: cmlenz@571: decls.append(decl.strip()) cmlenz@571: return decls cmlenz@571: cmlenz@431: _NORMALIZE_NEWLINES = re.compile(r'\r\n').sub cmlenz@431: _UNICODE_ESCAPE = re.compile(r'\\([0-9a-fA-F]{1,6})\s?').sub cmlenz@431: cmlenz@431: def _replace_unicode_escapes(self, text): cmlenz@431: def _repl(match): cmlenz@431: return unichr(int(match.group(1), 16)) cmlenz@431: return self._UNICODE_ESCAPE(_repl, self._NORMALIZE_NEWLINES('\n', text)) cmlenz@556: cmlenz@556: _CSS_COMMENTS = re.compile(r'/\*.*?\*/').sub cmlenz@556: cmlenz@556: def _strip_css_comments(self, text): cmlenz@556: return self._CSS_COMMENTS('', text)