cmlenz@1: # -*- coding: utf-8 -*- cmlenz@1: # cmlenz@66: # Copyright (C) 2006 Edgewall Software cmlenz@1: # All rights reserved. cmlenz@1: # cmlenz@1: # This software is licensed as described in the file COPYING, which cmlenz@1: # you should have received as part of this distribution. The terms cmlenz@66: # are also available at http://markup.edgewall.org/wiki/License. cmlenz@1: # cmlenz@1: # This software consists of voluntary contributions made by many cmlenz@1: # individuals. For the exact contribution history, see the revision cmlenz@66: # history and logs, available at http://markup.edgewall.org/log/. cmlenz@1: cmlenz@1: """Implementation of a number of stream filters.""" cmlenz@1: cmlenz@92: from itertools import chain cmlenz@1: try: cmlenz@1: frozenset cmlenz@1: except NameError: cmlenz@1: from sets import ImmutableSet as frozenset cmlenz@1: import re cmlenz@1: cmlenz@113: from markup.core import Attributes, Markup, Namespace, escape, stripentities cmlenz@69: from markup.core import END, END_NS, START, START_NS, TEXT cmlenz@1: from markup.path import Path cmlenz@1: cmlenz@123: __all__ = ['HTMLSanitizer', 'IncludeFilter'] cmlenz@123: cmlenz@123: cmlenz@123: class HTMLSanitizer(object): cmlenz@123: """A filter that removes potentially dangerous HTML tags and attributes cmlenz@123: from the stream. cmlenz@123: """ cmlenz@123: cmlenz@123: _SAFE_TAGS = frozenset(['a', 'abbr', 'acronym', 'address', 'area', 'b', cmlenz@123: 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite', cmlenz@123: 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', cmlenz@123: 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', cmlenz@123: 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map', cmlenz@123: 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp', cmlenz@123: 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table', cmlenz@123: 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', cmlenz@123: 'ul', 'var']) cmlenz@123: cmlenz@123: _SAFE_ATTRS = frozenset(['abbr', 'accept', 'accept-charset', 'accesskey', cmlenz@123: 'action', 'align', 'alt', 'axis', 'bgcolor', 'border', 'cellpadding', cmlenz@123: 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class', cmlenz@123: 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime', cmlenz@123: 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height', cmlenz@123: 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang', cmlenz@123: 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name', cmlenz@123: 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev', cmlenz@123: 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', cmlenz@123: 'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target', cmlenz@123: 'title', 'type', 'usemap', 'valign', 'value', 'vspace', 'width']) cmlenz@123: _URI_ATTRS = frozenset(['action', 'background', 'dynsrc', 'href', 'lowsrc', cmlenz@123: 'src']) cmlenz@123: _SAFE_SCHEMES = frozenset(['file', 'ftp', 'http', 'https', 'mailto', None]) cmlenz@123: cmlenz@123: def __call__(self, stream, ctxt=None): cmlenz@123: waiting_for = None cmlenz@123: cmlenz@123: for kind, data, pos in stream: cmlenz@123: if kind is START: cmlenz@123: if waiting_for: cmlenz@123: continue cmlenz@123: tag, attrib = data cmlenz@123: if tag not in self._SAFE_TAGS: cmlenz@123: waiting_for = tag cmlenz@123: continue cmlenz@123: cmlenz@123: new_attrib = Attributes() cmlenz@123: for attr, value in attrib: cmlenz@123: value = stripentities(value) cmlenz@123: if attr not in self._SAFE_ATTRS: cmlenz@123: continue cmlenz@123: elif attr in self._URI_ATTRS: cmlenz@123: # Don't allow URI schemes such as "javascript:" cmlenz@123: if self._get_scheme(value) not in self._SAFE_SCHEMES: cmlenz@123: continue cmlenz@123: elif attr == 'style': cmlenz@123: # Remove dangerous CSS declarations from inline styles cmlenz@123: decls = [] cmlenz@123: for decl in filter(None, value.split(';')): cmlenz@123: is_evil = False cmlenz@123: if 'expression' in decl: cmlenz@123: is_evil = True cmlenz@123: for m in re.finditer(r'url\s*\(([^)]+)', decl): cmlenz@123: if self._get_scheme(m.group(1)) not in self._SAFE_SCHEMES: cmlenz@123: is_evil = True cmlenz@123: break cmlenz@123: if not is_evil: cmlenz@123: decls.append(decl.strip()) cmlenz@123: if not decls: cmlenz@123: continue cmlenz@123: value = '; '.join(decls) cmlenz@123: new_attrib.append((attr, value)) cmlenz@123: cmlenz@123: yield kind, (tag, new_attrib), pos cmlenz@123: cmlenz@123: elif kind is END: cmlenz@123: tag = data cmlenz@123: if waiting_for: cmlenz@123: if waiting_for == tag: cmlenz@123: waiting_for = None cmlenz@123: else: cmlenz@123: yield kind, data, pos cmlenz@123: cmlenz@123: else: cmlenz@123: if not waiting_for: cmlenz@123: yield kind, data, pos cmlenz@123: cmlenz@123: def _get_scheme(self, text): cmlenz@123: if ':' not in text: cmlenz@123: return None cmlenz@123: chars = [char for char in text.split(':', 1)[0] if char.isalnum()] cmlenz@123: return ''.join(chars).lower() cmlenz@1: cmlenz@1: cmlenz@1: class IncludeFilter(object): cmlenz@1: """Template filter providing (very) basic XInclude support cmlenz@1: (see http://www.w3.org/TR/xinclude/) in templates. cmlenz@1: """ cmlenz@1: cmlenz@18: NAMESPACE = Namespace('http://www.w3.org/2001/XInclude') cmlenz@1: cmlenz@17: def __init__(self, loader): cmlenz@1: """Initialize the filter. cmlenz@1: cmlenz@1: @param loader: the `TemplateLoader` to use for resolving references to cmlenz@1: external template files cmlenz@1: """ cmlenz@1: self.loader = loader cmlenz@1: cmlenz@142: def __call__(self, stream, ctxt=None): cmlenz@1: """Filter the stream, processing any XInclude directives it may cmlenz@1: contain. cmlenz@1: cmlenz@142: @param stream: the markup event stream to filter cmlenz@1: @param ctxt: the template context cmlenz@1: """ cmlenz@12: from markup.template import Template, TemplateError, TemplateNotFound cmlenz@1: cmlenz@142: ns_prefixes = [] cmlenz@1: in_fallback = False cmlenz@1: include_href, fallback_stream = None, None cmlenz@69: namespace = self.NAMESPACE cmlenz@1: cmlenz@1: for kind, data, pos in stream: cmlenz@1: cmlenz@69: if kind is START and not in_fallback and data[0] in namespace: cmlenz@1: tag, attrib = data cmlenz@1: if tag.localname == 'include': cmlenz@1: include_href = attrib.get('href') cmlenz@1: elif tag.localname == 'fallback': cmlenz@1: in_fallback = True cmlenz@1: fallback_stream = [] cmlenz@1: cmlenz@69: elif kind is END and data in namespace: cmlenz@1: if data.localname == 'include': cmlenz@1: try: cmlenz@1: if not include_href: cmlenz@1: raise TemplateError('Include misses required ' cmlenz@1: 'attribute "href"') cmlenz@21: template = self.loader.load(include_href, cmlenz@21: relative_to=pos[0]) cmlenz@17: for event in template.generate(ctxt): cmlenz@17: yield event cmlenz@13: cmlenz@1: except TemplateNotFound: cmlenz@1: if fallback_stream is None: cmlenz@1: raise cmlenz@1: for event in fallback_stream: cmlenz@1: yield event cmlenz@1: cmlenz@1: include_href = None cmlenz@1: fallback_stream = None cmlenz@17: cmlenz@1: elif data.localname == 'fallback': cmlenz@1: in_fallback = False cmlenz@1: cmlenz@1: elif in_fallback: cmlenz@1: fallback_stream.append((kind, data, pos)) cmlenz@1: cmlenz@69: elif kind is START_NS and data[1] == namespace: cmlenz@12: ns_prefixes.append(data[0]) cmlenz@12: cmlenz@69: elif kind is END_NS and data in ns_prefixes: cmlenz@12: ns_prefixes.pop() cmlenz@1: cmlenz@1: else: cmlenz@1: yield kind, data, pos