cmlenz@1: # -*- coding: utf-8 -*- cmlenz@1: # cmlenz@66: # Copyright (C) 2006 Edgewall Software cmlenz@1: # All rights reserved. cmlenz@1: # cmlenz@1: # This software is licensed as described in the file COPYING, which cmlenz@1: # you should have received as part of this distribution. The terms cmlenz@66: # are also available at http://markup.edgewall.org/wiki/License. cmlenz@1: # cmlenz@1: # This software consists of voluntary contributions made by many cmlenz@1: # individuals. For the exact contribution history, see the revision cmlenz@66: # history and logs, available at http://markup.edgewall.org/log/. cmlenz@1: cmlenz@1: """Implementation of a number of stream filters.""" cmlenz@1: cmlenz@1: try: cmlenz@1: frozenset cmlenz@1: except NameError: cmlenz@1: from sets import ImmutableSet as frozenset cmlenz@1: import re cmlenz@1: cmlenz@69: from markup.core import Attributes, Markup, Namespace cmlenz@69: from markup.core import END, END_NS, START, START_NS, TEXT cmlenz@1: from markup.path import Path cmlenz@1: cmlenz@17: __all__ = ['IncludeFilter', 'WhitespaceFilter', 'HTMLSanitizer'] cmlenz@1: cmlenz@1: cmlenz@1: class IncludeFilter(object): cmlenz@1: """Template filter providing (very) basic XInclude support cmlenz@1: (see http://www.w3.org/TR/xinclude/) in templates. cmlenz@1: """ cmlenz@1: cmlenz@18: NAMESPACE = Namespace('http://www.w3.org/2001/XInclude') cmlenz@1: cmlenz@17: def __init__(self, loader): cmlenz@1: """Initialize the filter. cmlenz@1: cmlenz@1: @param loader: the `TemplateLoader` to use for resolving references to cmlenz@1: external template files cmlenz@1: """ cmlenz@1: self.loader = loader cmlenz@1: cmlenz@12: def __call__(self, stream, ctxt=None, ns_prefixes=None): cmlenz@1: """Filter the stream, processing any XInclude directives it may cmlenz@1: contain. cmlenz@1: cmlenz@1: @param ctxt: the template context cmlenz@1: @param stream: the markup event stream to filter cmlenz@1: """ cmlenz@12: from markup.template import Template, TemplateError, TemplateNotFound cmlenz@1: cmlenz@12: if ns_prefixes is None: cmlenz@12: ns_prefixes = [] cmlenz@1: in_fallback = False cmlenz@1: include_href, fallback_stream = None, None cmlenz@69: namespace = self.NAMESPACE cmlenz@1: cmlenz@1: for kind, data, pos in stream: cmlenz@1: cmlenz@69: if kind is START and not in_fallback and data[0] in namespace: cmlenz@1: tag, attrib = data cmlenz@1: if tag.localname == 'include': cmlenz@1: include_href = attrib.get('href') cmlenz@1: elif tag.localname == 'fallback': cmlenz@1: in_fallback = True cmlenz@1: fallback_stream = [] cmlenz@1: cmlenz@69: elif kind is END and data in namespace: cmlenz@1: if data.localname == 'include': cmlenz@1: try: cmlenz@1: if not include_href: cmlenz@1: raise TemplateError('Include misses required ' cmlenz@1: 'attribute "href"') cmlenz@21: template = self.loader.load(include_href, cmlenz@21: relative_to=pos[0]) cmlenz@17: for event in template.generate(ctxt): cmlenz@17: yield event cmlenz@13: cmlenz@1: except TemplateNotFound: cmlenz@1: if fallback_stream is None: cmlenz@1: raise cmlenz@1: for event in fallback_stream: cmlenz@1: yield event cmlenz@1: cmlenz@1: include_href = None cmlenz@1: fallback_stream = None cmlenz@17: cmlenz@1: elif data.localname == 'fallback': cmlenz@1: in_fallback = False cmlenz@1: cmlenz@1: elif in_fallback: cmlenz@1: fallback_stream.append((kind, data, pos)) cmlenz@1: cmlenz@69: elif kind is START_NS and data[1] == namespace: cmlenz@12: ns_prefixes.append(data[0]) cmlenz@12: cmlenz@69: elif kind is END_NS and data in ns_prefixes: cmlenz@12: ns_prefixes.pop() cmlenz@1: cmlenz@1: else: cmlenz@1: yield kind, data, pos cmlenz@1: cmlenz@1: cmlenz@1: class WhitespaceFilter(object): cmlenz@1: """A filter that removes extraneous white space from the stream. cmlenz@1: cmlenz@69: TODO: cmlenz@1: * Support for xml:space cmlenz@1: """ cmlenz@1: _TRAILING_SPACE = re.compile('[ \t]+(?=\n)') cmlenz@1: _LINE_COLLAPSE = re.compile('\n{2,}') cmlenz@1: cmlenz@1: def __call__(self, stream, ctxt=None): cmlenz@69: trim_trailing_space = self._TRAILING_SPACE.sub cmlenz@69: collapse_lines = self._LINE_COLLAPSE.sub cmlenz@69: mjoin = Markup('').join cmlenz@69: cmlenz@1: textbuf = [] cmlenz@1: for kind, data, pos in stream: cmlenz@69: if kind is TEXT: cmlenz@1: textbuf.append(data) cmlenz@69: else: cmlenz@69: if textbuf: cmlenz@69: text = mjoin(textbuf, escape_quotes=False) cmlenz@69: text = trim_trailing_space('', text) cmlenz@69: text = collapse_lines('\n', text) cmlenz@69: yield TEXT, Markup(text), pos cmlenz@69: del textbuf[:] cmlenz@1: yield kind, data, pos cmlenz@69: else: cmlenz@69: if textbuf: cmlenz@69: text = mjoin(textbuf, escape_quotes=False) cmlenz@69: text = trim_trailing_space('', text) cmlenz@69: text = collapse_lines('\n', text) cmlenz@69: yield TEXT, Markup(text), pos cmlenz@1: cmlenz@1: cmlenz@1: class HTMLSanitizer(object): cmlenz@1: """A filter that removes potentially dangerous HTML tags and attributes cmlenz@1: from the stream. cmlenz@1: """ cmlenz@1: cmlenz@1: _SAFE_TAGS = frozenset(['a', 'abbr', 'acronym', 'address', 'area', 'b', cmlenz@1: 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite', cmlenz@1: 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', cmlenz@1: 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', cmlenz@1: 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map', cmlenz@1: 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp', cmlenz@1: 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table', cmlenz@1: 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', cmlenz@1: 'ul', 'var']) cmlenz@1: cmlenz@1: _SAFE_ATTRS = frozenset(['abbr', 'accept', 'accept-charset', 'accesskey', cmlenz@15: 'action', 'align', 'alt', 'axis', 'bgcolor', 'border', 'cellpadding', cmlenz@1: 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class', cmlenz@1: 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime', cmlenz@1: 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height', cmlenz@1: 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang', cmlenz@1: 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name', cmlenz@1: 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev', cmlenz@1: 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', cmlenz@1: 'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target', cmlenz@1: 'title', 'type', 'usemap', 'valign', 'value', 'vspace', 'width']) cmlenz@1: _URI_ATTRS = frozenset(['action', 'background', 'dynsrc', 'href', 'lowsrc', cmlenz@1: 'src']) cmlenz@1: _SAFE_SCHEMES = frozenset(['file', 'ftp', 'http', 'https', 'mailto', None]) cmlenz@1: cmlenz@1: def __call__(self, stream, ctxt=None): cmlenz@1: waiting_for = None cmlenz@1: cmlenz@1: for kind, data, pos in stream: cmlenz@69: if kind is START: cmlenz@1: if waiting_for: cmlenz@1: continue cmlenz@1: tag, attrib = data cmlenz@1: if tag not in self._SAFE_TAGS: cmlenz@1: waiting_for = tag cmlenz@1: continue cmlenz@1: cmlenz@1: new_attrib = [] cmlenz@1: for attr, value in attrib: cmlenz@1: if attr not in self._SAFE_ATTRS: cmlenz@1: continue cmlenz@1: elif attr in self._URI_ATTRS: cmlenz@1: # Don't allow URI schemes such as "javascript:" cmlenz@1: if self._get_scheme(value) not in self._SAFE_SCHEMES: cmlenz@1: continue cmlenz@1: elif attr == 'style': cmlenz@1: # Remove dangerous CSS declarations from inline styles cmlenz@1: decls = [] cmlenz@1: for decl in filter(None, value.split(';')): cmlenz@1: is_evil = False cmlenz@1: if 'expression' in decl: cmlenz@1: is_evil = True cmlenz@1: for m in re.finditer(r'url\s*\(([^)]+)', decl): cmlenz@1: if self._get_scheme(m.group(1)) not in self._SAFE_SCHEMES: cmlenz@1: is_evil = True cmlenz@1: break cmlenz@1: if not is_evil: cmlenz@1: decls.append(decl.strip()) cmlenz@1: if not decls: cmlenz@1: continue cmlenz@1: value = '; '.join(decls) cmlenz@1: new_attrib.append((attr, value)) cmlenz@1: cmlenz@1: yield kind, (tag, new_attrib), pos cmlenz@1: cmlenz@69: elif kind is END: cmlenz@1: tag = data cmlenz@1: if waiting_for: cmlenz@1: if waiting_for == tag: cmlenz@1: waiting_for = None cmlenz@1: else: cmlenz@1: yield kind, data, pos cmlenz@1: cmlenz@1: else: cmlenz@1: if not waiting_for: cmlenz@1: yield kind, data, pos cmlenz@1: cmlenz@1: def _get_scheme(self, text): cmlenz@1: if ':' not in text: cmlenz@1: return None cmlenz@1: chars = [char for char in text.split(':', 1)[0] if char.isalnum()] cmlenz@1: return ''.join(chars).lower()