cmlenz@1: # -*- coding: utf-8 -*-
cmlenz@1: #
cmlenz@66: # Copyright (C) 2006 Edgewall Software
cmlenz@1: # All rights reserved.
cmlenz@1: #
cmlenz@1: # This software is licensed as described in the file COPYING, which
cmlenz@1: # you should have received as part of this distribution. The terms
cmlenz@66: # are also available at http://markup.edgewall.org/wiki/License.
cmlenz@1: #
cmlenz@1: # This software consists of voluntary contributions made by many
cmlenz@1: # individuals. For the exact contribution history, see the revision
cmlenz@66: # history and logs, available at http://markup.edgewall.org/log/.
cmlenz@1: 
cmlenz@1: """Implementation of a number of stream filters."""
cmlenz@1: 
cmlenz@1: try:
cmlenz@1:     frozenset
cmlenz@1: except NameError:
cmlenz@1:     from sets import ImmutableSet as frozenset
cmlenz@1: import re
cmlenz@1: 
cmlenz@69: from markup.core import Attributes, Markup, Namespace
cmlenz@69: from markup.core import END, END_NS, START, START_NS, TEXT
cmlenz@1: from markup.path import Path
cmlenz@1: 
cmlenz@17: __all__ = ['IncludeFilter', 'WhitespaceFilter', 'HTMLSanitizer']
cmlenz@1: 
cmlenz@1: 
cmlenz@1: class IncludeFilter(object):
cmlenz@1:     """Template filter providing (very) basic XInclude support
cmlenz@1:     (see http://www.w3.org/TR/xinclude/) in templates.
cmlenz@1:     """
cmlenz@1: 
cmlenz@18:     NAMESPACE = Namespace('http://www.w3.org/2001/XInclude')
cmlenz@1: 
cmlenz@17:     def __init__(self, loader):
cmlenz@1:         """Initialize the filter.
cmlenz@1:         
cmlenz@1:         @param loader: the `TemplateLoader` to use for resolving references to
cmlenz@1:             external template files
cmlenz@1:         """
cmlenz@1:         self.loader = loader
cmlenz@1: 
cmlenz@12:     def __call__(self, stream, ctxt=None, ns_prefixes=None):
cmlenz@1:         """Filter the stream, processing any XInclude directives it may
cmlenz@1:         contain.
cmlenz@1:         
cmlenz@1:         @param ctxt: the template context
cmlenz@1:         @param stream: the markup event stream to filter
cmlenz@1:         """
cmlenz@12:         from markup.template import Template, TemplateError, TemplateNotFound
cmlenz@1: 
cmlenz@12:         if ns_prefixes is None:
cmlenz@12:             ns_prefixes = []
cmlenz@1:         in_fallback = False
cmlenz@1:         include_href, fallback_stream = None, None
cmlenz@69:         namespace = self.NAMESPACE
cmlenz@1: 
cmlenz@1:         for kind, data, pos in stream:
cmlenz@1: 
cmlenz@69:             if kind is START and not in_fallback and data[0] in namespace:
cmlenz@1:                 tag, attrib = data
cmlenz@1:                 if tag.localname == 'include':
cmlenz@1:                     include_href = attrib.get('href')
cmlenz@1:                 elif tag.localname == 'fallback':
cmlenz@1:                     in_fallback = True
cmlenz@1:                     fallback_stream = []
cmlenz@1: 
cmlenz@69:             elif kind is END and data in namespace:
cmlenz@1:                 if data.localname == 'include':
cmlenz@1:                     try:
cmlenz@1:                         if not include_href:
cmlenz@1:                             raise TemplateError('Include misses required '
cmlenz@1:                                                 'attribute "href"')
cmlenz@21:                         template = self.loader.load(include_href,
cmlenz@21:                                                     relative_to=pos[0])
cmlenz@17:                         for event in template.generate(ctxt):
cmlenz@17:                             yield event
cmlenz@13: 
cmlenz@1:                     except TemplateNotFound:
cmlenz@1:                         if fallback_stream is None:
cmlenz@1:                             raise
cmlenz@1:                         for event in fallback_stream:
cmlenz@1:                             yield event
cmlenz@1: 
cmlenz@1:                     include_href = None
cmlenz@1:                     fallback_stream = None
cmlenz@17: 
cmlenz@1:                 elif data.localname == 'fallback':
cmlenz@1:                     in_fallback = False
cmlenz@1: 
cmlenz@1:             elif in_fallback:
cmlenz@1:                 fallback_stream.append((kind, data, pos))
cmlenz@1: 
cmlenz@69:             elif kind is START_NS and data[1] == namespace:
cmlenz@12:                 ns_prefixes.append(data[0])
cmlenz@12: 
cmlenz@69:             elif kind is END_NS and data in ns_prefixes:
cmlenz@12:                 ns_prefixes.pop()
cmlenz@1: 
cmlenz@1:             else:
cmlenz@1:                 yield kind, data, pos
cmlenz@1: 
cmlenz@1: 
cmlenz@1: class WhitespaceFilter(object):
cmlenz@1:     """A filter that removes extraneous white space from the stream.
cmlenz@1: 
cmlenz@69:     TODO:
cmlenz@1:      * Support for xml:space
cmlenz@1:     """
cmlenz@1:     _TRAILING_SPACE = re.compile('[ \t]+(?=\n)')
cmlenz@1:     _LINE_COLLAPSE = re.compile('\n{2,}')
cmlenz@1: 
cmlenz@1:     def __call__(self, stream, ctxt=None):
cmlenz@69:         trim_trailing_space = self._TRAILING_SPACE.sub
cmlenz@69:         collapse_lines = self._LINE_COLLAPSE.sub
cmlenz@69:         mjoin = Markup('').join
cmlenz@69: 
cmlenz@1:         textbuf = []
cmlenz@1:         for kind, data, pos in stream:
cmlenz@69:             if kind is TEXT:
cmlenz@1:                 textbuf.append(data)
cmlenz@69:             else:
cmlenz@69:                 if textbuf:
cmlenz@69:                     text = mjoin(textbuf, escape_quotes=False)
cmlenz@69:                     text = trim_trailing_space('', text)
cmlenz@69:                     text = collapse_lines('\n', text)
cmlenz@69:                     yield TEXT, Markup(text), pos
cmlenz@69:                     del textbuf[:]
cmlenz@1:                 yield kind, data, pos
cmlenz@69:         else:
cmlenz@69:             if textbuf:
cmlenz@69:                 text = mjoin(textbuf, escape_quotes=False)
cmlenz@69:                 text = trim_trailing_space('', text)
cmlenz@69:                 text = collapse_lines('\n', text)
cmlenz@69:                 yield TEXT, Markup(text), pos
cmlenz@1: 
cmlenz@1: 
cmlenz@1: class HTMLSanitizer(object):
cmlenz@1:     """A filter that removes potentially dangerous HTML tags and attributes
cmlenz@1:     from the stream.
cmlenz@1:     """
cmlenz@1: 
cmlenz@1:     _SAFE_TAGS = frozenset(['a', 'abbr', 'acronym', 'address', 'area', 'b',
cmlenz@1:         'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
cmlenz@1:         'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
cmlenz@1:         'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
cmlenz@1:         'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
cmlenz@1:         'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
cmlenz@1:         'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
cmlenz@1:         'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
cmlenz@1:         'ul', 'var'])
cmlenz@1: 
cmlenz@1:     _SAFE_ATTRS = frozenset(['abbr', 'accept', 'accept-charset', 'accesskey',
cmlenz@15:         'action', 'align', 'alt', 'axis', 'bgcolor', 'border', 'cellpadding',
cmlenz@1:         'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
cmlenz@1:         'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
cmlenz@1:         'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
cmlenz@1:         'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
cmlenz@1:         'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
cmlenz@1:         'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
cmlenz@1:         'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
cmlenz@1:         'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target',
cmlenz@1:         'title', 'type', 'usemap', 'valign', 'value', 'vspace', 'width'])
cmlenz@1:     _URI_ATTRS = frozenset(['action', 'background', 'dynsrc', 'href', 'lowsrc',
cmlenz@1:         'src'])
cmlenz@1:     _SAFE_SCHEMES = frozenset(['file', 'ftp', 'http', 'https', 'mailto', None])
cmlenz@1: 
cmlenz@1:     def __call__(self, stream, ctxt=None):
cmlenz@1:         waiting_for = None
cmlenz@1: 
cmlenz@1:         for kind, data, pos in stream:
cmlenz@69:             if kind is START:
cmlenz@1:                 if waiting_for:
cmlenz@1:                     continue
cmlenz@1:                 tag, attrib = data
cmlenz@1:                 if tag not in self._SAFE_TAGS:
cmlenz@1:                     waiting_for = tag
cmlenz@1:                     continue
cmlenz@1: 
cmlenz@1:                 new_attrib = []
cmlenz@1:                 for attr, value in attrib:
cmlenz@1:                     if attr not in self._SAFE_ATTRS:
cmlenz@1:                         continue
cmlenz@1:                     elif attr in self._URI_ATTRS:
cmlenz@1:                         # Don't allow URI schemes such as "javascript:"
cmlenz@1:                         if self._get_scheme(value) not in self._SAFE_SCHEMES:
cmlenz@1:                             continue
cmlenz@1:                     elif attr == 'style':
cmlenz@1:                         # Remove dangerous CSS declarations from inline styles
cmlenz@1:                         decls = []
cmlenz@1:                         for decl in filter(None, value.split(';')):
cmlenz@1:                             is_evil = False
cmlenz@1:                             if 'expression' in decl:
cmlenz@1:                                 is_evil = True
cmlenz@1:                             for m in re.finditer(r'url\s*\(([^)]+)', decl):
cmlenz@1:                                 if self._get_scheme(m.group(1)) not in self._SAFE_SCHEMES:
cmlenz@1:                                     is_evil = True
cmlenz@1:                                     break
cmlenz@1:                             if not is_evil:
cmlenz@1:                                 decls.append(decl.strip())
cmlenz@1:                         if not decls:
cmlenz@1:                             continue
cmlenz@1:                         value = '; '.join(decls)
cmlenz@1:                     new_attrib.append((attr, value))
cmlenz@1: 
cmlenz@1:                 yield kind, (tag, new_attrib), pos
cmlenz@1: 
cmlenz@69:             elif kind is END:
cmlenz@1:                 tag = data
cmlenz@1:                 if waiting_for:
cmlenz@1:                     if waiting_for == tag:
cmlenz@1:                         waiting_for = None
cmlenz@1:                 else:
cmlenz@1:                     yield kind, data, pos
cmlenz@1: 
cmlenz@1:             else:
cmlenz@1:                 if not waiting_for:
cmlenz@1:                     yield kind, data, pos
cmlenz@1: 
cmlenz@1:     def _get_scheme(self, text):
cmlenz@1:         if ':' not in text:
cmlenz@1:             return None
cmlenz@1:         chars = [char for char in text.split(':', 1)[0] if char.isalnum()]
cmlenz@1:         return ''.join(chars).lower()