Mercurial > genshi > genshi-test
annotate markup/filters.py @ 92:3b75c6730b29
More performance improvements... this time for whitespace normalization and template loops.
author | cmlenz |
---|---|
date | Thu, 20 Jul 2006 23:06:36 +0000 |
parents | e9a3930f8823 |
children | f648152df7fd |
rev | line source |
---|---|
1 | 1 # -*- coding: utf-8 -*- |
2 # | |
66
822089ae65ce
Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents:
54
diff
changeset
|
3 # Copyright (C) 2006 Edgewall Software |
1 | 4 # All rights reserved. |
5 # | |
6 # This software is licensed as described in the file COPYING, which | |
7 # you should have received as part of this distribution. The terms | |
66
822089ae65ce
Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents:
54
diff
changeset
|
8 # are also available at http://markup.edgewall.org/wiki/License. |
1 | 9 # |
10 # This software consists of voluntary contributions made by many | |
11 # individuals. For the exact contribution history, see the revision | |
66
822089ae65ce
Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents:
54
diff
changeset
|
12 # history and logs, available at http://markup.edgewall.org/log/. |
1 | 13 |
14 """Implementation of a number of stream filters.""" | |
15 | |
92
3b75c6730b29
More performance improvements... this time for whitespace normalization and template loops.
cmlenz
parents:
69
diff
changeset
|
16 from itertools import chain |
1 | 17 try: |
18 frozenset | |
19 except NameError: | |
20 from sets import ImmutableSet as frozenset | |
21 import re | |
22 | |
92
3b75c6730b29
More performance improvements... this time for whitespace normalization and template loops.
cmlenz
parents:
69
diff
changeset
|
23 from markup.core import Attributes, Markup, Namespace, escape |
69 | 24 from markup.core import END, END_NS, START, START_NS, TEXT |
1 | 25 from markup.path import Path |
26 | |
17
ad63ad459524
Refactoring to address #6: all match templates are now processed by a single filter, which means that match templates added by included templates are properly applied. A side effect of this refactoring is that `Context` objects may not be reused across multiple template processing runs.
cmlenz
parents:
15
diff
changeset
|
27 __all__ = ['IncludeFilter', 'WhitespaceFilter', 'HTMLSanitizer'] |
1 | 28 |
29 | |
30 class IncludeFilter(object): | |
31 """Template filter providing (very) basic XInclude support | |
32 (see http://www.w3.org/TR/xinclude/) in templates. | |
33 """ | |
34 | |
18
4cbebb15a834
Actually make use of the `markup.core.Namespace` class, and add a couple of doctests.
cmlenz
parents:
17
diff
changeset
|
35 NAMESPACE = Namespace('http://www.w3.org/2001/XInclude') |
1 | 36 |
17
ad63ad459524
Refactoring to address #6: all match templates are now processed by a single filter, which means that match templates added by included templates are properly applied. A side effect of this refactoring is that `Context` objects may not be reused across multiple template processing runs.
cmlenz
parents:
15
diff
changeset
|
37 def __init__(self, loader): |
1 | 38 """Initialize the filter. |
39 | |
40 @param loader: the `TemplateLoader` to use for resolving references to | |
41 external template files | |
42 """ | |
43 self.loader = loader | |
44 | |
12
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
45 def __call__(self, stream, ctxt=None, ns_prefixes=None): |
1 | 46 """Filter the stream, processing any XInclude directives it may |
47 contain. | |
48 | |
49 @param ctxt: the template context | |
50 @param stream: the markup event stream to filter | |
51 """ | |
12
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
52 from markup.template import Template, TemplateError, TemplateNotFound |
1 | 53 |
12
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
54 if ns_prefixes is None: |
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
55 ns_prefixes = [] |
1 | 56 in_fallback = False |
57 include_href, fallback_stream = None, None | |
69 | 58 namespace = self.NAMESPACE |
1 | 59 |
60 for kind, data, pos in stream: | |
61 | |
69 | 62 if kind is START and not in_fallback and data[0] in namespace: |
1 | 63 tag, attrib = data |
64 if tag.localname == 'include': | |
65 include_href = attrib.get('href') | |
66 elif tag.localname == 'fallback': | |
67 in_fallback = True | |
68 fallback_stream = [] | |
69 | |
69 | 70 elif kind is END and data in namespace: |
1 | 71 if data.localname == 'include': |
72 try: | |
73 if not include_href: | |
74 raise TemplateError('Include misses required ' | |
75 'attribute "href"') | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
18
diff
changeset
|
76 template = self.loader.load(include_href, |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
18
diff
changeset
|
77 relative_to=pos[0]) |
17
ad63ad459524
Refactoring to address #6: all match templates are now processed by a single filter, which means that match templates added by included templates are properly applied. A side effect of this refactoring is that `Context` objects may not be reused across multiple template processing runs.
cmlenz
parents:
15
diff
changeset
|
78 for event in template.generate(ctxt): |
ad63ad459524
Refactoring to address #6: all match templates are now processed by a single filter, which means that match templates added by included templates are properly applied. A side effect of this refactoring is that `Context` objects may not be reused across multiple template processing runs.
cmlenz
parents:
15
diff
changeset
|
79 yield event |
13
bf9de5a4c896
Match directives should now also be applied when included indirectly.
cmlenz
parents:
12
diff
changeset
|
80 |
1 | 81 except TemplateNotFound: |
82 if fallback_stream is None: | |
83 raise | |
84 for event in fallback_stream: | |
85 yield event | |
86 | |
87 include_href = None | |
88 fallback_stream = None | |
17
ad63ad459524
Refactoring to address #6: all match templates are now processed by a single filter, which means that match templates added by included templates are properly applied. A side effect of this refactoring is that `Context` objects may not be reused across multiple template processing runs.
cmlenz
parents:
15
diff
changeset
|
89 |
1 | 90 elif data.localname == 'fallback': |
91 in_fallback = False | |
92 | |
93 elif in_fallback: | |
94 fallback_stream.append((kind, data, pos)) | |
95 | |
69 | 96 elif kind is START_NS and data[1] == namespace: |
12
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
97 ns_prefixes.append(data[0]) |
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
98 |
69 | 99 elif kind is END_NS and data in ns_prefixes: |
12
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
100 ns_prefixes.pop() |
1 | 101 |
102 else: | |
103 yield kind, data, pos | |
104 | |
105 | |
106 class WhitespaceFilter(object): | |
107 """A filter that removes extraneous white space from the stream. | |
108 | |
69 | 109 TODO: |
1 | 110 * Support for xml:space |
111 """ | |
112 _TRAILING_SPACE = re.compile('[ \t]+(?=\n)') | |
113 _LINE_COLLAPSE = re.compile('\n{2,}') | |
114 | |
115 def __call__(self, stream, ctxt=None): | |
69 | 116 trim_trailing_space = self._TRAILING_SPACE.sub |
117 collapse_lines = self._LINE_COLLAPSE.sub | |
118 mjoin = Markup('').join | |
119 | |
1 | 120 textbuf = [] |
92
3b75c6730b29
More performance improvements... this time for whitespace normalization and template loops.
cmlenz
parents:
69
diff
changeset
|
121 for kind, data, pos in chain(stream, [(None, None, None)]): |
69 | 122 if kind is TEXT: |
1 | 123 textbuf.append(data) |
69 | 124 else: |
125 if textbuf: | |
92
3b75c6730b29
More performance improvements... this time for whitespace normalization and template loops.
cmlenz
parents:
69
diff
changeset
|
126 if len(textbuf) > 1: |
3b75c6730b29
More performance improvements... this time for whitespace normalization and template loops.
cmlenz
parents:
69
diff
changeset
|
127 output = Markup(collapse_lines('\n', |
3b75c6730b29
More performance improvements... this time for whitespace normalization and template loops.
cmlenz
parents:
69
diff
changeset
|
128 trim_trailing_space('', |
3b75c6730b29
More performance improvements... this time for whitespace normalization and template loops.
cmlenz
parents:
69
diff
changeset
|
129 mjoin(textbuf, escape_quotes=False)))) |
3b75c6730b29
More performance improvements... this time for whitespace normalization and template loops.
cmlenz
parents:
69
diff
changeset
|
130 del textbuf[:] |
3b75c6730b29
More performance improvements... this time for whitespace normalization and template loops.
cmlenz
parents:
69
diff
changeset
|
131 yield TEXT, output, pos |
3b75c6730b29
More performance improvements... this time for whitespace normalization and template loops.
cmlenz
parents:
69
diff
changeset
|
132 else: |
3b75c6730b29
More performance improvements... this time for whitespace normalization and template loops.
cmlenz
parents:
69
diff
changeset
|
133 output = escape(collapse_lines('\n', |
3b75c6730b29
More performance improvements... this time for whitespace normalization and template loops.
cmlenz
parents:
69
diff
changeset
|
134 trim_trailing_space('', |
3b75c6730b29
More performance improvements... this time for whitespace normalization and template loops.
cmlenz
parents:
69
diff
changeset
|
135 textbuf.pop())), quotes=False) |
3b75c6730b29
More performance improvements... this time for whitespace normalization and template loops.
cmlenz
parents:
69
diff
changeset
|
136 yield TEXT, output, pos |
1 | 137 yield kind, data, pos |
138 | |
139 | |
140 class HTMLSanitizer(object): | |
141 """A filter that removes potentially dangerous HTML tags and attributes | |
142 from the stream. | |
143 """ | |
144 | |
145 _SAFE_TAGS = frozenset(['a', 'abbr', 'acronym', 'address', 'area', 'b', | |
146 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite', | |
147 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', | |
148 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', | |
149 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map', | |
150 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp', | |
151 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table', | |
152 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', | |
153 'ul', 'var']) | |
154 | |
155 _SAFE_ATTRS = frozenset(['abbr', 'accept', 'accept-charset', 'accesskey', | |
15 | 156 'action', 'align', 'alt', 'axis', 'bgcolor', 'border', 'cellpadding', |
1 | 157 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class', |
158 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime', | |
159 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height', | |
160 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang', | |
161 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name', | |
162 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev', | |
163 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', | |
164 'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target', | |
165 'title', 'type', 'usemap', 'valign', 'value', 'vspace', 'width']) | |
166 _URI_ATTRS = frozenset(['action', 'background', 'dynsrc', 'href', 'lowsrc', | |
167 'src']) | |
168 _SAFE_SCHEMES = frozenset(['file', 'ftp', 'http', 'https', 'mailto', None]) | |
169 | |
170 def __call__(self, stream, ctxt=None): | |
171 waiting_for = None | |
172 | |
173 for kind, data, pos in stream: | |
69 | 174 if kind is START: |
1 | 175 if waiting_for: |
176 continue | |
177 tag, attrib = data | |
178 if tag not in self._SAFE_TAGS: | |
179 waiting_for = tag | |
180 continue | |
181 | |
182 new_attrib = [] | |
183 for attr, value in attrib: | |
184 if attr not in self._SAFE_ATTRS: | |
185 continue | |
186 elif attr in self._URI_ATTRS: | |
187 # Don't allow URI schemes such as "javascript:" | |
188 if self._get_scheme(value) not in self._SAFE_SCHEMES: | |
189 continue | |
190 elif attr == 'style': | |
191 # Remove dangerous CSS declarations from inline styles | |
192 decls = [] | |
193 for decl in filter(None, value.split(';')): | |
194 is_evil = False | |
195 if 'expression' in decl: | |
196 is_evil = True | |
197 for m in re.finditer(r'url\s*\(([^)]+)', decl): | |
198 if self._get_scheme(m.group(1)) not in self._SAFE_SCHEMES: | |
199 is_evil = True | |
200 break | |
201 if not is_evil: | |
202 decls.append(decl.strip()) | |
203 if not decls: | |
204 continue | |
205 value = '; '.join(decls) | |
206 new_attrib.append((attr, value)) | |
207 | |
208 yield kind, (tag, new_attrib), pos | |
209 | |
69 | 210 elif kind is END: |
1 | 211 tag = data |
212 if waiting_for: | |
213 if waiting_for == tag: | |
214 waiting_for = None | |
215 else: | |
216 yield kind, data, pos | |
217 | |
218 else: | |
219 if not waiting_for: | |
220 yield kind, data, pos | |
221 | |
222 def _get_scheme(self, text): | |
223 if ':' not in text: | |
224 return None | |
225 chars = [char for char in text.split(':', 1)[0] if char.isalnum()] | |
226 return ''.join(chars).lower() |