Mercurial > genshi > genshi-test
annotate markup/filters.py @ 17:ad63ad459524
Refactoring to address #6: all match templates are now processed by a single filter, which means that match templates added by included templates are properly applied. A side effect of this refactoring is that `Context` objects may not be reused across multiple template processing runs.
Also, output filters are now applied in the `Stream.serialize()` method instead of by the `Template.generate()` method, which just makes more sense.
author | cmlenz |
---|---|
date | Sun, 18 Jun 2006 22:33:33 +0000 |
parents | f083101b8e8a |
children | 4cbebb15a834 |
rev | line source |
---|---|
1 | 1 # -*- coding: utf-8 -*- |
2 # | |
3 # Copyright (C) 2006 Christopher Lenz | |
4 # All rights reserved. | |
5 # | |
6 # This software is licensed as described in the file COPYING, which | |
7 # you should have received as part of this distribution. The terms | |
8 # are also available at http://trac.edgewall.com/license.html. | |
9 # | |
10 # This software consists of voluntary contributions made by many | |
11 # individuals. For the exact contribution history, see the revision | |
12 # history and logs, available at http://projects.edgewall.com/trac/. | |
13 | |
14 """Implementation of a number of stream filters.""" | |
15 | |
16 try: | |
17 frozenset | |
18 except NameError: | |
19 from sets import ImmutableSet as frozenset | |
20 import re | |
21 | |
22 from markup.core import Attributes, Markup, Stream | |
23 from markup.path import Path | |
24 | |
17
ad63ad459524
Refactoring to address #6: all match templates are now processed by a single filter, which means that match templates added by included templates are properly applied. A side effect of this refactoring is that `Context` objects may not be reused across multiple template processing runs.
cmlenz
parents:
15
diff
changeset
|
25 __all__ = ['IncludeFilter', 'WhitespaceFilter', 'HTMLSanitizer'] |
1 | 26 |
27 | |
28 class IncludeFilter(object): | |
29 """Template filter providing (very) basic XInclude support | |
30 (see http://www.w3.org/TR/xinclude/) in templates. | |
31 """ | |
32 | |
33 _NAMESPACE = 'http://www.w3.org/2001/XInclude' | |
34 | |
17
ad63ad459524
Refactoring to address #6: all match templates are now processed by a single filter, which means that match templates added by included templates are properly applied. A side effect of this refactoring is that `Context` objects may not be reused across multiple template processing runs.
cmlenz
parents:
15
diff
changeset
|
35 def __init__(self, loader): |
1 | 36 """Initialize the filter. |
37 | |
38 @param loader: the `TemplateLoader` to use for resolving references to | |
39 external template files | |
40 """ | |
41 self.loader = loader | |
42 | |
12
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
43 def __call__(self, stream, ctxt=None, ns_prefixes=None): |
1 | 44 """Filter the stream, processing any XInclude directives it may |
45 contain. | |
46 | |
47 @param ctxt: the template context | |
48 @param stream: the markup event stream to filter | |
49 """ | |
12
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
50 from markup.template import Template, TemplateError, TemplateNotFound |
1 | 51 |
12
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
52 if ns_prefixes is None: |
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
53 ns_prefixes = [] |
1 | 54 in_fallback = False |
55 include_href, fallback_stream = None, None | |
56 indent = 0 | |
57 | |
58 for kind, data, pos in stream: | |
59 | |
60 if kind is Stream.START and data[0].namespace == self._NAMESPACE \ | |
61 and not in_fallback: | |
62 tag, attrib = data | |
63 if tag.localname == 'include': | |
64 include_href = attrib.get('href') | |
65 indent = pos[1] | |
66 elif tag.localname == 'fallback': | |
67 in_fallback = True | |
68 fallback_stream = [] | |
69 | |
70 elif kind is Stream.END and data.namespace == self._NAMESPACE: | |
71 if data.localname == 'include': | |
72 try: | |
73 if not include_href: | |
74 raise TemplateError('Include misses required ' | |
75 'attribute "href"') | |
76 template = self.loader.load(include_href) | |
17
ad63ad459524
Refactoring to address #6: all match templates are now processed by a single filter, which means that match templates added by included templates are properly applied. A side effect of this refactoring is that `Context` objects may not be reused across multiple template processing runs.
cmlenz
parents:
15
diff
changeset
|
77 for event in template.generate(ctxt): |
ad63ad459524
Refactoring to address #6: all match templates are now processed by a single filter, which means that match templates added by included templates are properly applied. A side effect of this refactoring is that `Context` objects may not be reused across multiple template processing runs.
cmlenz
parents:
15
diff
changeset
|
78 yield event |
13
bf9de5a4c896
Match directives should now also be applied when included indirectly.
cmlenz
parents:
12
diff
changeset
|
79 |
1 | 80 except TemplateNotFound: |
81 if fallback_stream is None: | |
82 raise | |
83 for event in fallback_stream: | |
84 yield event | |
85 | |
86 include_href = None | |
87 fallback_stream = None | |
88 indent = 0 | |
17
ad63ad459524
Refactoring to address #6: all match templates are now processed by a single filter, which means that match templates added by included templates are properly applied. A side effect of this refactoring is that `Context` objects may not be reused across multiple template processing runs.
cmlenz
parents:
15
diff
changeset
|
89 |
1 | 90 elif data.localname == 'fallback': |
91 in_fallback = False | |
92 | |
93 elif in_fallback: | |
94 fallback_stream.append((kind, data, pos)) | |
95 | |
96 elif kind is Stream.START_NS and data[1] == self._NAMESPACE: | |
12
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
97 ns_prefixes.append(data[0]) |
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
98 |
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
99 elif kind is Stream.END_NS and data in ns_prefixes: |
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
100 ns_prefixes.pop() |
1 | 101 |
102 else: | |
103 yield kind, data, pos | |
104 | |
105 | |
106 class WhitespaceFilter(object): | |
107 """A filter that removes extraneous white space from the stream. | |
108 | |
109 Todo: | |
110 * Support for xml:space | |
111 """ | |
112 | |
113 _TRAILING_SPACE = re.compile('[ \t]+(?=\n)') | |
114 _LINE_COLLAPSE = re.compile('\n{2,}') | |
115 | |
116 def __call__(self, stream, ctxt=None): | |
117 textbuf = [] | |
118 prev_kind = None | |
119 for kind, data, pos in stream: | |
120 if kind is Stream.TEXT: | |
121 textbuf.append(data) | |
122 elif prev_kind is Stream.TEXT: | |
17
ad63ad459524
Refactoring to address #6: all match templates are now processed by a single filter, which means that match templates added by included templates are properly applied. A side effect of this refactoring is that `Context` objects may not be reused across multiple template processing runs.
cmlenz
parents:
15
diff
changeset
|
123 text = Markup('').join(textbuf) |
1 | 124 text = self._TRAILING_SPACE.sub('', text) |
125 text = self._LINE_COLLAPSE.sub('\n', text) | |
17
ad63ad459524
Refactoring to address #6: all match templates are now processed by a single filter, which means that match templates added by included templates are properly applied. A side effect of this refactoring is that `Context` objects may not be reused across multiple template processing runs.
cmlenz
parents:
15
diff
changeset
|
126 yield Stream.TEXT, Markup(text), pos |
1 | 127 del textbuf[:] |
128 prev_kind = kind | |
129 if kind is not Stream.TEXT: | |
130 yield kind, data, pos | |
131 | |
132 if textbuf: | |
133 text = self._LINE_COLLAPSE.sub('\n', ''.join(textbuf)) | |
17
ad63ad459524
Refactoring to address #6: all match templates are now processed by a single filter, which means that match templates added by included templates are properly applied. A side effect of this refactoring is that `Context` objects may not be reused across multiple template processing runs.
cmlenz
parents:
15
diff
changeset
|
134 yield Stream.TEXT, Markup(text), pos |
1 | 135 |
136 | |
137 class HTMLSanitizer(object): | |
138 """A filter that removes potentially dangerous HTML tags and attributes | |
139 from the stream. | |
140 """ | |
141 | |
142 _SAFE_TAGS = frozenset(['a', 'abbr', 'acronym', 'address', 'area', 'b', | |
143 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite', | |
144 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', | |
145 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', | |
146 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map', | |
147 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp', | |
148 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table', | |
149 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', | |
150 'ul', 'var']) | |
151 | |
152 _SAFE_ATTRS = frozenset(['abbr', 'accept', 'accept-charset', 'accesskey', | |
15 | 153 'action', 'align', 'alt', 'axis', 'bgcolor', 'border', 'cellpadding', |
1 | 154 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class', |
155 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime', | |
156 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height', | |
157 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang', | |
158 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name', | |
159 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev', | |
160 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', | |
161 'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target', | |
162 'title', 'type', 'usemap', 'valign', 'value', 'vspace', 'width']) | |
163 _URI_ATTRS = frozenset(['action', 'background', 'dynsrc', 'href', 'lowsrc', | |
164 'src']) | |
165 _SAFE_SCHEMES = frozenset(['file', 'ftp', 'http', 'https', 'mailto', None]) | |
166 | |
167 def __call__(self, stream, ctxt=None): | |
168 waiting_for = None | |
169 | |
170 for kind, data, pos in stream: | |
171 if kind is Stream.START: | |
172 if waiting_for: | |
173 continue | |
174 tag, attrib = data | |
175 if tag not in self._SAFE_TAGS: | |
176 waiting_for = tag | |
177 continue | |
178 | |
179 new_attrib = [] | |
180 for attr, value in attrib: | |
181 if attr not in self._SAFE_ATTRS: | |
182 continue | |
183 elif attr in self._URI_ATTRS: | |
184 # Don't allow URI schemes such as "javascript:" | |
185 if self._get_scheme(value) not in self._SAFE_SCHEMES: | |
186 continue | |
187 elif attr == 'style': | |
188 # Remove dangerous CSS declarations from inline styles | |
189 decls = [] | |
190 for decl in filter(None, value.split(';')): | |
191 is_evil = False | |
192 if 'expression' in decl: | |
193 is_evil = True | |
194 for m in re.finditer(r'url\s*\(([^)]+)', decl): | |
195 if self._get_scheme(m.group(1)) not in self._SAFE_SCHEMES: | |
196 is_evil = True | |
197 break | |
198 if not is_evil: | |
199 decls.append(decl.strip()) | |
200 if not decls: | |
201 continue | |
202 value = '; '.join(decls) | |
203 new_attrib.append((attr, value)) | |
204 | |
205 yield kind, (tag, new_attrib), pos | |
206 | |
207 elif kind is Stream.END: | |
208 tag = data | |
209 if waiting_for: | |
210 if waiting_for == tag: | |
211 waiting_for = None | |
212 else: | |
213 yield kind, data, pos | |
214 | |
215 else: | |
216 if not waiting_for: | |
217 yield kind, data, pos | |
218 | |
219 def _get_scheme(self, text): | |
220 if ':' not in text: | |
221 return None | |
222 chars = [char for char in text.split(':', 1)[0] if char.isalnum()] | |
223 return ''.join(chars).lower() |