Mercurial > genshi > genshi-test
annotate markup/filters.py @ 66:822089ae65ce
Switch copyright to Edgewall and URLs to markup.edgewall.org.
author | cmlenz |
---|---|
date | Sun, 09 Jul 2006 17:46:12 +0000 |
parents | 01981cbc7575 |
children | e9a3930f8823 |
rev | line source |
---|---|
1 | 1 # -*- coding: utf-8 -*- |
2 # | |
66
822089ae65ce
Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents:
54
diff
changeset
|
3 # Copyright (C) 2006 Edgewall Software |
1 | 4 # All rights reserved. |
5 # | |
6 # This software is licensed as described in the file COPYING, which | |
7 # you should have received as part of this distribution. The terms | |
66
822089ae65ce
Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents:
54
diff
changeset
|
8 # are also available at http://markup.edgewall.org/wiki/License. |
1 | 9 # |
10 # This software consists of voluntary contributions made by many | |
11 # individuals. For the exact contribution history, see the revision | |
66
822089ae65ce
Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents:
54
diff
changeset
|
12 # history and logs, available at http://markup.edgewall.org/log/. |
1 | 13 |
14 """Implementation of a number of stream filters.""" | |
15 | |
16 try: | |
17 frozenset | |
18 except NameError: | |
19 from sets import ImmutableSet as frozenset | |
20 import re | |
21 | |
18
4cbebb15a834
Actually make use of the `markup.core.Namespace` class, and add a couple of doctests.
cmlenz
parents:
17
diff
changeset
|
22 from markup.core import Attributes, Markup, Namespace, Stream |
1 | 23 from markup.path import Path |
24 | |
17
ad63ad459524
Refactoring to address #6: all match templates are now processed by a single filter, which means that match templates added by included templates are properly applied. A side effect of this refactoring is that `Context` objects may not be reused across multiple template processing runs.
cmlenz
parents:
15
diff
changeset
|
25 __all__ = ['IncludeFilter', 'WhitespaceFilter', 'HTMLSanitizer'] |
1 | 26 |
27 | |
28 class IncludeFilter(object): | |
29 """Template filter providing (very) basic XInclude support | |
30 (see http://www.w3.org/TR/xinclude/) in templates. | |
31 """ | |
32 | |
18
4cbebb15a834
Actually make use of the `markup.core.Namespace` class, and add a couple of doctests.
cmlenz
parents:
17
diff
changeset
|
33 NAMESPACE = Namespace('http://www.w3.org/2001/XInclude') |
1 | 34 |
17
ad63ad459524
Refactoring to address #6: all match templates are now processed by a single filter, which means that match templates added by included templates are properly applied. A side effect of this refactoring is that `Context` objects may not be reused across multiple template processing runs.
cmlenz
parents:
15
diff
changeset
|
35 def __init__(self, loader): |
1 | 36 """Initialize the filter. |
37 | |
38 @param loader: the `TemplateLoader` to use for resolving references to | |
39 external template files | |
40 """ | |
41 self.loader = loader | |
42 | |
12
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
43 def __call__(self, stream, ctxt=None, ns_prefixes=None): |
1 | 44 """Filter the stream, processing any XInclude directives it may |
45 contain. | |
46 | |
47 @param ctxt: the template context | |
48 @param stream: the markup event stream to filter | |
49 """ | |
12
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
50 from markup.template import Template, TemplateError, TemplateNotFound |
1 | 51 |
12
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
52 if ns_prefixes is None: |
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
53 ns_prefixes = [] |
1 | 54 in_fallback = False |
55 include_href, fallback_stream = None, None | |
56 | |
57 for kind, data, pos in stream: | |
58 | |
18
4cbebb15a834
Actually make use of the `markup.core.Namespace` class, and add a couple of doctests.
cmlenz
parents:
17
diff
changeset
|
59 if kind is Stream.START and data[0] in self.NAMESPACE \ |
1 | 60 and not in_fallback: |
61 tag, attrib = data | |
62 if tag.localname == 'include': | |
63 include_href = attrib.get('href') | |
64 elif tag.localname == 'fallback': | |
65 in_fallback = True | |
66 fallback_stream = [] | |
67 | |
18
4cbebb15a834
Actually make use of the `markup.core.Namespace` class, and add a couple of doctests.
cmlenz
parents:
17
diff
changeset
|
68 elif kind is Stream.END and data in self.NAMESPACE: |
1 | 69 if data.localname == 'include': |
70 try: | |
71 if not include_href: | |
72 raise TemplateError('Include misses required ' | |
73 'attribute "href"') | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
18
diff
changeset
|
74 template = self.loader.load(include_href, |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
18
diff
changeset
|
75 relative_to=pos[0]) |
17
ad63ad459524
Refactoring to address #6: all match templates are now processed by a single filter, which means that match templates added by included templates are properly applied. A side effect of this refactoring is that `Context` objects may not be reused across multiple template processing runs.
cmlenz
parents:
15
diff
changeset
|
76 for event in template.generate(ctxt): |
ad63ad459524
Refactoring to address #6: all match templates are now processed by a single filter, which means that match templates added by included templates are properly applied. A side effect of this refactoring is that `Context` objects may not be reused across multiple template processing runs.
cmlenz
parents:
15
diff
changeset
|
77 yield event |
13
bf9de5a4c896
Match directives should now also be applied when included indirectly.
cmlenz
parents:
12
diff
changeset
|
78 |
1 | 79 except TemplateNotFound: |
80 if fallback_stream is None: | |
81 raise | |
82 for event in fallback_stream: | |
83 yield event | |
84 | |
85 include_href = None | |
86 fallback_stream = None | |
17
ad63ad459524
Refactoring to address #6: all match templates are now processed by a single filter, which means that match templates added by included templates are properly applied. A side effect of this refactoring is that `Context` objects may not be reused across multiple template processing runs.
cmlenz
parents:
15
diff
changeset
|
87 |
1 | 88 elif data.localname == 'fallback': |
89 in_fallback = False | |
90 | |
91 elif in_fallback: | |
92 fallback_stream.append((kind, data, pos)) | |
93 | |
18
4cbebb15a834
Actually make use of the `markup.core.Namespace` class, and add a couple of doctests.
cmlenz
parents:
17
diff
changeset
|
94 elif kind is Stream.START_NS and data[1] == self.NAMESPACE: |
12
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
95 ns_prefixes.append(data[0]) |
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
96 |
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
97 elif kind is Stream.END_NS and data in ns_prefixes: |
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
98 ns_prefixes.pop() |
1 | 99 |
100 else: | |
101 yield kind, data, pos | |
102 | |
103 | |
104 class WhitespaceFilter(object): | |
105 """A filter that removes extraneous white space from the stream. | |
106 | |
107 Todo: | |
108 * Support for xml:space | |
109 """ | |
110 | |
111 _TRAILING_SPACE = re.compile('[ \t]+(?=\n)') | |
112 _LINE_COLLAPSE = re.compile('\n{2,}') | |
113 | |
114 def __call__(self, stream, ctxt=None): | |
115 textbuf = [] | |
116 prev_kind = None | |
117 for kind, data, pos in stream: | |
118 if kind is Stream.TEXT: | |
119 textbuf.append(data) | |
120 elif prev_kind is Stream.TEXT: | |
54 | 121 text = Markup('').join(textbuf, escape_quotes=False) |
1 | 122 text = self._TRAILING_SPACE.sub('', text) |
123 text = self._LINE_COLLAPSE.sub('\n', text) | |
17
ad63ad459524
Refactoring to address #6: all match templates are now processed by a single filter, which means that match templates added by included templates are properly applied. A side effect of this refactoring is that `Context` objects may not be reused across multiple template processing runs.
cmlenz
parents:
15
diff
changeset
|
124 yield Stream.TEXT, Markup(text), pos |
1 | 125 del textbuf[:] |
126 prev_kind = kind | |
127 if kind is not Stream.TEXT: | |
128 yield kind, data, pos | |
129 | |
130 if textbuf: | |
54 | 131 text = Markup('').join(textbuf, escape_quotes=False) |
132 text = self._TRAILING_SPACE.sub('', text) | |
133 text = self._LINE_COLLAPSE.sub('\n', text) | |
17
ad63ad459524
Refactoring to address #6: all match templates are now processed by a single filter, which means that match templates added by included templates are properly applied. A side effect of this refactoring is that `Context` objects may not be reused across multiple template processing runs.
cmlenz
parents:
15
diff
changeset
|
134 yield Stream.TEXT, Markup(text), pos |
1 | 135 |
136 | |
137 class HTMLSanitizer(object): | |
138 """A filter that removes potentially dangerous HTML tags and attributes | |
139 from the stream. | |
140 """ | |
141 | |
142 _SAFE_TAGS = frozenset(['a', 'abbr', 'acronym', 'address', 'area', 'b', | |
143 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite', | |
144 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', | |
145 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', | |
146 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map', | |
147 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp', | |
148 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table', | |
149 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', | |
150 'ul', 'var']) | |
151 | |
152 _SAFE_ATTRS = frozenset(['abbr', 'accept', 'accept-charset', 'accesskey', | |
15 | 153 'action', 'align', 'alt', 'axis', 'bgcolor', 'border', 'cellpadding', |
1 | 154 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class', |
155 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime', | |
156 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height', | |
157 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang', | |
158 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name', | |
159 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev', | |
160 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', | |
161 'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target', | |
162 'title', 'type', 'usemap', 'valign', 'value', 'vspace', 'width']) | |
163 _URI_ATTRS = frozenset(['action', 'background', 'dynsrc', 'href', 'lowsrc', | |
164 'src']) | |
165 _SAFE_SCHEMES = frozenset(['file', 'ftp', 'http', 'https', 'mailto', None]) | |
166 | |
167 def __call__(self, stream, ctxt=None): | |
168 waiting_for = None | |
169 | |
170 for kind, data, pos in stream: | |
171 if kind is Stream.START: | |
172 if waiting_for: | |
173 continue | |
174 tag, attrib = data | |
175 if tag not in self._SAFE_TAGS: | |
176 waiting_for = tag | |
177 continue | |
178 | |
179 new_attrib = [] | |
180 for attr, value in attrib: | |
181 if attr not in self._SAFE_ATTRS: | |
182 continue | |
183 elif attr in self._URI_ATTRS: | |
184 # Don't allow URI schemes such as "javascript:" | |
185 if self._get_scheme(value) not in self._SAFE_SCHEMES: | |
186 continue | |
187 elif attr == 'style': | |
188 # Remove dangerous CSS declarations from inline styles | |
189 decls = [] | |
190 for decl in filter(None, value.split(';')): | |
191 is_evil = False | |
192 if 'expression' in decl: | |
193 is_evil = True | |
194 for m in re.finditer(r'url\s*\(([^)]+)', decl): | |
195 if self._get_scheme(m.group(1)) not in self._SAFE_SCHEMES: | |
196 is_evil = True | |
197 break | |
198 if not is_evil: | |
199 decls.append(decl.strip()) | |
200 if not decls: | |
201 continue | |
202 value = '; '.join(decls) | |
203 new_attrib.append((attr, value)) | |
204 | |
205 yield kind, (tag, new_attrib), pos | |
206 | |
207 elif kind is Stream.END: | |
208 tag = data | |
209 if waiting_for: | |
210 if waiting_for == tag: | |
211 waiting_for = None | |
212 else: | |
213 yield kind, data, pos | |
214 | |
215 else: | |
216 if not waiting_for: | |
217 yield kind, data, pos | |
218 | |
219 def _get_scheme(self, text): | |
220 if ':' not in text: | |
221 return None | |
222 chars = [char for char in text.split(':', 1)[0] if char.isalnum()] | |
223 return ''.join(chars).lower() |