Mercurial > genshi > genshi-test
annotate markup/filters.py @ 85:db8f2958c670
Improve handling of DOCTYPE declarations.
author | cmlenz |
---|---|
date | Sun, 16 Jul 2006 11:07:34 +0000 |
parents | e9a3930f8823 |
children | 3b75c6730b29 |
rev | line source |
---|---|
1 | 1 # -*- coding: utf-8 -*- |
2 # | |
66
822089ae65ce
Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents:
54
diff
changeset
|
3 # Copyright (C) 2006 Edgewall Software |
1 | 4 # All rights reserved. |
5 # | |
6 # This software is licensed as described in the file COPYING, which | |
7 # you should have received as part of this distribution. The terms | |
66
822089ae65ce
Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents:
54
diff
changeset
|
8 # are also available at http://markup.edgewall.org/wiki/License. |
1 | 9 # |
10 # This software consists of voluntary contributions made by many | |
11 # individuals. For the exact contribution history, see the revision | |
66
822089ae65ce
Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents:
54
diff
changeset
|
12 # history and logs, available at http://markup.edgewall.org/log/. |
1 | 13 |
14 """Implementation of a number of stream filters.""" | |
15 | |
16 try: | |
17 frozenset | |
18 except NameError: | |
19 from sets import ImmutableSet as frozenset | |
20 import re | |
21 | |
69 | 22 from markup.core import Attributes, Markup, Namespace |
23 from markup.core import END, END_NS, START, START_NS, TEXT | |
1 | 24 from markup.path import Path |
25 | |
17
ad63ad459524
Refactoring to address #6: all match templates are now processed by a single filter, which means that match templates added by included templates are properly applied. A side effect of this refactoring is that `Context` objects may not be reused across multiple template processing runs.
cmlenz
parents:
15
diff
changeset
|
26 __all__ = ['IncludeFilter', 'WhitespaceFilter', 'HTMLSanitizer'] |
1 | 27 |
28 | |
29 class IncludeFilter(object): | |
30 """Template filter providing (very) basic XInclude support | |
31 (see http://www.w3.org/TR/xinclude/) in templates. | |
32 """ | |
33 | |
18
4cbebb15a834
Actually make use of the `markup.core.Namespace` class, and add a couple of doctests.
cmlenz
parents:
17
diff
changeset
|
34 NAMESPACE = Namespace('http://www.w3.org/2001/XInclude') |
1 | 35 |
17
ad63ad459524
Refactoring to address #6: all match templates are now processed by a single filter, which means that match templates added by included templates are properly applied. A side effect of this refactoring is that `Context` objects may not be reused across multiple template processing runs.
cmlenz
parents:
15
diff
changeset
|
36 def __init__(self, loader): |
1 | 37 """Initialize the filter. |
38 | |
39 @param loader: the `TemplateLoader` to use for resolving references to | |
40 external template files | |
41 """ | |
42 self.loader = loader | |
43 | |
12
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
44 def __call__(self, stream, ctxt=None, ns_prefixes=None): |
1 | 45 """Filter the stream, processing any XInclude directives it may |
46 contain. | |
47 | |
48 @param ctxt: the template context | |
49 @param stream: the markup event stream to filter | |
50 """ | |
12
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
51 from markup.template import Template, TemplateError, TemplateNotFound |
1 | 52 |
12
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
53 if ns_prefixes is None: |
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
54 ns_prefixes = [] |
1 | 55 in_fallback = False |
56 include_href, fallback_stream = None, None | |
69 | 57 namespace = self.NAMESPACE |
1 | 58 |
59 for kind, data, pos in stream: | |
60 | |
69 | 61 if kind is START and not in_fallback and data[0] in namespace: |
1 | 62 tag, attrib = data |
63 if tag.localname == 'include': | |
64 include_href = attrib.get('href') | |
65 elif tag.localname == 'fallback': | |
66 in_fallback = True | |
67 fallback_stream = [] | |
68 | |
69 | 69 elif kind is END and data in namespace: |
1 | 70 if data.localname == 'include': |
71 try: | |
72 if not include_href: | |
73 raise TemplateError('Include misses required ' | |
74 'attribute "href"') | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
18
diff
changeset
|
75 template = self.loader.load(include_href, |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
18
diff
changeset
|
76 relative_to=pos[0]) |
17
ad63ad459524
Refactoring to address #6: all match templates are now processed by a single filter, which means that match templates added by included templates are properly applied. A side effect of this refactoring is that `Context` objects may not be reused across multiple template processing runs.
cmlenz
parents:
15
diff
changeset
|
77 for event in template.generate(ctxt): |
ad63ad459524
Refactoring to address #6: all match templates are now processed by a single filter, which means that match templates added by included templates are properly applied. A side effect of this refactoring is that `Context` objects may not be reused across multiple template processing runs.
cmlenz
parents:
15
diff
changeset
|
78 yield event |
13
bf9de5a4c896
Match directives should now also be applied when included indirectly.
cmlenz
parents:
12
diff
changeset
|
79 |
1 | 80 except TemplateNotFound: |
81 if fallback_stream is None: | |
82 raise | |
83 for event in fallback_stream: | |
84 yield event | |
85 | |
86 include_href = None | |
87 fallback_stream = None | |
17
ad63ad459524
Refactoring to address #6: all match templates are now processed by a single filter, which means that match templates added by included templates are properly applied. A side effect of this refactoring is that `Context` objects may not be reused across multiple template processing runs.
cmlenz
parents:
15
diff
changeset
|
88 |
1 | 89 elif data.localname == 'fallback': |
90 in_fallback = False | |
91 | |
92 elif in_fallback: | |
93 fallback_stream.append((kind, data, pos)) | |
94 | |
69 | 95 elif kind is START_NS and data[1] == namespace: |
12
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
96 ns_prefixes.append(data[0]) |
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
97 |
69 | 98 elif kind is END_NS and data in ns_prefixes: |
12
87238328a71d
Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents:
10
diff
changeset
|
99 ns_prefixes.pop() |
1 | 100 |
101 else: | |
102 yield kind, data, pos | |
103 | |
104 | |
105 class WhitespaceFilter(object): | |
106 """A filter that removes extraneous white space from the stream. | |
107 | |
69 | 108 TODO: |
1 | 109 * Support for xml:space |
110 """ | |
111 _TRAILING_SPACE = re.compile('[ \t]+(?=\n)') | |
112 _LINE_COLLAPSE = re.compile('\n{2,}') | |
113 | |
114 def __call__(self, stream, ctxt=None): | |
69 | 115 trim_trailing_space = self._TRAILING_SPACE.sub |
116 collapse_lines = self._LINE_COLLAPSE.sub | |
117 mjoin = Markup('').join | |
118 | |
1 | 119 textbuf = [] |
120 for kind, data, pos in stream: | |
69 | 121 if kind is TEXT: |
1 | 122 textbuf.append(data) |
69 | 123 else: |
124 if textbuf: | |
125 text = mjoin(textbuf, escape_quotes=False) | |
126 text = trim_trailing_space('', text) | |
127 text = collapse_lines('\n', text) | |
128 yield TEXT, Markup(text), pos | |
129 del textbuf[:] | |
1 | 130 yield kind, data, pos |
69 | 131 else: |
132 if textbuf: | |
133 text = mjoin(textbuf, escape_quotes=False) | |
134 text = trim_trailing_space('', text) | |
135 text = collapse_lines('\n', text) | |
136 yield TEXT, Markup(text), pos | |
1 | 137 |
138 | |
139 class HTMLSanitizer(object): | |
140 """A filter that removes potentially dangerous HTML tags and attributes | |
141 from the stream. | |
142 """ | |
143 | |
144 _SAFE_TAGS = frozenset(['a', 'abbr', 'acronym', 'address', 'area', 'b', | |
145 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite', | |
146 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', | |
147 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', | |
148 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map', | |
149 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp', | |
150 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table', | |
151 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', | |
152 'ul', 'var']) | |
153 | |
154 _SAFE_ATTRS = frozenset(['abbr', 'accept', 'accept-charset', 'accesskey', | |
15 | 155 'action', 'align', 'alt', 'axis', 'bgcolor', 'border', 'cellpadding', |
1 | 156 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class', |
157 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime', | |
158 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height', | |
159 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang', | |
160 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name', | |
161 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev', | |
162 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', | |
163 'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target', | |
164 'title', 'type', 'usemap', 'valign', 'value', 'vspace', 'width']) | |
165 _URI_ATTRS = frozenset(['action', 'background', 'dynsrc', 'href', 'lowsrc', | |
166 'src']) | |
167 _SAFE_SCHEMES = frozenset(['file', 'ftp', 'http', 'https', 'mailto', None]) | |
168 | |
169 def __call__(self, stream, ctxt=None): | |
170 waiting_for = None | |
171 | |
172 for kind, data, pos in stream: | |
69 | 173 if kind is START: |
1 | 174 if waiting_for: |
175 continue | |
176 tag, attrib = data | |
177 if tag not in self._SAFE_TAGS: | |
178 waiting_for = tag | |
179 continue | |
180 | |
181 new_attrib = [] | |
182 for attr, value in attrib: | |
183 if attr not in self._SAFE_ATTRS: | |
184 continue | |
185 elif attr in self._URI_ATTRS: | |
186 # Don't allow URI schemes such as "javascript:" | |
187 if self._get_scheme(value) not in self._SAFE_SCHEMES: | |
188 continue | |
189 elif attr == 'style': | |
190 # Remove dangerous CSS declarations from inline styles | |
191 decls = [] | |
192 for decl in filter(None, value.split(';')): | |
193 is_evil = False | |
194 if 'expression' in decl: | |
195 is_evil = True | |
196 for m in re.finditer(r'url\s*\(([^)]+)', decl): | |
197 if self._get_scheme(m.group(1)) not in self._SAFE_SCHEMES: | |
198 is_evil = True | |
199 break | |
200 if not is_evil: | |
201 decls.append(decl.strip()) | |
202 if not decls: | |
203 continue | |
204 value = '; '.join(decls) | |
205 new_attrib.append((attr, value)) | |
206 | |
207 yield kind, (tag, new_attrib), pos | |
208 | |
69 | 209 elif kind is END: |
1 | 210 tag = data |
211 if waiting_for: | |
212 if waiting_for == tag: | |
213 waiting_for = None | |
214 else: | |
215 yield kind, data, pos | |
216 | |
217 else: | |
218 if not waiting_for: | |
219 yield kind, data, pos | |
220 | |
221 def _get_scheme(self, text): | |
222 if ':' not in text: | |
223 return None | |
224 chars = [char for char in text.split(':', 1)[0] if char.isalnum()] | |
225 return ''.join(chars).lower() |