Mercurial > genshi > genshi-test
annotate genshi/filters/html.py @ 902:09cc3627654c experimental-inline
Sync `experimental/inline` branch with [source:trunk@1126].
author | cmlenz |
---|---|
date | Fri, 23 Apr 2010 21:08:26 +0000 |
parents | 1837f39efd6f |
children |
rev | line source |
---|---|
500 | 1 # -*- coding: utf-8 -*- |
2 # | |
902
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
3 # Copyright (C) 2006-2009 Edgewall Software |
500 | 4 # All rights reserved. |
5 # | |
6 # This software is licensed as described in the file COPYING, which | |
7 # you should have received as part of this distribution. The terms | |
8 # are also available at http://genshi.edgewall.org/wiki/License. | |
9 # | |
10 # This software consists of voluntary contributions made by many | |
11 # individuals. For the exact contribution history, see the revision | |
12 # history and logs, available at http://genshi.edgewall.org/log/. | |
13 | |
14 """Implementation of a number of stream filters.""" | |
15 | |
902
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
16 try: |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
17 any |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
18 except NameError: |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
19 from genshi.util import any |
500 | 20 import re |
21 | |
22 from genshi.core import Attrs, QName, stripentities | |
820
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
23 from genshi.core import END, START, TEXT, COMMENT |
500 | 24 |
25 __all__ = ['HTMLFormFiller', 'HTMLSanitizer'] | |
26 __docformat__ = 'restructuredtext en' | |
27 | |
28 | |
29 class HTMLFormFiller(object): | |
30 """A stream filter that can populate HTML forms from a dictionary of values. | |
31 | |
32 >>> from genshi.input import HTML | |
33 >>> html = HTML('''<form> | |
34 ... <p><input type="text" name="foo" /></p> | |
35 ... </form>''') | |
36 >>> filler = HTMLFormFiller(data={'foo': 'bar'}) | |
902
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
37 >>> print(html | filler) |
500 | 38 <form> |
39 <p><input type="text" name="foo" value="bar"/></p> | |
40 </form> | |
41 """ | |
42 # TODO: only select the first radio button, and the first select option | |
43 # (if not in a multiple-select) | |
44 # TODO: only apply to elements in the XHTML namespace (or no namespace)? | |
45 | |
902
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
46 def __init__(self, name=None, id=None, data=None, passwords=False): |
500 | 47 """Create the filter. |
48 | |
49 :param name: The name of the form that should be populated. If this | |
50 parameter is given, only forms where the ``name`` attribute | |
51 value matches the parameter are processed. | |
52 :param id: The ID of the form that should be populated. If this | |
53 parameter is given, only forms where the ``id`` attribute | |
54 value matches the parameter are processed. | |
55 :param data: The dictionary of form values, where the keys are the names | |
56 of the form fields, and the values are the values to fill | |
57 in. | |
902
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
58 :param passwords: Whether password input fields should be populated. |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
59 This is off by default for security reasons (for |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
60 example, a password may end up in the browser cache) |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
61 :note: Changed in 0.5.2: added the `passwords` option |
500 | 62 """ |
63 self.name = name | |
64 self.id = id | |
65 if data is None: | |
66 data = {} | |
67 self.data = data | |
902
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
68 self.passwords = passwords |
500 | 69 |
70 def __call__(self, stream): | |
71 """Apply the filter to the given stream. | |
72 | |
73 :param stream: the markup event stream to filter | |
74 """ | |
75 in_form = in_select = in_option = in_textarea = False | |
76 select_value = option_value = textarea_value = None | |
820
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
77 option_start = None |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
78 option_text = [] |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
79 no_option_value = False |
500 | 80 |
81 for kind, data, pos in stream: | |
82 | |
83 if kind is START: | |
84 tag, attrs = data | |
85 tagname = tag.localname | |
86 | |
87 if tagname == 'form' and ( | |
88 self.name and attrs.get('name') == self.name or | |
89 self.id and attrs.get('id') == self.id or | |
90 not (self.id or self.name)): | |
91 in_form = True | |
92 | |
93 elif in_form: | |
94 if tagname == 'input': | |
902
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
95 type = attrs.get('type', '').lower() |
500 | 96 if type in ('checkbox', 'radio'): |
97 name = attrs.get('name') | |
98 if name and name in self.data: | |
99 value = self.data[name] | |
100 declval = attrs.get('value') | |
101 checked = False | |
102 if isinstance(value, (list, tuple)): | |
103 if declval: | |
820
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
104 checked = declval in [unicode(v) for v |
500 | 105 in value] |
106 else: | |
902
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
107 checked = any(value) |
500 | 108 else: |
109 if declval: | |
820
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
110 checked = declval == unicode(value) |
500 | 111 elif type == 'checkbox': |
112 checked = bool(value) | |
113 if checked: | |
114 attrs |= [(QName('checked'), 'checked')] | |
115 elif 'checked' in attrs: | |
116 attrs -= 'checked' | |
902
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
117 elif type in ('', 'hidden', 'text') \ |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
118 or type == 'password' and self.passwords: |
500 | 119 name = attrs.get('name') |
120 if name and name in self.data: | |
121 value = self.data[name] | |
122 if isinstance(value, (list, tuple)): | |
123 value = value[0] | |
124 if value is not None: | |
902
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
125 attrs |= [ |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
126 (QName('value'), unicode(value)) |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
127 ] |
500 | 128 elif tagname == 'select': |
129 name = attrs.get('name') | |
130 if name in self.data: | |
131 select_value = self.data[name] | |
132 in_select = True | |
133 elif tagname == 'textarea': | |
134 name = attrs.get('name') | |
135 if name in self.data: | |
136 textarea_value = self.data.get(name) | |
137 if isinstance(textarea_value, (list, tuple)): | |
138 textarea_value = textarea_value[0] | |
139 in_textarea = True | |
140 elif in_select and tagname == 'option': | |
141 option_start = kind, data, pos | |
142 option_value = attrs.get('value') | |
820
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
143 if option_value is None: |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
144 no_option_value = True |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
145 option_value = '' |
500 | 146 in_option = True |
147 continue | |
148 yield kind, (tag, attrs), pos | |
149 | |
150 elif in_form and kind is TEXT: | |
151 if in_select and in_option: | |
820
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
152 if no_option_value: |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
153 option_value += data |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
154 option_text.append((kind, data, pos)) |
500 | 155 continue |
156 elif in_textarea: | |
157 continue | |
158 yield kind, data, pos | |
159 | |
160 elif in_form and kind is END: | |
161 tagname = data.localname | |
162 if tagname == 'form': | |
163 in_form = False | |
164 elif tagname == 'select': | |
165 in_select = False | |
166 select_value = None | |
167 elif in_select and tagname == 'option': | |
168 if isinstance(select_value, (tuple, list)): | |
820
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
169 selected = option_value in [unicode(v) for v |
500 | 170 in select_value] |
171 else: | |
820
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
172 selected = option_value == unicode(select_value) |
500 | 173 okind, (tag, attrs), opos = option_start |
174 if selected: | |
175 attrs |= [(QName('selected'), 'selected')] | |
176 elif 'selected' in attrs: | |
177 attrs -= 'selected' | |
178 yield okind, (tag, attrs), opos | |
179 if option_text: | |
820
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
180 for event in option_text: |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
181 yield event |
500 | 182 in_option = False |
820
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
183 no_option_value = False |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
184 option_start = option_value = None |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
185 option_text = [] |
500 | 186 elif tagname == 'textarea': |
187 if textarea_value: | |
188 yield TEXT, unicode(textarea_value), pos | |
189 in_textarea = False | |
190 yield kind, data, pos | |
191 | |
192 else: | |
193 yield kind, data, pos | |
194 | |
195 | |
196 class HTMLSanitizer(object): | |
197 """A filter that removes potentially dangerous HTML tags and attributes | |
198 from the stream. | |
199 | |
200 >>> from genshi import HTML | |
201 >>> html = HTML('<div><script>alert(document.cookie)</script></div>') | |
902
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
202 >>> print(html | HTMLSanitizer()) |
500 | 203 <div/> |
204 | |
205 The default set of safe tags and attributes can be modified when the filter | |
206 is instantiated. For example, to allow inline ``style`` attributes, the | |
207 following instantation would work: | |
208 | |
209 >>> html = HTML('<div style="background: #000"></div>') | |
210 >>> sanitizer = HTMLSanitizer(safe_attrs=HTMLSanitizer.SAFE_ATTRS | set(['style'])) | |
902
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
211 >>> print(html | sanitizer) |
500 | 212 <div style="background: #000"/> |
213 | |
214 Note that even in this case, the filter *does* attempt to remove dangerous | |
215 constructs from style attributes: | |
216 | |
217 >>> html = HTML('<div style="background: url(javascript:void); color: #000"></div>') | |
902
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
218 >>> print(html | sanitizer) |
500 | 219 <div style="color: #000"/> |
220 | |
221 This handles HTML entities, unicode escapes in CSS and Javascript text, as | |
222 well as a lot of other things. However, the style tag is still excluded by | |
223 default because it is very hard for such sanitizing to be completely safe, | |
224 especially considering how much error recovery current web browsers perform. | |
820
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
225 |
902
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
226 It also does some basic filtering of CSS properties that may be used for |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
227 typical phishing attacks. For more sophisticated filtering, this class |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
228 provides a couple of hooks that can be overridden in sub-classes. |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
229 |
820
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
230 :warn: Note that this special processing of CSS is currently only applied to |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
231 style attributes, **not** style elements. |
500 | 232 """ |
233 | |
234 SAFE_TAGS = frozenset(['a', 'abbr', 'acronym', 'address', 'area', 'b', | |
235 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite', | |
236 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', | |
237 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', | |
238 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map', | |
239 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp', | |
240 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table', | |
241 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', | |
242 'ul', 'var']) | |
243 | |
244 SAFE_ATTRS = frozenset(['abbr', 'accept', 'accept-charset', 'accesskey', | |
245 'action', 'align', 'alt', 'axis', 'bgcolor', 'border', 'cellpadding', | |
246 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class', | |
247 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime', | |
248 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height', | |
249 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang', | |
250 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name', | |
251 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev', | |
252 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', | |
253 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', | |
254 'type', 'usemap', 'valign', 'value', 'vspace', 'width']) | |
255 | |
256 SAFE_SCHEMES = frozenset(['file', 'ftp', 'http', 'https', 'mailto', None]) | |
257 | |
258 URI_ATTRS = frozenset(['action', 'background', 'dynsrc', 'href', 'lowsrc', | |
259 'src']) | |
260 | |
261 def __init__(self, safe_tags=SAFE_TAGS, safe_attrs=SAFE_ATTRS, | |
262 safe_schemes=SAFE_SCHEMES, uri_attrs=URI_ATTRS): | |
263 """Create the sanitizer. | |
264 | |
265 The exact set of allowed elements and attributes can be configured. | |
266 | |
267 :param safe_tags: a set of tag names that are considered safe | |
268 :param safe_attrs: a set of attribute names that are considered safe | |
269 :param safe_schemes: a set of URI schemes that are considered safe | |
270 :param uri_attrs: a set of names of attributes that contain URIs | |
271 """ | |
272 self.safe_tags = safe_tags | |
820
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
273 "The set of tag names that are considered safe." |
500 | 274 self.safe_attrs = safe_attrs |
820
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
275 "The set of attribute names that are considered safe." |
500 | 276 self.uri_attrs = uri_attrs |
820
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
277 "The set of names of attributes that may contain URIs." |
500 | 278 self.safe_schemes = safe_schemes |
820
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
279 "The set of URI schemes that are considered safe." |
500 | 280 |
281 def __call__(self, stream): | |
282 """Apply the filter to the given stream. | |
283 | |
284 :param stream: the markup event stream to filter | |
285 """ | |
286 waiting_for = None | |
287 | |
288 for kind, data, pos in stream: | |
289 if kind is START: | |
290 if waiting_for: | |
291 continue | |
292 tag, attrs = data | |
902
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
293 if not self.is_safe_elem(tag, attrs): |
500 | 294 waiting_for = tag |
295 continue | |
296 | |
297 new_attrs = [] | |
298 for attr, value in attrs: | |
299 value = stripentities(value) | |
300 if attr not in self.safe_attrs: | |
301 continue | |
302 elif attr in self.uri_attrs: | |
303 # Don't allow URI schemes such as "javascript:" | |
820
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
304 if not self.is_safe_uri(value): |
500 | 305 continue |
306 elif attr == 'style': | |
307 # Remove dangerous CSS declarations from inline styles | |
820
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
308 decls = self.sanitize_css(value) |
500 | 309 if not decls: |
310 continue | |
311 value = '; '.join(decls) | |
312 new_attrs.append((attr, value)) | |
313 | |
314 yield kind, (tag, Attrs(new_attrs)), pos | |
315 | |
316 elif kind is END: | |
317 tag = data | |
318 if waiting_for: | |
319 if waiting_for == tag: | |
320 waiting_for = None | |
321 else: | |
322 yield kind, data, pos | |
323 | |
820
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
324 elif kind is not COMMENT: |
500 | 325 if not waiting_for: |
326 yield kind, data, pos | |
327 | |
902
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
328 def is_safe_css(self, propname, value): |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
329 """Determine whether the given css property declaration is to be |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
330 considered safe for inclusion in the output. |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
331 |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
332 :param propname: the CSS property name |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
333 :param value: the value of the property |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
334 :return: whether the property value should be considered safe |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
335 :rtype: bool |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
336 :since: version 0.6 |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
337 """ |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
338 if propname == 'position': |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
339 return False |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
340 if propname.startswith('margin') and '-' in value: |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
341 # Negative margins can be used for phishing |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
342 return False |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
343 return True |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
344 |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
345 def is_safe_elem(self, tag, attrs): |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
346 """Determine whether the given element should be considered safe for |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
347 inclusion in the output. |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
348 |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
349 :param tag: the tag name of the element |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
350 :type tag: QName |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
351 :param attrs: the element attributes |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
352 :type attrs: Attrs |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
353 :return: whether the element should be considered safe |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
354 :rtype: bool |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
355 :since: version 0.6 |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
356 """ |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
357 if tag not in self.safe_tags: |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
358 return False |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
359 if tag.localname == 'input': |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
360 input_type = attrs.get('type', '').lower() |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
361 if input_type == 'password': |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
362 return False |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
363 return True |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
364 |
820
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
365 def is_safe_uri(self, uri): |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
366 """Determine whether the given URI is to be considered safe for |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
367 inclusion in the output. |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
368 |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
369 The default implementation checks whether the scheme of the URI is in |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
370 the set of allowed URIs (`safe_schemes`). |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
371 |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
372 >>> sanitizer = HTMLSanitizer() |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
373 >>> sanitizer.is_safe_uri('http://example.org/') |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
374 True |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
375 >>> sanitizer.is_safe_uri('javascript:alert(document.cookie)') |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
376 False |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
377 |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
378 :param uri: the URI to check |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
379 :return: `True` if the URI can be considered safe, `False` otherwise |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
380 :rtype: `bool` |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
381 :since: version 0.4.3 |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
382 """ |
902
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
383 if '#' in uri: |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
384 uri = uri.split('#', 1)[0] # Strip out the fragment identifier |
820
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
385 if ':' not in uri: |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
386 return True # This is a relative URI |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
387 chars = [char for char in uri.split(':', 1)[0] if char.isalnum()] |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
388 return ''.join(chars).lower() in self.safe_schemes |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
389 |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
390 def sanitize_css(self, text): |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
391 """Remove potentially dangerous property declarations from CSS code. |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
392 |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
393 In particular, properties using the CSS ``url()`` function with a scheme |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
394 that is not considered safe are removed: |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
395 |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
396 >>> sanitizer = HTMLSanitizer() |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
397 >>> sanitizer.sanitize_css(u''' |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
398 ... background: url(javascript:alert("foo")); |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
399 ... color: #000; |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
400 ... ''') |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
401 [u'color: #000'] |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
402 |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
403 Also, the proprietary Internet Explorer function ``expression()`` is |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
404 always stripped: |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
405 |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
406 >>> sanitizer.sanitize_css(u''' |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
407 ... background: #fff; |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
408 ... color: #000; |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
409 ... width: e/**/xpression(alert("foo")); |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
410 ... ''') |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
411 [u'background: #fff', u'color: #000'] |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
412 |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
413 :param text: the CSS text; this is expected to be `unicode` and to not |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
414 contain any character or numeric references |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
415 :return: a list of declarations that are considered safe |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
416 :rtype: `list` |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
417 :since: version 0.4.3 |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
418 """ |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
419 decls = [] |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
420 text = self._strip_css_comments(self._replace_unicode_escapes(text)) |
902
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
421 for decl in text.split(';'): |
820
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
422 decl = decl.strip() |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
423 if not decl: |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
424 continue |
902
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
425 try: |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
426 propname, value = decl.split(':', 1) |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
427 except ValueError: |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
428 continue |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
429 if not self.is_safe_css(propname.strip().lower(), value.strip()): |
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
430 continue |
820
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
431 is_evil = False |
902
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
432 if 'expression' in value: |
820
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
433 is_evil = True |
902
09cc3627654c
Sync `experimental/inline` branch with [source:trunk@1126].
cmlenz
parents:
820
diff
changeset
|
434 for match in re.finditer(r'url\s*\(([^)]+)', value): |
820
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
435 if not self.is_safe_uri(match.group(1)): |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
436 is_evil = True |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
437 break |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
438 if not is_evil: |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
439 decls.append(decl.strip()) |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
440 return decls |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
441 |
500 | 442 _NORMALIZE_NEWLINES = re.compile(r'\r\n').sub |
443 _UNICODE_ESCAPE = re.compile(r'\\([0-9a-fA-F]{1,6})\s?').sub | |
444 | |
445 def _replace_unicode_escapes(self, text): | |
446 def _repl(match): | |
447 return unichr(int(match.group(1), 16)) | |
448 return self._UNICODE_ESCAPE(_repl, self._NORMALIZE_NEWLINES('\n', text)) | |
820
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
449 |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
450 _CSS_COMMENTS = re.compile(r'/\*.*?\*/').sub |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
451 |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
452 def _strip_css_comments(self, text): |
1837f39efd6f
Sync (old) experimental inline branch with trunk@1027.
cmlenz
parents:
500
diff
changeset
|
453 return self._CSS_COMMENTS('', text) |