Mercurial > genshi > genshi-test
comparison genshi/filters/html.py @ 902:09cc3627654c experimental-inline
Sync `experimental/inline` branch with [source:trunk@1126].
author | cmlenz |
---|---|
date | Fri, 23 Apr 2010 21:08:26 +0000 |
parents | 1837f39efd6f |
children |
comparison
equal
deleted
inserted
replaced
830:de82830f8816 | 902:09cc3627654c |
---|---|
1 # -*- coding: utf-8 -*- | 1 # -*- coding: utf-8 -*- |
2 # | 2 # |
3 # Copyright (C) 2006-2008 Edgewall Software | 3 # Copyright (C) 2006-2009 Edgewall Software |
4 # All rights reserved. | 4 # All rights reserved. |
5 # | 5 # |
6 # This software is licensed as described in the file COPYING, which | 6 # This software is licensed as described in the file COPYING, which |
7 # you should have received as part of this distribution. The terms | 7 # you should have received as part of this distribution. The terms |
8 # are also available at http://genshi.edgewall.org/wiki/License. | 8 # are also available at http://genshi.edgewall.org/wiki/License. |
11 # individuals. For the exact contribution history, see the revision | 11 # individuals. For the exact contribution history, see the revision |
12 # history and logs, available at http://genshi.edgewall.org/log/. | 12 # history and logs, available at http://genshi.edgewall.org/log/. |
13 | 13 |
14 """Implementation of a number of stream filters.""" | 14 """Implementation of a number of stream filters.""" |
15 | 15 |
16 try: | |
17 any | |
18 except NameError: | |
19 from genshi.util import any | |
16 import re | 20 import re |
17 | 21 |
18 from genshi.core import Attrs, QName, stripentities | 22 from genshi.core import Attrs, QName, stripentities |
19 from genshi.core import END, START, TEXT, COMMENT | 23 from genshi.core import END, START, TEXT, COMMENT |
20 | 24 |
28 >>> from genshi.input import HTML | 32 >>> from genshi.input import HTML |
29 >>> html = HTML('''<form> | 33 >>> html = HTML('''<form> |
30 ... <p><input type="text" name="foo" /></p> | 34 ... <p><input type="text" name="foo" /></p> |
31 ... </form>''') | 35 ... </form>''') |
32 >>> filler = HTMLFormFiller(data={'foo': 'bar'}) | 36 >>> filler = HTMLFormFiller(data={'foo': 'bar'}) |
33 >>> print html | filler | 37 >>> print(html | filler) |
34 <form> | 38 <form> |
35 <p><input type="text" name="foo" value="bar"/></p> | 39 <p><input type="text" name="foo" value="bar"/></p> |
36 </form> | 40 </form> |
37 """ | 41 """ |
38 # TODO: only select the first radio button, and the first select option | 42 # TODO: only select the first radio button, and the first select option |
39 # (if not in a multiple-select) | 43 # (if not in a multiple-select) |
40 # TODO: only apply to elements in the XHTML namespace (or no namespace)? | 44 # TODO: only apply to elements in the XHTML namespace (or no namespace)? |
41 | 45 |
42 def __init__(self, name=None, id=None, data=None): | 46 def __init__(self, name=None, id=None, data=None, passwords=False): |
43 """Create the filter. | 47 """Create the filter. |
44 | 48 |
45 :param name: The name of the form that should be populated. If this | 49 :param name: The name of the form that should be populated. If this |
46 parameter is given, only forms where the ``name`` attribute | 50 parameter is given, only forms where the ``name`` attribute |
47 value matches the parameter are processed. | 51 value matches the parameter are processed. |
49 parameter is given, only forms where the ``id`` attribute | 53 parameter is given, only forms where the ``id`` attribute |
50 value matches the parameter are processed. | 54 value matches the parameter are processed. |
51 :param data: The dictionary of form values, where the keys are the names | 55 :param data: The dictionary of form values, where the keys are the names |
52 of the form fields, and the values are the values to fill | 56 of the form fields, and the values are the values to fill |
53 in. | 57 in. |
58 :param passwords: Whether password input fields should be populated. | |
59 This is off by default for security reasons (for | |
60 example, a password may end up in the browser cache) | |
61 :note: Changed in 0.5.2: added the `passwords` option | |
54 """ | 62 """ |
55 self.name = name | 63 self.name = name |
56 self.id = id | 64 self.id = id |
57 if data is None: | 65 if data is None: |
58 data = {} | 66 data = {} |
59 self.data = data | 67 self.data = data |
68 self.passwords = passwords | |
60 | 69 |
61 def __call__(self, stream): | 70 def __call__(self, stream): |
62 """Apply the filter to the given stream. | 71 """Apply the filter to the given stream. |
63 | 72 |
64 :param stream: the markup event stream to filter | 73 :param stream: the markup event stream to filter |
81 not (self.id or self.name)): | 90 not (self.id or self.name)): |
82 in_form = True | 91 in_form = True |
83 | 92 |
84 elif in_form: | 93 elif in_form: |
85 if tagname == 'input': | 94 if tagname == 'input': |
86 type = attrs.get('type') | 95 type = attrs.get('type', '').lower() |
87 if type in ('checkbox', 'radio'): | 96 if type in ('checkbox', 'radio'): |
88 name = attrs.get('name') | 97 name = attrs.get('name') |
89 if name and name in self.data: | 98 if name and name in self.data: |
90 value = self.data[name] | 99 value = self.data[name] |
91 declval = attrs.get('value') | 100 declval = attrs.get('value') |
93 if isinstance(value, (list, tuple)): | 102 if isinstance(value, (list, tuple)): |
94 if declval: | 103 if declval: |
95 checked = declval in [unicode(v) for v | 104 checked = declval in [unicode(v) for v |
96 in value] | 105 in value] |
97 else: | 106 else: |
98 checked = bool(filter(None, value)) | 107 checked = any(value) |
99 else: | 108 else: |
100 if declval: | 109 if declval: |
101 checked = declval == unicode(value) | 110 checked = declval == unicode(value) |
102 elif type == 'checkbox': | 111 elif type == 'checkbox': |
103 checked = bool(value) | 112 checked = bool(value) |
104 if checked: | 113 if checked: |
105 attrs |= [(QName('checked'), 'checked')] | 114 attrs |= [(QName('checked'), 'checked')] |
106 elif 'checked' in attrs: | 115 elif 'checked' in attrs: |
107 attrs -= 'checked' | 116 attrs -= 'checked' |
108 elif type in (None, 'hidden', 'text'): | 117 elif type in ('', 'hidden', 'text') \ |
118 or type == 'password' and self.passwords: | |
109 name = attrs.get('name') | 119 name = attrs.get('name') |
110 if name and name in self.data: | 120 if name and name in self.data: |
111 value = self.data[name] | 121 value = self.data[name] |
112 if isinstance(value, (list, tuple)): | 122 if isinstance(value, (list, tuple)): |
113 value = value[0] | 123 value = value[0] |
114 if value is not None: | 124 if value is not None: |
115 attrs |= [(QName('value'), unicode(value))] | 125 attrs |= [ |
126 (QName('value'), unicode(value)) | |
127 ] | |
116 elif tagname == 'select': | 128 elif tagname == 'select': |
117 name = attrs.get('name') | 129 name = attrs.get('name') |
118 if name in self.data: | 130 if name in self.data: |
119 select_value = self.data[name] | 131 select_value = self.data[name] |
120 in_select = True | 132 in_select = True |
185 """A filter that removes potentially dangerous HTML tags and attributes | 197 """A filter that removes potentially dangerous HTML tags and attributes |
186 from the stream. | 198 from the stream. |
187 | 199 |
188 >>> from genshi import HTML | 200 >>> from genshi import HTML |
189 >>> html = HTML('<div><script>alert(document.cookie)</script></div>') | 201 >>> html = HTML('<div><script>alert(document.cookie)</script></div>') |
190 >>> print html | HTMLSanitizer() | 202 >>> print(html | HTMLSanitizer()) |
191 <div/> | 203 <div/> |
192 | 204 |
193 The default set of safe tags and attributes can be modified when the filter | 205 The default set of safe tags and attributes can be modified when the filter |
194 is instantiated. For example, to allow inline ``style`` attributes, the | 206 is instantiated. For example, to allow inline ``style`` attributes, the |
195 following instantation would work: | 207 following instantation would work: |
196 | 208 |
197 >>> html = HTML('<div style="background: #000"></div>') | 209 >>> html = HTML('<div style="background: #000"></div>') |
198 >>> sanitizer = HTMLSanitizer(safe_attrs=HTMLSanitizer.SAFE_ATTRS | set(['style'])) | 210 >>> sanitizer = HTMLSanitizer(safe_attrs=HTMLSanitizer.SAFE_ATTRS | set(['style'])) |
199 >>> print html | sanitizer | 211 >>> print(html | sanitizer) |
200 <div style="background: #000"/> | 212 <div style="background: #000"/> |
201 | 213 |
202 Note that even in this case, the filter *does* attempt to remove dangerous | 214 Note that even in this case, the filter *does* attempt to remove dangerous |
203 constructs from style attributes: | 215 constructs from style attributes: |
204 | 216 |
205 >>> html = HTML('<div style="background: url(javascript:void); color: #000"></div>') | 217 >>> html = HTML('<div style="background: url(javascript:void); color: #000"></div>') |
206 >>> print html | sanitizer | 218 >>> print(html | sanitizer) |
207 <div style="color: #000"/> | 219 <div style="color: #000"/> |
208 | 220 |
209 This handles HTML entities, unicode escapes in CSS and Javascript text, as | 221 This handles HTML entities, unicode escapes in CSS and Javascript text, as |
210 well as a lot of other things. However, the style tag is still excluded by | 222 well as a lot of other things. However, the style tag is still excluded by |
211 default because it is very hard for such sanitizing to be completely safe, | 223 default because it is very hard for such sanitizing to be completely safe, |
212 especially considering how much error recovery current web browsers perform. | 224 especially considering how much error recovery current web browsers perform. |
225 | |
226 It also does some basic filtering of CSS properties that may be used for | |
227 typical phishing attacks. For more sophisticated filtering, this class | |
228 provides a couple of hooks that can be overridden in sub-classes. | |
213 | 229 |
214 :warn: Note that this special processing of CSS is currently only applied to | 230 :warn: Note that this special processing of CSS is currently only applied to |
215 style attributes, **not** style elements. | 231 style attributes, **not** style elements. |
216 """ | 232 """ |
217 | 233 |
272 for kind, data, pos in stream: | 288 for kind, data, pos in stream: |
273 if kind is START: | 289 if kind is START: |
274 if waiting_for: | 290 if waiting_for: |
275 continue | 291 continue |
276 tag, attrs = data | 292 tag, attrs = data |
277 if tag not in self.safe_tags: | 293 if not self.is_safe_elem(tag, attrs): |
278 waiting_for = tag | 294 waiting_for = tag |
279 continue | 295 continue |
280 | 296 |
281 new_attrs = [] | 297 new_attrs = [] |
282 for attr, value in attrs: | 298 for attr, value in attrs: |
307 | 323 |
308 elif kind is not COMMENT: | 324 elif kind is not COMMENT: |
309 if not waiting_for: | 325 if not waiting_for: |
310 yield kind, data, pos | 326 yield kind, data, pos |
311 | 327 |
328 def is_safe_css(self, propname, value): | |
329 """Determine whether the given css property declaration is to be | |
330 considered safe for inclusion in the output. | |
331 | |
332 :param propname: the CSS property name | |
333 :param value: the value of the property | |
334 :return: whether the property value should be considered safe | |
335 :rtype: bool | |
336 :since: version 0.6 | |
337 """ | |
338 if propname == 'position': | |
339 return False | |
340 if propname.startswith('margin') and '-' in value: | |
341 # Negative margins can be used for phishing | |
342 return False | |
343 return True | |
344 | |
345 def is_safe_elem(self, tag, attrs): | |
346 """Determine whether the given element should be considered safe for | |
347 inclusion in the output. | |
348 | |
349 :param tag: the tag name of the element | |
350 :type tag: QName | |
351 :param attrs: the element attributes | |
352 :type attrs: Attrs | |
353 :return: whether the element should be considered safe | |
354 :rtype: bool | |
355 :since: version 0.6 | |
356 """ | |
357 if tag not in self.safe_tags: | |
358 return False | |
359 if tag.localname == 'input': | |
360 input_type = attrs.get('type', '').lower() | |
361 if input_type == 'password': | |
362 return False | |
363 return True | |
364 | |
312 def is_safe_uri(self, uri): | 365 def is_safe_uri(self, uri): |
313 """Determine whether the given URI is to be considered safe for | 366 """Determine whether the given URI is to be considered safe for |
314 inclusion in the output. | 367 inclusion in the output. |
315 | 368 |
316 The default implementation checks whether the scheme of the URI is in | 369 The default implementation checks whether the scheme of the URI is in |
325 :param uri: the URI to check | 378 :param uri: the URI to check |
326 :return: `True` if the URI can be considered safe, `False` otherwise | 379 :return: `True` if the URI can be considered safe, `False` otherwise |
327 :rtype: `bool` | 380 :rtype: `bool` |
328 :since: version 0.4.3 | 381 :since: version 0.4.3 |
329 """ | 382 """ |
383 if '#' in uri: | |
384 uri = uri.split('#', 1)[0] # Strip out the fragment identifier | |
330 if ':' not in uri: | 385 if ':' not in uri: |
331 return True # This is a relative URI | 386 return True # This is a relative URI |
332 chars = [char for char in uri.split(':', 1)[0] if char.isalnum()] | 387 chars = [char for char in uri.split(':', 1)[0] if char.isalnum()] |
333 return ''.join(chars).lower() in self.safe_schemes | 388 return ''.join(chars).lower() in self.safe_schemes |
334 | 389 |
361 :rtype: `list` | 416 :rtype: `list` |
362 :since: version 0.4.3 | 417 :since: version 0.4.3 |
363 """ | 418 """ |
364 decls = [] | 419 decls = [] |
365 text = self._strip_css_comments(self._replace_unicode_escapes(text)) | 420 text = self._strip_css_comments(self._replace_unicode_escapes(text)) |
366 for decl in filter(None, text.split(';')): | 421 for decl in text.split(';'): |
367 decl = decl.strip() | 422 decl = decl.strip() |
368 if not decl: | 423 if not decl: |
369 continue | 424 continue |
425 try: | |
426 propname, value = decl.split(':', 1) | |
427 except ValueError: | |
428 continue | |
429 if not self.is_safe_css(propname.strip().lower(), value.strip()): | |
430 continue | |
370 is_evil = False | 431 is_evil = False |
371 if 'expression' in decl: | 432 if 'expression' in value: |
372 is_evil = True | 433 is_evil = True |
373 for match in re.finditer(r'url\s*\(([^)]+)', decl): | 434 for match in re.finditer(r'url\s*\(([^)]+)', value): |
374 if not self.is_safe_uri(match.group(1)): | 435 if not self.is_safe_uri(match.group(1)): |
375 is_evil = True | 436 is_evil = True |
376 break | 437 break |
377 if not is_evil: | 438 if not is_evil: |
378 decls.append(decl.strip()) | 439 decls.append(decl.strip()) |