comparison genshi/filters/html.py @ 902:09cc3627654c experimental-inline

Sync `experimental/inline` branch with [source:trunk@1126].
author cmlenz
date Fri, 23 Apr 2010 21:08:26 +0000
parents 1837f39efd6f
children
comparison
equal deleted inserted replaced
830:de82830f8816 902:09cc3627654c
1 # -*- coding: utf-8 -*- 1 # -*- coding: utf-8 -*-
2 # 2 #
3 # Copyright (C) 2006-2008 Edgewall Software 3 # Copyright (C) 2006-2009 Edgewall Software
4 # All rights reserved. 4 # All rights reserved.
5 # 5 #
6 # This software is licensed as described in the file COPYING, which 6 # This software is licensed as described in the file COPYING, which
7 # you should have received as part of this distribution. The terms 7 # you should have received as part of this distribution. The terms
8 # are also available at http://genshi.edgewall.org/wiki/License. 8 # are also available at http://genshi.edgewall.org/wiki/License.
11 # individuals. For the exact contribution history, see the revision 11 # individuals. For the exact contribution history, see the revision
12 # history and logs, available at http://genshi.edgewall.org/log/. 12 # history and logs, available at http://genshi.edgewall.org/log/.
13 13
14 """Implementation of a number of stream filters.""" 14 """Implementation of a number of stream filters."""
15 15
16 try:
17 any
18 except NameError:
19 from genshi.util import any
16 import re 20 import re
17 21
18 from genshi.core import Attrs, QName, stripentities 22 from genshi.core import Attrs, QName, stripentities
19 from genshi.core import END, START, TEXT, COMMENT 23 from genshi.core import END, START, TEXT, COMMENT
20 24
28 >>> from genshi.input import HTML 32 >>> from genshi.input import HTML
29 >>> html = HTML('''<form> 33 >>> html = HTML('''<form>
30 ... <p><input type="text" name="foo" /></p> 34 ... <p><input type="text" name="foo" /></p>
31 ... </form>''') 35 ... </form>''')
32 >>> filler = HTMLFormFiller(data={'foo': 'bar'}) 36 >>> filler = HTMLFormFiller(data={'foo': 'bar'})
33 >>> print html | filler 37 >>> print(html | filler)
34 <form> 38 <form>
35 <p><input type="text" name="foo" value="bar"/></p> 39 <p><input type="text" name="foo" value="bar"/></p>
36 </form> 40 </form>
37 """ 41 """
38 # TODO: only select the first radio button, and the first select option 42 # TODO: only select the first radio button, and the first select option
39 # (if not in a multiple-select) 43 # (if not in a multiple-select)
40 # TODO: only apply to elements in the XHTML namespace (or no namespace)? 44 # TODO: only apply to elements in the XHTML namespace (or no namespace)?
41 45
42 def __init__(self, name=None, id=None, data=None): 46 def __init__(self, name=None, id=None, data=None, passwords=False):
43 """Create the filter. 47 """Create the filter.
44 48
45 :param name: The name of the form that should be populated. If this 49 :param name: The name of the form that should be populated. If this
46 parameter is given, only forms where the ``name`` attribute 50 parameter is given, only forms where the ``name`` attribute
47 value matches the parameter are processed. 51 value matches the parameter are processed.
49 parameter is given, only forms where the ``id`` attribute 53 parameter is given, only forms where the ``id`` attribute
50 value matches the parameter are processed. 54 value matches the parameter are processed.
51 :param data: The dictionary of form values, where the keys are the names 55 :param data: The dictionary of form values, where the keys are the names
52 of the form fields, and the values are the values to fill 56 of the form fields, and the values are the values to fill
53 in. 57 in.
58 :param passwords: Whether password input fields should be populated.
59 This is off by default for security reasons (for
60 example, a password may end up in the browser cache)
61 :note: Changed in 0.5.2: added the `passwords` option
54 """ 62 """
55 self.name = name 63 self.name = name
56 self.id = id 64 self.id = id
57 if data is None: 65 if data is None:
58 data = {} 66 data = {}
59 self.data = data 67 self.data = data
68 self.passwords = passwords
60 69
61 def __call__(self, stream): 70 def __call__(self, stream):
62 """Apply the filter to the given stream. 71 """Apply the filter to the given stream.
63 72
64 :param stream: the markup event stream to filter 73 :param stream: the markup event stream to filter
81 not (self.id or self.name)): 90 not (self.id or self.name)):
82 in_form = True 91 in_form = True
83 92
84 elif in_form: 93 elif in_form:
85 if tagname == 'input': 94 if tagname == 'input':
86 type = attrs.get('type') 95 type = attrs.get('type', '').lower()
87 if type in ('checkbox', 'radio'): 96 if type in ('checkbox', 'radio'):
88 name = attrs.get('name') 97 name = attrs.get('name')
89 if name and name in self.data: 98 if name and name in self.data:
90 value = self.data[name] 99 value = self.data[name]
91 declval = attrs.get('value') 100 declval = attrs.get('value')
93 if isinstance(value, (list, tuple)): 102 if isinstance(value, (list, tuple)):
94 if declval: 103 if declval:
95 checked = declval in [unicode(v) for v 104 checked = declval in [unicode(v) for v
96 in value] 105 in value]
97 else: 106 else:
98 checked = bool(filter(None, value)) 107 checked = any(value)
99 else: 108 else:
100 if declval: 109 if declval:
101 checked = declval == unicode(value) 110 checked = declval == unicode(value)
102 elif type == 'checkbox': 111 elif type == 'checkbox':
103 checked = bool(value) 112 checked = bool(value)
104 if checked: 113 if checked:
105 attrs |= [(QName('checked'), 'checked')] 114 attrs |= [(QName('checked'), 'checked')]
106 elif 'checked' in attrs: 115 elif 'checked' in attrs:
107 attrs -= 'checked' 116 attrs -= 'checked'
108 elif type in (None, 'hidden', 'text'): 117 elif type in ('', 'hidden', 'text') \
118 or type == 'password' and self.passwords:
109 name = attrs.get('name') 119 name = attrs.get('name')
110 if name and name in self.data: 120 if name and name in self.data:
111 value = self.data[name] 121 value = self.data[name]
112 if isinstance(value, (list, tuple)): 122 if isinstance(value, (list, tuple)):
113 value = value[0] 123 value = value[0]
114 if value is not None: 124 if value is not None:
115 attrs |= [(QName('value'), unicode(value))] 125 attrs |= [
126 (QName('value'), unicode(value))
127 ]
116 elif tagname == 'select': 128 elif tagname == 'select':
117 name = attrs.get('name') 129 name = attrs.get('name')
118 if name in self.data: 130 if name in self.data:
119 select_value = self.data[name] 131 select_value = self.data[name]
120 in_select = True 132 in_select = True
185 """A filter that removes potentially dangerous HTML tags and attributes 197 """A filter that removes potentially dangerous HTML tags and attributes
186 from the stream. 198 from the stream.
187 199
188 >>> from genshi import HTML 200 >>> from genshi import HTML
189 >>> html = HTML('<div><script>alert(document.cookie)</script></div>') 201 >>> html = HTML('<div><script>alert(document.cookie)</script></div>')
190 >>> print html | HTMLSanitizer() 202 >>> print(html | HTMLSanitizer())
191 <div/> 203 <div/>
192 204
193 The default set of safe tags and attributes can be modified when the filter 205 The default set of safe tags and attributes can be modified when the filter
194 is instantiated. For example, to allow inline ``style`` attributes, the 206 is instantiated. For example, to allow inline ``style`` attributes, the
195 following instantation would work: 207 following instantation would work:
196 208
197 >>> html = HTML('<div style="background: #000"></div>') 209 >>> html = HTML('<div style="background: #000"></div>')
198 >>> sanitizer = HTMLSanitizer(safe_attrs=HTMLSanitizer.SAFE_ATTRS | set(['style'])) 210 >>> sanitizer = HTMLSanitizer(safe_attrs=HTMLSanitizer.SAFE_ATTRS | set(['style']))
199 >>> print html | sanitizer 211 >>> print(html | sanitizer)
200 <div style="background: #000"/> 212 <div style="background: #000"/>
201 213
202 Note that even in this case, the filter *does* attempt to remove dangerous 214 Note that even in this case, the filter *does* attempt to remove dangerous
203 constructs from style attributes: 215 constructs from style attributes:
204 216
205 >>> html = HTML('<div style="background: url(javascript:void); color: #000"></div>') 217 >>> html = HTML('<div style="background: url(javascript:void); color: #000"></div>')
206 >>> print html | sanitizer 218 >>> print(html | sanitizer)
207 <div style="color: #000"/> 219 <div style="color: #000"/>
208 220
209 This handles HTML entities, unicode escapes in CSS and Javascript text, as 221 This handles HTML entities, unicode escapes in CSS and Javascript text, as
210 well as a lot of other things. However, the style tag is still excluded by 222 well as a lot of other things. However, the style tag is still excluded by
211 default because it is very hard for such sanitizing to be completely safe, 223 default because it is very hard for such sanitizing to be completely safe,
212 especially considering how much error recovery current web browsers perform. 224 especially considering how much error recovery current web browsers perform.
225
226 It also does some basic filtering of CSS properties that may be used for
227 typical phishing attacks. For more sophisticated filtering, this class
228 provides a couple of hooks that can be overridden in sub-classes.
213 229
214 :warn: Note that this special processing of CSS is currently only applied to 230 :warn: Note that this special processing of CSS is currently only applied to
215 style attributes, **not** style elements. 231 style attributes, **not** style elements.
216 """ 232 """
217 233
272 for kind, data, pos in stream: 288 for kind, data, pos in stream:
273 if kind is START: 289 if kind is START:
274 if waiting_for: 290 if waiting_for:
275 continue 291 continue
276 tag, attrs = data 292 tag, attrs = data
277 if tag not in self.safe_tags: 293 if not self.is_safe_elem(tag, attrs):
278 waiting_for = tag 294 waiting_for = tag
279 continue 295 continue
280 296
281 new_attrs = [] 297 new_attrs = []
282 for attr, value in attrs: 298 for attr, value in attrs:
307 323
308 elif kind is not COMMENT: 324 elif kind is not COMMENT:
309 if not waiting_for: 325 if not waiting_for:
310 yield kind, data, pos 326 yield kind, data, pos
311 327
328 def is_safe_css(self, propname, value):
329 """Determine whether the given css property declaration is to be
330 considered safe for inclusion in the output.
331
332 :param propname: the CSS property name
333 :param value: the value of the property
334 :return: whether the property value should be considered safe
335 :rtype: bool
336 :since: version 0.6
337 """
338 if propname == 'position':
339 return False
340 if propname.startswith('margin') and '-' in value:
341 # Negative margins can be used for phishing
342 return False
343 return True
344
345 def is_safe_elem(self, tag, attrs):
346 """Determine whether the given element should be considered safe for
347 inclusion in the output.
348
349 :param tag: the tag name of the element
350 :type tag: QName
351 :param attrs: the element attributes
352 :type attrs: Attrs
353 :return: whether the element should be considered safe
354 :rtype: bool
355 :since: version 0.6
356 """
357 if tag not in self.safe_tags:
358 return False
359 if tag.localname == 'input':
360 input_type = attrs.get('type', '').lower()
361 if input_type == 'password':
362 return False
363 return True
364
312 def is_safe_uri(self, uri): 365 def is_safe_uri(self, uri):
313 """Determine whether the given URI is to be considered safe for 366 """Determine whether the given URI is to be considered safe for
314 inclusion in the output. 367 inclusion in the output.
315 368
316 The default implementation checks whether the scheme of the URI is in 369 The default implementation checks whether the scheme of the URI is in
325 :param uri: the URI to check 378 :param uri: the URI to check
326 :return: `True` if the URI can be considered safe, `False` otherwise 379 :return: `True` if the URI can be considered safe, `False` otherwise
327 :rtype: `bool` 380 :rtype: `bool`
328 :since: version 0.4.3 381 :since: version 0.4.3
329 """ 382 """
383 if '#' in uri:
384 uri = uri.split('#', 1)[0] # Strip out the fragment identifier
330 if ':' not in uri: 385 if ':' not in uri:
331 return True # This is a relative URI 386 return True # This is a relative URI
332 chars = [char for char in uri.split(':', 1)[0] if char.isalnum()] 387 chars = [char for char in uri.split(':', 1)[0] if char.isalnum()]
333 return ''.join(chars).lower() in self.safe_schemes 388 return ''.join(chars).lower() in self.safe_schemes
334 389
361 :rtype: `list` 416 :rtype: `list`
362 :since: version 0.4.3 417 :since: version 0.4.3
363 """ 418 """
364 decls = [] 419 decls = []
365 text = self._strip_css_comments(self._replace_unicode_escapes(text)) 420 text = self._strip_css_comments(self._replace_unicode_escapes(text))
366 for decl in filter(None, text.split(';')): 421 for decl in text.split(';'):
367 decl = decl.strip() 422 decl = decl.strip()
368 if not decl: 423 if not decl:
369 continue 424 continue
425 try:
426 propname, value = decl.split(':', 1)
427 except ValueError:
428 continue
429 if not self.is_safe_css(propname.strip().lower(), value.strip()):
430 continue
370 is_evil = False 431 is_evil = False
371 if 'expression' in decl: 432 if 'expression' in value:
372 is_evil = True 433 is_evil = True
373 for match in re.finditer(r'url\s*\(([^)]+)', decl): 434 for match in re.finditer(r'url\s*\(([^)]+)', value):
374 if not self.is_safe_uri(match.group(1)): 435 if not self.is_safe_uri(match.group(1)):
375 is_evil = True 436 is_evil = True
376 break 437 break
377 if not is_evil: 438 if not is_evil:
378 decls.append(decl.strip()) 439 decls.append(decl.strip())
Copyright (C) 2012-2017 Edgewall Software