Mercurial > genshi > mirror
annotate genshi/filters/html.py @ 844:1ae18bca8de4 trunk
Fix two instances of using None, which would cause an AttributeError.
Submitted by: Jon Nelson
author | jruigrok |
---|---|
date | Mon, 29 Jun 2009 09:33:02 +0000 |
parents | 86b5cee4eb6c |
children | f33ecf3c319e |
rev | line source |
---|---|
1 | 1 # -*- coding: utf-8 -*- |
2 # | |
719 | 3 # Copyright (C) 2006-2008 Edgewall Software |
1 | 4 # All rights reserved. |
5 # | |
6 # This software is licensed as described in the file COPYING, which | |
7 # you should have received as part of this distribution. The terms | |
230 | 8 # are also available at http://genshi.edgewall.org/wiki/License. |
1 | 9 # |
10 # This software consists of voluntary contributions made by many | |
11 # individuals. For the exact contribution history, see the revision | |
230 | 12 # history and logs, available at http://genshi.edgewall.org/log/. |
1 | 13 |
14 """Implementation of a number of stream filters.""" | |
15 | |
16 import re | |
17 | |
403
228907abb726
Remove some magic/overhead from `Attrs` creation and manipulation by not automatically wrapping attribute names in `QName`.
cmlenz
parents:
363
diff
changeset
|
18 from genshi.core import Attrs, QName, stripentities |
571
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
19 from genshi.core import END, START, TEXT, COMMENT |
1 | 20 |
363
37e4b4bb0b53
Parse template includes at parse time to avoid some runtime overhead.
cmlenz
parents:
345
diff
changeset
|
21 __all__ = ['HTMLFormFiller', 'HTMLSanitizer'] |
425
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
415
diff
changeset
|
22 __docformat__ = 'restructuredtext en' |
275
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
23 |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
24 |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
25 class HTMLFormFiller(object): |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
26 """A stream filter that can populate HTML forms from a dictionary of values. |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
27 |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
28 >>> from genshi.input import HTML |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
29 >>> html = HTML('''<form> |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
30 ... <p><input type="text" name="foo" /></p> |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
31 ... </form>''') |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
32 >>> filler = HTMLFormFiller(data={'foo': 'bar'}) |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
33 >>> print html | filler |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
34 <form> |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
35 <p><input type="text" name="foo" value="bar"/></p> |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
36 </form> |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
37 """ |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
38 # TODO: only select the first radio button, and the first select option |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
39 # (if not in a multiple-select) |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
40 # TODO: only apply to elements in the XHTML namespace (or no namespace)? |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
41 |
841
86b5cee4eb6c
Added an option to the `HTMLFiller` to also populate password fields.
cmlenz
parents:
840
diff
changeset
|
42 def __init__(self, name=None, id=None, data=None, passwords=False): |
275
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
43 """Create the filter. |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
44 |
425
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
415
diff
changeset
|
45 :param name: The name of the form that should be populated. If this |
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
415
diff
changeset
|
46 parameter is given, only forms where the ``name`` attribute |
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
415
diff
changeset
|
47 value matches the parameter are processed. |
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
415
diff
changeset
|
48 :param id: The ID of the form that should be populated. If this |
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
415
diff
changeset
|
49 parameter is given, only forms where the ``id`` attribute |
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
415
diff
changeset
|
50 value matches the parameter are processed. |
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
415
diff
changeset
|
51 :param data: The dictionary of form values, where the keys are the names |
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
415
diff
changeset
|
52 of the form fields, and the values are the values to fill |
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
415
diff
changeset
|
53 in. |
841
86b5cee4eb6c
Added an option to the `HTMLFiller` to also populate password fields.
cmlenz
parents:
840
diff
changeset
|
54 :param passwords: Whether password input fields should be populated. |
86b5cee4eb6c
Added an option to the `HTMLFiller` to also populate password fields.
cmlenz
parents:
840
diff
changeset
|
55 This is off by default for security reasons (for |
86b5cee4eb6c
Added an option to the `HTMLFiller` to also populate password fields.
cmlenz
parents:
840
diff
changeset
|
56 example, a password may end up in the browser cache) |
86b5cee4eb6c
Added an option to the `HTMLFiller` to also populate password fields.
cmlenz
parents:
840
diff
changeset
|
57 :note: Changed in 0.5.2: added the `passwords` option |
275
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
58 """ |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
59 self.name = name |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
60 self.id = id |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
61 if data is None: |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
62 data = {} |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
63 self.data = data |
841
86b5cee4eb6c
Added an option to the `HTMLFiller` to also populate password fields.
cmlenz
parents:
840
diff
changeset
|
64 self.passwords = passwords |
275
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
65 |
439
9f11c745fac9
Add support for adding custom template filters by passing a custom callback function to the `TemplateLoader`. Closes #89 (see added unit test).
cmlenz
parents:
431
diff
changeset
|
66 def __call__(self, stream): |
277
7e30bfa966ab
The `HTMLSanitizer` now lets you override the default set of tag and attribute names that are considered safe.
cmlenz
parents:
275
diff
changeset
|
67 """Apply the filter to the given stream. |
7e30bfa966ab
The `HTMLSanitizer` now lets you override the default set of tag and attribute names that are considered safe.
cmlenz
parents:
275
diff
changeset
|
68 |
425
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
415
diff
changeset
|
69 :param stream: the markup event stream to filter |
277
7e30bfa966ab
The `HTMLSanitizer` now lets you override the default set of tag and attribute names that are considered safe.
cmlenz
parents:
275
diff
changeset
|
70 """ |
275
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
71 in_form = in_select = in_option = in_textarea = False |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
72 select_value = option_value = textarea_value = None |
584
94f719af686d
Fixed a few cases where HTMLFormFiller didn't work well with option elements:
jonas
parents:
576
diff
changeset
|
73 option_start = None |
94f719af686d
Fixed a few cases where HTMLFormFiller didn't work well with option elements:
jonas
parents:
576
diff
changeset
|
74 option_text = [] |
94f719af686d
Fixed a few cases where HTMLFormFiller didn't work well with option elements:
jonas
parents:
576
diff
changeset
|
75 no_option_value = False |
275
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
76 |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
77 for kind, data, pos in stream: |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
78 |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
79 if kind is START: |
345 | 80 tag, attrs = data |
275
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
81 tagname = tag.localname |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
82 |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
83 if tagname == 'form' and ( |
345 | 84 self.name and attrs.get('name') == self.name or |
85 self.id and attrs.get('id') == self.id or | |
275
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
86 not (self.id or self.name)): |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
87 in_form = True |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
88 |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
89 elif in_form: |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
90 if tagname == 'input': |
844
1ae18bca8de4
Fix two instances of using None, which would cause an AttributeError.
jruigrok
parents:
841
diff
changeset
|
91 type = attrs.get('type', '').lower() |
275
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
92 if type in ('checkbox', 'radio'): |
345 | 93 name = attrs.get('name') |
471
76a0ec32835d
The `HTMLFormFiller` stream filter no longer alters form elements for which the data element contains no corresponding item.
cmlenz
parents:
446
diff
changeset
|
94 if name and name in self.data: |
76a0ec32835d
The `HTMLFormFiller` stream filter no longer alters form elements for which the data element contains no corresponding item.
cmlenz
parents:
446
diff
changeset
|
95 value = self.data[name] |
345 | 96 declval = attrs.get('value') |
275
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
97 checked = False |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
98 if isinstance(value, (list, tuple)): |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
99 if declval: |
584
94f719af686d
Fixed a few cases where HTMLFormFiller didn't work well with option elements:
jonas
parents:
576
diff
changeset
|
100 checked = declval in [unicode(v) for v |
415
b9f9a22484f0
`HTMLFormFiller` now correctly deals with non-string values in the data dictionary for select/checkbox/radio controls.
cmlenz
parents:
408
diff
changeset
|
101 in value] |
275
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
102 else: |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
103 checked = bool(filter(None, value)) |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
104 else: |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
105 if declval: |
584
94f719af686d
Fixed a few cases where HTMLFormFiller didn't work well with option elements:
jonas
parents:
576
diff
changeset
|
106 checked = declval == unicode(value) |
275
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
107 elif type == 'checkbox': |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
108 checked = bool(value) |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
109 if checked: |
403
228907abb726
Remove some magic/overhead from `Attrs` creation and manipulation by not automatically wrapping attribute names in `QName`.
cmlenz
parents:
363
diff
changeset
|
110 attrs |= [(QName('checked'), 'checked')] |
345 | 111 elif 'checked' in attrs: |
112 attrs -= 'checked' | |
844
1ae18bca8de4
Fix two instances of using None, which would cause an AttributeError.
jruigrok
parents:
841
diff
changeset
|
113 elif type in ('', 'hidden', 'text') \ |
841
86b5cee4eb6c
Added an option to the `HTMLFiller` to also populate password fields.
cmlenz
parents:
840
diff
changeset
|
114 or type == 'password' and self.passwords: |
345 | 115 name = attrs.get('name') |
471
76a0ec32835d
The `HTMLFormFiller` stream filter no longer alters form elements for which the data element contains no corresponding item.
cmlenz
parents:
446
diff
changeset
|
116 if name and name in self.data: |
76a0ec32835d
The `HTMLFormFiller` stream filter no longer alters form elements for which the data element contains no corresponding item.
cmlenz
parents:
446
diff
changeset
|
117 value = self.data[name] |
275
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
118 if isinstance(value, (list, tuple)): |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
119 value = value[0] |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
120 if value is not None: |
841
86b5cee4eb6c
Added an option to the `HTMLFiller` to also populate password fields.
cmlenz
parents:
840
diff
changeset
|
121 attrs |= [ |
86b5cee4eb6c
Added an option to the `HTMLFiller` to also populate password fields.
cmlenz
parents:
840
diff
changeset
|
122 (QName('value'), unicode(value)) |
86b5cee4eb6c
Added an option to the `HTMLFiller` to also populate password fields.
cmlenz
parents:
840
diff
changeset
|
123 ] |
275
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
124 elif tagname == 'select': |
345 | 125 name = attrs.get('name') |
471
76a0ec32835d
The `HTMLFormFiller` stream filter no longer alters form elements for which the data element contains no corresponding item.
cmlenz
parents:
446
diff
changeset
|
126 if name in self.data: |
76a0ec32835d
The `HTMLFormFiller` stream filter no longer alters form elements for which the data element contains no corresponding item.
cmlenz
parents:
446
diff
changeset
|
127 select_value = self.data[name] |
76a0ec32835d
The `HTMLFormFiller` stream filter no longer alters form elements for which the data element contains no corresponding item.
cmlenz
parents:
446
diff
changeset
|
128 in_select = True |
275
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
129 elif tagname == 'textarea': |
345 | 130 name = attrs.get('name') |
471
76a0ec32835d
The `HTMLFormFiller` stream filter no longer alters form elements for which the data element contains no corresponding item.
cmlenz
parents:
446
diff
changeset
|
131 if name in self.data: |
76a0ec32835d
The `HTMLFormFiller` stream filter no longer alters form elements for which the data element contains no corresponding item.
cmlenz
parents:
446
diff
changeset
|
132 textarea_value = self.data.get(name) |
76a0ec32835d
The `HTMLFormFiller` stream filter no longer alters form elements for which the data element contains no corresponding item.
cmlenz
parents:
446
diff
changeset
|
133 if isinstance(textarea_value, (list, tuple)): |
76a0ec32835d
The `HTMLFormFiller` stream filter no longer alters form elements for which the data element contains no corresponding item.
cmlenz
parents:
446
diff
changeset
|
134 textarea_value = textarea_value[0] |
76a0ec32835d
The `HTMLFormFiller` stream filter no longer alters form elements for which the data element contains no corresponding item.
cmlenz
parents:
446
diff
changeset
|
135 in_textarea = True |
275
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
136 elif in_select and tagname == 'option': |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
137 option_start = kind, data, pos |
345 | 138 option_value = attrs.get('value') |
584
94f719af686d
Fixed a few cases where HTMLFormFiller didn't work well with option elements:
jonas
parents:
576
diff
changeset
|
139 if option_value is None: |
94f719af686d
Fixed a few cases where HTMLFormFiller didn't work well with option elements:
jonas
parents:
576
diff
changeset
|
140 no_option_value = True |
94f719af686d
Fixed a few cases where HTMLFormFiller didn't work well with option elements:
jonas
parents:
576
diff
changeset
|
141 option_value = '' |
275
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
142 in_option = True |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
143 continue |
345 | 144 yield kind, (tag, attrs), pos |
145 | |
275
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
146 elif in_form and kind is TEXT: |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
147 if in_select and in_option: |
584
94f719af686d
Fixed a few cases where HTMLFormFiller didn't work well with option elements:
jonas
parents:
576
diff
changeset
|
148 if no_option_value: |
94f719af686d
Fixed a few cases where HTMLFormFiller didn't work well with option elements:
jonas
parents:
576
diff
changeset
|
149 option_value += data |
94f719af686d
Fixed a few cases where HTMLFormFiller didn't work well with option elements:
jonas
parents:
576
diff
changeset
|
150 option_text.append((kind, data, pos)) |
275
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
151 continue |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
152 elif in_textarea: |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
153 continue |
345 | 154 yield kind, data, pos |
275
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
155 |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
156 elif in_form and kind is END: |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
157 tagname = data.localname |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
158 if tagname == 'form': |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
159 in_form = False |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
160 elif tagname == 'select': |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
161 in_select = False |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
162 select_value = None |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
163 elif in_select and tagname == 'option': |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
164 if isinstance(select_value, (tuple, list)): |
584
94f719af686d
Fixed a few cases where HTMLFormFiller didn't work well with option elements:
jonas
parents:
576
diff
changeset
|
165 selected = option_value in [unicode(v) for v |
415
b9f9a22484f0
`HTMLFormFiller` now correctly deals with non-string values in the data dictionary for select/checkbox/radio controls.
cmlenz
parents:
408
diff
changeset
|
166 in select_value] |
275
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
167 else: |
584
94f719af686d
Fixed a few cases where HTMLFormFiller didn't work well with option elements:
jonas
parents:
576
diff
changeset
|
168 selected = option_value == unicode(select_value) |
345 | 169 okind, (tag, attrs), opos = option_start |
275
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
170 if selected: |
403
228907abb726
Remove some magic/overhead from `Attrs` creation and manipulation by not automatically wrapping attribute names in `QName`.
cmlenz
parents:
363
diff
changeset
|
171 attrs |= [(QName('selected'), 'selected')] |
345 | 172 elif 'selected' in attrs: |
173 attrs -= 'selected' | |
174 yield okind, (tag, attrs), opos | |
275
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
175 if option_text: |
584
94f719af686d
Fixed a few cases where HTMLFormFiller didn't work well with option elements:
jonas
parents:
576
diff
changeset
|
176 for event in option_text: |
94f719af686d
Fixed a few cases where HTMLFormFiller didn't work well with option elements:
jonas
parents:
576
diff
changeset
|
177 yield event |
275
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
178 in_option = False |
584
94f719af686d
Fixed a few cases where HTMLFormFiller didn't work well with option elements:
jonas
parents:
576
diff
changeset
|
179 no_option_value = False |
94f719af686d
Fixed a few cases where HTMLFormFiller didn't work well with option elements:
jonas
parents:
576
diff
changeset
|
180 option_start = option_value = None |
94f719af686d
Fixed a few cases where HTMLFormFiller didn't work well with option elements:
jonas
parents:
576
diff
changeset
|
181 option_text = [] |
275
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
182 elif tagname == 'textarea': |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
183 if textarea_value: |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
184 yield TEXT, unicode(textarea_value), pos |
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
185 in_textarea = False |
345 | 186 yield kind, data, pos |
275
d91cbdeb75e9
Integrated `HTMLFormFiller` filter initially presented as a [wiki:FormFilling#Usingatemplatefilter recipe].
cmlenz
parents:
230
diff
changeset
|
187 |
345 | 188 else: |
189 yield kind, data, pos | |
123
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
190 |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
191 |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
192 class HTMLSanitizer(object): |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
193 """A filter that removes potentially dangerous HTML tags and attributes |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
194 from the stream. |
431
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
195 |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
196 >>> from genshi import HTML |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
197 >>> html = HTML('<div><script>alert(document.cookie)</script></div>') |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
198 >>> print html | HTMLSanitizer() |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
199 <div/> |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
200 |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
201 The default set of safe tags and attributes can be modified when the filter |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
202 is instantiated. For example, to allow inline ``style`` attributes, the |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
203 following instantation would work: |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
204 |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
205 >>> html = HTML('<div style="background: #000"></div>') |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
206 >>> sanitizer = HTMLSanitizer(safe_attrs=HTMLSanitizer.SAFE_ATTRS | set(['style'])) |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
207 >>> print html | sanitizer |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
208 <div style="background: #000"/> |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
209 |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
210 Note that even in this case, the filter *does* attempt to remove dangerous |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
211 constructs from style attributes: |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
212 |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
213 >>> html = HTML('<div style="background: url(javascript:void); color: #000"></div>') |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
214 >>> print html | sanitizer |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
215 <div style="color: #000"/> |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
216 |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
217 This handles HTML entities, unicode escapes in CSS and Javascript text, as |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
218 well as a lot of other things. However, the style tag is still excluded by |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
219 default because it is very hard for such sanitizing to be completely safe, |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
220 especially considering how much error recovery current web browsers perform. |
571
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
221 |
840
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
222 It also does some basic filtering of CSS properties that may be used for |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
223 typical phishing attacks. For more sophisticated filtering, this class |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
224 provides a couple of hooks that can be overridden in sub-classes. |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
225 |
571
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
226 :warn: Note that this special processing of CSS is currently only applied to |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
227 style attributes, **not** style elements. |
123
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
228 """ |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
229 |
277
7e30bfa966ab
The `HTMLSanitizer` now lets you override the default set of tag and attribute names that are considered safe.
cmlenz
parents:
275
diff
changeset
|
230 SAFE_TAGS = frozenset(['a', 'abbr', 'acronym', 'address', 'area', 'b', |
123
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
231 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite', |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
232 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
233 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
234 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map', |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
235 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp', |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
236 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table', |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
237 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u', |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
238 'ul', 'var']) |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
239 |
277
7e30bfa966ab
The `HTMLSanitizer` now lets you override the default set of tag and attribute names that are considered safe.
cmlenz
parents:
275
diff
changeset
|
240 SAFE_ATTRS = frozenset(['abbr', 'accept', 'accept-charset', 'accesskey', |
123
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
241 'action', 'align', 'alt', 'axis', 'bgcolor', 'border', 'cellpadding', |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
242 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class', |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
243 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime', |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
244 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height', |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
245 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang', |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
246 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name', |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
247 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev', |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
248 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', |
431
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
249 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
250 'type', 'usemap', 'valign', 'value', 'vspace', 'width']) |
277
7e30bfa966ab
The `HTMLSanitizer` now lets you override the default set of tag and attribute names that are considered safe.
cmlenz
parents:
275
diff
changeset
|
251 |
7e30bfa966ab
The `HTMLSanitizer` now lets you override the default set of tag and attribute names that are considered safe.
cmlenz
parents:
275
diff
changeset
|
252 SAFE_SCHEMES = frozenset(['file', 'ftp', 'http', 'https', 'mailto', None]) |
7e30bfa966ab
The `HTMLSanitizer` now lets you override the default set of tag and attribute names that are considered safe.
cmlenz
parents:
275
diff
changeset
|
253 |
7e30bfa966ab
The `HTMLSanitizer` now lets you override the default set of tag and attribute names that are considered safe.
cmlenz
parents:
275
diff
changeset
|
254 URI_ATTRS = frozenset(['action', 'background', 'dynsrc', 'href', 'lowsrc', |
123
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
255 'src']) |
277
7e30bfa966ab
The `HTMLSanitizer` now lets you override the default set of tag and attribute names that are considered safe.
cmlenz
parents:
275
diff
changeset
|
256 |
7e30bfa966ab
The `HTMLSanitizer` now lets you override the default set of tag and attribute names that are considered safe.
cmlenz
parents:
275
diff
changeset
|
257 def __init__(self, safe_tags=SAFE_TAGS, safe_attrs=SAFE_ATTRS, |
7e30bfa966ab
The `HTMLSanitizer` now lets you override the default set of tag and attribute names that are considered safe.
cmlenz
parents:
275
diff
changeset
|
258 safe_schemes=SAFE_SCHEMES, uri_attrs=URI_ATTRS): |
7e30bfa966ab
The `HTMLSanitizer` now lets you override the default set of tag and attribute names that are considered safe.
cmlenz
parents:
275
diff
changeset
|
259 """Create the sanitizer. |
7e30bfa966ab
The `HTMLSanitizer` now lets you override the default set of tag and attribute names that are considered safe.
cmlenz
parents:
275
diff
changeset
|
260 |
7e30bfa966ab
The `HTMLSanitizer` now lets you override the default set of tag and attribute names that are considered safe.
cmlenz
parents:
275
diff
changeset
|
261 The exact set of allowed elements and attributes can be configured. |
7e30bfa966ab
The `HTMLSanitizer` now lets you override the default set of tag and attribute names that are considered safe.
cmlenz
parents:
275
diff
changeset
|
262 |
425
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
415
diff
changeset
|
263 :param safe_tags: a set of tag names that are considered safe |
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
415
diff
changeset
|
264 :param safe_attrs: a set of attribute names that are considered safe |
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
415
diff
changeset
|
265 :param safe_schemes: a set of URI schemes that are considered safe |
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
415
diff
changeset
|
266 :param uri_attrs: a set of names of attributes that contain URIs |
277
7e30bfa966ab
The `HTMLSanitizer` now lets you override the default set of tag and attribute names that are considered safe.
cmlenz
parents:
275
diff
changeset
|
267 """ |
7e30bfa966ab
The `HTMLSanitizer` now lets you override the default set of tag and attribute names that are considered safe.
cmlenz
parents:
275
diff
changeset
|
268 self.safe_tags = safe_tags |
571
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
269 "The set of tag names that are considered safe." |
277
7e30bfa966ab
The `HTMLSanitizer` now lets you override the default set of tag and attribute names that are considered safe.
cmlenz
parents:
275
diff
changeset
|
270 self.safe_attrs = safe_attrs |
571
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
271 "The set of attribute names that are considered safe." |
277
7e30bfa966ab
The `HTMLSanitizer` now lets you override the default set of tag and attribute names that are considered safe.
cmlenz
parents:
275
diff
changeset
|
272 self.uri_attrs = uri_attrs |
571
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
273 "The set of names of attributes that may contain URIs." |
277
7e30bfa966ab
The `HTMLSanitizer` now lets you override the default set of tag and attribute names that are considered safe.
cmlenz
parents:
275
diff
changeset
|
274 self.safe_schemes = safe_schemes |
571
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
275 "The set of URI schemes that are considered safe." |
123
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
276 |
439
9f11c745fac9
Add support for adding custom template filters by passing a custom callback function to the `TemplateLoader`. Closes #89 (see added unit test).
cmlenz
parents:
431
diff
changeset
|
277 def __call__(self, stream): |
277
7e30bfa966ab
The `HTMLSanitizer` now lets you override the default set of tag and attribute names that are considered safe.
cmlenz
parents:
275
diff
changeset
|
278 """Apply the filter to the given stream. |
7e30bfa966ab
The `HTMLSanitizer` now lets you override the default set of tag and attribute names that are considered safe.
cmlenz
parents:
275
diff
changeset
|
279 |
425
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
415
diff
changeset
|
280 :param stream: the markup event stream to filter |
277
7e30bfa966ab
The `HTMLSanitizer` now lets you override the default set of tag and attribute names that are considered safe.
cmlenz
parents:
275
diff
changeset
|
281 """ |
123
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
282 waiting_for = None |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
283 |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
284 for kind, data, pos in stream: |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
285 if kind is START: |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
286 if waiting_for: |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
287 continue |
345 | 288 tag, attrs = data |
840
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
289 if not self.is_safe_elem(tag, attrs): |
123
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
290 waiting_for = tag |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
291 continue |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
292 |
345 | 293 new_attrs = [] |
294 for attr, value in attrs: | |
123
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
295 value = stripentities(value) |
277
7e30bfa966ab
The `HTMLSanitizer` now lets you override the default set of tag and attribute names that are considered safe.
cmlenz
parents:
275
diff
changeset
|
296 if attr not in self.safe_attrs: |
123
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
297 continue |
277
7e30bfa966ab
The `HTMLSanitizer` now lets you override the default set of tag and attribute names that are considered safe.
cmlenz
parents:
275
diff
changeset
|
298 elif attr in self.uri_attrs: |
123
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
299 # Don't allow URI schemes such as "javascript:" |
571
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
300 if not self.is_safe_uri(value): |
123
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
301 continue |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
302 elif attr == 'style': |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
303 # Remove dangerous CSS declarations from inline styles |
571
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
304 decls = self.sanitize_css(value) |
123
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
305 if not decls: |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
306 continue |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
307 value = '; '.join(decls) |
345 | 308 new_attrs.append((attr, value)) |
123
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
309 |
345 | 310 yield kind, (tag, Attrs(new_attrs)), pos |
123
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
311 |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
312 elif kind is END: |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
313 tag = data |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
314 if waiting_for: |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
315 if waiting_for == tag: |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
316 waiting_for = None |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
317 else: |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
318 yield kind, data, pos |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
319 |
571
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
320 elif kind is not COMMENT: |
123
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
321 if not waiting_for: |
10279d2eeec9
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
113
diff
changeset
|
322 yield kind, data, pos |
431
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
323 |
840
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
324 def is_safe_css(self, propname, value): |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
325 """Determine whether the given css property declaration is to be |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
326 considered safe for inclusion in the output. |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
327 |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
328 :param propname: the CSS property name |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
329 :param value: the value of the property |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
330 :return: whether the property value should be considered safe |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
331 :rtype: bool |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
332 :since: version 0.6 |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
333 """ |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
334 if propname == 'position': |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
335 return False |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
336 if propname.startswith('margin') and '-' in value: |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
337 # Negative margins can be used for phishing |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
338 return False |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
339 return True |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
340 |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
341 def is_safe_elem(self, tag, attrs): |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
342 """Determine whether the given element should be considered safe for |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
343 inclusion in the output. |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
344 |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
345 :param tag: the tag name of the element |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
346 :type tag: QName |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
347 :param attrs: the element attributes |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
348 :type attrs: Attrs |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
349 :return: whether the element should be considered safe |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
350 :rtype: bool |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
351 :since: version 0.6 |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
352 """ |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
353 if tag not in self.safe_tags: |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
354 return False |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
355 if tag.localname == 'input': |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
356 input_type = attrs.get('type', '').lower() |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
357 if input_type == 'password': |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
358 return False |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
359 return True |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
360 |
571
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
361 def is_safe_uri(self, uri): |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
362 """Determine whether the given URI is to be considered safe for |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
363 inclusion in the output. |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
364 |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
365 The default implementation checks whether the scheme of the URI is in |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
366 the set of allowed URIs (`safe_schemes`). |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
367 |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
368 >>> sanitizer = HTMLSanitizer() |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
369 >>> sanitizer.is_safe_uri('http://example.org/') |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
370 True |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
371 >>> sanitizer.is_safe_uri('javascript:alert(document.cookie)') |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
372 False |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
373 |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
374 :param uri: the URI to check |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
375 :return: `True` if the URI can be considered safe, `False` otherwise |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
376 :rtype: `bool` |
576 | 377 :since: version 0.4.3 |
571
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
378 """ |
837 | 379 if '#' in uri: |
380 uri = uri.split('#', 1)[0] # Strip out the fragment identifier | |
571
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
381 if ':' not in uri: |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
382 return True # This is a relative URI |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
383 chars = [char for char in uri.split(':', 1)[0] if char.isalnum()] |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
384 return ''.join(chars).lower() in self.safe_schemes |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
385 |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
386 def sanitize_css(self, text): |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
387 """Remove potentially dangerous property declarations from CSS code. |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
388 |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
389 In particular, properties using the CSS ``url()`` function with a scheme |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
390 that is not considered safe are removed: |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
391 |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
392 >>> sanitizer = HTMLSanitizer() |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
393 >>> sanitizer.sanitize_css(u''' |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
394 ... background: url(javascript:alert("foo")); |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
395 ... color: #000; |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
396 ... ''') |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
397 [u'color: #000'] |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
398 |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
399 Also, the proprietary Internet Explorer function ``expression()`` is |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
400 always stripped: |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
401 |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
402 >>> sanitizer.sanitize_css(u''' |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
403 ... background: #fff; |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
404 ... color: #000; |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
405 ... width: e/**/xpression(alert("foo")); |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
406 ... ''') |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
407 [u'background: #fff', u'color: #000'] |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
408 |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
409 :param text: the CSS text; this is expected to be `unicode` and to not |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
410 contain any character or numeric references |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
411 :return: a list of declarations that are considered safe |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
412 :rtype: `list` |
576 | 413 :since: version 0.4.3 |
571
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
414 """ |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
415 decls = [] |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
416 text = self._strip_css_comments(self._replace_unicode_escapes(text)) |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
417 for decl in filter(None, text.split(';')): |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
418 decl = decl.strip() |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
419 if not decl: |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
420 continue |
840
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
421 try: |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
422 propname, value = decl.split(':', 1) |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
423 except ValueError: |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
424 continue |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
425 if not self.is_safe_css(propname.strip().lower(), value.strip()): |
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
426 continue |
571
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
427 is_evil = False |
840
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
428 if 'expression' in value: |
571
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
429 is_evil = True |
840
9eb84c75e5ac
Ported some of the HTML sanitization improvements from Trac (see [T7658]).
cmlenz
parents:
837
diff
changeset
|
430 for match in re.finditer(r'url\s*\(([^)]+)', value): |
571
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
431 if not self.is_safe_uri(match.group(1)): |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
432 is_evil = True |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
433 break |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
434 if not is_evil: |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
435 decls.append(decl.strip()) |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
436 return decls |
f0461dc3939a
* Cleaned up the implementation of the `HTMLSanitizer`.
cmlenz
parents:
556
diff
changeset
|
437 |
431
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
438 _NORMALIZE_NEWLINES = re.compile(r'\r\n').sub |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
439 _UNICODE_ESCAPE = re.compile(r'\\([0-9a-fA-F]{1,6})\s?').sub |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
440 |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
441 def _replace_unicode_escapes(self, text): |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
442 def _repl(match): |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
443 return unichr(int(match.group(1), 16)) |
ad01564e87f2
* Don't allow `style` attributes by default in the `HTMLSanitizer`. Closes #97.
cmlenz
parents:
425
diff
changeset
|
444 return self._UNICODE_ESCAPE(_repl, self._NORMALIZE_NEWLINES('\n', text)) |
556
0d98569eaced
The HTML sanitizer now strips any CSS comments in style attributes, which could previously be used to hide malicious property values.
cmlenz
parents:
471
diff
changeset
|
445 |
0d98569eaced
The HTML sanitizer now strips any CSS comments in style attributes, which could previously be used to hide malicious property values.
cmlenz
parents:
471
diff
changeset
|
446 _CSS_COMMENTS = re.compile(r'/\*.*?\*/').sub |
0d98569eaced
The HTML sanitizer now strips any CSS comments in style attributes, which could previously be used to hide malicious property values.
cmlenz
parents:
471
diff
changeset
|
447 |
0d98569eaced
The HTML sanitizer now strips any CSS comments in style attributes, which could previously be used to hide malicious property values.
cmlenz
parents:
471
diff
changeset
|
448 def _strip_css_comments(self, text): |
0d98569eaced
The HTML sanitizer now strips any CSS comments in style attributes, which could previously be used to hide malicious property values.
cmlenz
parents:
471
diff
changeset
|
449 return self._CSS_COMMENTS('', text) |