annotate markup/filters.py @ 13:bf9de5a4c896

Match directives should now also be applied when included indirectly. For example, a match directive defined in `a.html`, which is included by `b.html`, which in turn is included by `c.html` should now also be applied to `c.html`.
author cmlenz
date Mon, 05 Jun 2006 00:03:43 +0000
parents 87238328a71d
children 76b5d4b189e6
rev   line source
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
1 # -*- coding: utf-8 -*-
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
2 #
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
3 # Copyright (C) 2006 Christopher Lenz
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
4 # All rights reserved.
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
5 #
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
6 # This software is licensed as described in the file COPYING, which
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
7 # you should have received as part of this distribution. The terms
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
8 # are also available at http://trac.edgewall.com/license.html.
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
9 #
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
10 # This software consists of voluntary contributions made by many
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
11 # individuals. For the exact contribution history, see the revision
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
12 # history and logs, available at http://projects.edgewall.com/trac/.
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
13
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
14 """Implementation of a number of stream filters."""
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
15
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
16 try:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
17 frozenset
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
18 except NameError:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
19 from sets import ImmutableSet as frozenset
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
20 import re
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
21
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
22 from markup.core import Attributes, Markup, Stream
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
23 from markup.path import Path
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
24
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
25 __all__ = ['EvalFilter', 'IncludeFilter', 'MatchFilter', 'WhitespaceFilter',
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
26 'HTMLSanitizer']
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
27
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
28
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
29 class EvalFilter(object):
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
30 """Responsible for evaluating expressions in a template."""
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
31
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
32 def __call__(self, stream, ctxt=None):
10
c5890ef863ba Moved the template-specific stream event kinds into the template module.
cmlenz
parents: 1
diff changeset
33 from markup.template import Template
c5890ef863ba Moved the template-specific stream event kinds into the template module.
cmlenz
parents: 1
diff changeset
34
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
35 for kind, data, pos in stream:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
36
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
37 if kind is Stream.START:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
38 # Attributes may still contain expressions in start tags at
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
39 # this point, so do some evaluation
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
40 tag, attrib = data
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
41 new_attrib = []
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
42 for name, substream in attrib:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
43 if isinstance(substream, basestring):
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
44 value = substream
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
45 else:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
46 values = []
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
47 for subkind, subdata, subpos in substream:
10
c5890ef863ba Moved the template-specific stream event kinds into the template module.
cmlenz
parents: 1
diff changeset
48 if subkind is Template.EXPR:
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
49 values.append(subdata.evaluate(ctxt))
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
50 else:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
51 values.append(subdata)
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
52 value = filter(lambda x: x is not None, values)
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
53 if not value:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
54 continue
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
55 new_attrib.append((name, ''.join(value)))
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
56 yield kind, (tag, Attributes(new_attrib)), pos
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
57
10
c5890ef863ba Moved the template-specific stream event kinds into the template module.
cmlenz
parents: 1
diff changeset
58 elif kind is Template.EXPR:
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
59 result = data.evaluate(ctxt)
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
60 if result is None:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
61 continue
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
62
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
63 # First check for a string, otherwise the iterable
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
64 # test below succeeds, and the string will be
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
65 # chopped up into characters
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
66 if isinstance(result, basestring):
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
67 yield Stream.TEXT, result, pos
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
68 else:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
69 # Test if the expression evaluated to an
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
70 # iterable, in which case we yield the
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
71 # individual items
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
72 try:
10
c5890ef863ba Moved the template-specific stream event kinds into the template module.
cmlenz
parents: 1
diff changeset
73 yield Template.SUB, ([], iter(result)), pos
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
74 except TypeError:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
75 # Neither a string nor an iterable, so just
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
76 # pass it through
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
77 yield Stream.TEXT, unicode(result), pos
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
78
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
79 else:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
80 yield kind, data, pos
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
81
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
82
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
83 class IncludeFilter(object):
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
84 """Template filter providing (very) basic XInclude support
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
85 (see http://www.w3.org/TR/xinclude/) in templates.
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
86 """
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
87
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
88 _NAMESPACE = 'http://www.w3.org/2001/XInclude'
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
89
13
bf9de5a4c896 Match directives should now also be applied when included indirectly.
cmlenz
parents: 12
diff changeset
90 def __init__(self, loader, template):
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
91 """Initialize the filter.
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
92
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
93 @param loader: the `TemplateLoader` to use for resolving references to
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
94 external template files
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
95 """
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
96 self.loader = loader
13
bf9de5a4c896 Match directives should now also be applied when included indirectly.
cmlenz
parents: 12
diff changeset
97 self.template = template
bf9de5a4c896 Match directives should now also be applied when included indirectly.
cmlenz
parents: 12
diff changeset
98 if not hasattr(template, '_included_filters'):
bf9de5a4c896 Match directives should now also be applied when included indirectly.
cmlenz
parents: 12
diff changeset
99 template._included_filters = []
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
100
12
87238328a71d Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents: 10
diff changeset
101 def __call__(self, stream, ctxt=None, ns_prefixes=None):
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
102 """Filter the stream, processing any XInclude directives it may
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
103 contain.
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
104
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
105 @param ctxt: the template context
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
106 @param stream: the markup event stream to filter
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
107 """
12
87238328a71d Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents: 10
diff changeset
108 from markup.template import Template, TemplateError, TemplateNotFound
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
109
12
87238328a71d Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents: 10
diff changeset
110 if ns_prefixes is None:
87238328a71d Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents: 10
diff changeset
111 ns_prefixes = []
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
112 in_fallback = False
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
113 include_href, fallback_stream = None, None
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
114 indent = 0
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
115
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
116 for kind, data, pos in stream:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
117
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
118 if kind is Stream.START and data[0].namespace == self._NAMESPACE \
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
119 and not in_fallback:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
120 tag, attrib = data
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
121 if tag.localname == 'include':
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
122 include_href = attrib.get('href')
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
123 indent = pos[1]
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
124 elif tag.localname == 'fallback':
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
125 in_fallback = True
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
126 fallback_stream = []
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
127
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
128 elif kind is Stream.END and data.namespace == self._NAMESPACE:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
129 if data.localname == 'include':
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
130 try:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
131 if not include_href:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
132 raise TemplateError('Include misses required '
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
133 'attribute "href"')
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
134 template = self.loader.load(include_href)
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
135 for ikind, idata, ipos in template.generate(ctxt):
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
136 # Fixup indentation of included markup
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
137 if ikind is Stream.TEXT:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
138 idata = idata.replace('\n', '\n' + ' ' * indent)
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
139 yield ikind, idata, ipos
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
140
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
141 # If the included template defines any filters added at
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
142 # runtime (such as py:match templates), those need to be
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
143 # applied to the including template, too.
13
bf9de5a4c896 Match directives should now also be applied when included indirectly.
cmlenz
parents: 12
diff changeset
144 filters = template.filters + template._included_filters
bf9de5a4c896 Match directives should now also be applied when included indirectly.
cmlenz
parents: 12
diff changeset
145 for filter_ in filters:
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
146 stream = filter_(stream, ctxt)
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
147
13
bf9de5a4c896 Match directives should now also be applied when included indirectly.
cmlenz
parents: 12
diff changeset
148 # Runtime filters included need to be propagated up
bf9de5a4c896 Match directives should now also be applied when included indirectly.
cmlenz
parents: 12
diff changeset
149 self.template._included_filters += filters
bf9de5a4c896 Match directives should now also be applied when included indirectly.
cmlenz
parents: 12
diff changeset
150
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
151 except TemplateNotFound:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
152 if fallback_stream is None:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
153 raise
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
154 for event in fallback_stream:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
155 yield event
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
156
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
157 include_href = None
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
158 fallback_stream = None
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
159 indent = 0
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
160 break
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
161 elif data.localname == 'fallback':
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
162 in_fallback = False
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
163
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
164 elif in_fallback:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
165 fallback_stream.append((kind, data, pos))
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
166
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
167 elif kind is Stream.START_NS and data[1] == self._NAMESPACE:
12
87238328a71d Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents: 10
diff changeset
168 ns_prefixes.append(data[0])
87238328a71d Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents: 10
diff changeset
169 continue
87238328a71d Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents: 10
diff changeset
170
87238328a71d Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents: 10
diff changeset
171 elif kind is Stream.END_NS and data in ns_prefixes:
87238328a71d Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents: 10
diff changeset
172 ns_prefixes.pop()
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
173 continue
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
174
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
175 else:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
176 yield kind, data, pos
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
177 else:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
178 # The loop exited normally, so there shouldn't be further events to
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
179 # process
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
180 return
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
181
12
87238328a71d Make the XInclude filter track namespace context, to enable it to omit `END_NS` events for the XInclude namespace.
cmlenz
parents: 10
diff changeset
182 for event in self(stream, ctxt, ns_prefixes=ns_prefixes):
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
183 yield event
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
184
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
185
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
186 class MatchFilter(object):
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
187 """A filter that delegates to a given handler function when the input stream
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
188 matches some path expression.
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
189 """
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
190
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
191 def __init__(self, path, handler):
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
192 self.path = Path(path)
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
193 self.handler = handler
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
194
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
195 def __call__(self, stream, ctxt=None):
10
c5890ef863ba Moved the template-specific stream event kinds into the template module.
cmlenz
parents: 1
diff changeset
196 from markup.template import Template
c5890ef863ba Moved the template-specific stream event kinds into the template module.
cmlenz
parents: 1
diff changeset
197
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
198 test = self.path.test()
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
199 for kind, data, pos in stream:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
200 result = test(kind, data, pos)
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
201 if result is True:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
202 content = [(kind, data, pos)]
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
203 depth = 1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
204 while depth > 0:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
205 ev = stream.next()
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
206 if ev[0] is Stream.START:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
207 depth += 1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
208 elif ev[0] is Stream.END:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
209 depth -= 1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
210 content.append(ev)
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
211 test(*ev)
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
212
10
c5890ef863ba Moved the template-specific stream event kinds into the template module.
cmlenz
parents: 1
diff changeset
213 yield (Template.SUB,
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
214 ([lambda stream, ctxt: self.handler(content, ctxt)], []),
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
215 pos)
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
216 else:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
217 yield kind, data, pos
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
218
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
219
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
220 class WhitespaceFilter(object):
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
221 """A filter that removes extraneous white space from the stream.
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
222
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
223 Todo:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
224 * Support for xml:space
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
225 """
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
226
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
227 _TRAILING_SPACE = re.compile('[ \t]+(?=\n)')
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
228 _LINE_COLLAPSE = re.compile('\n{2,}')
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
229
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
230 def __call__(self, stream, ctxt=None):
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
231 textbuf = []
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
232 prev_kind = None
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
233 for kind, data, pos in stream:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
234 if kind is Stream.TEXT:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
235 textbuf.append(data)
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
236 elif prev_kind is Stream.TEXT:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
237 text = ''.join(textbuf)
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
238 text = self._TRAILING_SPACE.sub('', text)
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
239 text = self._LINE_COLLAPSE.sub('\n', text)
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
240 yield Stream.TEXT, text, pos
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
241 del textbuf[:]
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
242 prev_kind = kind
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
243 if kind is not Stream.TEXT:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
244 yield kind, data, pos
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
245
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
246 if textbuf:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
247 text = self._LINE_COLLAPSE.sub('\n', ''.join(textbuf))
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
248 yield Stream.TEXT, text, pos
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
249
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
250
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
251 class HTMLSanitizer(object):
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
252 """A filter that removes potentially dangerous HTML tags and attributes
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
253 from the stream.
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
254 """
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
255
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
256 _SAFE_TAGS = frozenset(['a', 'abbr', 'acronym', 'address', 'area', 'b',
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
257 'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
258 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
259 'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
260 'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
261 'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
262 'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
263 'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
264 'ul', 'var'])
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
265
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
266 _SAFE_ATTRS = frozenset(['abbr', 'accept', 'accept-charset', 'accesskey',
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
267 'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
268 'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
269 'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
270 'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
271 'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
272 'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
273 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
274 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
275 'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target',
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
276 'title', 'type', 'usemap', 'valign', 'value', 'vspace', 'width'])
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
277 _URI_ATTRS = frozenset(['action', 'background', 'dynsrc', 'href', 'lowsrc',
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
278 'src'])
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
279 _SAFE_SCHEMES = frozenset(['file', 'ftp', 'http', 'https', 'mailto', None])
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
280
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
281 def __call__(self, stream, ctxt=None):
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
282 waiting_for = None
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
283
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
284 for kind, data, pos in stream:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
285 if kind is Stream.START:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
286 if waiting_for:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
287 continue
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
288 tag, attrib = data
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
289 if tag not in self._SAFE_TAGS:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
290 waiting_for = tag
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
291 continue
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
292
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
293 new_attrib = []
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
294 for attr, value in attrib:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
295 if attr not in self._SAFE_ATTRS:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
296 continue
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
297 elif attr in self._URI_ATTRS:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
298 # Don't allow URI schemes such as "javascript:"
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
299 if self._get_scheme(value) not in self._SAFE_SCHEMES:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
300 continue
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
301 elif attr == 'style':
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
302 # Remove dangerous CSS declarations from inline styles
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
303 decls = []
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
304 for decl in filter(None, value.split(';')):
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
305 is_evil = False
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
306 if 'expression' in decl:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
307 is_evil = True
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
308 for m in re.finditer(r'url\s*\(([^)]+)', decl):
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
309 if self._get_scheme(m.group(1)) not in self._SAFE_SCHEMES:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
310 is_evil = True
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
311 break
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
312 if not is_evil:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
313 decls.append(decl.strip())
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
314 if not decls:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
315 continue
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
316 value = '; '.join(decls)
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
317 new_attrib.append((attr, value))
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
318
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
319 yield kind, (tag, new_attrib), pos
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
320
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
321 elif kind is Stream.END:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
322 tag = data
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
323 if waiting_for:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
324 if waiting_for == tag:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
325 waiting_for = None
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
326 else:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
327 yield kind, data, pos
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
328
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
329 else:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
330 if not waiting_for:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
331 yield kind, data, pos
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
332
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
333 def _get_scheme(self, text):
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
334 if ':' not in text:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
335 return None
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
336 chars = [char for char in text.split(':', 1)[0] if char.isalnum()]
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
337 return ''.join(chars).lower()
Copyright (C) 2012-2017 Edgewall Software