Mercurial > genshi > genshi-test
annotate genshi/output.py @ 500:0742f421caba experimental-inline
Merged revisions 487-603 via svnmerge from
http://svn.edgewall.org/repos/genshi/trunk
author | cmlenz |
---|---|
date | Fri, 01 Jun 2007 17:21:47 +0000 |
parents | 49aa525b8f83 |
children | 1837f39efd6f |
rev | line source |
---|---|
1 | 1 # -*- coding: utf-8 -*- |
2 # | |
500 | 3 # Copyright (C) 2006-2007 Edgewall Software |
1 | 4 # All rights reserved. |
5 # | |
6 # This software is licensed as described in the file COPYING, which | |
7 # you should have received as part of this distribution. The terms | |
230 | 8 # are also available at http://genshi.edgewall.org/wiki/License. |
1 | 9 # |
10 # This software consists of voluntary contributions made by many | |
11 # individuals. For the exact contribution history, see the revision | |
230 | 12 # history and logs, available at http://genshi.edgewall.org/log/. |
1 | 13 |
14 """This module provides different kinds of serialization methods for XML event | |
15 streams. | |
16 """ | |
17 | |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
18 from itertools import chain |
1 | 19 try: |
20 frozenset | |
21 except NameError: | |
22 from sets import ImmutableSet as frozenset | |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
23 import re |
1 | 24 |
500 | 25 from genshi.core import escape, Attrs, Markup, Namespace, QName, StreamEventKind |
26 from genshi.core import START, END, TEXT, XML_DECL, DOCTYPE, START_NS, END_NS, \ | |
27 START_CDATA, END_CDATA, PI, COMMENT, XML_NAMESPACE | |
1 | 28 |
500 | 29 __all__ = ['encode', 'get_serializer', 'DocType', 'XMLSerializer', |
30 'XHTMLSerializer', 'HTMLSerializer', 'TextSerializer'] | |
31 __docformat__ = 'restructuredtext en' | |
32 | |
33 def encode(iterator, method='xml', encoding='utf-8'): | |
34 """Encode serializer output into a string. | |
35 | |
36 :param iterator: the iterator returned from serializing a stream (basically | |
37 any iterator that yields unicode objects) | |
38 :param method: the serialization method; determines how characters not | |
39 representable in the specified encoding are treated | |
40 :param encoding: how the output string should be encoded; if set to `None`, | |
41 this method returns a `unicode` object | |
42 :return: a string or unicode object (depending on the `encoding` parameter) | |
43 :since: version 0.4.1 | |
44 """ | |
45 output = u''.join(list(iterator)) | |
46 if encoding is not None: | |
47 errors = 'replace' | |
48 if method != 'text' and not isinstance(method, TextSerializer): | |
49 errors = 'xmlcharrefreplace' | |
50 return output.encode(encoding, errors) | |
51 return output | |
52 | |
53 def get_serializer(method='xml', **kwargs): | |
54 """Return a serializer object for the given method. | |
55 | |
56 :param method: the serialization method; can be either "xml", "xhtml", | |
57 "html", "text", or a custom serializer class | |
58 | |
59 Any additional keyword arguments are passed to the serializer, and thus | |
60 depend on the `method` parameter value. | |
61 | |
62 :see: `XMLSerializer`, `XHTMLSerializer`, `HTMLSerializer`, `TextSerializer` | |
63 :since: version 0.4.1 | |
64 """ | |
65 if isinstance(method, basestring): | |
66 method = {'xml': XMLSerializer, | |
67 'xhtml': XHTMLSerializer, | |
68 'html': HTMLSerializer, | |
69 'text': TextSerializer}[method.lower()] | |
70 return method(**kwargs) | |
1 | 71 |
72 | |
85 | 73 class DocType(object): |
74 """Defines a number of commonly used DOCTYPE declarations as constants.""" | |
75 | |
500 | 76 HTML_STRICT = ( |
77 'html', '-//W3C//DTD HTML 4.01//EN', | |
78 'http://www.w3.org/TR/html4/strict.dtd' | |
79 ) | |
80 HTML_TRANSITIONAL = ( | |
81 'html', '-//W3C//DTD HTML 4.01 Transitional//EN', | |
82 'http://www.w3.org/TR/html4/loose.dtd' | |
83 ) | |
84 HTML_FRAMESET = ( | |
85 'html', '-//W3C//DTD HTML 4.01 Frameset//EN', | |
86 'http://www.w3.org/TR/html4/frameset.dtd' | |
87 ) | |
85 | 88 HTML = HTML_STRICT |
89 | |
500 | 90 HTML5 = ('html', None, None) |
91 | |
92 XHTML_STRICT = ( | |
93 'html', '-//W3C//DTD XHTML 1.0 Strict//EN', | |
94 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd' | |
95 ) | |
96 XHTML_TRANSITIONAL = ( | |
97 'html', '-//W3C//DTD XHTML 1.0 Transitional//EN', | |
98 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' | |
99 ) | |
100 XHTML_FRAMESET = ( | |
101 'html', '-//W3C//DTD XHTML 1.0 Frameset//EN', | |
102 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd' | |
103 ) | |
85 | 104 XHTML = XHTML_STRICT |
105 | |
500 | 106 def get(cls, name): |
107 """Return the ``(name, pubid, sysid)`` tuple of the ``DOCTYPE`` | |
108 declaration for the specified name. | |
109 | |
110 The following names are recognized in this version: | |
111 * "html" or "html-strict" for the HTML 4.01 strict DTD | |
112 * "html-transitional" for the HTML 4.01 transitional DTD | |
113 * "html-transitional" for the HTML 4.01 frameset DTD | |
114 * "html5" for the ``DOCTYPE`` proposed for HTML5 | |
115 * "xhtml" or "xhtml-strict" for the XHTML 1.0 strict DTD | |
116 * "xhtml-transitional" for the XHTML 1.0 transitional DTD | |
117 * "xhtml-frameset" for the XHTML 1.0 frameset DTD | |
118 | |
119 :param name: the name of the ``DOCTYPE`` | |
120 :return: the ``(name, pubid, sysid)`` tuple for the requested | |
121 ``DOCTYPE``, or ``None`` if the name is not recognized | |
122 :since: version 0.4.1 | |
123 """ | |
124 return { | |
125 'html': cls.HTML, 'html-strict': cls.HTML_STRICT, | |
126 'html-transitional': DocType.HTML_TRANSITIONAL, | |
127 'html-frameset': DocType.HTML_FRAMESET, | |
128 'html5': cls.HTML5, | |
129 'xhtml': cls.XHTML, 'xhtml-strict': cls.XHTML_STRICT, | |
130 'xhtml-transitional': cls.XHTML_TRANSITIONAL, | |
131 'xhtml-frameset': cls.XHTML_FRAMESET, | |
132 }.get(name.lower()) | |
133 get = classmethod(get) | |
134 | |
85 | 135 |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
136 class XMLSerializer(object): |
1 | 137 """Produces XML text from an event stream. |
138 | |
230 | 139 >>> from genshi.builder import tag |
20 | 140 >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
141 >>> print ''.join(XMLSerializer()(elem.generate())) |
1 | 142 <div><a href="foo"/><br/><hr noshade="True"/></div> |
143 """ | |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
144 |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
145 _PRESERVE_SPACE = frozenset() |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
146 |
500 | 147 def __init__(self, doctype=None, strip_whitespace=True, |
148 namespace_prefixes=None): | |
85 | 149 """Initialize the XML serializer. |
150 | |
500 | 151 :param doctype: a ``(name, pubid, sysid)`` tuple that represents the |
152 DOCTYPE declaration that should be included at the top | |
153 of the generated output, or the name of a DOCTYPE as | |
154 defined in `DocType.get` | |
155 :param strip_whitespace: whether extraneous whitespace should be | |
156 stripped from the output | |
157 :note: Changed in 0.4.2: The `doctype` parameter can now be a string. | |
85 | 158 """ |
159 self.preamble = [] | |
160 if doctype: | |
500 | 161 if isinstance(doctype, basestring): |
162 doctype = DocType.get(doctype) | |
85 | 163 self.preamble.append((DOCTYPE, doctype, (None, -1, -1))) |
212
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
164 self.filters = [EmptyTagFilter()] |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
165 if strip_whitespace: |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
166 self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE)) |
500 | 167 self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes)) |
1 | 168 |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
169 def __call__(self, stream): |
500 | 170 have_decl = have_doctype = False |
171 in_cdata = False | |
172 | |
173 stream = chain(self.preamble, stream) | |
174 for filter_ in self.filters: | |
175 stream = filter_(stream) | |
176 for kind, data, pos in stream: | |
177 | |
178 if kind is START or kind is EMPTY: | |
179 tag, attrib = data | |
180 buf = ['<', tag] | |
181 for attr, value in attrib: | |
182 buf += [' ', attr, '="', escape(value), '"'] | |
183 buf.append(kind is EMPTY and '/>' or '>') | |
184 yield Markup(u''.join(buf)) | |
185 | |
186 elif kind is END: | |
187 yield Markup('</%s>' % data) | |
188 | |
189 elif kind is TEXT: | |
190 if in_cdata: | |
191 yield data | |
192 else: | |
193 yield escape(data, quotes=False) | |
194 | |
195 elif kind is COMMENT: | |
196 yield Markup('<!--%s-->' % data) | |
197 | |
198 elif kind is XML_DECL and not have_decl: | |
199 version, encoding, standalone = data | |
200 buf = ['<?xml version="%s"' % version] | |
201 if encoding: | |
202 buf.append(' encoding="%s"' % encoding) | |
203 if standalone != -1: | |
204 standalone = standalone and 'yes' or 'no' | |
205 buf.append(' standalone="%s"' % standalone) | |
206 buf.append('?>\n') | |
207 yield Markup(u''.join(buf)) | |
208 have_decl = True | |
209 | |
210 elif kind is DOCTYPE and not have_doctype: | |
211 name, pubid, sysid = data | |
212 buf = ['<!DOCTYPE %s'] | |
213 if pubid: | |
214 buf.append(' PUBLIC "%s"') | |
215 elif sysid: | |
216 buf.append(' SYSTEM') | |
217 if sysid: | |
218 buf.append(' "%s"') | |
219 buf.append('>\n') | |
220 yield Markup(u''.join(buf), *filter(None, data)) | |
221 have_doctype = True | |
222 | |
223 elif kind is START_CDATA: | |
224 yield Markup('<![CDATA[') | |
225 in_cdata = True | |
226 | |
227 elif kind is END_CDATA: | |
228 yield Markup(']]>') | |
229 in_cdata = False | |
230 | |
231 elif kind is PI: | |
232 yield Markup('<?%s %s?>' % data) | |
233 | |
234 | |
235 class XHTMLSerializer(XMLSerializer): | |
236 """Produces XHTML text from an event stream. | |
237 | |
238 >>> from genshi.builder import tag | |
239 >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) | |
240 >>> print ''.join(XHTMLSerializer()(elem.generate())) | |
241 <div><a href="foo"></a><br /><hr noshade="noshade" /></div> | |
242 """ | |
243 | |
244 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', | |
245 'hr', 'img', 'input', 'isindex', 'link', 'meta', | |
246 'param']) | |
247 _BOOLEAN_ATTRS = frozenset(['selected', 'checked', 'compact', 'declare', | |
248 'defer', 'disabled', 'ismap', 'multiple', | |
249 'nohref', 'noresize', 'noshade', 'nowrap']) | |
250 _PRESERVE_SPACE = frozenset([ | |
251 QName('pre'), QName('http://www.w3.org/1999/xhtml}pre'), | |
252 QName('textarea'), QName('http://www.w3.org/1999/xhtml}textarea') | |
253 ]) | |
254 | |
255 def __init__(self, doctype=None, strip_whitespace=True, | |
256 namespace_prefixes=None): | |
257 super(XHTMLSerializer, self).__init__(doctype, False) | |
258 self.filters = [EmptyTagFilter()] | |
259 if strip_whitespace: | |
260 self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE)) | |
261 namespace_prefixes = namespace_prefixes or {} | |
262 namespace_prefixes['http://www.w3.org/1999/xhtml'] = '' | |
263 self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes)) | |
264 | |
265 def __call__(self, stream): | |
266 boolean_attrs = self._BOOLEAN_ATTRS | |
267 empty_elems = self._EMPTY_ELEMS | |
143
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
268 have_doctype = False |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
269 in_cdata = False |
1 | 270 |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
271 stream = chain(self.preamble, stream) |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
272 for filter_ in self.filters: |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
273 stream = filter_(stream) |
1 | 274 for kind, data, pos in stream: |
275 | |
212
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
276 if kind is START or kind is EMPTY: |
1 | 277 tag, attrib = data |
500 | 278 buf = ['<', tag] |
279 for attr, value in attrib: | |
280 if attr in boolean_attrs: | |
281 value = attr | |
282 buf += [' ', attr, '="', escape(value), '"'] | |
283 if kind is EMPTY: | |
284 if tag in empty_elems: | |
285 buf.append(' />') | |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
286 else: |
500 | 287 buf.append('></%s>' % tag) |
288 else: | |
289 buf.append('>') | |
398 | 290 yield Markup(u''.join(buf)) |
1 | 291 |
69 | 292 elif kind is END: |
500 | 293 yield Markup('</%s>' % data) |
1 | 294 |
69 | 295 elif kind is TEXT: |
143
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
296 if in_cdata: |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
297 yield data |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
298 else: |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
299 yield escape(data, quotes=False) |
1 | 300 |
89
d4c7617900e3
Support comments in templates that are not included in the output, in the same way Kid does: if the comment text starts with a `!` character, it is stripped from the output.
cmlenz
parents:
85
diff
changeset
|
301 elif kind is COMMENT: |
d4c7617900e3
Support comments in templates that are not included in the output, in the same way Kid does: if the comment text starts with a `!` character, it is stripped from the output.
cmlenz
parents:
85
diff
changeset
|
302 yield Markup('<!--%s-->' % data) |
d4c7617900e3
Support comments in templates that are not included in the output, in the same way Kid does: if the comment text starts with a `!` character, it is stripped from the output.
cmlenz
parents:
85
diff
changeset
|
303 |
136 | 304 elif kind is DOCTYPE and not have_doctype: |
305 name, pubid, sysid = data | |
306 buf = ['<!DOCTYPE %s'] | |
307 if pubid: | |
398 | 308 buf.append(' PUBLIC "%s"') |
136 | 309 elif sysid: |
398 | 310 buf.append(' SYSTEM') |
136 | 311 if sysid: |
398 | 312 buf.append(' "%s"') |
313 buf.append('>\n') | |
314 yield Markup(u''.join(buf), *filter(None, data)) | |
136 | 315 have_doctype = True |
109
2de3f9d84a1c
Reorder the conditional branches in the serializers so that the more common event kinds are on top.
cmlenz
parents:
105
diff
changeset
|
316 |
143
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
317 elif kind is START_CDATA: |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
318 yield Markup('<![CDATA[') |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
319 in_cdata = True |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
320 |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
321 elif kind is END_CDATA: |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
322 yield Markup(']]>') |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
323 in_cdata = False |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
324 |
105
334a338847af
Include processing instructions in serialized streams.
cmlenz
parents:
96
diff
changeset
|
325 elif kind is PI: |
334a338847af
Include processing instructions in serialized streams.
cmlenz
parents:
96
diff
changeset
|
326 yield Markup('<?%s %s?>' % data) |
334a338847af
Include processing instructions in serialized streams.
cmlenz
parents:
96
diff
changeset
|
327 |
96
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
328 |
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
329 class HTMLSerializer(XHTMLSerializer): |
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
330 """Produces HTML text from an event stream. |
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
331 |
230 | 332 >>> from genshi.builder import tag |
96
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
333 >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
334 >>> print ''.join(HTMLSerializer()(elem.generate())) |
96
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
335 <div><a href="foo"></a><br><hr noshade></div> |
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
336 """ |
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
337 |
500 | 338 _NOESCAPE_ELEMS = frozenset([ |
339 QName('script'), QName('http://www.w3.org/1999/xhtml}script'), | |
340 QName('style'), QName('http://www.w3.org/1999/xhtml}style') | |
341 ]) | |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
342 |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
343 def __init__(self, doctype=None, strip_whitespace=True): |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
344 """Initialize the HTML serializer. |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
345 |
500 | 346 :param doctype: a ``(name, pubid, sysid)`` tuple that represents the |
347 DOCTYPE declaration that should be included at the top | |
348 of the generated output | |
349 :param strip_whitespace: whether extraneous whitespace should be | |
350 stripped from the output | |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
351 """ |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
352 super(HTMLSerializer, self).__init__(doctype, False) |
500 | 353 self.filters = [EmptyTagFilter()] |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
354 if strip_whitespace: |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
355 self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE, |
305 | 356 self._NOESCAPE_ELEMS)) |
500 | 357 self.filters.append(NamespaceStripper('http://www.w3.org/1999/xhtml')) |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
358 |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
359 def __call__(self, stream): |
136 | 360 boolean_attrs = self._BOOLEAN_ATTRS |
361 empty_elems = self._EMPTY_ELEMS | |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
362 noescape_elems = self._NOESCAPE_ELEMS |
96
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
363 have_doctype = False |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
364 noescape = False |
96
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
365 |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
366 stream = chain(self.preamble, stream) |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
367 for filter_ in self.filters: |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
368 stream = filter_(stream) |
96
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
369 for kind, data, pos in stream: |
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
370 |
212
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
371 if kind is START or kind is EMPTY: |
96
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
372 tag, attrib = data |
500 | 373 buf = ['<', tag] |
374 for attr, value in attrib: | |
375 if attr in boolean_attrs: | |
376 if value: | |
377 buf += [' ', attr] | |
378 else: | |
379 buf += [' ', attr, '="', escape(value), '"'] | |
380 buf.append('>') | |
381 if kind is EMPTY: | |
382 if tag not in empty_elems: | |
383 buf.append('</%s>' % tag) | |
384 yield Markup(u''.join(buf)) | |
385 if tag in noescape_elems: | |
386 noescape = True | |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
387 |
69 | 388 elif kind is END: |
500 | 389 yield Markup('</%s>' % data) |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
390 noescape = False |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
391 |
69 | 392 elif kind is TEXT: |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
393 if noescape: |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
394 yield data |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
395 else: |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
396 yield escape(data, quotes=False) |
1 | 397 |
89
d4c7617900e3
Support comments in templates that are not included in the output, in the same way Kid does: if the comment text starts with a `!` character, it is stripped from the output.
cmlenz
parents:
85
diff
changeset
|
398 elif kind is COMMENT: |
d4c7617900e3
Support comments in templates that are not included in the output, in the same way Kid does: if the comment text starts with a `!` character, it is stripped from the output.
cmlenz
parents:
85
diff
changeset
|
399 yield Markup('<!--%s-->' % data) |
d4c7617900e3
Support comments in templates that are not included in the output, in the same way Kid does: if the comment text starts with a `!` character, it is stripped from the output.
cmlenz
parents:
85
diff
changeset
|
400 |
136 | 401 elif kind is DOCTYPE and not have_doctype: |
402 name, pubid, sysid = data | |
403 buf = ['<!DOCTYPE %s'] | |
404 if pubid: | |
398 | 405 buf.append(' PUBLIC "%s"') |
136 | 406 elif sysid: |
398 | 407 buf.append(' SYSTEM') |
136 | 408 if sysid: |
398 | 409 buf.append(' "%s"') |
410 buf.append('>\n') | |
411 yield Markup(u''.join(buf), *filter(None, data)) | |
136 | 412 have_doctype = True |
109
2de3f9d84a1c
Reorder the conditional branches in the serializers so that the more common event kinds are on top.
cmlenz
parents:
105
diff
changeset
|
413 |
105
334a338847af
Include processing instructions in serialized streams.
cmlenz
parents:
96
diff
changeset
|
414 elif kind is PI: |
334a338847af
Include processing instructions in serialized streams.
cmlenz
parents:
96
diff
changeset
|
415 yield Markup('<?%s %s?>' % data) |
334a338847af
Include processing instructions in serialized streams.
cmlenz
parents:
96
diff
changeset
|
416 |
1 | 417 |
200
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
418 class TextSerializer(object): |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
419 """Produces plain text from an event stream. |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
420 |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
421 Only text events are included in the output. Unlike the other serializer, |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
422 special XML characters are not escaped: |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
423 |
230 | 424 >>> from genshi.builder import tag |
200
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
425 >>> elem = tag.div(tag.a('<Hello!>', href='foo'), tag.br) |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
426 >>> print elem |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
427 <div><a href="foo"><Hello!></a><br/></div> |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
428 >>> print ''.join(TextSerializer()(elem.generate())) |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
429 <Hello!> |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
430 |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
431 If text events contain literal markup (instances of the `Markup` class), |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
432 tags or entities are stripped from the output: |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
433 |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
434 >>> elem = tag.div(Markup('<a href="foo">Hello!</a><br/>')) |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
435 >>> print elem |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
436 <div><a href="foo">Hello!</a><br/></div> |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
437 >>> print ''.join(TextSerializer()(elem.generate())) |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
438 Hello! |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
439 """ |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
440 |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
441 def __call__(self, stream): |
500 | 442 for event in stream: |
443 if event[0] is TEXT: | |
444 data = event[1] | |
200
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
445 if type(data) is Markup: |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
446 data = data.striptags().stripentities() |
201
0f16c907077e
The `TextSerializer` should produce `unicode` objects, not `Markup` objects.
cmlenz
parents:
200
diff
changeset
|
447 yield unicode(data) |
200
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
448 |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
449 |
212
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
450 class EmptyTagFilter(object): |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
451 """Combines `START` and `STOP` events into `EMPTY` events for elements that |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
452 have no contents. |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
453 """ |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
454 |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
455 EMPTY = StreamEventKind('EMPTY') |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
456 |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
457 def __call__(self, stream): |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
458 prev = (None, None, None) |
500 | 459 for ev in stream: |
212
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
460 if prev[0] is START: |
500 | 461 if ev[0] is END: |
212
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
462 prev = EMPTY, prev[1], prev[2] |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
463 yield prev |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
464 continue |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
465 else: |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
466 yield prev |
500 | 467 if ev[0] is not START: |
468 yield ev | |
469 prev = ev | |
212
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
470 |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
471 |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
472 EMPTY = EmptyTagFilter.EMPTY |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
473 |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
474 |
500 | 475 class NamespaceFlattener(object): |
476 r"""Output stream filter that removes namespace information from the stream, | |
477 instead adding namespace attributes and prefixes as needed. | |
478 | |
479 :param prefixes: optional mapping of namespace URIs to prefixes | |
480 | |
481 >>> from genshi.input import XML | |
482 >>> xml = XML('''<doc xmlns="NS1" xmlns:two="NS2"> | |
483 ... <two:item/> | |
484 ... </doc>''') | |
485 >>> for kind, data, pos in NamespaceFlattener()(xml): | |
486 ... print kind, repr(data) | |
487 START (u'doc', Attrs([(u'xmlns', u'NS1'), (u'xmlns:two', u'NS2')])) | |
488 TEXT u'\n ' | |
489 START (u'two:item', Attrs()) | |
490 END u'two:item' | |
491 TEXT u'\n' | |
492 END u'doc' | |
493 """ | |
494 | |
495 def __init__(self, prefixes=None): | |
496 self.prefixes = {XML_NAMESPACE.uri: 'xml'} | |
497 if prefixes is not None: | |
498 self.prefixes.update(prefixes) | |
499 | |
500 def __call__(self, stream): | |
501 prefixes = dict([(v, [k]) for k, v in self.prefixes.items()]) | |
502 namespaces = {XML_NAMESPACE.uri: ['xml']} | |
503 def _push_ns(prefix, uri): | |
504 namespaces.setdefault(uri, []).append(prefix) | |
505 prefixes.setdefault(prefix, []).append(uri) | |
506 | |
507 ns_attrs = [] | |
508 _push_ns_attr = ns_attrs.append | |
509 def _make_ns_attr(prefix, uri): | |
510 return u'xmlns%s' % (prefix and ':%s' % prefix or ''), uri | |
511 | |
512 def _gen_prefix(): | |
513 val = 0 | |
514 while 1: | |
515 val += 1 | |
516 yield 'ns%d' % val | |
517 _gen_prefix = _gen_prefix().next | |
518 | |
519 for kind, data, pos in stream: | |
520 | |
521 if kind is START or kind is EMPTY: | |
522 tag, attrs = data | |
523 | |
524 tagname = tag.localname | |
525 tagns = tag.namespace | |
526 if tagns: | |
527 if tagns in namespaces: | |
528 prefix = namespaces[tagns][-1] | |
529 if prefix: | |
530 tagname = u'%s:%s' % (prefix, tagname) | |
531 else: | |
532 _push_ns_attr((u'xmlns', tagns)) | |
533 _push_ns('', tagns) | |
534 | |
535 new_attrs = [] | |
536 for attr, value in attrs: | |
537 attrname = attr.localname | |
538 attrns = attr.namespace | |
539 if attrns: | |
540 if attrns not in namespaces: | |
541 prefix = _gen_prefix() | |
542 _push_ns(prefix, attrns) | |
543 _push_ns_attr(('xmlns:%s' % prefix, attrns)) | |
544 else: | |
545 prefix = namespaces[attrns][-1] | |
546 if prefix: | |
547 attrname = u'%s:%s' % (prefix, attrname) | |
548 new_attrs.append((attrname, value)) | |
549 | |
550 yield kind, (tagname, Attrs(ns_attrs + new_attrs)), pos | |
551 del ns_attrs[:] | |
552 | |
553 elif kind is END: | |
554 tagname = data.localname | |
555 tagns = data.namespace | |
556 if tagns: | |
557 prefix = namespaces[tagns][-1] | |
558 if prefix: | |
559 tagname = u'%s:%s' % (prefix, tagname) | |
560 yield kind, tagname, pos | |
561 | |
562 elif kind is START_NS: | |
563 prefix, uri = data | |
564 if uri not in namespaces: | |
565 prefix = prefixes.get(uri, [prefix])[-1] | |
566 _push_ns_attr(_make_ns_attr(prefix, uri)) | |
567 _push_ns(prefix, uri) | |
568 | |
569 elif kind is END_NS: | |
570 if data in prefixes: | |
571 uris = prefixes.get(data) | |
572 uri = uris.pop() | |
573 if not uris: | |
574 del prefixes[data] | |
575 if uri not in uris or uri != uris[-1]: | |
576 uri_prefixes = namespaces[uri] | |
577 uri_prefixes.pop() | |
578 if not uri_prefixes: | |
579 del namespaces[uri] | |
580 if ns_attrs: | |
581 attr = _make_ns_attr(data, uri) | |
582 if attr in ns_attrs: | |
583 ns_attrs.remove(attr) | |
584 | |
585 else: | |
586 yield kind, data, pos | |
587 | |
588 | |
589 class NamespaceStripper(object): | |
590 r"""Stream filter that removes all namespace information from a stream, and | |
591 optionally strips out all tags not in a given namespace. | |
592 | |
593 :param namespace: the URI of the namespace that should not be stripped. If | |
594 not set, only elements with no namespace are included in | |
595 the output. | |
596 | |
597 >>> from genshi.input import XML | |
598 >>> xml = XML('''<doc xmlns="NS1" xmlns:two="NS2"> | |
599 ... <two:item/> | |
600 ... </doc>''') | |
601 >>> for kind, data, pos in NamespaceStripper(Namespace('NS1'))(xml): | |
602 ... print kind, repr(data) | |
603 START (u'doc', Attrs()) | |
604 TEXT u'\n ' | |
605 TEXT u'\n' | |
606 END u'doc' | |
607 """ | |
608 | |
609 def __init__(self, namespace=None): | |
610 if namespace is not None: | |
611 self.namespace = Namespace(namespace) | |
612 else: | |
613 self.namespace = {} | |
614 | |
615 def __call__(self, stream): | |
616 namespace = self.namespace | |
617 | |
618 for kind, data, pos in stream: | |
619 | |
620 if kind is START or kind is EMPTY: | |
621 tag, attrs = data | |
622 if tag.namespace and tag not in namespace: | |
623 continue | |
624 | |
625 new_attrs = [] | |
626 for attr, value in attrs: | |
627 if not attr.namespace or attr in namespace: | |
628 new_attrs.append((attr, value)) | |
629 | |
630 data = tag.localname, Attrs(new_attrs) | |
631 | |
632 elif kind is END: | |
633 if data.namespace and data not in namespace: | |
634 continue | |
635 data = data.localname | |
636 | |
637 elif kind is START_NS or kind is END_NS: | |
638 continue | |
639 | |
640 yield kind, data, pos | |
641 | |
642 | |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
643 class WhitespaceFilter(object): |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
644 """A filter that removes extraneous ignorable white space from the |
500 | 645 stream. |
646 """ | |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
647 |
305 | 648 def __init__(self, preserve=None, noescape=None): |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
649 """Initialize the filter. |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
650 |
500 | 651 :param preserve: a set or sequence of tag names for which white-space |
652 should be preserved | |
653 :param noescape: a set or sequence of tag names for which text content | |
654 should not be escaped | |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
655 |
347 | 656 The `noescape` set is expected to refer to elements that cannot contain |
500 | 657 further child elements (such as ``<style>`` or ``<script>`` in HTML |
658 documents). | |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
659 """ |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
660 if preserve is None: |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
661 preserve = [] |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
662 self.preserve = frozenset(preserve) |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
663 if noescape is None: |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
664 noescape = [] |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
665 self.noescape = frozenset(noescape) |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
666 |
219 | 667 def __call__(self, stream, ctxt=None, space=XML_NAMESPACE['space'], |
668 trim_trailing_space=re.compile('[ \t]+(?=\n)').sub, | |
669 collapse_lines=re.compile('\n{2,}').sub): | |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
670 mjoin = Markup('').join |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
671 preserve_elems = self.preserve |
347 | 672 preserve = 0 |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
673 noescape_elems = self.noescape |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
674 noescape = False |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
675 |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
676 textbuf = [] |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
677 push_text = textbuf.append |
136 | 678 pop_text = textbuf.pop |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
679 for kind, data, pos in chain(stream, [(None, None, None)]): |
500 | 680 |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
681 if kind is TEXT: |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
682 if noescape: |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
683 data = Markup(data) |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
684 push_text(data) |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
685 else: |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
686 if textbuf: |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
687 if len(textbuf) > 1: |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
688 text = mjoin(textbuf, escape_quotes=False) |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
689 del textbuf[:] |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
690 else: |
136 | 691 text = escape(pop_text(), quotes=False) |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
692 if not preserve: |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
693 text = collapse_lines('\n', trim_trailing_space('', text)) |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
694 yield TEXT, Markup(text), pos |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
695 |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
696 if kind is START: |
347 | 697 tag, attrs = data |
698 if preserve or (tag in preserve_elems or | |
699 attrs.get(space) == 'preserve'): | |
700 preserve += 1 | |
219 | 701 if not noescape and tag in noescape_elems: |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
702 noescape = True |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
703 |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
704 elif kind is END: |
347 | 705 noescape = False |
706 if preserve: | |
707 preserve -= 1 | |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
708 |
305 | 709 elif kind is START_CDATA: |
143
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
710 noescape = True |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
711 |
305 | 712 elif kind is END_CDATA: |
143
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
713 noescape = False |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
714 |
136 | 715 if kind: |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
716 yield kind, data, pos |