Mercurial > genshi > genshi-test
annotate genshi/output.py @ 437:3d82c5bdbf46
Fix for #107.
author | cmlenz |
---|---|
date | Mon, 02 Apr 2007 15:52:21 +0000 |
parents | 5b248708bbed |
children | 0407937b2853 |
rev | line source |
---|---|
1 | 1 # -*- coding: utf-8 -*- |
2 # | |
408 | 3 # Copyright (C) 2006-2007 Edgewall Software |
1 | 4 # All rights reserved. |
5 # | |
6 # This software is licensed as described in the file COPYING, which | |
7 # you should have received as part of this distribution. The terms | |
230 | 8 # are also available at http://genshi.edgewall.org/wiki/License. |
1 | 9 # |
10 # This software consists of voluntary contributions made by many | |
11 # individuals. For the exact contribution history, see the revision | |
230 | 12 # history and logs, available at http://genshi.edgewall.org/log/. |
1 | 13 |
14 """This module provides different kinds of serialization methods for XML event | |
15 streams. | |
16 """ | |
17 | |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
18 from itertools import chain |
1 | 19 try: |
20 frozenset | |
21 except NameError: | |
22 from sets import ImmutableSet as frozenset | |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
23 import re |
1 | 24 |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
25 from genshi.core import escape, Attrs, Markup, Namespace, QName, StreamEventKind |
402
cc7f5b3fbbed
Fix output of namespace declarations for namespace URLs appearing more than once in a stream. Thanks to Jeff Cutsinger for reporting the problem.
cmlenz
parents:
397
diff
changeset
|
26 from genshi.core import DOCTYPE, START, END, START_NS, END_NS, TEXT, \ |
cc7f5b3fbbed
Fix output of namespace declarations for namespace URLs appearing more than once in a stream. Thanks to Jeff Cutsinger for reporting the problem.
cmlenz
parents:
397
diff
changeset
|
27 START_CDATA, END_CDATA, PI, COMMENT, XML_NAMESPACE |
1 | 28 |
200
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
29 __all__ = ['DocType', 'XMLSerializer', 'XHTMLSerializer', 'HTMLSerializer', |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
30 'TextSerializer'] |
425
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
412
diff
changeset
|
31 __docformat__ = 'restructuredtext en' |
1 | 32 |
33 | |
85 | 34 class DocType(object): |
35 """Defines a number of commonly used DOCTYPE declarations as constants.""" | |
36 | |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
37 HTML_STRICT = ( |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
38 'html', '-//W3C//DTD HTML 4.01//EN', |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
39 'http://www.w3.org/TR/html4/strict.dtd' |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
40 ) |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
41 HTML_TRANSITIONAL = ( |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
42 'html', '-//W3C//DTD HTML 4.01 Transitional//EN', |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
43 'http://www.w3.org/TR/html4/loose.dtd' |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
44 ) |
85 | 45 HTML = HTML_STRICT |
46 | |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
47 XHTML_STRICT = ( |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
48 'html', '-//W3C//DTD XHTML 1.0 Strict//EN', |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
49 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd' |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
50 ) |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
51 XHTML_TRANSITIONAL = ( |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
52 'html', '-//W3C//DTD XHTML 1.0 Transitional//EN', |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
53 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
54 ) |
85 | 55 XHTML = XHTML_STRICT |
56 | |
57 | |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
58 class XMLSerializer(object): |
1 | 59 """Produces XML text from an event stream. |
60 | |
230 | 61 >>> from genshi.builder import tag |
20 | 62 >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
63 >>> print ''.join(XMLSerializer()(elem.generate())) |
1 | 64 <div><a href="foo"/><br/><hr noshade="True"/></div> |
65 """ | |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
66 |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
67 _PRESERVE_SPACE = frozenset() |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
68 |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
69 def __init__(self, doctype=None, strip_whitespace=True, |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
70 namespace_prefixes=None): |
85 | 71 """Initialize the XML serializer. |
72 | |
425
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
412
diff
changeset
|
73 :param doctype: a ``(name, pubid, sysid)`` tuple that represents the |
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
412
diff
changeset
|
74 DOCTYPE declaration that should be included at the top |
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
412
diff
changeset
|
75 of the generated output |
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
412
diff
changeset
|
76 :param strip_whitespace: whether extraneous whitespace should be |
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
412
diff
changeset
|
77 stripped from the output |
85 | 78 """ |
79 self.preamble = [] | |
80 if doctype: | |
81 self.preamble.append((DOCTYPE, doctype, (None, -1, -1))) | |
212
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
82 self.filters = [EmptyTagFilter()] |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
83 if strip_whitespace: |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
84 self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE)) |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
85 self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes)) |
1 | 86 |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
87 def __call__(self, stream): |
143
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
88 have_doctype = False |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
89 in_cdata = False |
1 | 90 |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
91 stream = chain(self.preamble, stream) |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
92 for filter_ in self.filters: |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
93 stream = filter_(stream) |
1 | 94 for kind, data, pos in stream: |
95 | |
212
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
96 if kind is START or kind is EMPTY: |
1 | 97 tag, attrib = data |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
98 buf = ['<', tag] |
397
d6e9170c5ccc
* Moved some utility functions from `genshi.core` to `genshi.util` (backwards compatibility preserved via imports)
cmlenz
parents:
346
diff
changeset
|
99 for attr, value in attrib: |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
100 buf += [' ', attr, '="', escape(value), '"'] |
397
d6e9170c5ccc
* Moved some utility functions from `genshi.core` to `genshi.util` (backwards compatibility preserved via imports)
cmlenz
parents:
346
diff
changeset
|
101 buf.append(kind is EMPTY and '/>' or '>') |
d6e9170c5ccc
* Moved some utility functions from `genshi.core` to `genshi.util` (backwards compatibility preserved via imports)
cmlenz
parents:
346
diff
changeset
|
102 yield Markup(u''.join(buf)) |
1 | 103 |
69 | 104 elif kind is END: |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
105 yield Markup('</%s>' % data) |
1 | 106 |
69 | 107 elif kind is TEXT: |
143
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
108 if in_cdata: |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
109 yield data |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
110 else: |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
111 yield escape(data, quotes=False) |
1 | 112 |
89
d4c7617900e3
Support comments in templates that are not included in the output, in the same way Kid does: if the comment text starts with a `!` character, it is stripped from the output.
cmlenz
parents:
85
diff
changeset
|
113 elif kind is COMMENT: |
d4c7617900e3
Support comments in templates that are not included in the output, in the same way Kid does: if the comment text starts with a `!` character, it is stripped from the output.
cmlenz
parents:
85
diff
changeset
|
114 yield Markup('<!--%s-->' % data) |
d4c7617900e3
Support comments in templates that are not included in the output, in the same way Kid does: if the comment text starts with a `!` character, it is stripped from the output.
cmlenz
parents:
85
diff
changeset
|
115 |
136 | 116 elif kind is DOCTYPE and not have_doctype: |
117 name, pubid, sysid = data | |
118 buf = ['<!DOCTYPE %s'] | |
119 if pubid: | |
397
d6e9170c5ccc
* Moved some utility functions from `genshi.core` to `genshi.util` (backwards compatibility preserved via imports)
cmlenz
parents:
346
diff
changeset
|
120 buf.append(' PUBLIC "%s"') |
136 | 121 elif sysid: |
397
d6e9170c5ccc
* Moved some utility functions from `genshi.core` to `genshi.util` (backwards compatibility preserved via imports)
cmlenz
parents:
346
diff
changeset
|
122 buf.append(' SYSTEM') |
136 | 123 if sysid: |
397
d6e9170c5ccc
* Moved some utility functions from `genshi.core` to `genshi.util` (backwards compatibility preserved via imports)
cmlenz
parents:
346
diff
changeset
|
124 buf.append(' "%s"') |
d6e9170c5ccc
* Moved some utility functions from `genshi.core` to `genshi.util` (backwards compatibility preserved via imports)
cmlenz
parents:
346
diff
changeset
|
125 buf.append('>\n') |
d6e9170c5ccc
* Moved some utility functions from `genshi.core` to `genshi.util` (backwards compatibility preserved via imports)
cmlenz
parents:
346
diff
changeset
|
126 yield Markup(u''.join(buf), *filter(None, data)) |
136 | 127 have_doctype = True |
109
2de3f9d84a1c
Reorder the conditional branches in the serializers so that the more common event kinds are on top.
cmlenz
parents:
105
diff
changeset
|
128 |
143
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
129 elif kind is START_CDATA: |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
130 yield Markup('<![CDATA[') |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
131 in_cdata = True |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
132 |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
133 elif kind is END_CDATA: |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
134 yield Markup(']]>') |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
135 in_cdata = False |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
136 |
105
334a338847af
Include processing instructions in serialized streams.
cmlenz
parents:
96
diff
changeset
|
137 elif kind is PI: |
334a338847af
Include processing instructions in serialized streams.
cmlenz
parents:
96
diff
changeset
|
138 yield Markup('<?%s %s?>' % data) |
334a338847af
Include processing instructions in serialized streams.
cmlenz
parents:
96
diff
changeset
|
139 |
1 | 140 |
96
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
141 class XHTMLSerializer(XMLSerializer): |
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
142 """Produces XHTML text from an event stream. |
1 | 143 |
230 | 144 >>> from genshi.builder import tag |
20 | 145 >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
146 >>> print ''.join(XHTMLSerializer()(elem.generate())) |
96
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
147 <div><a href="foo"></a><br /><hr noshade="noshade" /></div> |
1 | 148 """ |
149 | |
150 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', | |
151 'hr', 'img', 'input', 'isindex', 'link', 'meta', | |
152 'param']) | |
153 _BOOLEAN_ATTRS = frozenset(['selected', 'checked', 'compact', 'declare', | |
154 'defer', 'disabled', 'ismap', 'multiple', | |
155 'nohref', 'noresize', 'noshade', 'nowrap']) | |
346
2304e080ec07
Whitespace was not getting preserved in HTML `<pre>` elements that contained other HTML elements.
cmlenz
parents:
345
diff
changeset
|
156 _PRESERVE_SPACE = frozenset([ |
2304e080ec07
Whitespace was not getting preserved in HTML `<pre>` elements that contained other HTML elements.
cmlenz
parents:
345
diff
changeset
|
157 QName('pre'), QName('http://www.w3.org/1999/xhtml}pre'), |
2304e080ec07
Whitespace was not getting preserved in HTML `<pre>` elements that contained other HTML elements.
cmlenz
parents:
345
diff
changeset
|
158 QName('textarea'), QName('http://www.w3.org/1999/xhtml}textarea') |
2304e080ec07
Whitespace was not getting preserved in HTML `<pre>` elements that contained other HTML elements.
cmlenz
parents:
345
diff
changeset
|
159 ]) |
1 | 160 |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
161 def __init__(self, doctype=None, strip_whitespace=True, |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
162 namespace_prefixes=None): |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
163 super(XHTMLSerializer, self).__init__(doctype, False) |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
164 self.filters = [EmptyTagFilter()] |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
165 if strip_whitespace: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
166 self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE)) |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
167 namespace_prefixes = namespace_prefixes or {} |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
168 namespace_prefixes['http://www.w3.org/1999/xhtml'] = '' |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
169 self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes)) |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
170 |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
171 def __call__(self, stream): |
136 | 172 boolean_attrs = self._BOOLEAN_ATTRS |
173 empty_elems = self._EMPTY_ELEMS | |
85 | 174 have_doctype = False |
143
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
175 in_cdata = False |
1 | 176 |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
177 stream = chain(self.preamble, stream) |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
178 for filter_ in self.filters: |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
179 stream = filter_(stream) |
1 | 180 for kind, data, pos in stream: |
181 | |
212
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
182 if kind is START or kind is EMPTY: |
1 | 183 tag, attrib = data |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
184 buf = ['<', tag] |
397
d6e9170c5ccc
* Moved some utility functions from `genshi.core` to `genshi.util` (backwards compatibility preserved via imports)
cmlenz
parents:
346
diff
changeset
|
185 for attr, value in attrib: |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
186 if attr in boolean_attrs: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
187 value = attr |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
188 buf += [' ', attr, '="', escape(value), '"'] |
212
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
189 if kind is EMPTY: |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
190 if tag in empty_elems: |
397
d6e9170c5ccc
* Moved some utility functions from `genshi.core` to `genshi.util` (backwards compatibility preserved via imports)
cmlenz
parents:
346
diff
changeset
|
191 buf.append(' />') |
96
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
192 else: |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
193 buf.append('></%s>' % tag) |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
194 else: |
397
d6e9170c5ccc
* Moved some utility functions from `genshi.core` to `genshi.util` (backwards compatibility preserved via imports)
cmlenz
parents:
346
diff
changeset
|
195 buf.append('>') |
d6e9170c5ccc
* Moved some utility functions from `genshi.core` to `genshi.util` (backwards compatibility preserved via imports)
cmlenz
parents:
346
diff
changeset
|
196 yield Markup(u''.join(buf)) |
96
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
197 |
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
198 elif kind is END: |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
199 yield Markup('</%s>' % data) |
96
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
200 |
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
201 elif kind is TEXT: |
143
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
202 if in_cdata: |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
203 yield data |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
204 else: |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
205 yield escape(data, quotes=False) |
96
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
206 |
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
207 elif kind is COMMENT: |
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
208 yield Markup('<!--%s-->' % data) |
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
209 |
136 | 210 elif kind is DOCTYPE and not have_doctype: |
211 name, pubid, sysid = data | |
212 buf = ['<!DOCTYPE %s'] | |
213 if pubid: | |
397
d6e9170c5ccc
* Moved some utility functions from `genshi.core` to `genshi.util` (backwards compatibility preserved via imports)
cmlenz
parents:
346
diff
changeset
|
214 buf.append(' PUBLIC "%s"') |
136 | 215 elif sysid: |
397
d6e9170c5ccc
* Moved some utility functions from `genshi.core` to `genshi.util` (backwards compatibility preserved via imports)
cmlenz
parents:
346
diff
changeset
|
216 buf.append(' SYSTEM') |
136 | 217 if sysid: |
397
d6e9170c5ccc
* Moved some utility functions from `genshi.core` to `genshi.util` (backwards compatibility preserved via imports)
cmlenz
parents:
346
diff
changeset
|
218 buf.append(' "%s"') |
d6e9170c5ccc
* Moved some utility functions from `genshi.core` to `genshi.util` (backwards compatibility preserved via imports)
cmlenz
parents:
346
diff
changeset
|
219 buf.append('>\n') |
d6e9170c5ccc
* Moved some utility functions from `genshi.core` to `genshi.util` (backwards compatibility preserved via imports)
cmlenz
parents:
346
diff
changeset
|
220 yield Markup(u''.join(buf), *filter(None, data)) |
136 | 221 have_doctype = True |
109
2de3f9d84a1c
Reorder the conditional branches in the serializers so that the more common event kinds are on top.
cmlenz
parents:
105
diff
changeset
|
222 |
143
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
223 elif kind is START_CDATA: |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
224 yield Markup('<![CDATA[') |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
225 in_cdata = True |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
226 |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
227 elif kind is END_CDATA: |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
228 yield Markup(']]>') |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
229 in_cdata = False |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
230 |
105
334a338847af
Include processing instructions in serialized streams.
cmlenz
parents:
96
diff
changeset
|
231 elif kind is PI: |
334a338847af
Include processing instructions in serialized streams.
cmlenz
parents:
96
diff
changeset
|
232 yield Markup('<?%s %s?>' % data) |
334a338847af
Include processing instructions in serialized streams.
cmlenz
parents:
96
diff
changeset
|
233 |
96
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
234 |
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
235 class HTMLSerializer(XHTMLSerializer): |
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
236 """Produces HTML text from an event stream. |
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
237 |
230 | 238 >>> from genshi.builder import tag |
96
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
239 >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
240 >>> print ''.join(HTMLSerializer()(elem.generate())) |
96
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
241 <div><a href="foo"></a><br><hr noshade></div> |
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
242 """ |
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
243 |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
244 _NOESCAPE_ELEMS = frozenset([ |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
245 QName('script'), QName('http://www.w3.org/1999/xhtml}script'), |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
246 QName('style'), QName('http://www.w3.org/1999/xhtml}style') |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
247 ]) |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
248 |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
249 def __init__(self, doctype=None, strip_whitespace=True): |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
250 """Initialize the HTML serializer. |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
251 |
425
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
412
diff
changeset
|
252 :param doctype: a ``(name, pubid, sysid)`` tuple that represents the |
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
412
diff
changeset
|
253 DOCTYPE declaration that should be included at the top |
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
412
diff
changeset
|
254 of the generated output |
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
412
diff
changeset
|
255 :param strip_whitespace: whether extraneous whitespace should be |
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
412
diff
changeset
|
256 stripped from the output |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
257 """ |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
258 super(HTMLSerializer, self).__init__(doctype, False) |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
259 self.filters = [EmptyTagFilter()] |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
260 if strip_whitespace: |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
261 self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE, |
305 | 262 self._NOESCAPE_ELEMS)) |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
263 self.filters.append(NamespaceStripper('http://www.w3.org/1999/xhtml')) |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
264 |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
265 def __call__(self, stream): |
136 | 266 boolean_attrs = self._BOOLEAN_ATTRS |
267 empty_elems = self._EMPTY_ELEMS | |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
268 noescape_elems = self._NOESCAPE_ELEMS |
96
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
269 have_doctype = False |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
270 noescape = False |
96
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
271 |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
272 stream = chain(self.preamble, stream) |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
273 for filter_ in self.filters: |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
274 stream = filter_(stream) |
96
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
275 for kind, data, pos in stream: |
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
276 |
212
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
277 if kind is START or kind is EMPTY: |
96
35d681a94763
Add an XHTML serialization method. Now really need to get rid of some code duplication in the `markup.output` module.
cmlenz
parents:
89
diff
changeset
|
278 tag, attrib = data |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
279 buf = ['<', tag] |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
280 for attr, value in attrib: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
281 if attr in boolean_attrs: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
282 if value: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
283 buf += [' ', attr] |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
284 else: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
285 buf += [' ', attr, '="', escape(value), '"'] |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
286 buf.append('>') |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
287 if kind is EMPTY: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
288 if tag not in empty_elems: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
289 buf.append('</%s>' % tag) |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
290 yield Markup(u''.join(buf)) |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
291 if tag in noescape_elems: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
292 noescape = True |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
293 |
69 | 294 elif kind is END: |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
295 yield Markup('</%s>' % data) |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
296 noescape = False |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
297 |
69 | 298 elif kind is TEXT: |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
299 if noescape: |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
300 yield data |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
301 else: |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
302 yield escape(data, quotes=False) |
1 | 303 |
89
d4c7617900e3
Support comments in templates that are not included in the output, in the same way Kid does: if the comment text starts with a `!` character, it is stripped from the output.
cmlenz
parents:
85
diff
changeset
|
304 elif kind is COMMENT: |
d4c7617900e3
Support comments in templates that are not included in the output, in the same way Kid does: if the comment text starts with a `!` character, it is stripped from the output.
cmlenz
parents:
85
diff
changeset
|
305 yield Markup('<!--%s-->' % data) |
d4c7617900e3
Support comments in templates that are not included in the output, in the same way Kid does: if the comment text starts with a `!` character, it is stripped from the output.
cmlenz
parents:
85
diff
changeset
|
306 |
136 | 307 elif kind is DOCTYPE and not have_doctype: |
308 name, pubid, sysid = data | |
309 buf = ['<!DOCTYPE %s'] | |
310 if pubid: | |
397
d6e9170c5ccc
* Moved some utility functions from `genshi.core` to `genshi.util` (backwards compatibility preserved via imports)
cmlenz
parents:
346
diff
changeset
|
311 buf.append(' PUBLIC "%s"') |
136 | 312 elif sysid: |
397
d6e9170c5ccc
* Moved some utility functions from `genshi.core` to `genshi.util` (backwards compatibility preserved via imports)
cmlenz
parents:
346
diff
changeset
|
313 buf.append(' SYSTEM') |
136 | 314 if sysid: |
397
d6e9170c5ccc
* Moved some utility functions from `genshi.core` to `genshi.util` (backwards compatibility preserved via imports)
cmlenz
parents:
346
diff
changeset
|
315 buf.append(' "%s"') |
d6e9170c5ccc
* Moved some utility functions from `genshi.core` to `genshi.util` (backwards compatibility preserved via imports)
cmlenz
parents:
346
diff
changeset
|
316 buf.append('>\n') |
d6e9170c5ccc
* Moved some utility functions from `genshi.core` to `genshi.util` (backwards compatibility preserved via imports)
cmlenz
parents:
346
diff
changeset
|
317 yield Markup(u''.join(buf), *filter(None, data)) |
136 | 318 have_doctype = True |
109
2de3f9d84a1c
Reorder the conditional branches in the serializers so that the more common event kinds are on top.
cmlenz
parents:
105
diff
changeset
|
319 |
105
334a338847af
Include processing instructions in serialized streams.
cmlenz
parents:
96
diff
changeset
|
320 elif kind is PI: |
334a338847af
Include processing instructions in serialized streams.
cmlenz
parents:
96
diff
changeset
|
321 yield Markup('<?%s %s?>' % data) |
334a338847af
Include processing instructions in serialized streams.
cmlenz
parents:
96
diff
changeset
|
322 |
1 | 323 |
200
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
324 class TextSerializer(object): |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
325 """Produces plain text from an event stream. |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
326 |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
327 Only text events are included in the output. Unlike the other serializer, |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
328 special XML characters are not escaped: |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
329 |
230 | 330 >>> from genshi.builder import tag |
200
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
331 >>> elem = tag.div(tag.a('<Hello!>', href='foo'), tag.br) |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
332 >>> print elem |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
333 <div><a href="foo"><Hello!></a><br/></div> |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
334 >>> print ''.join(TextSerializer()(elem.generate())) |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
335 <Hello!> |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
336 |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
337 If text events contain literal markup (instances of the `Markup` class), |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
338 tags or entities are stripped from the output: |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
339 |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
340 >>> elem = tag.div(Markup('<a href="foo">Hello!</a><br/>')) |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
341 >>> print elem |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
342 <div><a href="foo">Hello!</a><br/></div> |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
343 >>> print ''.join(TextSerializer()(elem.generate())) |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
344 Hello! |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
345 """ |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
346 |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
347 def __call__(self, stream): |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
348 for event in stream: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
349 if event[0] is TEXT: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
350 data = event[1] |
200
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
351 if type(data) is Markup: |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
352 data = data.striptags().stripentities() |
201
0f16c907077e
The `TextSerializer` should produce `unicode` objects, not `Markup` objects.
cmlenz
parents:
200
diff
changeset
|
353 yield unicode(data) |
200
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
354 |
50eab0469148
Add serialization to plain text, based on cboos' patch. Closes #41.
cmlenz
parents:
178
diff
changeset
|
355 |
212
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
356 class EmptyTagFilter(object): |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
357 """Combines `START` and `STOP` events into `EMPTY` events for elements that |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
358 have no contents. |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
359 """ |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
360 |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
361 EMPTY = StreamEventKind('EMPTY') |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
362 |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
363 def __call__(self, stream): |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
364 prev = (None, None, None) |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
365 for ev in stream: |
212
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
366 if prev[0] is START: |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
367 if ev[0] is END: |
212
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
368 prev = EMPTY, prev[1], prev[2] |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
369 yield prev |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
370 continue |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
371 else: |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
372 yield prev |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
373 if ev[0] is not START: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
374 yield ev |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
375 prev = ev |
212
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
376 |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
377 |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
378 EMPTY = EmptyTagFilter.EMPTY |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
379 |
e8c43127d9a9
Refactored the handling of empty tags in the serializer: use an `EmptyTagFilter` that combines adjacent start/end events, instead of the generic pushback-iterator.
cmlenz
parents:
201
diff
changeset
|
380 |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
381 class NamespaceFlattener(object): |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
382 r"""Output stream filter that removes namespace information from the stream, |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
383 instead adding namespace attributes and prefixes as needed. |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
384 |
425
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
412
diff
changeset
|
385 :param prefixes: optional mapping of namespace URIs to prefixes |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
386 |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
387 >>> from genshi.input import XML |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
388 >>> xml = XML('''<doc xmlns="NS1" xmlns:two="NS2"> |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
389 ... <two:item/> |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
390 ... </doc>''') |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
391 >>> for kind, data, pos in NamespaceFlattener()(xml): |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
392 ... print kind, repr(data) |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
393 START (u'doc', Attrs([(u'xmlns', u'NS1'), (u'xmlns:two', u'NS2')])) |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
394 TEXT u'\n ' |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
395 START (u'two:item', Attrs()) |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
396 END u'two:item' |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
397 TEXT u'\n' |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
398 END u'doc' |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
399 """ |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
400 |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
401 def __init__(self, prefixes=None): |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
402 self.prefixes = {XML_NAMESPACE.uri: 'xml'} |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
403 if prefixes is not None: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
404 self.prefixes.update(prefixes) |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
405 |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
406 def __call__(self, stream): |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
407 prefixes = dict([(v, [k]) for k, v in self.prefixes.items()]) |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
408 namespaces = {XML_NAMESPACE.uri: ['xml']} |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
409 def _push_ns(prefix, uri): |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
410 namespaces.setdefault(uri, []).append(prefix) |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
411 prefixes.setdefault(prefix, []).append(uri) |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
412 |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
413 ns_attrs = [] |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
414 _push_ns_attr = ns_attrs.append |
437 | 415 def _make_ns_attr(prefix, uri): |
416 return u'xmlns%s' % (prefix and ':%s' % prefix or ''), uri | |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
417 |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
418 def _gen_prefix(): |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
419 val = 0 |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
420 while 1: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
421 val += 1 |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
422 yield 'ns%d' % val |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
423 _gen_prefix = _gen_prefix().next |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
424 |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
425 for kind, data, pos in stream: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
426 |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
427 if kind is START or kind is EMPTY: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
428 tag, attrs = data |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
429 |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
430 tagname = tag.localname |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
431 tagns = tag.namespace |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
432 if tagns: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
433 if tagns in namespaces: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
434 prefix = namespaces[tagns][-1] |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
435 if prefix: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
436 tagname = u'%s:%s' % (prefix, tagname) |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
437 else: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
438 _push_ns_attr((u'xmlns', tagns)) |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
439 _push_ns('', tagns) |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
440 |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
441 new_attrs = [] |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
442 for attr, value in attrs: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
443 attrname = attr.localname |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
444 attrns = attr.namespace |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
445 if attrns: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
446 if attrns not in namespaces: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
447 prefix = _gen_prefix() |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
448 _push_ns(prefix, attrns) |
412
29cddd600245
Actually write xmlns declaratons for generated attribute namespace prefixes.
cmlenz
parents:
410
diff
changeset
|
449 _push_ns_attr(('xmlns:%s' % prefix, attrns)) |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
450 else: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
451 prefix = namespaces[attrns][-1] |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
452 if prefix: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
453 attrname = u'%s:%s' % (prefix, attrname) |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
454 new_attrs.append((attrname, value)) |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
455 |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
456 yield kind, (tagname, Attrs(ns_attrs + new_attrs)), pos |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
457 del ns_attrs[:] |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
458 |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
459 elif kind is END: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
460 tagname = data.localname |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
461 tagns = data.namespace |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
462 if tagns: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
463 prefix = namespaces[tagns][-1] |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
464 if prefix: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
465 tagname = u'%s:%s' % (prefix, tagname) |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
466 yield kind, tagname, pos |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
467 |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
468 elif kind is START_NS: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
469 prefix, uri = data |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
470 if uri not in namespaces: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
471 prefix = prefixes.get(uri, [prefix])[-1] |
437 | 472 _push_ns_attr(_make_ns_attr(prefix, uri)) |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
473 _push_ns(prefix, uri) |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
474 |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
475 elif kind is END_NS: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
476 if data in prefixes: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
477 uris = prefixes.get(data) |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
478 uri = uris.pop() |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
479 if not uris: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
480 del prefixes[data] |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
481 if uri not in uris or uri != uris[-1]: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
482 uri_prefixes = namespaces[uri] |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
483 uri_prefixes.pop() |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
484 if not uri_prefixes: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
485 del namespaces[uri] |
437 | 486 if ns_attrs: |
487 attr = _make_ns_attr(data, uri) | |
488 if attr in ns_attrs: | |
489 ns_attrs.remove(attr) | |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
490 |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
491 else: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
492 yield kind, data, pos |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
493 |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
494 |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
495 class NamespaceStripper(object): |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
496 r"""Stream filter that removes all namespace information from a stream, and |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
497 optionally strips out all tags not in a given namespace. |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
498 |
425
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
412
diff
changeset
|
499 :param namespace: the URI of the namespace that should not be stripped. If |
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
412
diff
changeset
|
500 not set, only elements with no namespace are included in |
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
412
diff
changeset
|
501 the output. |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
502 |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
503 >>> from genshi.input import XML |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
504 >>> xml = XML('''<doc xmlns="NS1" xmlns:two="NS2"> |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
505 ... <two:item/> |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
506 ... </doc>''') |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
507 >>> for kind, data, pos in NamespaceStripper(Namespace('NS1'))(xml): |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
508 ... print kind, repr(data) |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
509 START (u'doc', Attrs()) |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
510 TEXT u'\n ' |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
511 TEXT u'\n' |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
512 END u'doc' |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
513 """ |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
514 |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
515 def __init__(self, namespace=None): |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
516 if namespace is not None: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
517 self.namespace = Namespace(namespace) |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
518 else: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
519 self.namespace = {} |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
520 |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
521 def __call__(self, stream): |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
522 namespace = self.namespace |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
523 |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
524 for kind, data, pos in stream: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
525 |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
526 if kind is START or kind is EMPTY: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
527 tag, attrs = data |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
528 if tag.namespace and tag not in namespace: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
529 continue |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
530 |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
531 new_attrs = [] |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
532 for attr, value in attrs: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
533 if not attr.namespace or attr in namespace: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
534 new_attrs.append((attr, value)) |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
535 |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
536 data = tag.localname, Attrs(new_attrs) |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
537 |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
538 elif kind is END: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
539 if data.namespace and data not in namespace: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
540 continue |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
541 data = data.localname |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
542 |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
543 elif kind is START_NS or kind is END_NS: |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
544 continue |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
545 |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
546 yield kind, data, pos |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
547 |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
548 |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
549 class WhitespaceFilter(object): |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
550 """A filter that removes extraneous ignorable white space from the |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
551 stream. |
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
552 """ |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
553 |
305 | 554 def __init__(self, preserve=None, noescape=None): |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
555 """Initialize the filter. |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
556 |
425
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
412
diff
changeset
|
557 :param preserve: a set or sequence of tag names for which white-space |
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
412
diff
changeset
|
558 should be preserved |
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
412
diff
changeset
|
559 :param noescape: a set or sequence of tag names for which text content |
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
412
diff
changeset
|
560 should not be escaped |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
561 |
346
2304e080ec07
Whitespace was not getting preserved in HTML `<pre>` elements that contained other HTML elements.
cmlenz
parents:
345
diff
changeset
|
562 The `noescape` set is expected to refer to elements that cannot contain |
425
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
412
diff
changeset
|
563 further child elements (such as ``<style>`` or ``<script>`` in HTML |
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
412
diff
changeset
|
564 documents). |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
565 """ |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
566 if preserve is None: |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
567 preserve = [] |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
568 self.preserve = frozenset(preserve) |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
569 if noescape is None: |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
570 noescape = [] |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
571 self.noescape = frozenset(noescape) |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
572 |
219 | 573 def __call__(self, stream, ctxt=None, space=XML_NAMESPACE['space'], |
574 trim_trailing_space=re.compile('[ \t]+(?=\n)').sub, | |
575 collapse_lines=re.compile('\n{2,}').sub): | |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
576 mjoin = Markup('').join |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
577 preserve_elems = self.preserve |
346
2304e080ec07
Whitespace was not getting preserved in HTML `<pre>` elements that contained other HTML elements.
cmlenz
parents:
345
diff
changeset
|
578 preserve = 0 |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
579 noescape_elems = self.noescape |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
580 noescape = False |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
581 |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
582 textbuf = [] |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
583 push_text = textbuf.append |
136 | 584 pop_text = textbuf.pop |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
585 for kind, data, pos in chain(stream, [(None, None, None)]): |
410
3460b04daeac
Improve the handling of namespaces in serialization.
cmlenz
parents:
408
diff
changeset
|
586 |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
587 if kind is TEXT: |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
588 if noescape: |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
589 data = Markup(data) |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
590 push_text(data) |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
591 else: |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
592 if textbuf: |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
593 if len(textbuf) > 1: |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
594 text = mjoin(textbuf, escape_quotes=False) |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
595 del textbuf[:] |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
596 else: |
136 | 597 text = escape(pop_text(), quotes=False) |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
598 if not preserve: |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
599 text = collapse_lines('\n', trim_trailing_space('', text)) |
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
600 yield TEXT, Markup(text), pos |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
601 |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
602 if kind is START: |
346
2304e080ec07
Whitespace was not getting preserved in HTML `<pre>` elements that contained other HTML elements.
cmlenz
parents:
345
diff
changeset
|
603 tag, attrs = data |
2304e080ec07
Whitespace was not getting preserved in HTML `<pre>` elements that contained other HTML elements.
cmlenz
parents:
345
diff
changeset
|
604 if preserve or (tag in preserve_elems or |
2304e080ec07
Whitespace was not getting preserved in HTML `<pre>` elements that contained other HTML elements.
cmlenz
parents:
345
diff
changeset
|
605 attrs.get(space) == 'preserve'): |
2304e080ec07
Whitespace was not getting preserved in HTML `<pre>` elements that contained other HTML elements.
cmlenz
parents:
345
diff
changeset
|
606 preserve += 1 |
219 | 607 if not noescape and tag in noescape_elems: |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
608 noescape = True |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
609 |
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
610 elif kind is END: |
346
2304e080ec07
Whitespace was not getting preserved in HTML `<pre>` elements that contained other HTML elements.
cmlenz
parents:
345
diff
changeset
|
611 noescape = False |
2304e080ec07
Whitespace was not getting preserved in HTML `<pre>` elements that contained other HTML elements.
cmlenz
parents:
345
diff
changeset
|
612 if preserve: |
2304e080ec07
Whitespace was not getting preserved in HTML `<pre>` elements that contained other HTML elements.
cmlenz
parents:
345
diff
changeset
|
613 preserve -= 1 |
141
b3ceaa35fb6b
* No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents:
140
diff
changeset
|
614 |
305 | 615 elif kind is START_CDATA: |
143
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
616 noescape = True |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
617 |
305 | 618 elif kind is END_CDATA: |
143
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
619 noescape = False |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
141
diff
changeset
|
620 |
136 | 621 if kind: |
123
93bbdcf9428b
Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
cmlenz
parents:
109
diff
changeset
|
622 yield kind, data, pos |