annotate genshi/input.py @ 998:44fb098722ac stable-0.7.x

Merge r1210 and r1212 from trunk (remove unnecessary isinstance checks and skip mako benchmarks if mako isn't installed).
author hodgestar
date Sat, 26 Jan 2013 17:34:51 +0000
parents 9e30a7234290
children
rev   line source
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
1 # -*- coding: utf-8 -*-
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
2 #
854
4d9bef447df9 More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 853
diff changeset
3 # Copyright (C) 2006-2009 Edgewall Software
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
4 # All rights reserved.
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
5 #
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
6 # This software is licensed as described in the file COPYING, which
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
7 # you should have received as part of this distribution. The terms
230
84168828b074 Renamed Markup to Genshi in repository.
cmlenz
parents: 213
diff changeset
8 # are also available at http://genshi.edgewall.org/wiki/License.
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
9 #
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
10 # This software consists of voluntary contributions made by many
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
11 # individuals. For the exact contribution history, see the revision
230
84168828b074 Renamed Markup to Genshi in repository.
cmlenz
parents: 213
diff changeset
12 # history and logs, available at http://genshi.edgewall.org/log/.
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
13
425
073640758a42 Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents: 423
diff changeset
14 """Support for constructing markup streams from files, strings, or other
073640758a42 Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents: 423
diff changeset
15 sources.
073640758a42 Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents: 423
diff changeset
16 """
073640758a42 Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents: 423
diff changeset
17
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
18 from itertools import chain
965
2bfd8f8d241c Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents: 932
diff changeset
19 import codecs
859
f3d998cc941e More bits of 2to3 related cleanup.
cmlenz
parents: 857
diff changeset
20 import htmlentitydefs as entities
f3d998cc941e More bits of 2to3 related cleanup.
cmlenz
parents: 857
diff changeset
21 import HTMLParser as html
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
22 from xml.parsers import expat
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
23
293
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
24 from genshi.core import Attrs, QName, Stream, stripentities
859
f3d998cc941e More bits of 2to3 related cleanup.
cmlenz
parents: 857
diff changeset
25 from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, START_NS, \
f3d998cc941e More bits of 2to3 related cleanup.
cmlenz
parents: 857
diff changeset
26 END_NS, START_CDATA, END_CDATA, PI, COMMENT
932
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 859
diff changeset
27 from genshi.compat import StringIO, BytesIO
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 859
diff changeset
28
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
29
290
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
30 __all__ = ['ET', 'ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML']
425
073640758a42 Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents: 423
diff changeset
31 __docformat__ = 'restructuredtext en'
290
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
32
859
f3d998cc941e More bits of 2to3 related cleanup.
cmlenz
parents: 857
diff changeset
33
290
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
34 def ET(element):
433
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
35 """Convert a given ElementTree element to a markup stream.
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
36
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
37 :param element: an ElementTree element
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
38 :return: a markup stream
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
39 """
290
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
40 tag_name = QName(element.tag.lstrip('{'))
458
5f5b227b04be The `ET()` function now correctly handles attributes with a namespace.
cmlenz
parents: 434
diff changeset
41 attrs = Attrs([(QName(attr.lstrip('{')), value)
5f5b227b04be The `ET()` function now correctly handles attributes with a namespace.
cmlenz
parents: 434
diff changeset
42 for attr, value in element.items()])
290
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
43
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
44 yield START, (tag_name, attrs), (None, -1, -1)
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
45 if element.text:
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
46 yield TEXT, element.text, (None, -1, -1)
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
47 for child in element.getchildren():
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
48 for item in ET(child):
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
49 yield item
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
50 yield END, tag_name, (None, -1, -1)
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
51 if element.tail:
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
52 yield TEXT, element.tail, (None, -1, -1)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
53
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
54
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
55 class ParseError(Exception):
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
56 """Exception raised when fatal syntax errors are found in the input being
433
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
57 parsed.
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
58 """
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
59
422
5d08a744636e More work to include absolute file paths in exceptions.
cmlenz
parents: 419
diff changeset
60 def __init__(self, message, filename=None, lineno=-1, offset=-1):
433
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
61 """Exception initializer.
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
62
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
63 :param message: the error message from the parser
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
64 :param filename: the path to the file that was parsed
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
65 :param lineno: the number of the line on which the error was encountered
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
66 :param offset: the column number where the error was encountered
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
67 """
422
5d08a744636e More work to include absolute file paths in exceptions.
cmlenz
parents: 419
diff changeset
68 self.msg = message
5d08a744636e More work to include absolute file paths in exceptions.
cmlenz
parents: 419
diff changeset
69 if filename:
434
5692bc32ba5f * Better method to propogate the full path to the template file on parse errors. Supersedes r513.
cmlenz
parents: 433
diff changeset
70 message += ', in ' + filename
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
71 Exception.__init__(self, message)
422
5d08a744636e More work to include absolute file paths in exceptions.
cmlenz
parents: 419
diff changeset
72 self.filename = filename or '<string>'
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
73 self.lineno = lineno
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
74 self.offset = offset
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
75
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
76
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
77 class XMLParser(object):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
78 """Generator-based XML parser based on roughly equivalent code in
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
79 Kid/ElementTree.
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
80
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
81 The parsing is initiated by iterating over the parser object:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
82
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
83 >>> parser = XMLParser(StringIO('<root id="2"><child>Foo</child></root>'))
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
84 >>> for kind, data, pos in parser:
853
f33ecf3c319e Convert a bunch of print statements to py3k compatible syntax.
cmlenz
parents: 852
diff changeset
85 ... print('%s %s' % (kind, data))
857
129e54866a98 Avoid unicode literals in `repr`s of `QName` and `Namespace` when not necessary.
cmlenz
parents: 856
diff changeset
86 START (QName('root'), Attrs([(QName('id'), u'2')]))
129e54866a98 Avoid unicode literals in `repr`s of `QName` and `Namespace` when not necessary.
cmlenz
parents: 856
diff changeset
87 START (QName('child'), Attrs())
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
88 TEXT Foo
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
89 END child
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
90 END root
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
91 """
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
92
293
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
93 _entitydefs = ['<!ENTITY %s "&#%d;">' % (name, value) for name, value in
856
21308bd343b8 Add a couple of fallback imports for Python 3.0.
cmlenz
parents: 854
diff changeset
94 entities.name2codepoint.items()]
932
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 859
diff changeset
95 _external_dtd = u'\n'.join(_entitydefs).encode('utf-8')
293
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
96
316
a946edefac40 Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
97 def __init__(self, source, filename=None, encoding=None):
311
8de1ff534d22 * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
98 """Initialize the parser for the given XML input.
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
99
425
073640758a42 Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents: 423
diff changeset
100 :param source: the XML text as a file-like object
073640758a42 Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents: 423
diff changeset
101 :param filename: the name of the file, if appropriate
073640758a42 Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents: 423
diff changeset
102 :param encoding: the encoding of the file; if not specified, the
073640758a42 Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents: 423
diff changeset
103 encoding is assumed to be ASCII, UTF-8, or UTF-16, or
073640758a42 Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents: 423
diff changeset
104 whatever the encoding specified in the XML declaration
073640758a42 Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents: 423
diff changeset
105 (if any)
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
106 """
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
107 self.source = source
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
108 self.filename = filename
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
109
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
110 # Setup the Expat parser
316
a946edefac40 Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
111 parser = expat.ParserCreate(encoding, '}')
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
112 parser.buffer_text = True
932
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 859
diff changeset
113 # Python 3 does not have returns_unicode
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 859
diff changeset
114 if hasattr(parser, 'returns_unicode'):
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 859
diff changeset
115 parser.returns_unicode = True
160
d19e8a2c549e Attribute order in parsed XML is now preserved.
cmlenz
parents: 146
diff changeset
116 parser.ordered_attributes = True
d19e8a2c549e Attribute order in parsed XML is now preserved.
cmlenz
parents: 146
diff changeset
117
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
118 parser.StartElementHandler = self._handle_start
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
119 parser.EndElementHandler = self._handle_end
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
120 parser.CharacterDataHandler = self._handle_data
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
121 parser.StartDoctypeDeclHandler = self._handle_doctype
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
122 parser.StartNamespaceDeclHandler = self._handle_start_ns
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
123 parser.EndNamespaceDeclHandler = self._handle_end_ns
143
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
124 parser.StartCdataSectionHandler = self._handle_start_cdata
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
125 parser.EndCdataSectionHandler = self._handle_end_cdata
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
126 parser.ProcessingInstructionHandler = self._handle_pi
460
75425671b437 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 458
diff changeset
127 parser.XmlDeclHandler = self._handle_xml_decl
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
128 parser.CommentHandler = self._handle_comment
209
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
129
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
130 # Tell Expat that we'll handle non-XML entities ourselves
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
131 # (in _handle_other)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
132 parser.DefaultHandler = self._handle_other
293
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
133 parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS)
209
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
134 parser.UseForeignDTD()
293
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
135 parser.ExternalEntityRefHandler = self._build_foreign
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
136
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
137 self.expat = parser
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
138 self._queue = []
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
139
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
140 def parse(self):
433
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
141 """Generator that parses the XML source, yielding markup events.
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
142
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
143 :return: a markup event stream
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
144 :raises ParseError: if the XML text is not well formed
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
145 """
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
146 def _generate():
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
147 try:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
148 bufsize = 4 * 1024 # 4K
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
149 done = False
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
150 while 1:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
151 while not done and len(self._queue) == 0:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
152 data = self.source.read(bufsize)
932
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 859
diff changeset
153 if not data: # end of data
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
154 if hasattr(self, 'expat'):
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
155 self.expat.Parse('', True)
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
156 del self.expat # get rid of circular references
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
157 done = True
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
158 else:
207
28bfc6aafab7 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 182
diff changeset
159 if isinstance(data, unicode):
28bfc6aafab7 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 182
diff changeset
160 data = data.encode('utf-8')
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
161 self.expat.Parse(data, False)
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
162 for event in self._queue:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
163 yield event
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
164 self._queue = []
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
165 if done:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
166 break
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
167 except expat.ExpatError, e:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
168 msg = str(e)
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
169 raise ParseError(msg, self.filename, e.lineno, e.offset)
146
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
170 return Stream(_generate()).filter(_coalesce)
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
171
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
172 def __iter__(self):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
173 return iter(self.parse())
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
174
293
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
175 def _build_foreign(self, context, base, sysid, pubid):
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
176 parser = self.expat.ExternalEntityParserCreate(context)
932
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 859
diff changeset
177 parser.ParseFile(BytesIO(self._external_dtd))
293
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
178 return 1
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
179
143
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
180 def _enqueue(self, kind, data=None, pos=None):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
181 if pos is None:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
182 pos = self._getpos()
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
183 if kind is TEXT:
134
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
184 # Expat reports the *end* of the text event as current position. We
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
185 # try to fix that up here as much as possible. Unfortunately, the
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
186 # offset is only valid for single-line text. For multi-line text,
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
187 # it is apparently not possible to determine at what offset it
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
188 # started
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
189 if '\n' in data:
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
190 lines = data.splitlines()
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
191 lineno = pos[1] - len(lines) + 1
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
192 offset = -1
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
193 else:
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
194 lineno = pos[1]
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
195 offset = pos[2] - len(data)
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
196 pos = (pos[0], lineno, offset)
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
197 self._queue.append((kind, data, pos))
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
198
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
199 def _getpos_unknown(self):
134
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
200 return (self.filename, -1, -1)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
201
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
202 def _getpos(self):
134
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
203 return (self.filename, self.expat.CurrentLineNumber,
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
204 self.expat.CurrentColumnNumber)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
205
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
206 def _handle_start(self, tag, attrib):
403
228907abb726 Remove some magic/overhead from `Attrs` creation and manipulation by not automatically wrapping attribute names in `QName`.
cmlenz
parents: 378
diff changeset
207 attrs = Attrs([(QName(name), value) for name, value in
228907abb726 Remove some magic/overhead from `Attrs` creation and manipulation by not automatically wrapping attribute names in `QName`.
cmlenz
parents: 378
diff changeset
208 zip(*[iter(attrib)] * 2)])
228907abb726 Remove some magic/overhead from `Attrs` creation and manipulation by not automatically wrapping attribute names in `QName`.
cmlenz
parents: 378
diff changeset
209 self._enqueue(START, (QName(tag), attrs))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
210
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
211 def _handle_end(self, tag):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
212 self._enqueue(END, QName(tag))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
213
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
214 def _handle_data(self, text):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
215 self._enqueue(TEXT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
216
460
75425671b437 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 458
diff changeset
217 def _handle_xml_decl(self, version, encoding, standalone):
75425671b437 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 458
diff changeset
218 self._enqueue(XML_DECL, (version, encoding, standalone))
75425671b437 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 458
diff changeset
219
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
220 def _handle_doctype(self, name, sysid, pubid, has_internal_subset):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
221 self._enqueue(DOCTYPE, (name, pubid, sysid))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
222
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
223 def _handle_start_ns(self, prefix, uri):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
224 self._enqueue(START_NS, (prefix or '', uri))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
225
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
226 def _handle_end_ns(self, prefix):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
227 self._enqueue(END_NS, prefix or '')
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
228
143
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
229 def _handle_start_cdata(self):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
230 self._enqueue(START_CDATA)
143
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
231
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
232 def _handle_end_cdata(self):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
233 self._enqueue(END_CDATA)
143
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
234
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
235 def _handle_pi(self, target, data):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
236 self._enqueue(PI, (target, data))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
237
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
238 def _handle_comment(self, text):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
239 self._enqueue(COMMENT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
240
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
241 def _handle_other(self, text):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
242 if text.startswith('&'):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
243 # deal with undefined entities
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
244 try:
856
21308bd343b8 Add a couple of fallback imports for Python 3.0.
cmlenz
parents: 854
diff changeset
245 text = unichr(entities.name2codepoint[text[1:-1]])
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
246 self._enqueue(TEXT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
247 except KeyError:
209
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
248 filename, lineno, offset = self._getpos()
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
249 error = expat.error('undefined entity "%s": line %d, column %d'
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
250 % (text, lineno, offset))
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
251 error.code = expat.errors.XML_ERROR_UNDEFINED_ENTITY
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
252 error.lineno = lineno
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
253 error.offset = offset
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
254 raise error
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
255
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
256
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
257 def XML(text):
433
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
258 """Parse the given XML source and return a markup stream.
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
259
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
260 Unlike with `XMLParser`, the returned stream is reusable, meaning it can be
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
261 iterated over multiple times:
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
262
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
263 >>> xml = XML('<doc><elem>Foo</elem><elem>Bar</elem></doc>')
853
f33ecf3c319e Convert a bunch of print statements to py3k compatible syntax.
cmlenz
parents: 852
diff changeset
264 >>> print(xml)
433
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
265 <doc><elem>Foo</elem><elem>Bar</elem></doc>
853
f33ecf3c319e Convert a bunch of print statements to py3k compatible syntax.
cmlenz
parents: 852
diff changeset
266 >>> print(xml.select('elem'))
433
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
267 <elem>Foo</elem><elem>Bar</elem>
853
f33ecf3c319e Convert a bunch of print statements to py3k compatible syntax.
cmlenz
parents: 852
diff changeset
268 >>> print(xml.select('elem/text()'))
433
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
269 FooBar
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
270
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
271 :param text: the XML source
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
272 :return: the parsed XML event stream
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
273 :raises ParseError: if the XML text is not well-formed
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
274 """
859
f3d998cc941e More bits of 2to3 related cleanup.
cmlenz
parents: 857
diff changeset
275 return Stream(list(XMLParser(StringIO(text))))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
276
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
277
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
278 class HTMLParser(html.HTMLParser, object):
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
279 """Parser for HTML input based on the Python `HTMLParser` module.
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
280
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
281 This class provides the same interface for generating stream events as
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
282 `XMLParser`, and attempts to automatically balance tags.
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
283
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
284 The parsing is initiated by iterating over the parser object:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
285
932
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 859
diff changeset
286 >>> parser = HTMLParser(BytesIO(u'<UL compact><LI>Foo</UL>'.encode('utf-8')), encoding='utf-8')
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
287 >>> for kind, data, pos in parser:
853
f33ecf3c319e Convert a bunch of print statements to py3k compatible syntax.
cmlenz
parents: 852
diff changeset
288 ... print('%s %s' % (kind, data))
857
129e54866a98 Avoid unicode literals in `repr`s of `QName` and `Namespace` when not necessary.
cmlenz
parents: 856
diff changeset
289 START (QName('ul'), Attrs([(QName('compact'), u'compact')]))
129e54866a98 Avoid unicode literals in `repr`s of `QName` and `Namespace` when not necessary.
cmlenz
parents: 856
diff changeset
290 START (QName('li'), Attrs())
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
291 TEXT Foo
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
292 END li
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
293 END ul
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
294 """
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
295
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
296 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame',
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
297 'hr', 'img', 'input', 'isindex', 'link', 'meta',
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
298 'param'])
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
299
932
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 859
diff changeset
300 def __init__(self, source, filename=None, encoding=None):
311
8de1ff534d22 * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
301 """Initialize the parser for the given HTML input.
8de1ff534d22 * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
302
425
073640758a42 Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents: 423
diff changeset
303 :param source: the HTML text as a file-like object
073640758a42 Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents: 423
diff changeset
304 :param filename: the name of the file, if known
073640758a42 Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents: 423
diff changeset
305 :param filename: encoding of the file; ignored if the input is unicode
311
8de1ff534d22 * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
306 """
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
307 html.HTMLParser.__init__(self)
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
308 self.source = source
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
309 self.filename = filename
311
8de1ff534d22 * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
310 self.encoding = encoding
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
311 self._queue = []
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
312 self._open_tags = []
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
313
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
314 def parse(self):
433
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
315 """Generator that parses the HTML source, yielding markup events.
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
316
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
317 :return: a markup event stream
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
318 :raises ParseError: if the HTML text is not well formed
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
319 """
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
320 def _generate():
965
2bfd8f8d241c Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents: 932
diff changeset
321 if self.encoding:
2bfd8f8d241c Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents: 932
diff changeset
322 reader = codecs.getreader(self.encoding)
2bfd8f8d241c Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents: 932
diff changeset
323 source = reader(self.source)
2bfd8f8d241c Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents: 932
diff changeset
324 else:
2bfd8f8d241c Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents: 932
diff changeset
325 source = self.source
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
326 try:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
327 bufsize = 4 * 1024 # 4K
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
328 done = False
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
329 while 1:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
330 while not done and len(self._queue) == 0:
965
2bfd8f8d241c Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents: 932
diff changeset
331 data = source.read(bufsize)
932
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 859
diff changeset
332 if not data: # end of data
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
333 self.close()
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
334 done = True
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
335 else:
932
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 859
diff changeset
336 if not isinstance(data, unicode):
965
2bfd8f8d241c Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents: 932
diff changeset
337 raise UnicodeError("source returned bytes, but no encoding specified")
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
338 self.feed(data)
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
339 for kind, data, pos in self._queue:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
340 yield kind, data, pos
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
341 self._queue = []
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
342 if done:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
343 open_tags = self._open_tags
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
344 open_tags.reverse()
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
345 for tag in open_tags:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
346 yield END, QName(tag), pos
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
347 break
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
348 except html.HTMLParseError, e:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
349 msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset)
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
350 raise ParseError(msg, self.filename, e.lineno, e.offset)
146
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
351 return Stream(_generate()).filter(_coalesce)
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
352
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
353 def __iter__(self):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
354 return iter(self.parse())
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
355
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
356 def _enqueue(self, kind, data, pos=None):
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
357 if pos is None:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
358 pos = self._getpos()
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
359 self._queue.append((kind, data, pos))
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
360
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
361 def _getpos(self):
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
362 lineno, column = self.getpos()
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
363 return (self.filename, lineno, column)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
364
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
365 def handle_starttag(self, tag, attrib):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
366 fixed_attrib = []
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
367 for name, value in attrib: # Fixup minimized attributes
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
368 if value is None:
998
44fb098722ac Merge r1210 and r1212 from trunk (remove unnecessary isinstance checks and skip mako benchmarks if mako isn't installed).
hodgestar
parents: 997
diff changeset
369 value = name
403
228907abb726 Remove some magic/overhead from `Attrs` creation and manipulation by not automatically wrapping attribute names in `QName`.
cmlenz
parents: 378
diff changeset
370 fixed_attrib.append((QName(name), stripentities(value)))
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
371
182
2f30ce3fb85e Renamed `Attributes` to `Attrs` to reduce the verbosity.
cmlenz
parents: 160
diff changeset
372 self._enqueue(START, (QName(tag), Attrs(fixed_attrib)))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
373 if tag in self._EMPTY_ELEMS:
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
374 self._enqueue(END, QName(tag))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
375 else:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
376 self._open_tags.append(tag)
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
377
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
378 def handle_endtag(self, tag):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
379 if tag not in self._EMPTY_ELEMS:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
380 while self._open_tags:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
381 open_tag = self._open_tags.pop()
378
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
382 self._enqueue(END, QName(open_tag))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
383 if open_tag.lower() == tag.lower():
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
384 break
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
385
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
386 def handle_data(self, text):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
387 self._enqueue(TEXT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
388
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
389 def handle_charref(self, name):
423
56bbe1d94da0 Applied patch for #106 (handling of hex charrefs in HTML parser).
cmlenz
parents: 422
diff changeset
390 if name.lower().startswith('x'):
56bbe1d94da0 Applied patch for #106 (handling of hex charrefs in HTML parser).
cmlenz
parents: 422
diff changeset
391 text = unichr(int(name[1:], 16))
56bbe1d94da0 Applied patch for #106 (handling of hex charrefs in HTML parser).
cmlenz
parents: 422
diff changeset
392 else:
56bbe1d94da0 Applied patch for #106 (handling of hex charrefs in HTML parser).
cmlenz
parents: 422
diff changeset
393 text = unichr(int(name))
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
394 self._enqueue(TEXT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
395
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
396 def handle_entityref(self, name):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
397 try:
856
21308bd343b8 Add a couple of fallback imports for Python 3.0.
cmlenz
parents: 854
diff changeset
398 text = unichr(entities.name2codepoint[name])
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
399 except KeyError:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
400 text = '&%s;' % name
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
401 self._enqueue(TEXT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
402
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
403 def handle_pi(self, data):
376
0e0952d85d97 Fix parsing of processing instructions in HTML input.
cmlenz
parents: 326
diff changeset
404 if data.endswith('?'):
0e0952d85d97 Fix parsing of processing instructions in HTML input.
cmlenz
parents: 326
diff changeset
405 data = data[:-1]
997
9e30a7234290 Merge r1219 from trunk (fix for PIs without data, fixes #368).
hodgestar
parents: 965
diff changeset
406 try:
9e30a7234290 Merge r1219 from trunk (fix for PIs without data, fixes #368).
hodgestar
parents: 965
diff changeset
407 target, data = data.split(None, 1)
9e30a7234290 Merge r1219 from trunk (fix for PIs without data, fixes #368).
hodgestar
parents: 965
diff changeset
408 except ValueError:
9e30a7234290 Merge r1219 from trunk (fix for PIs without data, fixes #368).
hodgestar
parents: 965
diff changeset
409 # PI with no data
9e30a7234290 Merge r1219 from trunk (fix for PIs without data, fixes #368).
hodgestar
parents: 965
diff changeset
410 target = data
9e30a7234290 Merge r1219 from trunk (fix for PIs without data, fixes #368).
hodgestar
parents: 965
diff changeset
411 data = ''
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
412 self._enqueue(PI, (target.strip(), data.strip()))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
413
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
414 def handle_comment(self, text):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
415 self._enqueue(COMMENT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
416
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
417
932
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 859
diff changeset
418 def HTML(text, encoding=None):
433
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
419 """Parse the given HTML source and return a markup stream.
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
420
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
421 Unlike with `HTMLParser`, the returned stream is reusable, meaning it can be
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
422 iterated over multiple times:
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
423
932
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 859
diff changeset
424 >>> html = HTML('<body><h1>Foo</h1></body>', encoding='utf-8')
853
f33ecf3c319e Convert a bunch of print statements to py3k compatible syntax.
cmlenz
parents: 852
diff changeset
425 >>> print(html)
433
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
426 <body><h1>Foo</h1></body>
853
f33ecf3c319e Convert a bunch of print statements to py3k compatible syntax.
cmlenz
parents: 852
diff changeset
427 >>> print(html.select('h1'))
433
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
428 <h1>Foo</h1>
853
f33ecf3c319e Convert a bunch of print statements to py3k compatible syntax.
cmlenz
parents: 852
diff changeset
429 >>> print(html.select('h1/text()'))
433
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
430 Foo
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
431
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
432 :param text: the HTML source
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
433 :return: the parsed XML event stream
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
434 :raises ParseError: if the HTML text is not well-formed, and error recovery
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
435 fails
bc430fd7c54d More API docs.
cmlenz
parents: 425
diff changeset
436 """
932
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 859
diff changeset
437 if isinstance(text, unicode):
965
2bfd8f8d241c Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents: 932
diff changeset
438 # If it's unicode text the encoding should be set to None.
2bfd8f8d241c Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents: 932
diff changeset
439 # The option to pass in an incorrect encoding is for ease
2bfd8f8d241c Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents: 932
diff changeset
440 # of writing doctests that work in both Python 2.x and 3.x.
2bfd8f8d241c Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents: 932
diff changeset
441 return Stream(list(HTMLParser(StringIO(text), encoding=None)))
932
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 859
diff changeset
442 return Stream(list(HTMLParser(BytesIO(text), encoding=encoding)))
859
f3d998cc941e More bits of 2to3 related cleanup.
cmlenz
parents: 857
diff changeset
443
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
444
146
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
445 def _coalesce(stream):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
446 """Coalesces adjacent TEXT events into a single event."""
146
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
447 textbuf = []
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
448 textpos = None
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
449 for kind, data, pos in chain(stream, [(None, None, None)]):
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
450 if kind is TEXT:
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
451 textbuf.append(data)
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
452 if textpos is None:
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
453 textpos = pos
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
454 else:
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
455 if textbuf:
852
07f4339fecb0 Remove usage of unicode literals in a couple of places where they were not strictly necessary.
cmlenz
parents: 750
diff changeset
456 yield TEXT, ''.join(textbuf), textpos
146
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
457 del textbuf[:]
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
458 textpos = None
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
459 if kind:
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
460 yield kind, data, pos
Copyright (C) 2012-2017 Edgewall Software