annotate genshi/input.py @ 403:228907abb726 trunk

Remove some magic/overhead from `Attrs` creation and manipulation by not automatically wrapping attribute names in `QName`.
author cmlenz
date Wed, 21 Feb 2007 09:51:43 +0000
parents 873ca2a7ec05
children 4675d5cf6c67
rev   line source
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
1 # -*- coding: utf-8 -*-
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
2 #
66
59eb24184e9c Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents: 27
diff changeset
3 # Copyright (C) 2006 Edgewall Software
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
4 # All rights reserved.
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
5 #
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
6 # This software is licensed as described in the file COPYING, which
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
7 # you should have received as part of this distribution. The terms
230
84168828b074 Renamed Markup to Genshi in repository.
cmlenz
parents: 213
diff changeset
8 # are also available at http://genshi.edgewall.org/wiki/License.
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
9 #
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
10 # This software consists of voluntary contributions made by many
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
11 # individuals. For the exact contribution history, see the revision
230
84168828b074 Renamed Markup to Genshi in repository.
cmlenz
parents: 213
diff changeset
12 # history and logs, available at http://genshi.edgewall.org/log/.
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
13
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
14 from itertools import chain
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
15 from xml.parsers import expat
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
16 try:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
17 frozenset
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
18 except NameError:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
19 from sets import ImmutableSet as frozenset
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
20 import HTMLParser as html
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
21 import htmlentitydefs
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
22 from StringIO import StringIO
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
23
293
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
24 from genshi.core import Attrs, QName, Stream, stripentities
230
84168828b074 Renamed Markup to Genshi in repository.
cmlenz
parents: 213
diff changeset
25 from genshi.core import DOCTYPE, START, END, START_NS, END_NS, TEXT, \
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
26 START_CDATA, END_CDATA, PI, COMMENT
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
27
290
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
28 __all__ = ['ET', 'ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML']
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
29
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
30 def ET(element):
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
31 """Convert a given ElementTree element to a markup stream."""
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
32 tag_name = QName(element.tag.lstrip('{'))
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
33 attrs = Attrs(element.items())
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
34
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
35 yield START, (tag_name, attrs), (None, -1, -1)
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
36 if element.text:
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
37 yield TEXT, element.text, (None, -1, -1)
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
38 for child in element.getchildren():
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
39 for item in ET(child):
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
40 yield item
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
41 yield END, tag_name, (None, -1, -1)
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
42 if element.tail:
94f9f2cc66c8 Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
43 yield TEXT, element.tail, (None, -1, -1)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
44
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
45
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
46 class ParseError(Exception):
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
47 """Exception raised when fatal syntax errors are found in the input being
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
48 parsed."""
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
49
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
50 def __init__(self, message, filename='<string>', lineno=-1, offset=-1):
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
51 Exception.__init__(self, message)
213
13d2d4420628 Store original message in exceptions as `msg` ivar.
cmlenz
parents: 209
diff changeset
52 self.msg = message
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
53 self.filename = filename
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
54 self.lineno = lineno
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
55 self.offset = offset
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
56
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
57
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
58 class XMLParser(object):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
59 """Generator-based XML parser based on roughly equivalent code in
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
60 Kid/ElementTree.
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
61
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
62 The parsing is initiated by iterating over the parser object:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
63
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
64 >>> parser = XMLParser(StringIO('<root id="2"><child>Foo</child></root>'))
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
65 >>> for kind, data, pos in parser:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
66 ... print kind, data
326
f999da894391 Fixed `__repr__` of the `QName`, `Attrs`, and `Expression` classes so that the output can be used as code to instantiate the object again.
cmlenz
parents: 316
diff changeset
67 START (QName(u'root'), Attrs([(QName(u'id'), u'2')]))
f999da894391 Fixed `__repr__` of the `QName`, `Attrs`, and `Expression` classes so that the output can be used as code to instantiate the object again.
cmlenz
parents: 316
diff changeset
68 START (QName(u'child'), Attrs())
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
69 TEXT Foo
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
70 END child
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
71 END root
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
72 """
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
73
293
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
74 _entitydefs = ['<!ENTITY %s "&#%d;">' % (name, value) for name, value in
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
75 htmlentitydefs.name2codepoint.items()]
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
76 _external_dtd = '\n'.join(_entitydefs)
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
77
316
a946edefac40 Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
78 def __init__(self, source, filename=None, encoding=None):
311
8de1ff534d22 * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
79 """Initialize the parser for the given XML input.
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
80
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
81 @param source: the XML text as a file-like object
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
82 @param filename: the name of the file, if appropriate
316
a946edefac40 Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
83 @param encoding: the encoding of the file; if not specified, the
a946edefac40 Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
84 encoding is assumed to be ASCII, UTF-8, or UTF-16, or whatever the
a946edefac40 Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
85 encoding specified in the XML declaration (if any)
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
86 """
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
87 self.source = source
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
88 self.filename = filename
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
89
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
90 # Setup the Expat parser
316
a946edefac40 Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
91 parser = expat.ParserCreate(encoding, '}')
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
92 parser.buffer_text = True
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
93 parser.returns_unicode = True
160
d19e8a2c549e Attribute order in parsed XML is now preserved.
cmlenz
parents: 146
diff changeset
94 parser.ordered_attributes = True
d19e8a2c549e Attribute order in parsed XML is now preserved.
cmlenz
parents: 146
diff changeset
95
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
96 parser.StartElementHandler = self._handle_start
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
97 parser.EndElementHandler = self._handle_end
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
98 parser.CharacterDataHandler = self._handle_data
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
99 parser.StartDoctypeDeclHandler = self._handle_doctype
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
100 parser.StartNamespaceDeclHandler = self._handle_start_ns
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
101 parser.EndNamespaceDeclHandler = self._handle_end_ns
143
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
102 parser.StartCdataSectionHandler = self._handle_start_cdata
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
103 parser.EndCdataSectionHandler = self._handle_end_cdata
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
104 parser.ProcessingInstructionHandler = self._handle_pi
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
105 parser.CommentHandler = self._handle_comment
209
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
106
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
107 # Tell Expat that we'll handle non-XML entities ourselves
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
108 # (in _handle_other)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
109 parser.DefaultHandler = self._handle_other
293
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
110 parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS)
209
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
111 parser.UseForeignDTD()
293
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
112 parser.ExternalEntityRefHandler = self._build_foreign
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
113
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
114 # Location reporting is only support in Python >= 2.4
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
115 if not hasattr(parser, 'CurrentLineNumber'):
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
116 self._getpos = self._getpos_unknown
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
117
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
118 self.expat = parser
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
119 self._queue = []
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
120
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
121 def parse(self):
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
122 def _generate():
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
123 try:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
124 bufsize = 4 * 1024 # 4K
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
125 done = False
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
126 while 1:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
127 while not done and len(self._queue) == 0:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
128 data = self.source.read(bufsize)
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
129 if data == '': # end of data
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
130 if hasattr(self, 'expat'):
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
131 self.expat.Parse('', True)
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
132 del self.expat # get rid of circular references
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
133 done = True
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
134 else:
207
28bfc6aafab7 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 182
diff changeset
135 if isinstance(data, unicode):
28bfc6aafab7 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 182
diff changeset
136 data = data.encode('utf-8')
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
137 self.expat.Parse(data, False)
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
138 for event in self._queue:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
139 yield event
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
140 self._queue = []
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
141 if done:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
142 break
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
143 except expat.ExpatError, e:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
144 msg = str(e)
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
145 if self.filename:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
146 msg += ', in ' + self.filename
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
147 raise ParseError(msg, self.filename, e.lineno, e.offset)
146
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
148 return Stream(_generate()).filter(_coalesce)
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
149
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
150 def __iter__(self):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
151 return iter(self.parse())
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
152
293
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
153 def _build_foreign(self, context, base, sysid, pubid):
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
154 parser = self.expat.ExternalEntityParserCreate(context)
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
155 parser.ParseFile(StringIO(self._external_dtd))
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
156 return 1
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
157
143
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
158 def _enqueue(self, kind, data=None, pos=None):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
159 if pos is None:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
160 pos = self._getpos()
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
161 if kind is TEXT:
134
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
162 # Expat reports the *end* of the text event as current position. We
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
163 # try to fix that up here as much as possible. Unfortunately, the
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
164 # offset is only valid for single-line text. For multi-line text,
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
165 # it is apparently not possible to determine at what offset it
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
166 # started
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
167 if '\n' in data:
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
168 lines = data.splitlines()
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
169 lineno = pos[1] - len(lines) + 1
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
170 offset = -1
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
171 else:
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
172 lineno = pos[1]
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
173 offset = pos[2] - len(data)
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
174 pos = (pos[0], lineno, offset)
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
175 self._queue.append((kind, data, pos))
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
176
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
177 def _getpos_unknown(self):
134
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
178 return (self.filename, -1, -1)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
179
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
180 def _getpos(self):
134
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
181 return (self.filename, self.expat.CurrentLineNumber,
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
182 self.expat.CurrentColumnNumber)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
183
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
184 def _handle_start(self, tag, attrib):
403
228907abb726 Remove some magic/overhead from `Attrs` creation and manipulation by not automatically wrapping attribute names in `QName`.
cmlenz
parents: 378
diff changeset
185 attrs = Attrs([(QName(name), value) for name, value in
228907abb726 Remove some magic/overhead from `Attrs` creation and manipulation by not automatically wrapping attribute names in `QName`.
cmlenz
parents: 378
diff changeset
186 zip(*[iter(attrib)] * 2)])
228907abb726 Remove some magic/overhead from `Attrs` creation and manipulation by not automatically wrapping attribute names in `QName`.
cmlenz
parents: 378
diff changeset
187 self._enqueue(START, (QName(tag), attrs))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
188
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
189 def _handle_end(self, tag):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
190 self._enqueue(END, QName(tag))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
191
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
192 def _handle_data(self, text):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
193 self._enqueue(TEXT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
194
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
195 def _handle_doctype(self, name, sysid, pubid, has_internal_subset):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
196 self._enqueue(DOCTYPE, (name, pubid, sysid))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
197
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
198 def _handle_start_ns(self, prefix, uri):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
199 self._enqueue(START_NS, (prefix or '', uri))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
200
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
201 def _handle_end_ns(self, prefix):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
202 self._enqueue(END_NS, prefix or '')
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
203
143
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
204 def _handle_start_cdata(self):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
205 self._enqueue(START_CDATA)
143
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
206
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
207 def _handle_end_cdata(self):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
208 self._enqueue(END_CDATA)
143
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
209
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
210 def _handle_pi(self, target, data):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
211 self._enqueue(PI, (target, data))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
212
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
213 def _handle_comment(self, text):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
214 self._enqueue(COMMENT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
215
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
216 def _handle_other(self, text):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
217 if text.startswith('&'):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
218 # deal with undefined entities
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
219 try:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
220 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
221 self._enqueue(TEXT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
222 except KeyError:
209
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
223 filename, lineno, offset = self._getpos()
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
224 error = expat.error('undefined entity "%s": line %d, column %d'
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
225 % (text, lineno, offset))
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
226 error.code = expat.errors.XML_ERROR_UNDEFINED_ENTITY
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
227 error.lineno = lineno
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
228 error.offset = offset
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
229 raise error
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
230
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
231
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
232 def XML(text):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
233 return Stream(list(XMLParser(StringIO(text))))
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
234
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
235
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
236 class HTMLParser(html.HTMLParser, object):
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
237 """Parser for HTML input based on the Python `HTMLParser` module.
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
238
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
239 This class provides the same interface for generating stream events as
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
240 `XMLParser`, and attempts to automatically balance tags.
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
241
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
242 The parsing is initiated by iterating over the parser object:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
243
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
244 >>> parser = HTMLParser(StringIO('<UL compact><LI>Foo</UL>'))
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
245 >>> for kind, data, pos in parser:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
246 ... print kind, data
326
f999da894391 Fixed `__repr__` of the `QName`, `Attrs`, and `Expression` classes so that the output can be used as code to instantiate the object again.
cmlenz
parents: 316
diff changeset
247 START (QName(u'ul'), Attrs([(QName(u'compact'), u'compact')]))
f999da894391 Fixed `__repr__` of the `QName`, `Attrs`, and `Expression` classes so that the output can be used as code to instantiate the object again.
cmlenz
parents: 316
diff changeset
248 START (QName(u'li'), Attrs())
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
249 TEXT Foo
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
250 END li
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
251 END ul
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
252 """
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
253
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
254 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame',
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
255 'hr', 'img', 'input', 'isindex', 'link', 'meta',
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
256 'param'])
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
257
311
8de1ff534d22 * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
258 def __init__(self, source, filename=None, encoding='utf-8'):
8de1ff534d22 * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
259 """Initialize the parser for the given HTML input.
8de1ff534d22 * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
260
8de1ff534d22 * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
261 @param source: the HTML text as a file-like object
8de1ff534d22 * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
262 @param filename: the name of the file, if known
8de1ff534d22 * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
263 @param filename: encoding of the file; ignored if the input is unicode
8de1ff534d22 * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
264 """
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
265 html.HTMLParser.__init__(self)
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
266 self.source = source
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
267 self.filename = filename
311
8de1ff534d22 * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
268 self.encoding = encoding
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
269 self._queue = []
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
270 self._open_tags = []
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
271
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
272 def parse(self):
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
273 def _generate():
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
274 try:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
275 bufsize = 4 * 1024 # 4K
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
276 done = False
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
277 while 1:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
278 while not done and len(self._queue) == 0:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
279 data = self.source.read(bufsize)
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
280 if data == '': # end of data
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
281 self.close()
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
282 done = True
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
283 else:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
284 self.feed(data)
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
285 for kind, data, pos in self._queue:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
286 yield kind, data, pos
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
287 self._queue = []
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
288 if done:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
289 open_tags = self._open_tags
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
290 open_tags.reverse()
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
291 for tag in open_tags:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
292 yield END, QName(tag), pos
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
293 break
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
294 except html.HTMLParseError, e:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
295 msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset)
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
296 if self.filename:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
297 msg += ', in %s' % self.filename
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
298 raise ParseError(msg, self.filename, e.lineno, e.offset)
146
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
299 return Stream(_generate()).filter(_coalesce)
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
300
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
301 def __iter__(self):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
302 return iter(self.parse())
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
303
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
304 def _enqueue(self, kind, data, pos=None):
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
305 if pos is None:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
306 pos = self._getpos()
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
307 self._queue.append((kind, data, pos))
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
308
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
309 def _getpos(self):
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
310 lineno, column = self.getpos()
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
311 return (self.filename, lineno, column)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
312
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
313 def handle_starttag(self, tag, attrib):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
314 fixed_attrib = []
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
315 for name, value in attrib: # Fixup minimized attributes
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
316 if value is None:
312
cb7326367f91 Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents: 311
diff changeset
317 value = unicode(name)
cb7326367f91 Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents: 311
diff changeset
318 elif not isinstance(value, unicode):
cb7326367f91 Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents: 311
diff changeset
319 value = value.decode(self.encoding, 'replace')
403
228907abb726 Remove some magic/overhead from `Attrs` creation and manipulation by not automatically wrapping attribute names in `QName`.
cmlenz
parents: 378
diff changeset
320 fixed_attrib.append((QName(name), stripentities(value)))
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
321
182
2f30ce3fb85e Renamed `Attributes` to `Attrs` to reduce the verbosity.
cmlenz
parents: 160
diff changeset
322 self._enqueue(START, (QName(tag), Attrs(fixed_attrib)))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
323 if tag in self._EMPTY_ELEMS:
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
324 self._enqueue(END, QName(tag))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
325 else:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
326 self._open_tags.append(tag)
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
327
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
328 def handle_endtag(self, tag):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
329 if tag not in self._EMPTY_ELEMS:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
330 while self._open_tags:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
331 open_tag = self._open_tags.pop()
378
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
332 self._enqueue(END, QName(open_tag))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
333 if open_tag.lower() == tag.lower():
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
334 break
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
335
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
336 def handle_data(self, text):
311
8de1ff534d22 * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
337 if not isinstance(text, unicode):
8de1ff534d22 * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
338 text = text.decode(self.encoding, 'replace')
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
339 self._enqueue(TEXT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
340
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
341 def handle_charref(self, name):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
342 text = unichr(int(name))
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
343 self._enqueue(TEXT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
344
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
345 def handle_entityref(self, name):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
346 try:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
347 text = unichr(htmlentitydefs.name2codepoint[name])
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
348 except KeyError:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
349 text = '&%s;' % name
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
350 self._enqueue(TEXT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
351
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
352 def handle_pi(self, data):
376
0e0952d85d97 Fix parsing of processing instructions in HTML input.
cmlenz
parents: 326
diff changeset
353 target, data = data.split(None, 1)
0e0952d85d97 Fix parsing of processing instructions in HTML input.
cmlenz
parents: 326
diff changeset
354 if data.endswith('?'):
0e0952d85d97 Fix parsing of processing instructions in HTML input.
cmlenz
parents: 326
diff changeset
355 data = data[:-1]
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
356 self._enqueue(PI, (target.strip(), data.strip()))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
357
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
358 def handle_comment(self, text):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
359 self._enqueue(COMMENT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
360
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
361
311
8de1ff534d22 * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
362 def HTML(text, encoding='utf-8'):
8de1ff534d22 * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
363 return Stream(list(HTMLParser(StringIO(text), encoding=encoding)))
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
364
146
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
365 def _coalesce(stream):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
366 """Coalesces adjacent TEXT events into a single event."""
146
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
367 textbuf = []
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
368 textpos = None
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
369 for kind, data, pos in chain(stream, [(None, None, None)]):
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
370 if kind is TEXT:
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
371 textbuf.append(data)
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
372 if textpos is None:
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
373 textpos = pos
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
374 else:
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
375 if textbuf:
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
376 yield TEXT, u''.join(textbuf), textpos
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
377 del textbuf[:]
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
378 textpos = None
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
379 if kind:
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
380 yield kind, data, pos
Copyright (C) 2012-2017 Edgewall Software