annotate genshi/input.py @ 312:7e743338a799

Follow-up to [385]: also decode attribute values in the `HTMLParser`.
author cmlenz
date Sun, 22 Oct 2006 16:44:18 +0000
parents 01e2c48f6dfb
children 4ab9edf5e83b
rev   line source
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
1 # -*- coding: utf-8 -*-
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
2 #
66
822089ae65ce Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents: 27
diff changeset
3 # Copyright (C) 2006 Edgewall Software
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
4 # All rights reserved.
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
5 #
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
6 # This software is licensed as described in the file COPYING, which
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
7 # you should have received as part of this distribution. The terms
230
24757b771651 Renamed Markup to Genshi in repository.
cmlenz
parents: 213
diff changeset
8 # are also available at http://genshi.edgewall.org/wiki/License.
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
9 #
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
10 # This software consists of voluntary contributions made by many
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
11 # individuals. For the exact contribution history, see the revision
230
24757b771651 Renamed Markup to Genshi in repository.
cmlenz
parents: 213
diff changeset
12 # history and logs, available at http://genshi.edgewall.org/log/.
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
13
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
14 from itertools import chain
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
15 from xml.parsers import expat
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
16 try:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
17 frozenset
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
18 except NameError:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
19 from sets import ImmutableSet as frozenset
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
20 import HTMLParser as html
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
21 import htmlentitydefs
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
22 from StringIO import StringIO
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
23
293
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
24 from genshi.core import Attrs, QName, Stream, stripentities
230
24757b771651 Renamed Markup to Genshi in repository.
cmlenz
parents: 213
diff changeset
25 from genshi.core import DOCTYPE, START, END, START_NS, END_NS, TEXT, \
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
26 START_CDATA, END_CDATA, PI, COMMENT
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
27
290
a6738047c85e Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
28 __all__ = ['ET', 'ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML']
a6738047c85e Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
29
a6738047c85e Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
30 def ET(element):
a6738047c85e Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
31 """Convert a given ElementTree element to a markup stream."""
a6738047c85e Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
32 tag_name = QName(element.tag.lstrip('{'))
a6738047c85e Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
33 attrs = Attrs(element.items())
a6738047c85e Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
34
a6738047c85e Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
35 yield START, (tag_name, attrs), (None, -1, -1)
a6738047c85e Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
36 if element.text:
a6738047c85e Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
37 yield TEXT, element.text, (None, -1, -1)
a6738047c85e Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
38 for child in element.getchildren():
a6738047c85e Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
39 for item in ET(child):
a6738047c85e Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
40 yield item
a6738047c85e Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
41 yield END, tag_name, (None, -1, -1)
a6738047c85e Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
42 if element.tail:
a6738047c85e Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents: 230
diff changeset
43 yield TEXT, element.tail, (None, -1, -1)
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
44
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
45
21
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
46 class ParseError(Exception):
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
47 """Exception raised when fatal syntax errors are found in the input being
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
48 parsed."""
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
49
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
50 def __init__(self, message, filename='<string>', lineno=-1, offset=-1):
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
51 Exception.__init__(self, message)
213
bafa1cc49c2f Store original message in exceptions as `msg` ivar.
cmlenz
parents: 209
diff changeset
52 self.msg = message
21
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
53 self.filename = filename
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
54 self.lineno = lineno
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
55 self.offset = offset
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
56
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
57
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
58 class XMLParser(object):
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
59 """Generator-based XML parser based on roughly equivalent code in
26
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
60 Kid/ElementTree.
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
61
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
62 The parsing is initiated by iterating over the parser object:
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
63
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
64 >>> parser = XMLParser(StringIO('<root id="2"><child>Foo</child></root>'))
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
65 >>> for kind, data, pos in parser:
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
66 ... print kind, data
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
67 START (u'root', [(u'id', u'2')])
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
68 START (u'child', [])
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
69 TEXT Foo
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
70 END child
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
71 END root
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
72 """
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
73
293
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
74 _entitydefs = ['<!ENTITY %s "&#%d;">' % (name, value) for name, value in
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
75 htmlentitydefs.name2codepoint.items()]
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
76 _external_dtd = '\n'.join(_entitydefs)
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
77
21
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
78 def __init__(self, source, filename=None):
311
01e2c48f6dfb * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
79 """Initialize the parser for the given XML input.
26
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
80
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
81 @param source: the XML text as a file-like object
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
82 @param filename: the name of the file, if appropriate
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
83 """
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
84 self.source = source
21
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
85 self.filename = filename
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
86
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
87 # Setup the Expat parser
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
88 parser = expat.ParserCreate('utf-8', '}')
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
89 parser.buffer_text = True
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
90 parser.returns_unicode = True
160
faea6db52ef1 Attribute order in parsed XML is now preserved.
cmlenz
parents: 146
diff changeset
91 parser.ordered_attributes = True
faea6db52ef1 Attribute order in parsed XML is now preserved.
cmlenz
parents: 146
diff changeset
92
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
93 parser.StartElementHandler = self._handle_start
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
94 parser.EndElementHandler = self._handle_end
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
95 parser.CharacterDataHandler = self._handle_data
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
96 parser.StartDoctypeDeclHandler = self._handle_doctype
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
97 parser.StartNamespaceDeclHandler = self._handle_start_ns
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
98 parser.EndNamespaceDeclHandler = self._handle_end_ns
143
ef761afcedff CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
99 parser.StartCdataSectionHandler = self._handle_start_cdata
ef761afcedff CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
100 parser.EndCdataSectionHandler = self._handle_end_cdata
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
101 parser.ProcessingInstructionHandler = self._handle_pi
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
102 parser.CommentHandler = self._handle_comment
209
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
103
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
104 # Tell Expat that we'll handle non-XML entities ourselves
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
105 # (in _handle_other)
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
106 parser.DefaultHandler = self._handle_other
293
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
107 parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS)
209
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
108 parser.UseForeignDTD()
293
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
109 parser.ExternalEntityRefHandler = self._build_foreign
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
110
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
111 # Location reporting is only support in Python >= 2.4
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
112 if not hasattr(parser, 'CurrentLineNumber'):
21
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
113 self._getpos = self._getpos_unknown
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
114
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
115 self.expat = parser
21
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
116 self._queue = []
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
117
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
118 def parse(self):
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
119 def _generate():
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
120 try:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
121 bufsize = 4 * 1024 # 4K
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
122 done = False
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
123 while 1:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
124 while not done and len(self._queue) == 0:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
125 data = self.source.read(bufsize)
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
126 if data == '': # end of data
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
127 if hasattr(self, 'expat'):
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
128 self.expat.Parse('', True)
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
129 del self.expat # get rid of circular references
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
130 done = True
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
131 else:
207
0619a27f5e67 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 182
diff changeset
132 if isinstance(data, unicode):
0619a27f5e67 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 182
diff changeset
133 data = data.encode('utf-8')
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
134 self.expat.Parse(data, False)
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
135 for event in self._queue:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
136 yield event
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
137 self._queue = []
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
138 if done:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
139 break
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
140 except expat.ExpatError, e:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
141 msg = str(e)
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
142 if self.filename:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
143 msg += ', in ' + self.filename
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
144 raise ParseError(msg, self.filename, e.lineno, e.offset)
146
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
145 return Stream(_generate()).filter(_coalesce)
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
146
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
147 def __iter__(self):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
148 return iter(self.parse())
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
149
293
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
150 def _build_foreign(self, context, base, sysid, pubid):
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
151 parser = self.expat.ExternalEntityParserCreate(context)
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
152 parser.ParseFile(StringIO(self._external_dtd))
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
153 return 1
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 290
diff changeset
154
143
ef761afcedff CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
155 def _enqueue(self, kind, data=None, pos=None):
26
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
156 if pos is None:
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
157 pos = self._getpos()
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
158 if kind is TEXT:
134
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
159 # Expat reports the *end* of the text event as current position. We
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
160 # try to fix that up here as much as possible. Unfortunately, the
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
161 # offset is only valid for single-line text. For multi-line text,
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
162 # it is apparently not possible to determine at what offset it
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
163 # started
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
164 if '\n' in data:
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
165 lines = data.splitlines()
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
166 lineno = pos[1] - len(lines) + 1
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
167 offset = -1
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
168 else:
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
169 lineno = pos[1]
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
170 offset = pos[2] - len(data)
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
171 pos = (pos[0], lineno, offset)
26
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
172 self._queue.append((kind, data, pos))
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
173
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
174 def _getpos_unknown(self):
134
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
175 return (self.filename, -1, -1)
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
176
21
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
177 def _getpos(self):
134
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
178 return (self.filename, self.expat.CurrentLineNumber,
21
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
179 self.expat.CurrentColumnNumber)
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
180
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
181 def _handle_start(self, tag, attrib):
182
41db0260ebb1 Renamed `Attributes` to `Attrs` to reduce the verbosity.
cmlenz
parents: 160
diff changeset
182 self._enqueue(START, (QName(tag), Attrs(zip(*[iter(attrib)] * 2))))
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
183
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
184 def _handle_end(self, tag):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
185 self._enqueue(END, QName(tag))
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
186
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
187 def _handle_data(self, text):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
188 self._enqueue(TEXT, text)
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
189
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
190 def _handle_doctype(self, name, sysid, pubid, has_internal_subset):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
191 self._enqueue(DOCTYPE, (name, pubid, sysid))
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
192
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
193 def _handle_start_ns(self, prefix, uri):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
194 self._enqueue(START_NS, (prefix or '', uri))
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
195
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
196 def _handle_end_ns(self, prefix):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
197 self._enqueue(END_NS, prefix or '')
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
198
143
ef761afcedff CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
199 def _handle_start_cdata(self):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
200 self._enqueue(START_CDATA)
143
ef761afcedff CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
201
ef761afcedff CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
202 def _handle_end_cdata(self):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
203 self._enqueue(END_CDATA)
143
ef761afcedff CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
204
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
205 def _handle_pi(self, target, data):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
206 self._enqueue(PI, (target, data))
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
207
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
208 def _handle_comment(self, text):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
209 self._enqueue(COMMENT, text)
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
210
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
211 def _handle_other(self, text):
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
212 if text.startswith('&'):
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
213 # deal with undefined entities
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
214 try:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
215 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
216 self._enqueue(TEXT, text)
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
217 except KeyError:
209
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
218 filename, lineno, offset = self._getpos()
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
219 error = expat.error('undefined entity "%s": line %d, column %d'
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
220 % (text, lineno, offset))
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
221 error.code = expat.errors.XML_ERROR_UNDEFINED_ENTITY
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
222 error.lineno = lineno
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
223 error.offset = offset
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
224 raise error
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
225
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
226
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
227 def XML(text):
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
228 return Stream(list(XMLParser(StringIO(text))))
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
229
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
230
21
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
231 class HTMLParser(html.HTMLParser, object):
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
232 """Parser for HTML input based on the Python `HTMLParser` module.
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
233
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
234 This class provides the same interface for generating stream events as
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
235 `XMLParser`, and attempts to automatically balance tags.
26
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
236
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
237 The parsing is initiated by iterating over the parser object:
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
238
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
239 >>> parser = HTMLParser(StringIO('<UL compact><LI>Foo</UL>'))
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
240 >>> for kind, data, pos in parser:
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
241 ... print kind, data
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
242 START (u'ul', [(u'compact', u'compact')])
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
243 START (u'li', [])
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
244 TEXT Foo
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
245 END li
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
246 END ul
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
247 """
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
248
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
249 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame',
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
250 'hr', 'img', 'input', 'isindex', 'link', 'meta',
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
251 'param'])
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
252
311
01e2c48f6dfb * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
253 def __init__(self, source, filename=None, encoding='utf-8'):
01e2c48f6dfb * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
254 """Initialize the parser for the given HTML input.
01e2c48f6dfb * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
255
01e2c48f6dfb * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
256 @param source: the HTML text as a file-like object
01e2c48f6dfb * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
257 @param filename: the name of the file, if known
01e2c48f6dfb * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
258 @param filename: encoding of the file; ignored if the input is unicode
01e2c48f6dfb * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
259 """
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
260 html.HTMLParser.__init__(self)
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
261 self.source = source
21
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
262 self.filename = filename
311
01e2c48f6dfb * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
263 self.encoding = encoding
21
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
264 self._queue = []
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
265 self._open_tags = []
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
266
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
267 def parse(self):
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
268 def _generate():
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
269 try:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
270 bufsize = 4 * 1024 # 4K
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
271 done = False
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
272 while 1:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
273 while not done and len(self._queue) == 0:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
274 data = self.source.read(bufsize)
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
275 if data == '': # end of data
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
276 self.close()
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
277 done = True
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
278 else:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
279 self.feed(data)
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
280 for kind, data, pos in self._queue:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
281 yield kind, data, pos
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
282 self._queue = []
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
283 if done:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
284 open_tags = self._open_tags
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
285 open_tags.reverse()
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
286 for tag in open_tags:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
287 yield END, QName(tag), pos
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
288 break
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
289 except html.HTMLParseError, e:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
290 msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset)
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
291 if self.filename:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
292 msg += ', in %s' % self.filename
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
293 raise ParseError(msg, self.filename, e.lineno, e.offset)
146
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
294 return Stream(_generate()).filter(_coalesce)
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
295
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
296 def __iter__(self):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
297 return iter(self.parse())
21
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
298
26
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
299 def _enqueue(self, kind, data, pos=None):
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
300 if pos is None:
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
301 pos = self._getpos()
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
302 self._queue.append((kind, data, pos))
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
303
21
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
304 def _getpos(self):
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
305 lineno, column = self.getpos()
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
306 return (self.filename, lineno, column)
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
307
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
308 def handle_starttag(self, tag, attrib):
26
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
309 fixed_attrib = []
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
310 for name, value in attrib: # Fixup minimized attributes
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
311 if value is None:
312
7e743338a799 Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents: 311
diff changeset
312 value = unicode(name)
7e743338a799 Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents: 311
diff changeset
313 elif not isinstance(value, unicode):
7e743338a799 Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents: 311
diff changeset
314 value = value.decode(self.encoding, 'replace')
7e743338a799 Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents: 311
diff changeset
315 fixed_attrib.append((name, stripentities(value)))
26
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
316
182
41db0260ebb1 Renamed `Attributes` to `Attrs` to reduce the verbosity.
cmlenz
parents: 160
diff changeset
317 self._enqueue(START, (QName(tag), Attrs(fixed_attrib)))
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
318 if tag in self._EMPTY_ELEMS:
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
319 self._enqueue(END, QName(tag))
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
320 else:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
321 self._open_tags.append(tag)
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
322
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
323 def handle_endtag(self, tag):
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
324 if tag not in self._EMPTY_ELEMS:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
325 while self._open_tags:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
326 open_tag = self._open_tags.pop()
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
327 if open_tag.lower() == tag.lower():
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
328 break
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
329 self._enqueue(END, QName(open_tag))
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
330 self._enqueue(END, QName(tag))
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
331
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
332 def handle_data(self, text):
311
01e2c48f6dfb * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
333 if not isinstance(text, unicode):
01e2c48f6dfb * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
334 text = text.decode(self.encoding, 'replace')
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
335 self._enqueue(TEXT, text)
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
336
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
337 def handle_charref(self, name):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
338 text = unichr(int(name))
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
339 self._enqueue(TEXT, text)
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
340
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
341 def handle_entityref(self, name):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
342 try:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
343 text = unichr(htmlentitydefs.name2codepoint[name])
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
344 except KeyError:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
345 text = '&%s;' % name
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
346 self._enqueue(TEXT, text)
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
347
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
348 def handle_pi(self, data):
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
349 target, data = data.split(maxsplit=1)
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
350 data = data.rstrip('?')
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
351 self._enqueue(PI, (target.strip(), data.strip()))
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
352
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
353 def handle_comment(self, text):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
354 self._enqueue(COMMENT, text)
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
355
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
356
311
01e2c48f6dfb * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
357 def HTML(text, encoding='utf-8'):
01e2c48f6dfb * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
358 return Stream(list(HTMLParser(StringIO(text), encoding=encoding)))
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
359
146
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
360 def _coalesce(stream):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
361 """Coalesces adjacent TEXT events into a single event."""
146
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
362 textbuf = []
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
363 textpos = None
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
364 for kind, data, pos in chain(stream, [(None, None, None)]):
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
365 if kind is TEXT:
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
366 textbuf.append(data)
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
367 if textpos is None:
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
368 textpos = pos
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
369 else:
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
370 if textbuf:
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
371 yield TEXT, u''.join(textbuf), textpos
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
372 del textbuf[:]
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
373 textpos = None
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
374 if kind:
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
375 yield kind, data, pos
Copyright (C) 2012-2017 Edgewall Software