annotate markup/input.py @ 211:e5151983df0d trunk

Fix another regression introduced in [258]: some kinds of cascaded match templates were broken, for example in the TurboGears example app.
author cmlenz
date Tue, 29 Aug 2006 21:14:58 +0000
parents fc6b2fb66518
children 13d2d4420628
rev   line source
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
1 # -*- coding: utf-8 -*-
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
2 #
66
59eb24184e9c Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents: 27
diff changeset
3 # Copyright (C) 2006 Edgewall Software
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
4 # All rights reserved.
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
5 #
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
6 # This software is licensed as described in the file COPYING, which
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
7 # you should have received as part of this distribution. The terms
66
59eb24184e9c Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents: 27
diff changeset
8 # are also available at http://markup.edgewall.org/wiki/License.
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
9 #
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
10 # This software consists of voluntary contributions made by many
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
11 # individuals. For the exact contribution history, see the revision
66
59eb24184e9c Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents: 27
diff changeset
12 # history and logs, available at http://markup.edgewall.org/log/.
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
13
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
14 from itertools import chain
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
15 from xml.parsers import expat
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
16 try:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
17 frozenset
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
18 except NameError:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
19 from sets import ImmutableSet as frozenset
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
20 import HTMLParser as html
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
21 import htmlentitydefs
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
22 from StringIO import StringIO
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
23
182
2f30ce3fb85e Renamed `Attributes` to `Attrs` to reduce the verbosity.
cmlenz
parents: 160
diff changeset
24 from markup.core import Attrs, QName, Stream
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
25 from markup.core import DOCTYPE, START, END, START_NS, END_NS, TEXT, \
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
26 START_CDATA, END_CDATA, PI, COMMENT
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
27
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
28 __all__ = ['ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML']
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
29
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
30
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
31 class ParseError(Exception):
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
32 """Exception raised when fatal syntax errors are found in the input being
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
33 parsed."""
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
34
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
35 def __init__(self, message, filename='<string>', lineno=-1, offset=-1):
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
36 Exception.__init__(self, message)
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
37 self.filename = filename
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
38 self.lineno = lineno
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
39 self.offset = offset
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
40
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
41
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
42 class XMLParser(object):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
43 """Generator-based XML parser based on roughly equivalent code in
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
44 Kid/ElementTree.
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
45
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
46 The parsing is initiated by iterating over the parser object:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
47
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
48 >>> parser = XMLParser(StringIO('<root id="2"><child>Foo</child></root>'))
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
49 >>> for kind, data, pos in parser:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
50 ... print kind, data
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
51 START (u'root', [(u'id', u'2')])
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
52 START (u'child', [])
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
53 TEXT Foo
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
54 END child
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
55 END root
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
56 """
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
57
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
58 def __init__(self, source, filename=None):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
59 """Initialize the parser for the given XML text.
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
60
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
61 @param source: the XML text as a file-like object
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
62 @param filename: the name of the file, if appropriate
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
63 """
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
64 self.source = source
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
65 self.filename = filename
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
66
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
67 # Setup the Expat parser
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
68 parser = expat.ParserCreate('utf-8', '}')
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
69 parser.buffer_text = True
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
70 parser.returns_unicode = True
160
d19e8a2c549e Attribute order in parsed XML is now preserved.
cmlenz
parents: 146
diff changeset
71 parser.ordered_attributes = True
d19e8a2c549e Attribute order in parsed XML is now preserved.
cmlenz
parents: 146
diff changeset
72
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
73 parser.StartElementHandler = self._handle_start
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
74 parser.EndElementHandler = self._handle_end
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
75 parser.CharacterDataHandler = self._handle_data
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
76 parser.StartDoctypeDeclHandler = self._handle_doctype
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
77 parser.StartNamespaceDeclHandler = self._handle_start_ns
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
78 parser.EndNamespaceDeclHandler = self._handle_end_ns
143
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
79 parser.StartCdataSectionHandler = self._handle_start_cdata
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
80 parser.EndCdataSectionHandler = self._handle_end_cdata
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
81 parser.ProcessingInstructionHandler = self._handle_pi
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
82 parser.CommentHandler = self._handle_comment
209
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
83
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
84 # Tell Expat that we'll handle non-XML entities ourselves
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
85 # (in _handle_other)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
86 parser.DefaultHandler = self._handle_other
209
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
87 parser.UseForeignDTD()
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
88
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
89 # Location reporting is only support in Python >= 2.4
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
90 if not hasattr(parser, 'CurrentLineNumber'):
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
91 self._getpos = self._getpos_unknown
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
92
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
93 self.expat = parser
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
94 self._queue = []
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
95
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
96 def parse(self):
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
97 def _generate():
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
98 try:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
99 bufsize = 4 * 1024 # 4K
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
100 done = False
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
101 while 1:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
102 while not done and len(self._queue) == 0:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
103 data = self.source.read(bufsize)
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
104 if data == '': # end of data
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
105 if hasattr(self, 'expat'):
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
106 self.expat.Parse('', True)
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
107 del self.expat # get rid of circular references
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
108 done = True
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
109 else:
207
28bfc6aafab7 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 182
diff changeset
110 if isinstance(data, unicode):
28bfc6aafab7 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 182
diff changeset
111 data = data.encode('utf-8')
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
112 self.expat.Parse(data, False)
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
113 for event in self._queue:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
114 yield event
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
115 self._queue = []
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
116 if done:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
117 break
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
118 except expat.ExpatError, e:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
119 msg = str(e)
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
120 if self.filename:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
121 msg += ', in ' + self.filename
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
122 raise ParseError(msg, self.filename, e.lineno, e.offset)
146
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
123 return Stream(_generate()).filter(_coalesce)
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
124
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
125 def __iter__(self):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
126 return iter(self.parse())
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
127
143
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
128 def _enqueue(self, kind, data=None, pos=None):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
129 if pos is None:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
130 pos = self._getpos()
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
131 if kind is TEXT:
134
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
132 # Expat reports the *end* of the text event as current position. We
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
133 # try to fix that up here as much as possible. Unfortunately, the
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
134 # offset is only valid for single-line text. For multi-line text,
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
135 # it is apparently not possible to determine at what offset it
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
136 # started
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
137 if '\n' in data:
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
138 lines = data.splitlines()
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
139 lineno = pos[1] - len(lines) + 1
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
140 offset = -1
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
141 else:
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
142 lineno = pos[1]
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
143 offset = pos[2] - len(data)
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
144 pos = (pos[0], lineno, offset)
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
145 self._queue.append((kind, data, pos))
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
146
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
147 def _getpos_unknown(self):
134
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
148 return (self.filename, -1, -1)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
149
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
150 def _getpos(self):
134
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
151 return (self.filename, self.expat.CurrentLineNumber,
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
152 self.expat.CurrentColumnNumber)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
153
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
154 def _handle_start(self, tag, attrib):
182
2f30ce3fb85e Renamed `Attributes` to `Attrs` to reduce the verbosity.
cmlenz
parents: 160
diff changeset
155 self._enqueue(START, (QName(tag), Attrs(zip(*[iter(attrib)] * 2))))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
156
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
157 def _handle_end(self, tag):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
158 self._enqueue(END, QName(tag))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
159
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
160 def _handle_data(self, text):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
161 self._enqueue(TEXT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
162
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
163 def _handle_doctype(self, name, sysid, pubid, has_internal_subset):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
164 self._enqueue(DOCTYPE, (name, pubid, sysid))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
165
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
166 def _handle_start_ns(self, prefix, uri):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
167 self._enqueue(START_NS, (prefix or '', uri))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
168
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
169 def _handle_end_ns(self, prefix):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
170 self._enqueue(END_NS, prefix or '')
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
171
143
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
172 def _handle_start_cdata(self):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
173 self._enqueue(START_CDATA)
143
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
174
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
175 def _handle_end_cdata(self):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
176 self._enqueue(END_CDATA)
143
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
177
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
178 def _handle_pi(self, target, data):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
179 self._enqueue(PI, (target, data))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
180
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
181 def _handle_comment(self, text):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
182 self._enqueue(COMMENT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
183
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
184 def _handle_other(self, text):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
185 if text.startswith('&'):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
186 # deal with undefined entities
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
187 try:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
188 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
189 self._enqueue(TEXT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
190 except KeyError:
209
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
191 filename, lineno, offset = self._getpos()
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
192 error = expat.error('undefined entity "%s": line %d, column %d'
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
193 % (text, lineno, offset))
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
194 error.code = expat.errors.XML_ERROR_UNDEFINED_ENTITY
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
195 error.lineno = lineno
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
196 error.offset = offset
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
197 raise error
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
198
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
199
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
200 def XML(text):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
201 return Stream(list(XMLParser(StringIO(text))))
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
202
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
203
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
204 class HTMLParser(html.HTMLParser, object):
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
205 """Parser for HTML input based on the Python `HTMLParser` module.
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
206
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
207 This class provides the same interface for generating stream events as
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
208 `XMLParser`, and attempts to automatically balance tags.
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
209
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
210 The parsing is initiated by iterating over the parser object:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
211
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
212 >>> parser = HTMLParser(StringIO('<UL compact><LI>Foo</UL>'))
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
213 >>> for kind, data, pos in parser:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
214 ... print kind, data
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
215 START (u'ul', [(u'compact', u'compact')])
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
216 START (u'li', [])
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
217 TEXT Foo
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
218 END li
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
219 END ul
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
220 """
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
221
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
222 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame',
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
223 'hr', 'img', 'input', 'isindex', 'link', 'meta',
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
224 'param'])
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
225
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
226 def __init__(self, source, filename=None):
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
227 html.HTMLParser.__init__(self)
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
228 self.source = source
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
229 self.filename = filename
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
230 self._queue = []
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
231 self._open_tags = []
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
232
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
233 def parse(self):
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
234 def _generate():
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
235 try:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
236 bufsize = 4 * 1024 # 4K
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
237 done = False
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
238 while 1:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
239 while not done and len(self._queue) == 0:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
240 data = self.source.read(bufsize)
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
241 if data == '': # end of data
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
242 self.close()
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
243 done = True
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
244 else:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
245 self.feed(data)
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
246 for kind, data, pos in self._queue:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
247 yield kind, data, pos
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
248 self._queue = []
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
249 if done:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
250 open_tags = self._open_tags
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
251 open_tags.reverse()
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
252 for tag in open_tags:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
253 yield END, QName(tag), pos
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
254 break
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
255 except html.HTMLParseError, e:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
256 msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset)
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
257 if self.filename:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
258 msg += ', in %s' % self.filename
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
259 raise ParseError(msg, self.filename, e.lineno, e.offset)
146
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
260 return Stream(_generate()).filter(_coalesce)
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
261
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
262 def __iter__(self):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
263 return iter(self.parse())
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
264
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
265 def _enqueue(self, kind, data, pos=None):
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
266 if pos is None:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
267 pos = self._getpos()
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
268 self._queue.append((kind, data, pos))
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
269
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
270 def _getpos(self):
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
271 lineno, column = self.getpos()
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
272 return (self.filename, lineno, column)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
273
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
274 def handle_starttag(self, tag, attrib):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
275 fixed_attrib = []
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
276 for name, value in attrib: # Fixup minimized attributes
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
277 if value is None:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
278 value = name
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
279 fixed_attrib.append((name, unicode(value)))
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
280
182
2f30ce3fb85e Renamed `Attributes` to `Attrs` to reduce the verbosity.
cmlenz
parents: 160
diff changeset
281 self._enqueue(START, (QName(tag), Attrs(fixed_attrib)))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
282 if tag in self._EMPTY_ELEMS:
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
283 self._enqueue(END, QName(tag))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
284 else:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
285 self._open_tags.append(tag)
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
286
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
287 def handle_endtag(self, tag):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
288 if tag not in self._EMPTY_ELEMS:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
289 while self._open_tags:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
290 open_tag = self._open_tags.pop()
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
291 if open_tag.lower() == tag.lower():
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
292 break
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
293 self._enqueue(END, QName(open_tag))
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
294 self._enqueue(END, QName(tag))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
295
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
296 def handle_data(self, text):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
297 self._enqueue(TEXT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
298
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
299 def handle_charref(self, name):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
300 text = unichr(int(name))
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
301 self._enqueue(TEXT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
302
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
303 def handle_entityref(self, name):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
304 try:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
305 text = unichr(htmlentitydefs.name2codepoint[name])
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
306 except KeyError:
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
307 text = '&%s;' % name
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
308 self._enqueue(TEXT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
309
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
310 def handle_pi(self, data):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
311 target, data = data.split(maxsplit=1)
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
312 data = data.rstrip('?')
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
313 self._enqueue(PI, (target.strip(), data.strip()))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
314
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
315 def handle_comment(self, text):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
316 self._enqueue(COMMENT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
317
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
318
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
319 def HTML(text):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
320 return Stream(list(HTMLParser(StringIO(text))))
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
321
146
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
322 def _coalesce(stream):
144
d1ce85a7f296 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
323 """Coalesces adjacent TEXT events into a single event."""
146
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
324 textbuf = []
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
325 textpos = None
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
326 for kind, data, pos in chain(stream, [(None, None, None)]):
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
327 if kind is TEXT:
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
328 textbuf.append(data)
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
329 if textpos is None:
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
330 textpos = pos
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
331 else:
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
332 if textbuf:
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
333 yield TEXT, u''.join(textbuf), textpos
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
334 del textbuf[:]
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
335 textpos = None
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
336 if kind:
04799355362d Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
337 yield kind, data, pos
Copyright (C) 2012-2017 Edgewall Software