annotate genshi/input.py @ 230:24757b771651

Renamed Markup to Genshi in repository.
author cmlenz
date Mon, 11 Sep 2006 15:07:07 +0000
parents markup/input.py@bafa1cc49c2f
children a6738047c85e 2fd22ce67ed0
rev   line source
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
1 # -*- coding: utf-8 -*-
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
2 #
66
822089ae65ce Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents: 27
diff changeset
3 # Copyright (C) 2006 Edgewall Software
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
4 # All rights reserved.
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
5 #
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
6 # This software is licensed as described in the file COPYING, which
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
7 # you should have received as part of this distribution. The terms
230
24757b771651 Renamed Markup to Genshi in repository.
cmlenz
parents: 213
diff changeset
8 # are also available at http://genshi.edgewall.org/wiki/License.
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
9 #
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
10 # This software consists of voluntary contributions made by many
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
11 # individuals. For the exact contribution history, see the revision
230
24757b771651 Renamed Markup to Genshi in repository.
cmlenz
parents: 213
diff changeset
12 # history and logs, available at http://genshi.edgewall.org/log/.
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
13
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
14 from itertools import chain
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
15 from xml.parsers import expat
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
16 try:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
17 frozenset
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
18 except NameError:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
19 from sets import ImmutableSet as frozenset
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
20 import HTMLParser as html
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
21 import htmlentitydefs
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
22 from StringIO import StringIO
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
23
230
24757b771651 Renamed Markup to Genshi in repository.
cmlenz
parents: 213
diff changeset
24 from genshi.core import Attrs, QName, Stream
24757b771651 Renamed Markup to Genshi in repository.
cmlenz
parents: 213
diff changeset
25 from genshi.core import DOCTYPE, START, END, START_NS, END_NS, TEXT, \
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
26 START_CDATA, END_CDATA, PI, COMMENT
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
27
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
28 __all__ = ['ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML']
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
29
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
30
21
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
31 class ParseError(Exception):
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
32 """Exception raised when fatal syntax errors are found in the input being
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
33 parsed."""
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
34
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
35 def __init__(self, message, filename='<string>', lineno=-1, offset=-1):
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
36 Exception.__init__(self, message)
213
bafa1cc49c2f Store original message in exceptions as `msg` ivar.
cmlenz
parents: 209
diff changeset
37 self.msg = message
21
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
38 self.filename = filename
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
39 self.lineno = lineno
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
40 self.offset = offset
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
41
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
42
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
43 class XMLParser(object):
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
44 """Generator-based XML parser based on roughly equivalent code in
26
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
45 Kid/ElementTree.
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
46
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
47 The parsing is initiated by iterating over the parser object:
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
48
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
49 >>> parser = XMLParser(StringIO('<root id="2"><child>Foo</child></root>'))
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
50 >>> for kind, data, pos in parser:
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
51 ... print kind, data
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
52 START (u'root', [(u'id', u'2')])
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
53 START (u'child', [])
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
54 TEXT Foo
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
55 END child
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
56 END root
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
57 """
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
58
21
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
59 def __init__(self, source, filename=None):
26
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
60 """Initialize the parser for the given XML text.
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
61
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
62 @param source: the XML text as a file-like object
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
63 @param filename: the name of the file, if appropriate
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
64 """
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
65 self.source = source
21
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
66 self.filename = filename
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
67
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
68 # Setup the Expat parser
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
69 parser = expat.ParserCreate('utf-8', '}')
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
70 parser.buffer_text = True
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
71 parser.returns_unicode = True
160
faea6db52ef1 Attribute order in parsed XML is now preserved.
cmlenz
parents: 146
diff changeset
72 parser.ordered_attributes = True
faea6db52ef1 Attribute order in parsed XML is now preserved.
cmlenz
parents: 146
diff changeset
73
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
74 parser.StartElementHandler = self._handle_start
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
75 parser.EndElementHandler = self._handle_end
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
76 parser.CharacterDataHandler = self._handle_data
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
77 parser.StartDoctypeDeclHandler = self._handle_doctype
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
78 parser.StartNamespaceDeclHandler = self._handle_start_ns
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
79 parser.EndNamespaceDeclHandler = self._handle_end_ns
143
ef761afcedff CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
80 parser.StartCdataSectionHandler = self._handle_start_cdata
ef761afcedff CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
81 parser.EndCdataSectionHandler = self._handle_end_cdata
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
82 parser.ProcessingInstructionHandler = self._handle_pi
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
83 parser.CommentHandler = self._handle_comment
209
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
84
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
85 # Tell Expat that we'll handle non-XML entities ourselves
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
86 # (in _handle_other)
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
87 parser.DefaultHandler = self._handle_other
209
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
88 parser.UseForeignDTD()
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
89
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
90 # Location reporting is only support in Python >= 2.4
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
91 if not hasattr(parser, 'CurrentLineNumber'):
21
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
92 self._getpos = self._getpos_unknown
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
93
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
94 self.expat = parser
21
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
95 self._queue = []
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
96
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
97 def parse(self):
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
98 def _generate():
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
99 try:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
100 bufsize = 4 * 1024 # 4K
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
101 done = False
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
102 while 1:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
103 while not done and len(self._queue) == 0:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
104 data = self.source.read(bufsize)
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
105 if data == '': # end of data
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
106 if hasattr(self, 'expat'):
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
107 self.expat.Parse('', True)
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
108 del self.expat # get rid of circular references
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
109 done = True
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
110 else:
207
0619a27f5e67 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 182
diff changeset
111 if isinstance(data, unicode):
0619a27f5e67 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 182
diff changeset
112 data = data.encode('utf-8')
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
113 self.expat.Parse(data, False)
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
114 for event in self._queue:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
115 yield event
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
116 self._queue = []
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
117 if done:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
118 break
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
119 except expat.ExpatError, e:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
120 msg = str(e)
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
121 if self.filename:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
122 msg += ', in ' + self.filename
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
123 raise ParseError(msg, self.filename, e.lineno, e.offset)
146
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
124 return Stream(_generate()).filter(_coalesce)
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
125
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
126 def __iter__(self):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
127 return iter(self.parse())
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
128
143
ef761afcedff CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
129 def _enqueue(self, kind, data=None, pos=None):
26
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
130 if pos is None:
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
131 pos = self._getpos()
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
132 if kind is TEXT:
134
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
133 # Expat reports the *end* of the text event as current position. We
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
134 # try to fix that up here as much as possible. Unfortunately, the
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
135 # offset is only valid for single-line text. For multi-line text,
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
136 # it is apparently not possible to determine at what offset it
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
137 # started
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
138 if '\n' in data:
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
139 lines = data.splitlines()
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
140 lineno = pos[1] - len(lines) + 1
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
141 offset = -1
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
142 else:
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
143 lineno = pos[1]
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
144 offset = pos[2] - len(data)
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
145 pos = (pos[0], lineno, offset)
26
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
146 self._queue.append((kind, data, pos))
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
147
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
148 def _getpos_unknown(self):
134
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
149 return (self.filename, -1, -1)
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
150
21
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
151 def _getpos(self):
134
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
152 return (self.filename, self.expat.CurrentLineNumber,
21
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
153 self.expat.CurrentColumnNumber)
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
154
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
155 def _handle_start(self, tag, attrib):
182
41db0260ebb1 Renamed `Attributes` to `Attrs` to reduce the verbosity.
cmlenz
parents: 160
diff changeset
156 self._enqueue(START, (QName(tag), Attrs(zip(*[iter(attrib)] * 2))))
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
157
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
158 def _handle_end(self, tag):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
159 self._enqueue(END, QName(tag))
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
160
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
161 def _handle_data(self, text):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
162 self._enqueue(TEXT, text)
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
163
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
164 def _handle_doctype(self, name, sysid, pubid, has_internal_subset):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
165 self._enqueue(DOCTYPE, (name, pubid, sysid))
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
166
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
167 def _handle_start_ns(self, prefix, uri):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
168 self._enqueue(START_NS, (prefix or '', uri))
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
169
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
170 def _handle_end_ns(self, prefix):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
171 self._enqueue(END_NS, prefix or '')
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
172
143
ef761afcedff CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
173 def _handle_start_cdata(self):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
174 self._enqueue(START_CDATA)
143
ef761afcedff CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
175
ef761afcedff CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
176 def _handle_end_cdata(self):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
177 self._enqueue(END_CDATA)
143
ef761afcedff CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
178
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
179 def _handle_pi(self, target, data):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
180 self._enqueue(PI, (target, data))
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
181
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
182 def _handle_comment(self, text):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
183 self._enqueue(COMMENT, text)
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
184
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
185 def _handle_other(self, text):
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
186 if text.startswith('&'):
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
187 # deal with undefined entities
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
188 try:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
189 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
190 self._enqueue(TEXT, text)
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
191 except KeyError:
209
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
192 filename, lineno, offset = self._getpos()
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
193 error = expat.error('undefined entity "%s": line %d, column %d'
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
194 % (text, lineno, offset))
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
195 error.code = expat.errors.XML_ERROR_UNDEFINED_ENTITY
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
196 error.lineno = lineno
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
197 error.offset = offset
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
198 raise error
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
199
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
200
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
201 def XML(text):
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
202 return Stream(list(XMLParser(StringIO(text))))
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
203
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
204
21
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
205 class HTMLParser(html.HTMLParser, object):
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
206 """Parser for HTML input based on the Python `HTMLParser` module.
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
207
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
208 This class provides the same interface for generating stream events as
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
209 `XMLParser`, and attempts to automatically balance tags.
26
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
210
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
211 The parsing is initiated by iterating over the parser object:
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
212
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
213 >>> parser = HTMLParser(StringIO('<UL compact><LI>Foo</UL>'))
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
214 >>> for kind, data, pos in parser:
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
215 ... print kind, data
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
216 START (u'ul', [(u'compact', u'compact')])
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
217 START (u'li', [])
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
218 TEXT Foo
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
219 END li
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
220 END ul
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
221 """
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
222
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
223 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame',
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
224 'hr', 'img', 'input', 'isindex', 'link', 'meta',
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
225 'param'])
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
226
21
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
227 def __init__(self, source, filename=None):
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
228 html.HTMLParser.__init__(self)
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
229 self.source = source
21
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
230 self.filename = filename
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
231 self._queue = []
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
232 self._open_tags = []
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
233
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
234 def parse(self):
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
235 def _generate():
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
236 try:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
237 bufsize = 4 * 1024 # 4K
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
238 done = False
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
239 while 1:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
240 while not done and len(self._queue) == 0:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
241 data = self.source.read(bufsize)
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
242 if data == '': # end of data
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
243 self.close()
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
244 done = True
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
245 else:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
246 self.feed(data)
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
247 for kind, data, pos in self._queue:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
248 yield kind, data, pos
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
249 self._queue = []
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
250 if done:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
251 open_tags = self._open_tags
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
252 open_tags.reverse()
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
253 for tag in open_tags:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
254 yield END, QName(tag), pos
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
255 break
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
256 except html.HTMLParseError, e:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
257 msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset)
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
258 if self.filename:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
259 msg += ', in %s' % self.filename
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
260 raise ParseError(msg, self.filename, e.lineno, e.offset)
146
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
261 return Stream(_generate()).filter(_coalesce)
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
262
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
263 def __iter__(self):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
264 return iter(self.parse())
21
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
265
26
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
266 def _enqueue(self, kind, data, pos=None):
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
267 if pos is None:
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
268 pos = self._getpos()
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
269 self._queue.append((kind, data, pos))
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
270
21
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
271 def _getpos(self):
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
272 lineno, column = self.getpos()
eca77129518a * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
273 return (self.filename, lineno, column)
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
274
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
275 def handle_starttag(self, tag, attrib):
26
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
276 fixed_attrib = []
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
277 for name, value in attrib: # Fixup minimized attributes
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
278 if value is None:
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
279 value = name
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
280 fixed_attrib.append((name, unicode(value)))
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
281
182
41db0260ebb1 Renamed `Attributes` to `Attrs` to reduce the verbosity.
cmlenz
parents: 160
diff changeset
282 self._enqueue(START, (QName(tag), Attrs(fixed_attrib)))
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
283 if tag in self._EMPTY_ELEMS:
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
284 self._enqueue(END, QName(tag))
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
285 else:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
286 self._open_tags.append(tag)
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
287
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
288 def handle_endtag(self, tag):
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
289 if tag not in self._EMPTY_ELEMS:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
290 while self._open_tags:
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
291 open_tag = self._open_tags.pop()
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
292 if open_tag.lower() == tag.lower():
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
293 break
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
294 self._enqueue(END, QName(open_tag))
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
295 self._enqueue(END, QName(tag))
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
296
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
297 def handle_data(self, text):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
298 self._enqueue(TEXT, text)
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
299
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
300 def handle_charref(self, name):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
301 text = unichr(int(name))
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
302 self._enqueue(TEXT, text)
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
303
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
304 def handle_entityref(self, name):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
305 try:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
306 text = unichr(htmlentitydefs.name2codepoint[name])
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
307 except KeyError:
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
308 text = '&%s;' % name
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
309 self._enqueue(TEXT, text)
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
310
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
311 def handle_pi(self, data):
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
312 target, data = data.split(maxsplit=1)
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
313 data = data.rstrip('?')
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
314 self._enqueue(PI, (target.strip(), data.strip()))
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
315
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
316 def handle_comment(self, text):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
317 self._enqueue(COMMENT, text)
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
318
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
319
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
320 def HTML(text):
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
321 return Stream(list(HTMLParser(StringIO(text))))
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
322
146
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
323 def _coalesce(stream):
144
28b56f09a7e1 * Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents: 143
diff changeset
324 """Coalesces adjacent TEXT events into a single event."""
146
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
325 textbuf = []
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
326 textpos = None
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
327 for kind, data, pos in chain(stream, [(None, None, None)]):
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
328 if kind is TEXT:
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
329 textbuf.append(data)
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
330 if textpos is None:
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
331 textpos = pos
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
332 else:
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
333 if textbuf:
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
334 yield TEXT, u''.join(textbuf), textpos
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
335 del textbuf[:]
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
336 textpos = None
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
337 if kind:
db0dacc1239a Simplifed `CoalesceFilter` (now a function)
cmlenz
parents: 145
diff changeset
338 yield kind, data, pos
Copyright (C) 2012-2017 Edgewall Software