Mercurial > genshi > genshi-test
annotate markup/input.py @ 140:a2edde90ad24
Fix bug in HTML serializer, plus some other minor tweaks.
author | cmlenz |
---|---|
date | Wed, 09 Aug 2006 21:00:15 +0000 |
parents | df44110ca91d |
children | ef761afcedff |
rev | line source |
---|---|
1 | 1 # -*- coding: utf-8 -*- |
2 # | |
66
822089ae65ce
Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents:
27
diff
changeset
|
3 # Copyright (C) 2006 Edgewall Software |
1 | 4 # All rights reserved. |
5 # | |
6 # This software is licensed as described in the file COPYING, which | |
7 # you should have received as part of this distribution. The terms | |
66
822089ae65ce
Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents:
27
diff
changeset
|
8 # are also available at http://markup.edgewall.org/wiki/License. |
1 | 9 # |
10 # This software consists of voluntary contributions made by many | |
11 # individuals. For the exact contribution history, see the revision | |
66
822089ae65ce
Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents:
27
diff
changeset
|
12 # history and logs, available at http://markup.edgewall.org/log/. |
1 | 13 |
14 from xml.parsers import expat | |
15 try: | |
16 frozenset | |
17 except NameError: | |
18 from sets import ImmutableSet as frozenset | |
19 import HTMLParser as html | |
20 import htmlentitydefs | |
21 from StringIO import StringIO | |
22 | |
23 from markup.core import Attributes, Markup, QName, Stream | |
24 | |
25 | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
26 class ParseError(Exception): |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
27 """Exception raised when fatal syntax errors are found in the input being |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
28 parsed.""" |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
29 |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
30 def __init__(self, message, filename='<string>', lineno=-1, offset=-1): |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
31 Exception.__init__(self, message) |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
32 self.filename = filename |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
33 self.lineno = lineno |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
34 self.offset = offset |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
35 |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
36 |
1 | 37 class XMLParser(object): |
38 """Generator-based XML parser based on roughly equivalent code in | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
39 Kid/ElementTree. |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
40 |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
41 The parsing is initiated by iterating over the parser object: |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
42 |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
43 >>> parser = XMLParser(StringIO('<root id="2"><child>Foo</child></root>')) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
44 >>> for kind, data, pos in parser: |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
45 ... print kind, data |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
46 START (u'root', [(u'id', u'2')]) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
47 START (u'child', []) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
48 TEXT Foo |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
49 END child |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
50 END root |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
51 """ |
1 | 52 |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
53 def __init__(self, source, filename=None): |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
54 """Initialize the parser for the given XML text. |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
55 |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
56 @param source: the XML text as a file-like object |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
57 @param filename: the name of the file, if appropriate |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
58 """ |
1 | 59 self.source = source |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
60 self.filename = filename |
1 | 61 |
62 # Setup the Expat parser | |
63 parser = expat.ParserCreate('utf-8', '}') | |
64 parser.buffer_text = True | |
65 parser.returns_unicode = True | |
66 parser.StartElementHandler = self._handle_start | |
67 parser.EndElementHandler = self._handle_end | |
68 parser.CharacterDataHandler = self._handle_data | |
69 parser.StartDoctypeDeclHandler = self._handle_doctype | |
70 parser.StartNamespaceDeclHandler = self._handle_start_ns | |
71 parser.EndNamespaceDeclHandler = self._handle_end_ns | |
72 parser.ProcessingInstructionHandler = self._handle_pi | |
73 parser.CommentHandler = self._handle_comment | |
74 parser.DefaultHandler = self._handle_other | |
75 | |
76 # Location reporting is only support in Python >= 2.4 | |
77 if not hasattr(parser, 'CurrentLineNumber'): | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
78 self._getpos = self._getpos_unknown |
1 | 79 |
80 self.expat = parser | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
81 self._queue = [] |
1 | 82 |
83 def __iter__(self): | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
84 try: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
85 bufsize = 4 * 1024 # 4K |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
86 done = False |
69 | 87 while 1: |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
88 while not done and len(self._queue) == 0: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
89 data = self.source.read(bufsize) |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
90 if data == '': # end of data |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
91 if hasattr(self, 'expat'): |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
92 self.expat.Parse('', True) |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
93 del self.expat # get rid of circular references |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
94 done = True |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
95 else: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
96 self.expat.Parse(data, False) |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
97 for event in self._queue: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
98 yield event |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
99 self._queue = [] |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
100 if done: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
101 break |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
102 except expat.ExpatError, e: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
103 msg = str(e) |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
104 if self.filename: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
105 msg += ', in ' + self.filename |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
106 raise ParseError(msg, self.filename, e.lineno, e.offset) |
1 | 107 |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
108 def _enqueue(self, kind, data, pos=None): |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
109 if pos is None: |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
110 pos = self._getpos() |
134
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
111 if kind is Stream.TEXT: |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
112 # Expat reports the *end* of the text event as current position. We |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
113 # try to fix that up here as much as possible. Unfortunately, the |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
114 # offset is only valid for single-line text. For multi-line text, |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
115 # it is apparently not possible to determine at what offset it |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
116 # started |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
117 if '\n' in data: |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
118 lines = data.splitlines() |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
119 lineno = pos[1] - len(lines) + 1 |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
120 offset = -1 |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
121 else: |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
122 lineno = pos[1] |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
123 offset = pos[2] - len(data) |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
124 pos = (pos[0], lineno, offset) |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
125 self._queue.append((kind, data, pos)) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
126 |
1 | 127 def _getpos_unknown(self): |
134
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
128 return (self.filename, -1, -1) |
1 | 129 |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
130 def _getpos(self): |
134
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
131 return (self.filename, self.expat.CurrentLineNumber, |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
132 self.expat.CurrentColumnNumber) |
1 | 133 |
134 def _handle_start(self, tag, attrib): | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
135 self._enqueue(Stream.START, (QName(tag), Attributes(attrib.items()))) |
1 | 136 |
137 def _handle_end(self, tag): | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
138 self._enqueue(Stream.END, QName(tag)) |
1 | 139 |
140 def _handle_data(self, text): | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
141 self._enqueue(Stream.TEXT, text) |
1 | 142 |
143 def _handle_doctype(self, name, sysid, pubid, has_internal_subset): | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
144 self._enqueue(Stream.DOCTYPE, (name, pubid, sysid)) |
1 | 145 |
146 def _handle_start_ns(self, prefix, uri): | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
147 self._enqueue(Stream.START_NS, (prefix or '', uri)) |
1 | 148 |
149 def _handle_end_ns(self, prefix): | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
150 self._enqueue(Stream.END_NS, prefix or '') |
1 | 151 |
152 def _handle_pi(self, target, data): | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
153 self._enqueue(Stream.PI, (target, data)) |
1 | 154 |
155 def _handle_comment(self, text): | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
156 self._enqueue(Stream.COMMENT, text) |
1 | 157 |
158 def _handle_other(self, text): | |
159 if text.startswith('&'): | |
160 # deal with undefined entities | |
161 try: | |
162 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
163 self._enqueue(Stream.TEXT, text) |
1 | 164 except KeyError: |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
165 lineno, offset = self._getpos() |
1 | 166 raise expat.error("undefined entity %s: line %d, column %d" % |
167 (text, lineno, offset)) | |
168 | |
169 | |
170 def XML(text): | |
171 return Stream(list(XMLParser(StringIO(text)))) | |
172 | |
173 | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
174 class HTMLParser(html.HTMLParser, object): |
1 | 175 """Parser for HTML input based on the Python `HTMLParser` module. |
176 | |
177 This class provides the same interface for generating stream events as | |
178 `XMLParser`, and attempts to automatically balance tags. | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
179 |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
180 The parsing is initiated by iterating over the parser object: |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
181 |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
182 >>> parser = HTMLParser(StringIO('<UL compact><LI>Foo</UL>')) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
183 >>> for kind, data, pos in parser: |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
184 ... print kind, data |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
185 START (u'ul', [(u'compact', u'compact')]) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
186 START (u'li', []) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
187 TEXT Foo |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
188 END li |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
189 END ul |
1 | 190 """ |
191 | |
192 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', | |
193 'hr', 'img', 'input', 'isindex', 'link', 'meta', | |
194 'param']) | |
195 | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
196 def __init__(self, source, filename=None): |
1 | 197 html.HTMLParser.__init__(self) |
198 self.source = source | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
199 self.filename = filename |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
200 self._queue = [] |
1 | 201 self._open_tags = [] |
202 | |
203 def __iter__(self): | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
204 try: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
205 bufsize = 4 * 1024 # 4K |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
206 done = False |
69 | 207 while 1: |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
208 while not done and len(self._queue) == 0: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
209 data = self.source.read(bufsize) |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
210 if data == '': # end of data |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
211 self.close() |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
212 done = True |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
213 else: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
214 self.feed(data) |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
215 for kind, data, pos in self._queue: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
216 yield kind, data, pos |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
217 self._queue = [] |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
218 if done: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
219 open_tags = self._open_tags |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
220 open_tags.reverse() |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
221 for tag in open_tags: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
222 yield Stream.END, QName(tag), pos |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
223 break |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
224 except html.HTMLParseError, e: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
225 msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset) |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
226 if self.filename: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
227 msg += ', in %s' % self.filename |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
228 raise ParseError(msg, self.filename, e.lineno, e.offset) |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
229 |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
230 def _enqueue(self, kind, data, pos=None): |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
231 if pos is None: |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
232 pos = self._getpos() |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
233 self._queue.append((kind, data, pos)) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
234 |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
235 def _getpos(self): |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
236 lineno, column = self.getpos() |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
237 return (self.filename, lineno, column) |
1 | 238 |
239 def handle_starttag(self, tag, attrib): | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
240 fixed_attrib = [] |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
241 for name, value in attrib: # Fixup minimized attributes |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
242 if value is None: |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
243 value = name |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
244 fixed_attrib.append((name, unicode(value))) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
245 |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
246 self._enqueue(Stream.START, (QName(tag), Attributes(fixed_attrib))) |
1 | 247 if tag in self._EMPTY_ELEMS: |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
248 self._enqueue(Stream.END, QName(tag)) |
1 | 249 else: |
250 self._open_tags.append(tag) | |
251 | |
252 def handle_endtag(self, tag): | |
253 if tag not in self._EMPTY_ELEMS: | |
254 while self._open_tags: | |
255 open_tag = self._open_tags.pop() | |
256 if open_tag.lower() == tag.lower(): | |
257 break | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
258 self._enqueue(Stream.END, QName(open_tag)) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
259 self._enqueue(Stream.END, QName(tag)) |
1 | 260 |
261 def handle_data(self, text): | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
262 self._enqueue(Stream.TEXT, text) |
1 | 263 |
264 def handle_charref(self, name): | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
265 self._enqueue(Stream.TEXT, Markup('&#%s;' % name)) |
1 | 266 |
267 def handle_entityref(self, name): | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
268 self._enqueue(Stream.TEXT, Markup('&%s;' % name)) |
1 | 269 |
270 def handle_pi(self, data): | |
271 target, data = data.split(maxsplit=1) | |
272 data = data.rstrip('?') | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
273 self._enqueue(Stream.PI, (target.strip(), data.strip())) |
1 | 274 |
275 def handle_comment(self, text): | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
276 self._enqueue(Stream.COMMENT, text) |
1 | 277 |
278 | |
279 def HTML(text): | |
280 return Stream(list(HTMLParser(StringIO(text)))) |