Mercurial > genshi > mirror
annotate markup/input.py @ 143:3d4c214c979a trunk
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
author | cmlenz |
---|---|
date | Fri, 11 Aug 2006 14:08:13 +0000 |
parents | c1f4390d50f8 |
children | d1ce85a7f296 |
rev | line source |
---|---|
1 | 1 # -*- coding: utf-8 -*- |
2 # | |
66
59eb24184e9c
Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents:
27
diff
changeset
|
3 # Copyright (C) 2006 Edgewall Software |
1 | 4 # All rights reserved. |
5 # | |
6 # This software is licensed as described in the file COPYING, which | |
7 # you should have received as part of this distribution. The terms | |
66
59eb24184e9c
Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents:
27
diff
changeset
|
8 # are also available at http://markup.edgewall.org/wiki/License. |
1 | 9 # |
10 # This software consists of voluntary contributions made by many | |
11 # individuals. For the exact contribution history, see the revision | |
66
59eb24184e9c
Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents:
27
diff
changeset
|
12 # history and logs, available at http://markup.edgewall.org/log/. |
1 | 13 |
14 from xml.parsers import expat | |
15 try: | |
16 frozenset | |
17 except NameError: | |
18 from sets import ImmutableSet as frozenset | |
19 import HTMLParser as html | |
20 import htmlentitydefs | |
21 from StringIO import StringIO | |
22 | |
23 from markup.core import Attributes, Markup, QName, Stream | |
24 | |
25 | |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
26 class ParseError(Exception): |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
27 """Exception raised when fatal syntax errors are found in the input being |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
28 parsed.""" |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
29 |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
30 def __init__(self, message, filename='<string>', lineno=-1, offset=-1): |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
31 Exception.__init__(self, message) |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
32 self.filename = filename |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
33 self.lineno = lineno |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
34 self.offset = offset |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
35 |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
36 |
1 | 37 class XMLParser(object): |
38 """Generator-based XML parser based on roughly equivalent code in | |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
39 Kid/ElementTree. |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
40 |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
41 The parsing is initiated by iterating over the parser object: |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
42 |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
43 >>> parser = XMLParser(StringIO('<root id="2"><child>Foo</child></root>')) |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
44 >>> for kind, data, pos in parser: |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
45 ... print kind, data |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
46 START (u'root', [(u'id', u'2')]) |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
47 START (u'child', []) |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
48 TEXT Foo |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
49 END child |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
50 END root |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
51 """ |
1 | 52 |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
53 def __init__(self, source, filename=None): |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
54 """Initialize the parser for the given XML text. |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
55 |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
56 @param source: the XML text as a file-like object |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
57 @param filename: the name of the file, if appropriate |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
58 """ |
1 | 59 self.source = source |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
60 self.filename = filename |
1 | 61 |
62 # Setup the Expat parser | |
63 parser = expat.ParserCreate('utf-8', '}') | |
64 parser.buffer_text = True | |
65 parser.returns_unicode = True | |
66 parser.StartElementHandler = self._handle_start | |
67 parser.EndElementHandler = self._handle_end | |
68 parser.CharacterDataHandler = self._handle_data | |
69 parser.StartDoctypeDeclHandler = self._handle_doctype | |
70 parser.StartNamespaceDeclHandler = self._handle_start_ns | |
71 parser.EndNamespaceDeclHandler = self._handle_end_ns | |
143
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
72 parser.StartCdataSectionHandler = self._handle_start_cdata |
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
73 parser.EndCdataSectionHandler = self._handle_end_cdata |
1 | 74 parser.ProcessingInstructionHandler = self._handle_pi |
75 parser.CommentHandler = self._handle_comment | |
76 parser.DefaultHandler = self._handle_other | |
77 | |
78 # Location reporting is only support in Python >= 2.4 | |
79 if not hasattr(parser, 'CurrentLineNumber'): | |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
80 self._getpos = self._getpos_unknown |
1 | 81 |
82 self.expat = parser | |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
83 self._queue = [] |
1 | 84 |
85 def __iter__(self): | |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
86 try: |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
87 bufsize = 4 * 1024 # 4K |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
88 done = False |
69 | 89 while 1: |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
90 while not done and len(self._queue) == 0: |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
91 data = self.source.read(bufsize) |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
92 if data == '': # end of data |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
93 if hasattr(self, 'expat'): |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
94 self.expat.Parse('', True) |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
95 del self.expat # get rid of circular references |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
96 done = True |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
97 else: |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
98 self.expat.Parse(data, False) |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
99 for event in self._queue: |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
100 yield event |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
101 self._queue = [] |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
102 if done: |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
103 break |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
104 except expat.ExpatError, e: |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
105 msg = str(e) |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
106 if self.filename: |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
107 msg += ', in ' + self.filename |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
108 raise ParseError(msg, self.filename, e.lineno, e.offset) |
1 | 109 |
143
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
110 def _enqueue(self, kind, data=None, pos=None): |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
111 if pos is None: |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
112 pos = self._getpos() |
134
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
113 if kind is Stream.TEXT: |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
114 # Expat reports the *end* of the text event as current position. We |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
115 # try to fix that up here as much as possible. Unfortunately, the |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
116 # offset is only valid for single-line text. For multi-line text, |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
117 # it is apparently not possible to determine at what offset it |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
118 # started |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
119 if '\n' in data: |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
120 lines = data.splitlines() |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
121 lineno = pos[1] - len(lines) + 1 |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
122 offset = -1 |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
123 else: |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
124 lineno = pos[1] |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
125 offset = pos[2] - len(data) |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
126 pos = (pos[0], lineno, offset) |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
127 self._queue.append((kind, data, pos)) |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
128 |
1 | 129 def _getpos_unknown(self): |
134
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
130 return (self.filename, -1, -1) |
1 | 131 |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
132 def _getpos(self): |
134
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
133 return (self.filename, self.expat.CurrentLineNumber, |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
134 self.expat.CurrentColumnNumber) |
1 | 135 |
136 def _handle_start(self, tag, attrib): | |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
137 self._enqueue(Stream.START, (QName(tag), Attributes(attrib.items()))) |
1 | 138 |
139 def _handle_end(self, tag): | |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
140 self._enqueue(Stream.END, QName(tag)) |
1 | 141 |
142 def _handle_data(self, text): | |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
143 self._enqueue(Stream.TEXT, text) |
1 | 144 |
145 def _handle_doctype(self, name, sysid, pubid, has_internal_subset): | |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
146 self._enqueue(Stream.DOCTYPE, (name, pubid, sysid)) |
1 | 147 |
148 def _handle_start_ns(self, prefix, uri): | |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
149 self._enqueue(Stream.START_NS, (prefix or '', uri)) |
1 | 150 |
151 def _handle_end_ns(self, prefix): | |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
152 self._enqueue(Stream.END_NS, prefix or '') |
1 | 153 |
143
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
154 def _handle_start_cdata(self): |
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
155 self._enqueue(Stream.START_CDATA) |
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
156 |
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
157 def _handle_end_cdata(self): |
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
158 self._enqueue(Stream.END_CDATA) |
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
159 |
1 | 160 def _handle_pi(self, target, data): |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
161 self._enqueue(Stream.PI, (target, data)) |
1 | 162 |
163 def _handle_comment(self, text): | |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
164 self._enqueue(Stream.COMMENT, text) |
1 | 165 |
166 def _handle_other(self, text): | |
167 if text.startswith('&'): | |
168 # deal with undefined entities | |
169 try: | |
170 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) | |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
171 self._enqueue(Stream.TEXT, text) |
1 | 172 except KeyError: |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
173 lineno, offset = self._getpos() |
1 | 174 raise expat.error("undefined entity %s: line %d, column %d" % |
175 (text, lineno, offset)) | |
176 | |
177 | |
178 def XML(text): | |
179 return Stream(list(XMLParser(StringIO(text)))) | |
180 | |
181 | |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
182 class HTMLParser(html.HTMLParser, object): |
1 | 183 """Parser for HTML input based on the Python `HTMLParser` module. |
184 | |
185 This class provides the same interface for generating stream events as | |
186 `XMLParser`, and attempts to automatically balance tags. | |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
187 |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
188 The parsing is initiated by iterating over the parser object: |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
189 |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
190 >>> parser = HTMLParser(StringIO('<UL compact><LI>Foo</UL>')) |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
191 >>> for kind, data, pos in parser: |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
192 ... print kind, data |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
193 START (u'ul', [(u'compact', u'compact')]) |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
194 START (u'li', []) |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
195 TEXT Foo |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
196 END li |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
197 END ul |
1 | 198 """ |
199 | |
200 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', | |
201 'hr', 'img', 'input', 'isindex', 'link', 'meta', | |
202 'param']) | |
203 | |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
204 def __init__(self, source, filename=None): |
1 | 205 html.HTMLParser.__init__(self) |
206 self.source = source | |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
207 self.filename = filename |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
208 self._queue = [] |
1 | 209 self._open_tags = [] |
210 | |
211 def __iter__(self): | |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
212 try: |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
213 bufsize = 4 * 1024 # 4K |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
214 done = False |
69 | 215 while 1: |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
216 while not done and len(self._queue) == 0: |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
217 data = self.source.read(bufsize) |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
218 if data == '': # end of data |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
219 self.close() |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
220 done = True |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
221 else: |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
222 self.feed(data) |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
223 for kind, data, pos in self._queue: |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
224 yield kind, data, pos |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
225 self._queue = [] |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
226 if done: |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
227 open_tags = self._open_tags |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
228 open_tags.reverse() |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
229 for tag in open_tags: |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
230 yield Stream.END, QName(tag), pos |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
231 break |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
232 except html.HTMLParseError, e: |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
233 msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset) |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
234 if self.filename: |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
235 msg += ', in %s' % self.filename |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
236 raise ParseError(msg, self.filename, e.lineno, e.offset) |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
237 |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
238 def _enqueue(self, kind, data, pos=None): |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
239 if pos is None: |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
240 pos = self._getpos() |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
241 self._queue.append((kind, data, pos)) |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
242 |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
243 def _getpos(self): |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
244 lineno, column = self.getpos() |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
245 return (self.filename, lineno, column) |
1 | 246 |
247 def handle_starttag(self, tag, attrib): | |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
248 fixed_attrib = [] |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
249 for name, value in attrib: # Fixup minimized attributes |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
250 if value is None: |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
251 value = name |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
252 fixed_attrib.append((name, unicode(value))) |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
253 |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
254 self._enqueue(Stream.START, (QName(tag), Attributes(fixed_attrib))) |
1 | 255 if tag in self._EMPTY_ELEMS: |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
256 self._enqueue(Stream.END, QName(tag)) |
1 | 257 else: |
258 self._open_tags.append(tag) | |
259 | |
260 def handle_endtag(self, tag): | |
261 if tag not in self._EMPTY_ELEMS: | |
262 while self._open_tags: | |
263 open_tag = self._open_tags.pop() | |
264 if open_tag.lower() == tag.lower(): | |
265 break | |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
266 self._enqueue(Stream.END, QName(open_tag)) |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
267 self._enqueue(Stream.END, QName(tag)) |
1 | 268 |
269 def handle_data(self, text): | |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
270 self._enqueue(Stream.TEXT, text) |
1 | 271 |
272 def handle_charref(self, name): | |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
273 self._enqueue(Stream.TEXT, Markup('&#%s;' % name)) |
1 | 274 |
275 def handle_entityref(self, name): | |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
276 self._enqueue(Stream.TEXT, Markup('&%s;' % name)) |
1 | 277 |
278 def handle_pi(self, data): | |
279 target, data = data.split(maxsplit=1) | |
280 data = data.rstrip('?') | |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
281 self._enqueue(Stream.PI, (target.strip(), data.strip())) |
1 | 282 |
283 def handle_comment(self, text): | |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
284 self._enqueue(Stream.COMMENT, text) |
1 | 285 |
286 | |
287 def HTML(text): | |
288 return Stream(list(HTMLParser(StringIO(text)))) |