Mercurial > genshi > genshi-test
annotate markup/input.py @ 225:0edf663b97d6
support slices in expressions (fixes #51)
author | mgood |
---|---|
date | Thu, 07 Sep 2006 20:40:56 +0000 |
parents | bafa1cc49c2f |
children |
rev | line source |
---|---|
1 | 1 # -*- coding: utf-8 -*- |
2 # | |
66
822089ae65ce
Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents:
27
diff
changeset
|
3 # Copyright (C) 2006 Edgewall Software |
1 | 4 # All rights reserved. |
5 # | |
6 # This software is licensed as described in the file COPYING, which | |
7 # you should have received as part of this distribution. The terms | |
66
822089ae65ce
Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents:
27
diff
changeset
|
8 # are also available at http://markup.edgewall.org/wiki/License. |
1 | 9 # |
10 # This software consists of voluntary contributions made by many | |
11 # individuals. For the exact contribution history, see the revision | |
66
822089ae65ce
Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents:
27
diff
changeset
|
12 # history and logs, available at http://markup.edgewall.org/log/. |
1 | 13 |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
14 from itertools import chain |
1 | 15 from xml.parsers import expat |
16 try: | |
17 frozenset | |
18 except NameError: | |
19 from sets import ImmutableSet as frozenset | |
20 import HTMLParser as html | |
21 import htmlentitydefs | |
22 from StringIO import StringIO | |
23 | |
182
41db0260ebb1
Renamed `Attributes` to `Attrs` to reduce the verbosity.
cmlenz
parents:
160
diff
changeset
|
24 from markup.core import Attrs, QName, Stream |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
25 from markup.core import DOCTYPE, START, END, START_NS, END_NS, TEXT, \ |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
26 START_CDATA, END_CDATA, PI, COMMENT |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
27 |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
28 __all__ = ['ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML'] |
1 | 29 |
30 | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
31 class ParseError(Exception): |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
32 """Exception raised when fatal syntax errors are found in the input being |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
33 parsed.""" |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
34 |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
35 def __init__(self, message, filename='<string>', lineno=-1, offset=-1): |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
36 Exception.__init__(self, message) |
213 | 37 self.msg = message |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
38 self.filename = filename |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
39 self.lineno = lineno |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
40 self.offset = offset |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
41 |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
42 |
1 | 43 class XMLParser(object): |
44 """Generator-based XML parser based on roughly equivalent code in | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
45 Kid/ElementTree. |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
46 |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
47 The parsing is initiated by iterating over the parser object: |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
48 |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
49 >>> parser = XMLParser(StringIO('<root id="2"><child>Foo</child></root>')) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
50 >>> for kind, data, pos in parser: |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
51 ... print kind, data |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
52 START (u'root', [(u'id', u'2')]) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
53 START (u'child', []) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
54 TEXT Foo |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
55 END child |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
56 END root |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
57 """ |
1 | 58 |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
59 def __init__(self, source, filename=None): |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
60 """Initialize the parser for the given XML text. |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
61 |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
62 @param source: the XML text as a file-like object |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
63 @param filename: the name of the file, if appropriate |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
64 """ |
1 | 65 self.source = source |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
66 self.filename = filename |
1 | 67 |
68 # Setup the Expat parser | |
69 parser = expat.ParserCreate('utf-8', '}') | |
70 parser.buffer_text = True | |
71 parser.returns_unicode = True | |
160 | 72 parser.ordered_attributes = True |
73 | |
1 | 74 parser.StartElementHandler = self._handle_start |
75 parser.EndElementHandler = self._handle_end | |
76 parser.CharacterDataHandler = self._handle_data | |
77 parser.StartDoctypeDeclHandler = self._handle_doctype | |
78 parser.StartNamespaceDeclHandler = self._handle_start_ns | |
79 parser.EndNamespaceDeclHandler = self._handle_end_ns | |
143
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
80 parser.StartCdataSectionHandler = self._handle_start_cdata |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
81 parser.EndCdataSectionHandler = self._handle_end_cdata |
1 | 82 parser.ProcessingInstructionHandler = self._handle_pi |
83 parser.CommentHandler = self._handle_comment | |
209
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
84 |
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
85 # Tell Expat that we'll handle non-XML entities ourselves |
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
86 # (in _handle_other) |
1 | 87 parser.DefaultHandler = self._handle_other |
209
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
88 parser.UseForeignDTD() |
1 | 89 |
90 # Location reporting is only support in Python >= 2.4 | |
91 if not hasattr(parser, 'CurrentLineNumber'): | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
92 self._getpos = self._getpos_unknown |
1 | 93 |
94 self.expat = parser | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
95 self._queue = [] |
1 | 96 |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
97 def parse(self): |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
98 def _generate(): |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
99 try: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
100 bufsize = 4 * 1024 # 4K |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
101 done = False |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
102 while 1: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
103 while not done and len(self._queue) == 0: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
104 data = self.source.read(bufsize) |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
105 if data == '': # end of data |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
106 if hasattr(self, 'expat'): |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
107 self.expat.Parse('', True) |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
108 del self.expat # get rid of circular references |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
109 done = True |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
110 else: |
207
0619a27f5e67
The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents:
182
diff
changeset
|
111 if isinstance(data, unicode): |
0619a27f5e67
The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents:
182
diff
changeset
|
112 data = data.encode('utf-8') |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
113 self.expat.Parse(data, False) |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
114 for event in self._queue: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
115 yield event |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
116 self._queue = [] |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
117 if done: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
118 break |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
119 except expat.ExpatError, e: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
120 msg = str(e) |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
121 if self.filename: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
122 msg += ', in ' + self.filename |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
123 raise ParseError(msg, self.filename, e.lineno, e.offset) |
146 | 124 return Stream(_generate()).filter(_coalesce) |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
125 |
1 | 126 def __iter__(self): |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
127 return iter(self.parse()) |
1 | 128 |
143
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
129 def _enqueue(self, kind, data=None, pos=None): |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
130 if pos is None: |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
131 pos = self._getpos() |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
132 if kind is TEXT: |
134
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
133 # Expat reports the *end* of the text event as current position. We |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
134 # try to fix that up here as much as possible. Unfortunately, the |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
135 # offset is only valid for single-line text. For multi-line text, |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
136 # it is apparently not possible to determine at what offset it |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
137 # started |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
138 if '\n' in data: |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
139 lines = data.splitlines() |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
140 lineno = pos[1] - len(lines) + 1 |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
141 offset = -1 |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
142 else: |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
143 lineno = pos[1] |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
144 offset = pos[2] - len(data) |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
145 pos = (pos[0], lineno, offset) |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
146 self._queue.append((kind, data, pos)) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
147 |
1 | 148 def _getpos_unknown(self): |
134
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
149 return (self.filename, -1, -1) |
1 | 150 |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
151 def _getpos(self): |
134
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
152 return (self.filename, self.expat.CurrentLineNumber, |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
153 self.expat.CurrentColumnNumber) |
1 | 154 |
155 def _handle_start(self, tag, attrib): | |
182
41db0260ebb1
Renamed `Attributes` to `Attrs` to reduce the verbosity.
cmlenz
parents:
160
diff
changeset
|
156 self._enqueue(START, (QName(tag), Attrs(zip(*[iter(attrib)] * 2)))) |
1 | 157 |
158 def _handle_end(self, tag): | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
159 self._enqueue(END, QName(tag)) |
1 | 160 |
161 def _handle_data(self, text): | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
162 self._enqueue(TEXT, text) |
1 | 163 |
164 def _handle_doctype(self, name, sysid, pubid, has_internal_subset): | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
165 self._enqueue(DOCTYPE, (name, pubid, sysid)) |
1 | 166 |
167 def _handle_start_ns(self, prefix, uri): | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
168 self._enqueue(START_NS, (prefix or '', uri)) |
1 | 169 |
170 def _handle_end_ns(self, prefix): | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
171 self._enqueue(END_NS, prefix or '') |
1 | 172 |
143
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
173 def _handle_start_cdata(self): |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
174 self._enqueue(START_CDATA) |
143
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
175 |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
176 def _handle_end_cdata(self): |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
177 self._enqueue(END_CDATA) |
143
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
178 |
1 | 179 def _handle_pi(self, target, data): |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
180 self._enqueue(PI, (target, data)) |
1 | 181 |
182 def _handle_comment(self, text): | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
183 self._enqueue(COMMENT, text) |
1 | 184 |
185 def _handle_other(self, text): | |
186 if text.startswith('&'): | |
187 # deal with undefined entities | |
188 try: | |
189 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
190 self._enqueue(TEXT, text) |
1 | 191 except KeyError: |
209
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
192 filename, lineno, offset = self._getpos() |
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
193 error = expat.error('undefined entity "%s": line %d, column %d' |
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
194 % (text, lineno, offset)) |
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
195 error.code = expat.errors.XML_ERROR_UNDEFINED_ENTITY |
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
196 error.lineno = lineno |
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
197 error.offset = offset |
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
198 raise error |
1 | 199 |
200 | |
201 def XML(text): | |
202 return Stream(list(XMLParser(StringIO(text)))) | |
203 | |
204 | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
205 class HTMLParser(html.HTMLParser, object): |
1 | 206 """Parser for HTML input based on the Python `HTMLParser` module. |
207 | |
208 This class provides the same interface for generating stream events as | |
209 `XMLParser`, and attempts to automatically balance tags. | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
210 |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
211 The parsing is initiated by iterating over the parser object: |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
212 |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
213 >>> parser = HTMLParser(StringIO('<UL compact><LI>Foo</UL>')) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
214 >>> for kind, data, pos in parser: |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
215 ... print kind, data |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
216 START (u'ul', [(u'compact', u'compact')]) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
217 START (u'li', []) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
218 TEXT Foo |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
219 END li |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
220 END ul |
1 | 221 """ |
222 | |
223 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', | |
224 'hr', 'img', 'input', 'isindex', 'link', 'meta', | |
225 'param']) | |
226 | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
227 def __init__(self, source, filename=None): |
1 | 228 html.HTMLParser.__init__(self) |
229 self.source = source | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
230 self.filename = filename |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
231 self._queue = [] |
1 | 232 self._open_tags = [] |
233 | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
234 def parse(self): |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
235 def _generate(): |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
236 try: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
237 bufsize = 4 * 1024 # 4K |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
238 done = False |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
239 while 1: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
240 while not done and len(self._queue) == 0: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
241 data = self.source.read(bufsize) |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
242 if data == '': # end of data |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
243 self.close() |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
244 done = True |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
245 else: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
246 self.feed(data) |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
247 for kind, data, pos in self._queue: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
248 yield kind, data, pos |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
249 self._queue = [] |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
250 if done: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
251 open_tags = self._open_tags |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
252 open_tags.reverse() |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
253 for tag in open_tags: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
254 yield END, QName(tag), pos |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
255 break |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
256 except html.HTMLParseError, e: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
257 msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset) |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
258 if self.filename: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
259 msg += ', in %s' % self.filename |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
260 raise ParseError(msg, self.filename, e.lineno, e.offset) |
146 | 261 return Stream(_generate()).filter(_coalesce) |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
262 |
1 | 263 def __iter__(self): |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
264 return iter(self.parse()) |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
265 |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
266 def _enqueue(self, kind, data, pos=None): |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
267 if pos is None: |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
268 pos = self._getpos() |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
269 self._queue.append((kind, data, pos)) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
270 |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
271 def _getpos(self): |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
272 lineno, column = self.getpos() |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
273 return (self.filename, lineno, column) |
1 | 274 |
275 def handle_starttag(self, tag, attrib): | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
276 fixed_attrib = [] |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
277 for name, value in attrib: # Fixup minimized attributes |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
278 if value is None: |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
279 value = name |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
280 fixed_attrib.append((name, unicode(value))) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
281 |
182
41db0260ebb1
Renamed `Attributes` to `Attrs` to reduce the verbosity.
cmlenz
parents:
160
diff
changeset
|
282 self._enqueue(START, (QName(tag), Attrs(fixed_attrib))) |
1 | 283 if tag in self._EMPTY_ELEMS: |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
284 self._enqueue(END, QName(tag)) |
1 | 285 else: |
286 self._open_tags.append(tag) | |
287 | |
288 def handle_endtag(self, tag): | |
289 if tag not in self._EMPTY_ELEMS: | |
290 while self._open_tags: | |
291 open_tag = self._open_tags.pop() | |
292 if open_tag.lower() == tag.lower(): | |
293 break | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
294 self._enqueue(END, QName(open_tag)) |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
295 self._enqueue(END, QName(tag)) |
1 | 296 |
297 def handle_data(self, text): | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
298 self._enqueue(TEXT, text) |
1 | 299 |
300 def handle_charref(self, name): | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
301 text = unichr(int(name)) |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
302 self._enqueue(TEXT, text) |
1 | 303 |
304 def handle_entityref(self, name): | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
305 try: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
306 text = unichr(htmlentitydefs.name2codepoint[name]) |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
307 except KeyError: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
308 text = '&%s;' % name |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
309 self._enqueue(TEXT, text) |
1 | 310 |
311 def handle_pi(self, data): | |
312 target, data = data.split(maxsplit=1) | |
313 data = data.rstrip('?') | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
314 self._enqueue(PI, (target.strip(), data.strip())) |
1 | 315 |
316 def handle_comment(self, text): | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
317 self._enqueue(COMMENT, text) |
1 | 318 |
319 | |
320 def HTML(text): | |
321 return Stream(list(HTMLParser(StringIO(text)))) | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
322 |
146 | 323 def _coalesce(stream): |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
324 """Coalesces adjacent TEXT events into a single event.""" |
146 | 325 textbuf = [] |
326 textpos = None | |
327 for kind, data, pos in chain(stream, [(None, None, None)]): | |
328 if kind is TEXT: | |
329 textbuf.append(data) | |
330 if textpos is None: | |
331 textpos = pos | |
332 else: | |
333 if textbuf: | |
334 yield TEXT, u''.join(textbuf), textpos | |
335 del textbuf[:] | |
336 textpos = None | |
337 if kind: | |
338 yield kind, data, pos |