Mercurial > genshi > mirror
annotate markup/input.py @ 210:9fd7535883f2 trunk
Fix regression introduced in [258]. More fixes needed?
author | cmlenz |
---|---|
date | Tue, 29 Aug 2006 17:35:32 +0000 |
parents | fc6b2fb66518 |
children | 13d2d4420628 |
rev | line source |
---|---|
1 | 1 # -*- coding: utf-8 -*- |
2 # | |
66
59eb24184e9c
Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents:
27
diff
changeset
|
3 # Copyright (C) 2006 Edgewall Software |
1 | 4 # All rights reserved. |
5 # | |
6 # This software is licensed as described in the file COPYING, which | |
7 # you should have received as part of this distribution. The terms | |
66
59eb24184e9c
Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents:
27
diff
changeset
|
8 # are also available at http://markup.edgewall.org/wiki/License. |
1 | 9 # |
10 # This software consists of voluntary contributions made by many | |
11 # individuals. For the exact contribution history, see the revision | |
66
59eb24184e9c
Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents:
27
diff
changeset
|
12 # history and logs, available at http://markup.edgewall.org/log/. |
1 | 13 |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
14 from itertools import chain |
1 | 15 from xml.parsers import expat |
16 try: | |
17 frozenset | |
18 except NameError: | |
19 from sets import ImmutableSet as frozenset | |
20 import HTMLParser as html | |
21 import htmlentitydefs | |
22 from StringIO import StringIO | |
23 | |
182
2f30ce3fb85e
Renamed `Attributes` to `Attrs` to reduce the verbosity.
cmlenz
parents:
160
diff
changeset
|
24 from markup.core import Attrs, QName, Stream |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
25 from markup.core import DOCTYPE, START, END, START_NS, END_NS, TEXT, \ |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
26 START_CDATA, END_CDATA, PI, COMMENT |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
27 |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
28 __all__ = ['ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML'] |
1 | 29 |
30 | |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
31 class ParseError(Exception): |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
32 """Exception raised when fatal syntax errors are found in the input being |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
33 parsed.""" |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
34 |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
35 def __init__(self, message, filename='<string>', lineno=-1, offset=-1): |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
36 Exception.__init__(self, message) |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
37 self.filename = filename |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
38 self.lineno = lineno |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
39 self.offset = offset |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
40 |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
41 |
1 | 42 class XMLParser(object): |
43 """Generator-based XML parser based on roughly equivalent code in | |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
44 Kid/ElementTree. |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
45 |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
46 The parsing is initiated by iterating over the parser object: |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
47 |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
48 >>> parser = XMLParser(StringIO('<root id="2"><child>Foo</child></root>')) |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
49 >>> for kind, data, pos in parser: |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
50 ... print kind, data |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
51 START (u'root', [(u'id', u'2')]) |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
52 START (u'child', []) |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
53 TEXT Foo |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
54 END child |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
55 END root |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
56 """ |
1 | 57 |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
58 def __init__(self, source, filename=None): |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
59 """Initialize the parser for the given XML text. |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
60 |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
61 @param source: the XML text as a file-like object |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
62 @param filename: the name of the file, if appropriate |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
63 """ |
1 | 64 self.source = source |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
65 self.filename = filename |
1 | 66 |
67 # Setup the Expat parser | |
68 parser = expat.ParserCreate('utf-8', '}') | |
69 parser.buffer_text = True | |
70 parser.returns_unicode = True | |
160 | 71 parser.ordered_attributes = True |
72 | |
1 | 73 parser.StartElementHandler = self._handle_start |
74 parser.EndElementHandler = self._handle_end | |
75 parser.CharacterDataHandler = self._handle_data | |
76 parser.StartDoctypeDeclHandler = self._handle_doctype | |
77 parser.StartNamespaceDeclHandler = self._handle_start_ns | |
78 parser.EndNamespaceDeclHandler = self._handle_end_ns | |
143
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
79 parser.StartCdataSectionHandler = self._handle_start_cdata |
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
80 parser.EndCdataSectionHandler = self._handle_end_cdata |
1 | 81 parser.ProcessingInstructionHandler = self._handle_pi |
82 parser.CommentHandler = self._handle_comment | |
209
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
83 |
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
84 # Tell Expat that we'll handle non-XML entities ourselves |
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
85 # (in _handle_other) |
1 | 86 parser.DefaultHandler = self._handle_other |
209
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
87 parser.UseForeignDTD() |
1 | 88 |
89 # Location reporting is only support in Python >= 2.4 | |
90 if not hasattr(parser, 'CurrentLineNumber'): | |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
91 self._getpos = self._getpos_unknown |
1 | 92 |
93 self.expat = parser | |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
94 self._queue = [] |
1 | 95 |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
96 def parse(self): |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
97 def _generate(): |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
98 try: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
99 bufsize = 4 * 1024 # 4K |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
100 done = False |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
101 while 1: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
102 while not done and len(self._queue) == 0: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
103 data = self.source.read(bufsize) |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
104 if data == '': # end of data |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
105 if hasattr(self, 'expat'): |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
106 self.expat.Parse('', True) |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
107 del self.expat # get rid of circular references |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
108 done = True |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
109 else: |
207
28bfc6aafab7
The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents:
182
diff
changeset
|
110 if isinstance(data, unicode): |
28bfc6aafab7
The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents:
182
diff
changeset
|
111 data = data.encode('utf-8') |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
112 self.expat.Parse(data, False) |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
113 for event in self._queue: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
114 yield event |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
115 self._queue = [] |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
116 if done: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
117 break |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
118 except expat.ExpatError, e: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
119 msg = str(e) |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
120 if self.filename: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
121 msg += ', in ' + self.filename |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
122 raise ParseError(msg, self.filename, e.lineno, e.offset) |
146 | 123 return Stream(_generate()).filter(_coalesce) |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
124 |
1 | 125 def __iter__(self): |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
126 return iter(self.parse()) |
1 | 127 |
143
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
128 def _enqueue(self, kind, data=None, pos=None): |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
129 if pos is None: |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
130 pos = self._getpos() |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
131 if kind is TEXT: |
134
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
132 # Expat reports the *end* of the text event as current position. We |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
133 # try to fix that up here as much as possible. Unfortunately, the |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
134 # offset is only valid for single-line text. For multi-line text, |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
135 # it is apparently not possible to determine at what offset it |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
136 # started |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
137 if '\n' in data: |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
138 lines = data.splitlines() |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
139 lineno = pos[1] - len(lines) + 1 |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
140 offset = -1 |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
141 else: |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
142 lineno = pos[1] |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
143 offset = pos[2] - len(data) |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
144 pos = (pos[0], lineno, offset) |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
145 self._queue.append((kind, data, pos)) |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
146 |
1 | 147 def _getpos_unknown(self): |
134
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
148 return (self.filename, -1, -1) |
1 | 149 |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
150 def _getpos(self): |
134
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
151 return (self.filename, self.expat.CurrentLineNumber, |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
152 self.expat.CurrentColumnNumber) |
1 | 153 |
154 def _handle_start(self, tag, attrib): | |
182
2f30ce3fb85e
Renamed `Attributes` to `Attrs` to reduce the verbosity.
cmlenz
parents:
160
diff
changeset
|
155 self._enqueue(START, (QName(tag), Attrs(zip(*[iter(attrib)] * 2)))) |
1 | 156 |
157 def _handle_end(self, tag): | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
158 self._enqueue(END, QName(tag)) |
1 | 159 |
160 def _handle_data(self, text): | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
161 self._enqueue(TEXT, text) |
1 | 162 |
163 def _handle_doctype(self, name, sysid, pubid, has_internal_subset): | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
164 self._enqueue(DOCTYPE, (name, pubid, sysid)) |
1 | 165 |
166 def _handle_start_ns(self, prefix, uri): | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
167 self._enqueue(START_NS, (prefix or '', uri)) |
1 | 168 |
169 def _handle_end_ns(self, prefix): | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
170 self._enqueue(END_NS, prefix or '') |
1 | 171 |
143
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
172 def _handle_start_cdata(self): |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
173 self._enqueue(START_CDATA) |
143
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
174 |
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
175 def _handle_end_cdata(self): |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
176 self._enqueue(END_CDATA) |
143
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
177 |
1 | 178 def _handle_pi(self, target, data): |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
179 self._enqueue(PI, (target, data)) |
1 | 180 |
181 def _handle_comment(self, text): | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
182 self._enqueue(COMMENT, text) |
1 | 183 |
184 def _handle_other(self, text): | |
185 if text.startswith('&'): | |
186 # deal with undefined entities | |
187 try: | |
188 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
189 self._enqueue(TEXT, text) |
1 | 190 except KeyError: |
209
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
191 filename, lineno, offset = self._getpos() |
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
192 error = expat.error('undefined entity "%s": line %d, column %d' |
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
193 % (text, lineno, offset)) |
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
194 error.code = expat.errors.XML_ERROR_UNDEFINED_ENTITY |
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
195 error.lineno = lineno |
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
196 error.offset = offset |
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
197 raise error |
1 | 198 |
199 | |
200 def XML(text): | |
201 return Stream(list(XMLParser(StringIO(text)))) | |
202 | |
203 | |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
204 class HTMLParser(html.HTMLParser, object): |
1 | 205 """Parser for HTML input based on the Python `HTMLParser` module. |
206 | |
207 This class provides the same interface for generating stream events as | |
208 `XMLParser`, and attempts to automatically balance tags. | |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
209 |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
210 The parsing is initiated by iterating over the parser object: |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
211 |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
212 >>> parser = HTMLParser(StringIO('<UL compact><LI>Foo</UL>')) |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
213 >>> for kind, data, pos in parser: |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
214 ... print kind, data |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
215 START (u'ul', [(u'compact', u'compact')]) |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
216 START (u'li', []) |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
217 TEXT Foo |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
218 END li |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
219 END ul |
1 | 220 """ |
221 | |
222 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', | |
223 'hr', 'img', 'input', 'isindex', 'link', 'meta', | |
224 'param']) | |
225 | |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
226 def __init__(self, source, filename=None): |
1 | 227 html.HTMLParser.__init__(self) |
228 self.source = source | |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
229 self.filename = filename |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
230 self._queue = [] |
1 | 231 self._open_tags = [] |
232 | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
233 def parse(self): |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
234 def _generate(): |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
235 try: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
236 bufsize = 4 * 1024 # 4K |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
237 done = False |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
238 while 1: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
239 while not done and len(self._queue) == 0: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
240 data = self.source.read(bufsize) |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
241 if data == '': # end of data |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
242 self.close() |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
243 done = True |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
244 else: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
245 self.feed(data) |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
246 for kind, data, pos in self._queue: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
247 yield kind, data, pos |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
248 self._queue = [] |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
249 if done: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
250 open_tags = self._open_tags |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
251 open_tags.reverse() |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
252 for tag in open_tags: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
253 yield END, QName(tag), pos |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
254 break |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
255 except html.HTMLParseError, e: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
256 msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset) |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
257 if self.filename: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
258 msg += ', in %s' % self.filename |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
259 raise ParseError(msg, self.filename, e.lineno, e.offset) |
146 | 260 return Stream(_generate()).filter(_coalesce) |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
261 |
1 | 262 def __iter__(self): |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
263 return iter(self.parse()) |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
264 |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
265 def _enqueue(self, kind, data, pos=None): |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
266 if pos is None: |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
267 pos = self._getpos() |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
268 self._queue.append((kind, data, pos)) |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
269 |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
270 def _getpos(self): |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
271 lineno, column = self.getpos() |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
272 return (self.filename, lineno, column) |
1 | 273 |
274 def handle_starttag(self, tag, attrib): | |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
275 fixed_attrib = [] |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
276 for name, value in attrib: # Fixup minimized attributes |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
277 if value is None: |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
278 value = name |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
279 fixed_attrib.append((name, unicode(value))) |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
280 |
182
2f30ce3fb85e
Renamed `Attributes` to `Attrs` to reduce the verbosity.
cmlenz
parents:
160
diff
changeset
|
281 self._enqueue(START, (QName(tag), Attrs(fixed_attrib))) |
1 | 282 if tag in self._EMPTY_ELEMS: |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
283 self._enqueue(END, QName(tag)) |
1 | 284 else: |
285 self._open_tags.append(tag) | |
286 | |
287 def handle_endtag(self, tag): | |
288 if tag not in self._EMPTY_ELEMS: | |
289 while self._open_tags: | |
290 open_tag = self._open_tags.pop() | |
291 if open_tag.lower() == tag.lower(): | |
292 break | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
293 self._enqueue(END, QName(open_tag)) |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
294 self._enqueue(END, QName(tag)) |
1 | 295 |
296 def handle_data(self, text): | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
297 self._enqueue(TEXT, text) |
1 | 298 |
299 def handle_charref(self, name): | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
300 text = unichr(int(name)) |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
301 self._enqueue(TEXT, text) |
1 | 302 |
303 def handle_entityref(self, name): | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
304 try: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
305 text = unichr(htmlentitydefs.name2codepoint[name]) |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
306 except KeyError: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
307 text = '&%s;' % name |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
308 self._enqueue(TEXT, text) |
1 | 309 |
310 def handle_pi(self, data): | |
311 target, data = data.split(maxsplit=1) | |
312 data = data.rstrip('?') | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
313 self._enqueue(PI, (target.strip(), data.strip())) |
1 | 314 |
315 def handle_comment(self, text): | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
316 self._enqueue(COMMENT, text) |
1 | 317 |
318 | |
319 def HTML(text): | |
320 return Stream(list(HTMLParser(StringIO(text)))) | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
321 |
146 | 322 def _coalesce(stream): |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
323 """Coalesces adjacent TEXT events into a single event.""" |
146 | 324 textbuf = [] |
325 textpos = None | |
326 for kind, data, pos in chain(stream, [(None, None, None)]): | |
327 if kind is TEXT: | |
328 textbuf.append(data) | |
329 if textpos is None: | |
330 textpos = pos | |
331 else: | |
332 if textbuf: | |
333 yield TEXT, u''.join(textbuf), textpos | |
334 del textbuf[:] | |
335 textpos = None | |
336 if kind: | |
337 yield kind, data, pos |