annotate markup/input.py @ 143:3d4c214c979a trunk

CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
author cmlenz
date Fri, 11 Aug 2006 14:08:13 +0000
parents c1f4390d50f8
children d1ce85a7f296
rev   line source
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
1 # -*- coding: utf-8 -*-
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
2 #
66
59eb24184e9c Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents: 27
diff changeset
3 # Copyright (C) 2006 Edgewall Software
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
4 # All rights reserved.
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
5 #
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
6 # This software is licensed as described in the file COPYING, which
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
7 # you should have received as part of this distribution. The terms
66
59eb24184e9c Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents: 27
diff changeset
8 # are also available at http://markup.edgewall.org/wiki/License.
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
9 #
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
10 # This software consists of voluntary contributions made by many
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
11 # individuals. For the exact contribution history, see the revision
66
59eb24184e9c Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents: 27
diff changeset
12 # history and logs, available at http://markup.edgewall.org/log/.
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
13
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
14 from xml.parsers import expat
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
15 try:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
16 frozenset
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
17 except NameError:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
18 from sets import ImmutableSet as frozenset
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
19 import HTMLParser as html
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
20 import htmlentitydefs
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
21 from StringIO import StringIO
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
22
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
23 from markup.core import Attributes, Markup, QName, Stream
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
24
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
25
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
26 class ParseError(Exception):
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
27 """Exception raised when fatal syntax errors are found in the input being
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
28 parsed."""
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
29
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
30 def __init__(self, message, filename='<string>', lineno=-1, offset=-1):
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
31 Exception.__init__(self, message)
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
32 self.filename = filename
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
33 self.lineno = lineno
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
34 self.offset = offset
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
35
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
36
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
37 class XMLParser(object):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
38 """Generator-based XML parser based on roughly equivalent code in
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
39 Kid/ElementTree.
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
40
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
41 The parsing is initiated by iterating over the parser object:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
42
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
43 >>> parser = XMLParser(StringIO('<root id="2"><child>Foo</child></root>'))
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
44 >>> for kind, data, pos in parser:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
45 ... print kind, data
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
46 START (u'root', [(u'id', u'2')])
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
47 START (u'child', [])
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
48 TEXT Foo
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
49 END child
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
50 END root
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
51 """
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
52
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
53 def __init__(self, source, filename=None):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
54 """Initialize the parser for the given XML text.
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
55
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
56 @param source: the XML text as a file-like object
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
57 @param filename: the name of the file, if appropriate
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
58 """
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
59 self.source = source
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
60 self.filename = filename
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
61
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
62 # Setup the Expat parser
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
63 parser = expat.ParserCreate('utf-8', '}')
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
64 parser.buffer_text = True
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
65 parser.returns_unicode = True
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
66 parser.StartElementHandler = self._handle_start
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
67 parser.EndElementHandler = self._handle_end
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
68 parser.CharacterDataHandler = self._handle_data
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
69 parser.StartDoctypeDeclHandler = self._handle_doctype
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
70 parser.StartNamespaceDeclHandler = self._handle_start_ns
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
71 parser.EndNamespaceDeclHandler = self._handle_end_ns
143
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
72 parser.StartCdataSectionHandler = self._handle_start_cdata
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
73 parser.EndCdataSectionHandler = self._handle_end_cdata
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
74 parser.ProcessingInstructionHandler = self._handle_pi
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
75 parser.CommentHandler = self._handle_comment
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
76 parser.DefaultHandler = self._handle_other
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
77
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
78 # Location reporting is only support in Python >= 2.4
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
79 if not hasattr(parser, 'CurrentLineNumber'):
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
80 self._getpos = self._getpos_unknown
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
81
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
82 self.expat = parser
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
83 self._queue = []
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
84
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
85 def __iter__(self):
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
86 try:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
87 bufsize = 4 * 1024 # 4K
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
88 done = False
69
c40a5dcd2b55 A couple of minor performance improvements.
cmlenz
parents: 66
diff changeset
89 while 1:
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
90 while not done and len(self._queue) == 0:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
91 data = self.source.read(bufsize)
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
92 if data == '': # end of data
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
93 if hasattr(self, 'expat'):
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
94 self.expat.Parse('', True)
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
95 del self.expat # get rid of circular references
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
96 done = True
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
97 else:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
98 self.expat.Parse(data, False)
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
99 for event in self._queue:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
100 yield event
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
101 self._queue = []
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
102 if done:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
103 break
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
104 except expat.ExpatError, e:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
105 msg = str(e)
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
106 if self.filename:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
107 msg += ', in ' + self.filename
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
108 raise ParseError(msg, self.filename, e.lineno, e.offset)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
109
143
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
110 def _enqueue(self, kind, data=None, pos=None):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
111 if pos is None:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
112 pos = self._getpos()
134
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
113 if kind is Stream.TEXT:
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
114 # Expat reports the *end* of the text event as current position. We
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
115 # try to fix that up here as much as possible. Unfortunately, the
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
116 # offset is only valid for single-line text. For multi-line text,
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
117 # it is apparently not possible to determine at what offset it
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
118 # started
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
119 if '\n' in data:
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
120 lines = data.splitlines()
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
121 lineno = pos[1] - len(lines) + 1
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
122 offset = -1
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
123 else:
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
124 lineno = pos[1]
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
125 offset = pos[2] - len(data)
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
126 pos = (pos[0], lineno, offset)
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
127 self._queue.append((kind, data, pos))
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
128
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
129 def _getpos_unknown(self):
134
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
130 return (self.filename, -1, -1)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
131
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
132 def _getpos(self):
134
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 69
diff changeset
133 return (self.filename, self.expat.CurrentLineNumber,
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
134 self.expat.CurrentColumnNumber)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
135
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
136 def _handle_start(self, tag, attrib):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
137 self._enqueue(Stream.START, (QName(tag), Attributes(attrib.items())))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
138
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
139 def _handle_end(self, tag):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
140 self._enqueue(Stream.END, QName(tag))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
141
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
142 def _handle_data(self, text):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
143 self._enqueue(Stream.TEXT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
144
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
145 def _handle_doctype(self, name, sysid, pubid, has_internal_subset):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
146 self._enqueue(Stream.DOCTYPE, (name, pubid, sysid))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
147
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
148 def _handle_start_ns(self, prefix, uri):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
149 self._enqueue(Stream.START_NS, (prefix or '', uri))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
150
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
151 def _handle_end_ns(self, prefix):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
152 self._enqueue(Stream.END_NS, prefix or '')
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
153
143
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
154 def _handle_start_cdata(self):
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
155 self._enqueue(Stream.START_CDATA)
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
156
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
157 def _handle_end_cdata(self):
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
158 self._enqueue(Stream.END_CDATA)
3d4c214c979a CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents: 140
diff changeset
159
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
160 def _handle_pi(self, target, data):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
161 self._enqueue(Stream.PI, (target, data))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
162
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
163 def _handle_comment(self, text):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
164 self._enqueue(Stream.COMMENT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
165
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
166 def _handle_other(self, text):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
167 if text.startswith('&'):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
168 # deal with undefined entities
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
169 try:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
170 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
171 self._enqueue(Stream.TEXT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
172 except KeyError:
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
173 lineno, offset = self._getpos()
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
174 raise expat.error("undefined entity %s: line %d, column %d" %
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
175 (text, lineno, offset))
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
176
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
177
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
178 def XML(text):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
179 return Stream(list(XMLParser(StringIO(text))))
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
180
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
181
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
182 class HTMLParser(html.HTMLParser, object):
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
183 """Parser for HTML input based on the Python `HTMLParser` module.
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
184
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
185 This class provides the same interface for generating stream events as
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
186 `XMLParser`, and attempts to automatically balance tags.
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
187
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
188 The parsing is initiated by iterating over the parser object:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
189
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
190 >>> parser = HTMLParser(StringIO('<UL compact><LI>Foo</UL>'))
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
191 >>> for kind, data, pos in parser:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
192 ... print kind, data
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
193 START (u'ul', [(u'compact', u'compact')])
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
194 START (u'li', [])
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
195 TEXT Foo
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
196 END li
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
197 END ul
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
198 """
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
199
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
200 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame',
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
201 'hr', 'img', 'input', 'isindex', 'link', 'meta',
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
202 'param'])
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
203
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
204 def __init__(self, source, filename=None):
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
205 html.HTMLParser.__init__(self)
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
206 self.source = source
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
207 self.filename = filename
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
208 self._queue = []
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
209 self._open_tags = []
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
210
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
211 def __iter__(self):
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
212 try:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
213 bufsize = 4 * 1024 # 4K
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
214 done = False
69
c40a5dcd2b55 A couple of minor performance improvements.
cmlenz
parents: 66
diff changeset
215 while 1:
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
216 while not done and len(self._queue) == 0:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
217 data = self.source.read(bufsize)
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
218 if data == '': # end of data
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
219 self.close()
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
220 done = True
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
221 else:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
222 self.feed(data)
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
223 for kind, data, pos in self._queue:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
224 yield kind, data, pos
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
225 self._queue = []
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
226 if done:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
227 open_tags = self._open_tags
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
228 open_tags.reverse()
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
229 for tag in open_tags:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
230 yield Stream.END, QName(tag), pos
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
231 break
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
232 except html.HTMLParseError, e:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
233 msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset)
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
234 if self.filename:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
235 msg += ', in %s' % self.filename
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
236 raise ParseError(msg, self.filename, e.lineno, e.offset)
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
237
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
238 def _enqueue(self, kind, data, pos=None):
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
239 if pos is None:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
240 pos = self._getpos()
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
241 self._queue.append((kind, data, pos))
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
242
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
243 def _getpos(self):
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
244 lineno, column = self.getpos()
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
245 return (self.filename, lineno, column)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
246
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
247 def handle_starttag(self, tag, attrib):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
248 fixed_attrib = []
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
249 for name, value in attrib: # Fixup minimized attributes
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
250 if value is None:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
251 value = name
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
252 fixed_attrib.append((name, unicode(value)))
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
253
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
254 self._enqueue(Stream.START, (QName(tag), Attributes(fixed_attrib)))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
255 if tag in self._EMPTY_ELEMS:
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
256 self._enqueue(Stream.END, QName(tag))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
257 else:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
258 self._open_tags.append(tag)
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
259
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
260 def handle_endtag(self, tag):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
261 if tag not in self._EMPTY_ELEMS:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
262 while self._open_tags:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
263 open_tag = self._open_tags.pop()
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
264 if open_tag.lower() == tag.lower():
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
265 break
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
266 self._enqueue(Stream.END, QName(open_tag))
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
267 self._enqueue(Stream.END, QName(tag))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
268
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
269 def handle_data(self, text):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
270 self._enqueue(Stream.TEXT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
271
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
272 def handle_charref(self, name):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
273 self._enqueue(Stream.TEXT, Markup('&#%s;' % name))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
274
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
275 def handle_entityref(self, name):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
276 self._enqueue(Stream.TEXT, Markup('&%s;' % name))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
277
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
278 def handle_pi(self, data):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
279 target, data = data.split(maxsplit=1)
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
280 data = data.rstrip('?')
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
281 self._enqueue(Stream.PI, (target.strip(), data.strip()))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
282
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
283 def handle_comment(self, text):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
284 self._enqueue(Stream.COMMENT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
285
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
286
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
287 def HTML(text):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
288 return Stream(list(HTMLParser(StringIO(text))))
Copyright (C) 2012-2017 Edgewall Software