annotate markup/input.py @ 27:b4f78c05e5c9 trunk

* Fix the boilerplate in the Python source files. * Some more docstrings and cosmetic fixes.
author cmlenz
date Wed, 28 Jun 2006 09:28:09 +0000
parents 3c1a022be04c
children 59eb24184e9c
rev   line source
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
1 # -*- coding: utf-8 -*-
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
2 #
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
3 # Copyright (C) 2006 Christopher Lenz
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
4 # All rights reserved.
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
5 #
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
6 # This software is licensed as described in the file COPYING, which
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
7 # you should have received as part of this distribution. The terms
27
b4f78c05e5c9 * Fix the boilerplate in the Python source files.
cmlenz
parents: 26
diff changeset
8 # are also available at http://markup.cmlenz.net/wiki/License.
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
9 #
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
10 # This software consists of voluntary contributions made by many
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
11 # individuals. For the exact contribution history, see the revision
27
b4f78c05e5c9 * Fix the boilerplate in the Python source files.
cmlenz
parents: 26
diff changeset
12 # history and logs, available at http://markup.cmlenz.net/log/.
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
13
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
14 from xml.parsers import expat
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
15 try:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
16 frozenset
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
17 except NameError:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
18 from sets import ImmutableSet as frozenset
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
19 import HTMLParser as html
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
20 import htmlentitydefs
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
21 from StringIO import StringIO
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
22
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
23 from markup.core import Attributes, Markup, QName, Stream
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
24
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
25
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
26 class ParseError(Exception):
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
27 """Exception raised when fatal syntax errors are found in the input being
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
28 parsed."""
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
29
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
30 def __init__(self, message, filename='<string>', lineno=-1, offset=-1):
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
31 Exception.__init__(self, message)
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
32 self.filename = filename
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
33 self.lineno = lineno
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
34 self.offset = offset
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
35
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
36
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
37 class XMLParser(object):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
38 """Generator-based XML parser based on roughly equivalent code in
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
39 Kid/ElementTree.
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
40
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
41 The parsing is initiated by iterating over the parser object:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
42
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
43 >>> parser = XMLParser(StringIO('<root id="2"><child>Foo</child></root>'))
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
44 >>> for kind, data, pos in parser:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
45 ... print kind, data
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
46 START (u'root', [(u'id', u'2')])
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
47 START (u'child', [])
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
48 TEXT Foo
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
49 END child
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
50 END root
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
51 """
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
52
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
53 def __init__(self, source, filename=None):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
54 """Initialize the parser for the given XML text.
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
55
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
56 @param source: the XML text as a file-like object
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
57 @param filename: the name of the file, if appropriate
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
58 """
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
59 self.source = source
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
60 self.filename = filename
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
61
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
62 # Setup the Expat parser
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
63 parser = expat.ParserCreate('utf-8', '}')
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
64 parser.buffer_text = True
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
65 parser.returns_unicode = True
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
66 parser.StartElementHandler = self._handle_start
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
67 parser.EndElementHandler = self._handle_end
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
68 parser.CharacterDataHandler = self._handle_data
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
69 parser.XmlDeclHandler = self._handle_prolog
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
70 parser.StartDoctypeDeclHandler = self._handle_doctype
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
71 parser.StartNamespaceDeclHandler = self._handle_start_ns
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
72 parser.EndNamespaceDeclHandler = self._handle_end_ns
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
73 parser.ProcessingInstructionHandler = self._handle_pi
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
74 parser.CommentHandler = self._handle_comment
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
75 parser.DefaultHandler = self._handle_other
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
76
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
77 # Location reporting is only support in Python >= 2.4
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
78 if not hasattr(parser, 'CurrentLineNumber'):
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
79 self._getpos = self._getpos_unknown
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
80
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
81 self.expat = parser
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
82 self._queue = []
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
83
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
84 def __iter__(self):
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
85 try:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
86 bufsize = 4 * 1024 # 4K
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
87 done = False
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
88 while True:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
89 while not done and len(self._queue) == 0:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
90 data = self.source.read(bufsize)
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
91 if data == '': # end of data
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
92 if hasattr(self, 'expat'):
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
93 self.expat.Parse('', True)
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
94 del self.expat # get rid of circular references
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
95 done = True
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
96 else:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
97 self.expat.Parse(data, False)
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
98 for event in self._queue:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
99 yield event
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
100 self._queue = []
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
101 if done:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
102 break
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
103 except expat.ExpatError, e:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
104 msg = str(e)
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
105 if self.filename:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
106 msg += ', in ' + self.filename
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
107 raise ParseError(msg, self.filename, e.lineno, e.offset)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
108
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
109 def _enqueue(self, kind, data, pos=None):
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
110 if pos is None:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
111 pos = self._getpos()
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
112 self._queue.append((kind, data, pos))
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
113
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
114 def _getpos_unknown(self):
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
115 return (self.filename or '<string>', -1, -1)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
116
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
117 def _getpos(self):
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
118 return (self.filename or '<string>', self.expat.CurrentLineNumber,
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
119 self.expat.CurrentColumnNumber)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
120
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
121 def _handle_start(self, tag, attrib):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
122 self._enqueue(Stream.START, (QName(tag), Attributes(attrib.items())))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
123
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
124 def _handle_end(self, tag):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
125 self._enqueue(Stream.END, QName(tag))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
126
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
127 def _handle_data(self, text):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
128 self._enqueue(Stream.TEXT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
129
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
130 def _handle_prolog(self, version, encoding, standalone):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
131 self._enqueue(Stream.PROLOG, (version, encoding, standalone))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
132
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
133 def _handle_doctype(self, name, sysid, pubid, has_internal_subset):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
134 self._enqueue(Stream.DOCTYPE, (name, pubid, sysid))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
135
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
136 def _handle_start_ns(self, prefix, uri):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
137 self._enqueue(Stream.START_NS, (prefix or '', uri))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
138
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
139 def _handle_end_ns(self, prefix):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
140 self._enqueue(Stream.END_NS, prefix or '')
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
141
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
142 def _handle_pi(self, target, data):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
143 self._enqueue(Stream.PI, (target, data))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
144
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
145 def _handle_comment(self, text):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
146 self._enqueue(Stream.COMMENT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
147
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
148 def _handle_other(self, text):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
149 if text.startswith('&'):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
150 # deal with undefined entities
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
151 try:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
152 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
153 self._enqueue(Stream.TEXT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
154 except KeyError:
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
155 lineno, offset = self._getpos()
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
156 raise expat.error("undefined entity %s: line %d, column %d" %
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
157 (text, lineno, offset))
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
158
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
159
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
160 def XML(text):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
161 return Stream(list(XMLParser(StringIO(text))))
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
162
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
163
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
164 class HTMLParser(html.HTMLParser, object):
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
165 """Parser for HTML input based on the Python `HTMLParser` module.
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
166
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
167 This class provides the same interface for generating stream events as
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
168 `XMLParser`, and attempts to automatically balance tags.
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
169
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
170 The parsing is initiated by iterating over the parser object:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
171
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
172 >>> parser = HTMLParser(StringIO('<UL compact><LI>Foo</UL>'))
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
173 >>> for kind, data, pos in parser:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
174 ... print kind, data
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
175 START (u'ul', [(u'compact', u'compact')])
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
176 START (u'li', [])
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
177 TEXT Foo
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
178 END li
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
179 END ul
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
180 """
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
181
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
182 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame',
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
183 'hr', 'img', 'input', 'isindex', 'link', 'meta',
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
184 'param'])
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
185
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
186 def __init__(self, source, filename=None):
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
187 html.HTMLParser.__init__(self)
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
188 self.source = source
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
189 self.filename = filename
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
190 self._queue = []
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
191 self._open_tags = []
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
192
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
193 def __iter__(self):
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
194 try:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
195 bufsize = 4 * 1024 # 4K
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
196 done = False
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
197 while True:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
198 while not done and len(self._queue) == 0:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
199 data = self.source.read(bufsize)
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
200 if data == '': # end of data
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
201 self.close()
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
202 done = True
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
203 else:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
204 self.feed(data)
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
205 for kind, data, pos in self._queue:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
206 yield kind, data, pos
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
207 self._queue = []
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
208 if done:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
209 open_tags = self._open_tags
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
210 open_tags.reverse()
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
211 for tag in open_tags:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
212 yield Stream.END, QName(tag), pos
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
213 break
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
214 except html.HTMLParseError, e:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
215 msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset)
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
216 if self.filename:
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
217 msg += ', in %s' % self.filename
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
218 raise ParseError(msg, self.filename, e.lineno, e.offset)
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
219
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
220 def _enqueue(self, kind, data, pos=None):
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
221 if pos is None:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
222 pos = self._getpos()
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
223 self._queue.append((kind, data, pos))
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
224
21
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
225 def _getpos(self):
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
226 lineno, column = self.getpos()
b4d17897d053 * Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents: 1
diff changeset
227 return (self.filename, lineno, column)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
228
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
229 def handle_starttag(self, tag, attrib):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
230 fixed_attrib = []
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
231 for name, value in attrib: # Fixup minimized attributes
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
232 if value is None:
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
233 value = name
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
234 fixed_attrib.append((name, unicode(value)))
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
235
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
236 self._enqueue(Stream.START, (QName(tag), Attributes(fixed_attrib)))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
237 if tag in self._EMPTY_ELEMS:
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
238 self._enqueue(Stream.END, QName(tag))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
239 else:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
240 self._open_tags.append(tag)
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
241
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
242 def handle_endtag(self, tag):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
243 if tag not in self._EMPTY_ELEMS:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
244 while self._open_tags:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
245 open_tag = self._open_tags.pop()
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
246 if open_tag.lower() == tag.lower():
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
247 break
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
248 self._enqueue(Stream.END, QName(open_tag))
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
249 self._enqueue(Stream.END, QName(tag))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
250
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
251 def handle_data(self, text):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
252 self._enqueue(Stream.TEXT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
253
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
254 def handle_charref(self, name):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
255 self._enqueue(Stream.TEXT, Markup('&#%s;' % name))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
256
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
257 def handle_entityref(self, name):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
258 self._enqueue(Stream.TEXT, Markup('&%s;' % name))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
259
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
260 def handle_pi(self, data):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
261 target, data = data.split(maxsplit=1)
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
262 data = data.rstrip('?')
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
263 self._enqueue(Stream.PI, (target.strip(), data.strip()))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
264
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
265 def handle_comment(self, text):
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 21
diff changeset
266 self._enqueue(Stream.COMMENT, text)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
267
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
268
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
269 def HTML(text):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
270 return Stream(list(HTMLParser(StringIO(text))))
Copyright (C) 2012-2017 Edgewall Software