comparison markup/input.py @ 26:3c1a022be04c trunk

* Split out the XPath tests into a separate `unittest`-based file. * Added many more docstrings. * Cleaned up the implementation of the XML/HTML parsers a bit. * The HTML parser now correctly handles minimized attributes. * Added `COPYING` and `README` files.
author cmlenz
date Wed, 28 Jun 2006 08:55:04 +0000
parents b4d17897d053
children b4f78c05e5c9
comparison
equal deleted inserted replaced
25:e3be27f5bcf5 26:3c1a022be04c
16 frozenset 16 frozenset
17 except NameError: 17 except NameError:
18 from sets import ImmutableSet as frozenset 18 from sets import ImmutableSet as frozenset
19 import HTMLParser as html 19 import HTMLParser as html
20 import htmlentitydefs 20 import htmlentitydefs
21 import re
22 from StringIO import StringIO 21 from StringIO import StringIO
23 22
24 from markup.core import Attributes, Markup, QName, Stream 23 from markup.core import Attributes, Markup, QName, Stream
25 24
26 25
35 self.offset = offset 34 self.offset = offset
36 35
37 36
38 class XMLParser(object): 37 class XMLParser(object):
39 """Generator-based XML parser based on roughly equivalent code in 38 """Generator-based XML parser based on roughly equivalent code in
40 Kid/ElementTree.""" 39 Kid/ElementTree.
40
41 The parsing is initiated by iterating over the parser object:
42
43 >>> parser = XMLParser(StringIO('<root id="2"><child>Foo</child></root>'))
44 >>> for kind, data, pos in parser:
45 ... print kind, data
46 START (u'root', [(u'id', u'2')])
47 START (u'child', [])
48 TEXT Foo
49 END child
50 END root
51 """
41 52
42 def __init__(self, source, filename=None): 53 def __init__(self, source, filename=None):
54 """Initialize the parser for the given XML text.
55
56 @param source: the XML text as a file-like object
57 @param filename: the name of the file, if appropriate
58 """
43 self.source = source 59 self.source = source
44 self.filename = filename 60 self.filename = filename
45 61
46 # Setup the Expat parser 62 # Setup the Expat parser
47 parser = expat.ParserCreate('utf-8', '}') 63 parser = expat.ParserCreate('utf-8', '}')
88 msg = str(e) 104 msg = str(e)
89 if self.filename: 105 if self.filename:
90 msg += ', in ' + self.filename 106 msg += ', in ' + self.filename
91 raise ParseError(msg, self.filename, e.lineno, e.offset) 107 raise ParseError(msg, self.filename, e.lineno, e.offset)
92 108
109 def _enqueue(self, kind, data, pos=None):
110 if pos is None:
111 pos = self._getpos()
112 self._queue.append((kind, data, pos))
113
93 def _getpos_unknown(self): 114 def _getpos_unknown(self):
94 return (self.filename or '<string>', -1, -1) 115 return (self.filename or '<string>', -1, -1)
95 116
96 def _getpos(self): 117 def _getpos(self):
97 return (self.filename or '<string>', self.expat.CurrentLineNumber, 118 return (self.filename or '<string>', self.expat.CurrentLineNumber,
98 self.expat.CurrentColumnNumber) 119 self.expat.CurrentColumnNumber)
99 120
100 def _handle_start(self, tag, attrib): 121 def _handle_start(self, tag, attrib):
101 self._queue.append((Stream.START, (QName(tag), Attributes(attrib.items())), 122 self._enqueue(Stream.START, (QName(tag), Attributes(attrib.items())))
102 self._getpos()))
103 123
104 def _handle_end(self, tag): 124 def _handle_end(self, tag):
105 self._queue.append((Stream.END, QName(tag), self._getpos())) 125 self._enqueue(Stream.END, QName(tag))
106 126
107 def _handle_data(self, text): 127 def _handle_data(self, text):
108 self._queue.append((Stream.TEXT, text, self._getpos())) 128 self._enqueue(Stream.TEXT, text)
109 129
110 def _handle_prolog(self, version, encoding, standalone): 130 def _handle_prolog(self, version, encoding, standalone):
111 self._queue.append((Stream.PROLOG, (version, encoding, standalone), 131 self._enqueue(Stream.PROLOG, (version, encoding, standalone))
112 self._getpos()))
113 132
114 def _handle_doctype(self, name, sysid, pubid, has_internal_subset): 133 def _handle_doctype(self, name, sysid, pubid, has_internal_subset):
115 self._queue.append((Stream.DOCTYPE, (name, pubid, sysid), self._getpos())) 134 self._enqueue(Stream.DOCTYPE, (name, pubid, sysid))
116 135
117 def _handle_start_ns(self, prefix, uri): 136 def _handle_start_ns(self, prefix, uri):
118 self._queue.append((Stream.START_NS, (prefix or '', uri), self._getpos())) 137 self._enqueue(Stream.START_NS, (prefix or '', uri))
119 138
120 def _handle_end_ns(self, prefix): 139 def _handle_end_ns(self, prefix):
121 self._queue.append((Stream.END_NS, prefix or '', self._getpos())) 140 self._enqueue(Stream.END_NS, prefix or '')
122 141
123 def _handle_pi(self, target, data): 142 def _handle_pi(self, target, data):
124 self._queue.append((Stream.PI, (target, data), self._getpos())) 143 self._enqueue(Stream.PI, (target, data))
125 144
126 def _handle_comment(self, text): 145 def _handle_comment(self, text):
127 self._queue.append((Stream.COMMENT, text, self._getpos())) 146 self._enqueue(Stream.COMMENT, text)
128 147
129 def _handle_other(self, text): 148 def _handle_other(self, text):
130 if text.startswith('&'): 149 if text.startswith('&'):
131 # deal with undefined entities 150 # deal with undefined entities
132 try: 151 try:
133 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) 152 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
134 self._queue.append((Stream.TEXT, text, self._getpos())) 153 self._enqueue(Stream.TEXT, text)
135 except KeyError: 154 except KeyError:
136 lineno, offset = self._getpos() 155 lineno, offset = self._getpos()
137 raise expat.error("undefined entity %s: line %d, column %d" % 156 raise expat.error("undefined entity %s: line %d, column %d" %
138 (text, lineno, offset)) 157 (text, lineno, offset))
139 158
145 class HTMLParser(html.HTMLParser, object): 164 class HTMLParser(html.HTMLParser, object):
146 """Parser for HTML input based on the Python `HTMLParser` module. 165 """Parser for HTML input based on the Python `HTMLParser` module.
147 166
148 This class provides the same interface for generating stream events as 167 This class provides the same interface for generating stream events as
149 `XMLParser`, and attempts to automatically balance tags. 168 `XMLParser`, and attempts to automatically balance tags.
169
170 The parsing is initiated by iterating over the parser object:
171
172 >>> parser = HTMLParser(StringIO('<UL compact><LI>Foo</UL>'))
173 >>> for kind, data, pos in parser:
174 ... print kind, data
175 START (u'ul', [(u'compact', u'compact')])
176 START (u'li', [])
177 TEXT Foo
178 END li
179 END ul
150 """ 180 """
151 181
152 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', 182 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame',
153 'hr', 'img', 'input', 'isindex', 'link', 'meta', 183 'hr', 'img', 'input', 'isindex', 'link', 'meta',
154 'param']) 184 'param'])
185 msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset) 215 msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset)
186 if self.filename: 216 if self.filename:
187 msg += ', in %s' % self.filename 217 msg += ', in %s' % self.filename
188 raise ParseError(msg, self.filename, e.lineno, e.offset) 218 raise ParseError(msg, self.filename, e.lineno, e.offset)
189 219
220 def _enqueue(self, kind, data, pos=None):
221 if pos is None:
222 pos = self._getpos()
223 self._queue.append((kind, data, pos))
224
190 def _getpos(self): 225 def _getpos(self):
191 lineno, column = self.getpos() 226 lineno, column = self.getpos()
192 return (self.filename, lineno, column) 227 return (self.filename, lineno, column)
193 228
194 def handle_starttag(self, tag, attrib): 229 def handle_starttag(self, tag, attrib):
195 pos = self._getpos() 230 fixed_attrib = []
196 self._queue.append((Stream.START, (QName(tag), Attributes(attrib)), pos)) 231 for name, value in attrib: # Fixup minimized attributes
232 if value is None:
233 value = name
234 fixed_attrib.append((name, unicode(value)))
235
236 self._enqueue(Stream.START, (QName(tag), Attributes(fixed_attrib)))
197 if tag in self._EMPTY_ELEMS: 237 if tag in self._EMPTY_ELEMS:
198 self._queue.append((Stream.END, QName(tag), pos)) 238 self._enqueue(Stream.END, QName(tag))
199 else: 239 else:
200 self._open_tags.append(tag) 240 self._open_tags.append(tag)
201 241
202 def handle_endtag(self, tag): 242 def handle_endtag(self, tag):
203 if tag not in self._EMPTY_ELEMS: 243 if tag not in self._EMPTY_ELEMS:
204 pos = self._getpos()
205 while self._open_tags: 244 while self._open_tags:
206 open_tag = self._open_tags.pop() 245 open_tag = self._open_tags.pop()
207 if open_tag.lower() == tag.lower(): 246 if open_tag.lower() == tag.lower():
208 break 247 break
209 self._queue.append((Stream.END, QName(open_tag), pos)) 248 self._enqueue(Stream.END, QName(open_tag))
210 self._queue.append((Stream.END, QName(tag), pos)) 249 self._enqueue(Stream.END, QName(tag))
211 250
212 def handle_data(self, text): 251 def handle_data(self, text):
213 self._queue.append((Stream.TEXT, text, self._getpos())) 252 self._enqueue(Stream.TEXT, text)
214 253
215 def handle_charref(self, name): 254 def handle_charref(self, name):
216 self._queue.append((Stream.TEXT, Markup('&#%s;' % name), self._getpos())) 255 self._enqueue(Stream.TEXT, Markup('&#%s;' % name))
217 256
218 def handle_entityref(self, name): 257 def handle_entityref(self, name):
219 self._queue.append((Stream.TEXT, Markup('&%s;' % name), self._getpos())) 258 self._enqueue(Stream.TEXT, Markup('&%s;' % name))
220 259
221 def handle_pi(self, data): 260 def handle_pi(self, data):
222 target, data = data.split(maxsplit=1) 261 target, data = data.split(maxsplit=1)
223 data = data.rstrip('?') 262 data = data.rstrip('?')
224 self._queue.append((Stream.PI, (target.strip(), data.strip()), 263 self._enqueue(Stream.PI, (target.strip(), data.strip()))
225 self._getpos()))
226 264
227 def handle_comment(self, text): 265 def handle_comment(self, text):
228 self._queue.append((Stream.COMMENT, text, self._getpos())) 266 self._enqueue(Stream.COMMENT, text)
229 267
230 268
231 def HTML(text): 269 def HTML(text):
232 return Stream(list(HTMLParser(StringIO(text)))) 270 return Stream(list(HTMLParser(StringIO(text))))
Copyright (C) 2012-2017 Edgewall Software