Mercurial > genshi > mirror
comparison markup/input.py @ 26:3c1a022be04c trunk
* Split out the XPath tests into a separate `unittest`-based file.
* Added many more docstrings.
* Cleaned up the implementation of the XML/HTML parsers a bit.
* The HTML parser now correctly handles minimized attributes.
* Added `COPYING` and `README` files.
author | cmlenz |
---|---|
date | Wed, 28 Jun 2006 08:55:04 +0000 |
parents | b4d17897d053 |
children | b4f78c05e5c9 |
comparison
equal
deleted
inserted
replaced
25:e3be27f5bcf5 | 26:3c1a022be04c |
---|---|
16 frozenset | 16 frozenset |
17 except NameError: | 17 except NameError: |
18 from sets import ImmutableSet as frozenset | 18 from sets import ImmutableSet as frozenset |
19 import HTMLParser as html | 19 import HTMLParser as html |
20 import htmlentitydefs | 20 import htmlentitydefs |
21 import re | |
22 from StringIO import StringIO | 21 from StringIO import StringIO |
23 | 22 |
24 from markup.core import Attributes, Markup, QName, Stream | 23 from markup.core import Attributes, Markup, QName, Stream |
25 | 24 |
26 | 25 |
35 self.offset = offset | 34 self.offset = offset |
36 | 35 |
37 | 36 |
38 class XMLParser(object): | 37 class XMLParser(object): |
39 """Generator-based XML parser based on roughly equivalent code in | 38 """Generator-based XML parser based on roughly equivalent code in |
40 Kid/ElementTree.""" | 39 Kid/ElementTree. |
40 | |
41 The parsing is initiated by iterating over the parser object: | |
42 | |
43 >>> parser = XMLParser(StringIO('<root id="2"><child>Foo</child></root>')) | |
44 >>> for kind, data, pos in parser: | |
45 ... print kind, data | |
46 START (u'root', [(u'id', u'2')]) | |
47 START (u'child', []) | |
48 TEXT Foo | |
49 END child | |
50 END root | |
51 """ | |
41 | 52 |
42 def __init__(self, source, filename=None): | 53 def __init__(self, source, filename=None): |
54 """Initialize the parser for the given XML text. | |
55 | |
56 @param source: the XML text as a file-like object | |
57 @param filename: the name of the file, if appropriate | |
58 """ | |
43 self.source = source | 59 self.source = source |
44 self.filename = filename | 60 self.filename = filename |
45 | 61 |
46 # Setup the Expat parser | 62 # Setup the Expat parser |
47 parser = expat.ParserCreate('utf-8', '}') | 63 parser = expat.ParserCreate('utf-8', '}') |
88 msg = str(e) | 104 msg = str(e) |
89 if self.filename: | 105 if self.filename: |
90 msg += ', in ' + self.filename | 106 msg += ', in ' + self.filename |
91 raise ParseError(msg, self.filename, e.lineno, e.offset) | 107 raise ParseError(msg, self.filename, e.lineno, e.offset) |
92 | 108 |
109 def _enqueue(self, kind, data, pos=None): | |
110 if pos is None: | |
111 pos = self._getpos() | |
112 self._queue.append((kind, data, pos)) | |
113 | |
93 def _getpos_unknown(self): | 114 def _getpos_unknown(self): |
94 return (self.filename or '<string>', -1, -1) | 115 return (self.filename or '<string>', -1, -1) |
95 | 116 |
96 def _getpos(self): | 117 def _getpos(self): |
97 return (self.filename or '<string>', self.expat.CurrentLineNumber, | 118 return (self.filename or '<string>', self.expat.CurrentLineNumber, |
98 self.expat.CurrentColumnNumber) | 119 self.expat.CurrentColumnNumber) |
99 | 120 |
100 def _handle_start(self, tag, attrib): | 121 def _handle_start(self, tag, attrib): |
101 self._queue.append((Stream.START, (QName(tag), Attributes(attrib.items())), | 122 self._enqueue(Stream.START, (QName(tag), Attributes(attrib.items()))) |
102 self._getpos())) | |
103 | 123 |
104 def _handle_end(self, tag): | 124 def _handle_end(self, tag): |
105 self._queue.append((Stream.END, QName(tag), self._getpos())) | 125 self._enqueue(Stream.END, QName(tag)) |
106 | 126 |
107 def _handle_data(self, text): | 127 def _handle_data(self, text): |
108 self._queue.append((Stream.TEXT, text, self._getpos())) | 128 self._enqueue(Stream.TEXT, text) |
109 | 129 |
110 def _handle_prolog(self, version, encoding, standalone): | 130 def _handle_prolog(self, version, encoding, standalone): |
111 self._queue.append((Stream.PROLOG, (version, encoding, standalone), | 131 self._enqueue(Stream.PROLOG, (version, encoding, standalone)) |
112 self._getpos())) | |
113 | 132 |
114 def _handle_doctype(self, name, sysid, pubid, has_internal_subset): | 133 def _handle_doctype(self, name, sysid, pubid, has_internal_subset): |
115 self._queue.append((Stream.DOCTYPE, (name, pubid, sysid), self._getpos())) | 134 self._enqueue(Stream.DOCTYPE, (name, pubid, sysid)) |
116 | 135 |
117 def _handle_start_ns(self, prefix, uri): | 136 def _handle_start_ns(self, prefix, uri): |
118 self._queue.append((Stream.START_NS, (prefix or '', uri), self._getpos())) | 137 self._enqueue(Stream.START_NS, (prefix or '', uri)) |
119 | 138 |
120 def _handle_end_ns(self, prefix): | 139 def _handle_end_ns(self, prefix): |
121 self._queue.append((Stream.END_NS, prefix or '', self._getpos())) | 140 self._enqueue(Stream.END_NS, prefix or '') |
122 | 141 |
123 def _handle_pi(self, target, data): | 142 def _handle_pi(self, target, data): |
124 self._queue.append((Stream.PI, (target, data), self._getpos())) | 143 self._enqueue(Stream.PI, (target, data)) |
125 | 144 |
126 def _handle_comment(self, text): | 145 def _handle_comment(self, text): |
127 self._queue.append((Stream.COMMENT, text, self._getpos())) | 146 self._enqueue(Stream.COMMENT, text) |
128 | 147 |
129 def _handle_other(self, text): | 148 def _handle_other(self, text): |
130 if text.startswith('&'): | 149 if text.startswith('&'): |
131 # deal with undefined entities | 150 # deal with undefined entities |
132 try: | 151 try: |
133 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) | 152 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) |
134 self._queue.append((Stream.TEXT, text, self._getpos())) | 153 self._enqueue(Stream.TEXT, text) |
135 except KeyError: | 154 except KeyError: |
136 lineno, offset = self._getpos() | 155 lineno, offset = self._getpos() |
137 raise expat.error("undefined entity %s: line %d, column %d" % | 156 raise expat.error("undefined entity %s: line %d, column %d" % |
138 (text, lineno, offset)) | 157 (text, lineno, offset)) |
139 | 158 |
145 class HTMLParser(html.HTMLParser, object): | 164 class HTMLParser(html.HTMLParser, object): |
146 """Parser for HTML input based on the Python `HTMLParser` module. | 165 """Parser for HTML input based on the Python `HTMLParser` module. |
147 | 166 |
148 This class provides the same interface for generating stream events as | 167 This class provides the same interface for generating stream events as |
149 `XMLParser`, and attempts to automatically balance tags. | 168 `XMLParser`, and attempts to automatically balance tags. |
169 | |
170 The parsing is initiated by iterating over the parser object: | |
171 | |
172 >>> parser = HTMLParser(StringIO('<UL compact><LI>Foo</UL>')) | |
173 >>> for kind, data, pos in parser: | |
174 ... print kind, data | |
175 START (u'ul', [(u'compact', u'compact')]) | |
176 START (u'li', []) | |
177 TEXT Foo | |
178 END li | |
179 END ul | |
150 """ | 180 """ |
151 | 181 |
152 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', | 182 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', |
153 'hr', 'img', 'input', 'isindex', 'link', 'meta', | 183 'hr', 'img', 'input', 'isindex', 'link', 'meta', |
154 'param']) | 184 'param']) |
185 msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset) | 215 msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset) |
186 if self.filename: | 216 if self.filename: |
187 msg += ', in %s' % self.filename | 217 msg += ', in %s' % self.filename |
188 raise ParseError(msg, self.filename, e.lineno, e.offset) | 218 raise ParseError(msg, self.filename, e.lineno, e.offset) |
189 | 219 |
220 def _enqueue(self, kind, data, pos=None): | |
221 if pos is None: | |
222 pos = self._getpos() | |
223 self._queue.append((kind, data, pos)) | |
224 | |
190 def _getpos(self): | 225 def _getpos(self): |
191 lineno, column = self.getpos() | 226 lineno, column = self.getpos() |
192 return (self.filename, lineno, column) | 227 return (self.filename, lineno, column) |
193 | 228 |
194 def handle_starttag(self, tag, attrib): | 229 def handle_starttag(self, tag, attrib): |
195 pos = self._getpos() | 230 fixed_attrib = [] |
196 self._queue.append((Stream.START, (QName(tag), Attributes(attrib)), pos)) | 231 for name, value in attrib: # Fixup minimized attributes |
232 if value is None: | |
233 value = name | |
234 fixed_attrib.append((name, unicode(value))) | |
235 | |
236 self._enqueue(Stream.START, (QName(tag), Attributes(fixed_attrib))) | |
197 if tag in self._EMPTY_ELEMS: | 237 if tag in self._EMPTY_ELEMS: |
198 self._queue.append((Stream.END, QName(tag), pos)) | 238 self._enqueue(Stream.END, QName(tag)) |
199 else: | 239 else: |
200 self._open_tags.append(tag) | 240 self._open_tags.append(tag) |
201 | 241 |
202 def handle_endtag(self, tag): | 242 def handle_endtag(self, tag): |
203 if tag not in self._EMPTY_ELEMS: | 243 if tag not in self._EMPTY_ELEMS: |
204 pos = self._getpos() | |
205 while self._open_tags: | 244 while self._open_tags: |
206 open_tag = self._open_tags.pop() | 245 open_tag = self._open_tags.pop() |
207 if open_tag.lower() == tag.lower(): | 246 if open_tag.lower() == tag.lower(): |
208 break | 247 break |
209 self._queue.append((Stream.END, QName(open_tag), pos)) | 248 self._enqueue(Stream.END, QName(open_tag)) |
210 self._queue.append((Stream.END, QName(tag), pos)) | 249 self._enqueue(Stream.END, QName(tag)) |
211 | 250 |
212 def handle_data(self, text): | 251 def handle_data(self, text): |
213 self._queue.append((Stream.TEXT, text, self._getpos())) | 252 self._enqueue(Stream.TEXT, text) |
214 | 253 |
215 def handle_charref(self, name): | 254 def handle_charref(self, name): |
216 self._queue.append((Stream.TEXT, Markup('&#%s;' % name), self._getpos())) | 255 self._enqueue(Stream.TEXT, Markup('&#%s;' % name)) |
217 | 256 |
218 def handle_entityref(self, name): | 257 def handle_entityref(self, name): |
219 self._queue.append((Stream.TEXT, Markup('&%s;' % name), self._getpos())) | 258 self._enqueue(Stream.TEXT, Markup('&%s;' % name)) |
220 | 259 |
221 def handle_pi(self, data): | 260 def handle_pi(self, data): |
222 target, data = data.split(maxsplit=1) | 261 target, data = data.split(maxsplit=1) |
223 data = data.rstrip('?') | 262 data = data.rstrip('?') |
224 self._queue.append((Stream.PI, (target.strip(), data.strip()), | 263 self._enqueue(Stream.PI, (target.strip(), data.strip())) |
225 self._getpos())) | |
226 | 264 |
227 def handle_comment(self, text): | 265 def handle_comment(self, text): |
228 self._queue.append((Stream.COMMENT, text, self._getpos())) | 266 self._enqueue(Stream.COMMENT, text) |
229 | 267 |
230 | 268 |
231 def HTML(text): | 269 def HTML(text): |
232 return Stream(list(HTMLParser(StringIO(text)))) | 270 return Stream(list(HTMLParser(StringIO(text)))) |