Mercurial > genshi > genshi-test
comparison genshi/input.py @ 932:e53161c2773c
Merge r1140 from py3k:
add support for python 3 to core genshi components (genshi.core, genshi.input and genshi.output):
* default input and output encodings changed from UTF-8 to None (i.e. unicode strings)
* Namespace and QName objects do not call stringrepr in __repr__ in Python 3 since repr() returns a unicode string there.
* track changes to expat parser in Python 3 (mostly it accepts bytes instead of strings)
author | hodgestar |
---|---|
date | Fri, 18 Mar 2011 09:08:12 +0000 |
parents | fbe34d12acde |
children |
comparison
equal
deleted
inserted
replaced
931:ade3abe742e9 | 932:e53161c2773c |
---|---|
16 """ | 16 """ |
17 | 17 |
18 from itertools import chain | 18 from itertools import chain |
19 import htmlentitydefs as entities | 19 import htmlentitydefs as entities |
20 import HTMLParser as html | 20 import HTMLParser as html |
21 from StringIO import StringIO | |
22 from xml.parsers import expat | 21 from xml.parsers import expat |
23 | 22 |
24 from genshi.core import Attrs, QName, Stream, stripentities | 23 from genshi.core import Attrs, QName, Stream, stripentities |
25 from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, START_NS, \ | 24 from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, START_NS, \ |
26 END_NS, START_CDATA, END_CDATA, PI, COMMENT | 25 END_NS, START_CDATA, END_CDATA, PI, COMMENT |
26 from genshi.compat import StringIO, BytesIO | |
27 | |
27 | 28 |
28 __all__ = ['ET', 'ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML'] | 29 __all__ = ['ET', 'ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML'] |
29 __docformat__ = 'restructuredtext en' | 30 __docformat__ = 'restructuredtext en' |
30 | 31 |
31 | 32 |
88 END root | 89 END root |
89 """ | 90 """ |
90 | 91 |
91 _entitydefs = ['<!ENTITY %s "&#%d;">' % (name, value) for name, value in | 92 _entitydefs = ['<!ENTITY %s "&#%d;">' % (name, value) for name, value in |
92 entities.name2codepoint.items()] | 93 entities.name2codepoint.items()] |
93 _external_dtd = '\n'.join(_entitydefs) | 94 _external_dtd = u'\n'.join(_entitydefs).encode('utf-8') |
94 | 95 |
95 def __init__(self, source, filename=None, encoding=None): | 96 def __init__(self, source, filename=None, encoding=None): |
96 """Initialize the parser for the given XML input. | 97 """Initialize the parser for the given XML input. |
97 | 98 |
98 :param source: the XML text as a file-like object | 99 :param source: the XML text as a file-like object |
106 self.filename = filename | 107 self.filename = filename |
107 | 108 |
108 # Setup the Expat parser | 109 # Setup the Expat parser |
109 parser = expat.ParserCreate(encoding, '}') | 110 parser = expat.ParserCreate(encoding, '}') |
110 parser.buffer_text = True | 111 parser.buffer_text = True |
111 parser.returns_unicode = True | 112 # Python 3 does not have returns_unicode |
113 if hasattr(parser, 'returns_unicode'): | |
114 parser.returns_unicode = True | |
112 parser.ordered_attributes = True | 115 parser.ordered_attributes = True |
113 | 116 |
114 parser.StartElementHandler = self._handle_start | 117 parser.StartElementHandler = self._handle_start |
115 parser.EndElementHandler = self._handle_end | 118 parser.EndElementHandler = self._handle_end |
116 parser.CharacterDataHandler = self._handle_data | 119 parser.CharacterDataHandler = self._handle_data |
144 bufsize = 4 * 1024 # 4K | 147 bufsize = 4 * 1024 # 4K |
145 done = False | 148 done = False |
146 while 1: | 149 while 1: |
147 while not done and len(self._queue) == 0: | 150 while not done and len(self._queue) == 0: |
148 data = self.source.read(bufsize) | 151 data = self.source.read(bufsize) |
149 if data == '': # end of data | 152 if not data: # end of data |
150 if hasattr(self, 'expat'): | 153 if hasattr(self, 'expat'): |
151 self.expat.Parse('', True) | 154 self.expat.Parse('', True) |
152 del self.expat # get rid of circular references | 155 del self.expat # get rid of circular references |
153 done = True | 156 done = True |
154 else: | 157 else: |
168 def __iter__(self): | 171 def __iter__(self): |
169 return iter(self.parse()) | 172 return iter(self.parse()) |
170 | 173 |
171 def _build_foreign(self, context, base, sysid, pubid): | 174 def _build_foreign(self, context, base, sysid, pubid): |
172 parser = self.expat.ExternalEntityParserCreate(context) | 175 parser = self.expat.ExternalEntityParserCreate(context) |
173 parser.ParseFile(StringIO(self._external_dtd)) | 176 parser.ParseFile(BytesIO(self._external_dtd)) |
174 return 1 | 177 return 1 |
175 | 178 |
176 def _enqueue(self, kind, data=None, pos=None): | 179 def _enqueue(self, kind, data=None, pos=None): |
177 if pos is None: | 180 if pos is None: |
178 pos = self._getpos() | 181 pos = self._getpos() |
277 This class provides the same interface for generating stream events as | 280 This class provides the same interface for generating stream events as |
278 `XMLParser`, and attempts to automatically balance tags. | 281 `XMLParser`, and attempts to automatically balance tags. |
279 | 282 |
280 The parsing is initiated by iterating over the parser object: | 283 The parsing is initiated by iterating over the parser object: |
281 | 284 |
282 >>> parser = HTMLParser(StringIO('<UL compact><LI>Foo</UL>')) | 285 >>> parser = HTMLParser(BytesIO(u'<UL compact><LI>Foo</UL>'.encode('utf-8')), encoding='utf-8') |
283 >>> for kind, data, pos in parser: | 286 >>> for kind, data, pos in parser: |
284 ... print('%s %s' % (kind, data)) | 287 ... print('%s %s' % (kind, data)) |
285 START (QName('ul'), Attrs([(QName('compact'), u'compact')])) | 288 START (QName('ul'), Attrs([(QName('compact'), u'compact')])) |
286 START (QName('li'), Attrs()) | 289 START (QName('li'), Attrs()) |
287 TEXT Foo | 290 TEXT Foo |
291 | 294 |
292 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', | 295 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', |
293 'hr', 'img', 'input', 'isindex', 'link', 'meta', | 296 'hr', 'img', 'input', 'isindex', 'link', 'meta', |
294 'param']) | 297 'param']) |
295 | 298 |
296 def __init__(self, source, filename=None, encoding='utf-8'): | 299 def __init__(self, source, filename=None, encoding=None): |
297 """Initialize the parser for the given HTML input. | 300 """Initialize the parser for the given HTML input. |
298 | 301 |
299 :param source: the HTML text as a file-like object | 302 :param source: the HTML text as a file-like object |
300 :param filename: the name of the file, if known | 303 :param filename: the name of the file, if known |
301 :param filename: encoding of the file; ignored if the input is unicode | 304 :param filename: encoding of the file; ignored if the input is unicode |
318 bufsize = 4 * 1024 # 4K | 321 bufsize = 4 * 1024 # 4K |
319 done = False | 322 done = False |
320 while 1: | 323 while 1: |
321 while not done and len(self._queue) == 0: | 324 while not done and len(self._queue) == 0: |
322 data = self.source.read(bufsize) | 325 data = self.source.read(bufsize) |
323 if data == '': # end of data | 326 if not data: # end of data |
324 self.close() | 327 self.close() |
325 done = True | 328 done = True |
326 else: | 329 else: |
330 if not isinstance(data, unicode): | |
331 # bytes | |
332 if self.encoding: | |
333 data = data.decode(self.encoding) | |
334 else: | |
335 raise UnicodeError("source returned bytes, but no encoding specified") | |
327 self.feed(data) | 336 self.feed(data) |
328 for kind, data, pos in self._queue: | 337 for kind, data, pos in self._queue: |
329 yield kind, data, pos | 338 yield kind, data, pos |
330 self._queue = [] | 339 self._queue = [] |
331 if done: | 340 if done: |
401 | 410 |
402 def handle_comment(self, text): | 411 def handle_comment(self, text): |
403 self._enqueue(COMMENT, text) | 412 self._enqueue(COMMENT, text) |
404 | 413 |
405 | 414 |
406 def HTML(text, encoding='utf-8'): | 415 def HTML(text, encoding=None): |
407 """Parse the given HTML source and return a markup stream. | 416 """Parse the given HTML source and return a markup stream. |
408 | 417 |
409 Unlike with `HTMLParser`, the returned stream is reusable, meaning it can be | 418 Unlike with `HTMLParser`, the returned stream is reusable, meaning it can be |
410 iterated over multiple times: | 419 iterated over multiple times: |
411 | 420 |
412 >>> html = HTML('<body><h1>Foo</h1></body>') | 421 >>> html = HTML('<body><h1>Foo</h1></body>', encoding='utf-8') |
413 >>> print(html) | 422 >>> print(html) |
414 <body><h1>Foo</h1></body> | 423 <body><h1>Foo</h1></body> |
415 >>> print(html.select('h1')) | 424 >>> print(html.select('h1')) |
416 <h1>Foo</h1> | 425 <h1>Foo</h1> |
417 >>> print(html.select('h1/text()')) | 426 >>> print(html.select('h1/text()')) |
420 :param text: the HTML source | 429 :param text: the HTML source |
421 :return: the parsed XML event stream | 430 :return: the parsed XML event stream |
422 :raises ParseError: if the HTML text is not well-formed, and error recovery | 431 :raises ParseError: if the HTML text is not well-formed, and error recovery |
423 fails | 432 fails |
424 """ | 433 """ |
425 return Stream(list(HTMLParser(StringIO(text), encoding=encoding))) | 434 if isinstance(text, unicode): |
435 return Stream(list(HTMLParser(StringIO(text), encoding=encoding))) | |
436 return Stream(list(HTMLParser(BytesIO(text), encoding=encoding))) | |
426 | 437 |
427 | 438 |
428 def _coalesce(stream): | 439 def _coalesce(stream): |
429 """Coalesces adjacent TEXT events into a single event.""" | 440 """Coalesces adjacent TEXT events into a single event.""" |
430 textbuf = [] | 441 textbuf = [] |