comparison genshi/input.py @ 932:e53161c2773c

Merge r1140 from py3k: add support for python 3 to core genshi components (genshi.core, genshi.input and genshi.output): * default input and output encodings changed from UTF-8 to None (i.e. unicode strings) * Namespace and QName objects do not call stringrepr in __repr__ in Python 3 since repr() returns a unicode string there. * track changes to expat parser in Python 3 (mostly it accepts bytes instead of strings)
author hodgestar
date Fri, 18 Mar 2011 09:08:12 +0000
parents fbe34d12acde
children
comparison
equal deleted inserted replaced
931:ade3abe742e9 932:e53161c2773c
16 """ 16 """
17 17
18 from itertools import chain 18 from itertools import chain
19 import htmlentitydefs as entities 19 import htmlentitydefs as entities
20 import HTMLParser as html 20 import HTMLParser as html
21 from StringIO import StringIO
22 from xml.parsers import expat 21 from xml.parsers import expat
23 22
24 from genshi.core import Attrs, QName, Stream, stripentities 23 from genshi.core import Attrs, QName, Stream, stripentities
25 from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, START_NS, \ 24 from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, START_NS, \
26 END_NS, START_CDATA, END_CDATA, PI, COMMENT 25 END_NS, START_CDATA, END_CDATA, PI, COMMENT
26 from genshi.compat import StringIO, BytesIO
27
27 28
28 __all__ = ['ET', 'ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML'] 29 __all__ = ['ET', 'ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML']
29 __docformat__ = 'restructuredtext en' 30 __docformat__ = 'restructuredtext en'
30 31
31 32
88 END root 89 END root
89 """ 90 """
90 91
91 _entitydefs = ['<!ENTITY %s "&#%d;">' % (name, value) for name, value in 92 _entitydefs = ['<!ENTITY %s "&#%d;">' % (name, value) for name, value in
92 entities.name2codepoint.items()] 93 entities.name2codepoint.items()]
93 _external_dtd = '\n'.join(_entitydefs) 94 _external_dtd = u'\n'.join(_entitydefs).encode('utf-8')
94 95
95 def __init__(self, source, filename=None, encoding=None): 96 def __init__(self, source, filename=None, encoding=None):
96 """Initialize the parser for the given XML input. 97 """Initialize the parser for the given XML input.
97 98
98 :param source: the XML text as a file-like object 99 :param source: the XML text as a file-like object
106 self.filename = filename 107 self.filename = filename
107 108
108 # Setup the Expat parser 109 # Setup the Expat parser
109 parser = expat.ParserCreate(encoding, '}') 110 parser = expat.ParserCreate(encoding, '}')
110 parser.buffer_text = True 111 parser.buffer_text = True
111 parser.returns_unicode = True 112 # Python 3 does not have returns_unicode
113 if hasattr(parser, 'returns_unicode'):
114 parser.returns_unicode = True
112 parser.ordered_attributes = True 115 parser.ordered_attributes = True
113 116
114 parser.StartElementHandler = self._handle_start 117 parser.StartElementHandler = self._handle_start
115 parser.EndElementHandler = self._handle_end 118 parser.EndElementHandler = self._handle_end
116 parser.CharacterDataHandler = self._handle_data 119 parser.CharacterDataHandler = self._handle_data
144 bufsize = 4 * 1024 # 4K 147 bufsize = 4 * 1024 # 4K
145 done = False 148 done = False
146 while 1: 149 while 1:
147 while not done and len(self._queue) == 0: 150 while not done and len(self._queue) == 0:
148 data = self.source.read(bufsize) 151 data = self.source.read(bufsize)
149 if data == '': # end of data 152 if not data: # end of data
150 if hasattr(self, 'expat'): 153 if hasattr(self, 'expat'):
151 self.expat.Parse('', True) 154 self.expat.Parse('', True)
152 del self.expat # get rid of circular references 155 del self.expat # get rid of circular references
153 done = True 156 done = True
154 else: 157 else:
168 def __iter__(self): 171 def __iter__(self):
169 return iter(self.parse()) 172 return iter(self.parse())
170 173
171 def _build_foreign(self, context, base, sysid, pubid): 174 def _build_foreign(self, context, base, sysid, pubid):
172 parser = self.expat.ExternalEntityParserCreate(context) 175 parser = self.expat.ExternalEntityParserCreate(context)
173 parser.ParseFile(StringIO(self._external_dtd)) 176 parser.ParseFile(BytesIO(self._external_dtd))
174 return 1 177 return 1
175 178
176 def _enqueue(self, kind, data=None, pos=None): 179 def _enqueue(self, kind, data=None, pos=None):
177 if pos is None: 180 if pos is None:
178 pos = self._getpos() 181 pos = self._getpos()
277 This class provides the same interface for generating stream events as 280 This class provides the same interface for generating stream events as
278 `XMLParser`, and attempts to automatically balance tags. 281 `XMLParser`, and attempts to automatically balance tags.
279 282
280 The parsing is initiated by iterating over the parser object: 283 The parsing is initiated by iterating over the parser object:
281 284
282 >>> parser = HTMLParser(StringIO('<UL compact><LI>Foo</UL>')) 285 >>> parser = HTMLParser(BytesIO(u'<UL compact><LI>Foo</UL>'.encode('utf-8')), encoding='utf-8')
283 >>> for kind, data, pos in parser: 286 >>> for kind, data, pos in parser:
284 ... print('%s %s' % (kind, data)) 287 ... print('%s %s' % (kind, data))
285 START (QName('ul'), Attrs([(QName('compact'), u'compact')])) 288 START (QName('ul'), Attrs([(QName('compact'), u'compact')]))
286 START (QName('li'), Attrs()) 289 START (QName('li'), Attrs())
287 TEXT Foo 290 TEXT Foo
291 294
292 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', 295 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame',
293 'hr', 'img', 'input', 'isindex', 'link', 'meta', 296 'hr', 'img', 'input', 'isindex', 'link', 'meta',
294 'param']) 297 'param'])
295 298
296 def __init__(self, source, filename=None, encoding='utf-8'): 299 def __init__(self, source, filename=None, encoding=None):
297 """Initialize the parser for the given HTML input. 300 """Initialize the parser for the given HTML input.
298 301
299 :param source: the HTML text as a file-like object 302 :param source: the HTML text as a file-like object
300 :param filename: the name of the file, if known 303 :param filename: the name of the file, if known
301 :param filename: encoding of the file; ignored if the input is unicode 304 :param filename: encoding of the file; ignored if the input is unicode
318 bufsize = 4 * 1024 # 4K 321 bufsize = 4 * 1024 # 4K
319 done = False 322 done = False
320 while 1: 323 while 1:
321 while not done and len(self._queue) == 0: 324 while not done and len(self._queue) == 0:
322 data = self.source.read(bufsize) 325 data = self.source.read(bufsize)
323 if data == '': # end of data 326 if not data: # end of data
324 self.close() 327 self.close()
325 done = True 328 done = True
326 else: 329 else:
330 if not isinstance(data, unicode):
331 # bytes
332 if self.encoding:
333 data = data.decode(self.encoding)
334 else:
335 raise UnicodeError("source returned bytes, but no encoding specified")
327 self.feed(data) 336 self.feed(data)
328 for kind, data, pos in self._queue: 337 for kind, data, pos in self._queue:
329 yield kind, data, pos 338 yield kind, data, pos
330 self._queue = [] 339 self._queue = []
331 if done: 340 if done:
401 410
402 def handle_comment(self, text): 411 def handle_comment(self, text):
403 self._enqueue(COMMENT, text) 412 self._enqueue(COMMENT, text)
404 413
405 414
406 def HTML(text, encoding='utf-8'): 415 def HTML(text, encoding=None):
407 """Parse the given HTML source and return a markup stream. 416 """Parse the given HTML source and return a markup stream.
408 417
409 Unlike with `HTMLParser`, the returned stream is reusable, meaning it can be 418 Unlike with `HTMLParser`, the returned stream is reusable, meaning it can be
410 iterated over multiple times: 419 iterated over multiple times:
411 420
412 >>> html = HTML('<body><h1>Foo</h1></body>') 421 >>> html = HTML('<body><h1>Foo</h1></body>', encoding='utf-8')
413 >>> print(html) 422 >>> print(html)
414 <body><h1>Foo</h1></body> 423 <body><h1>Foo</h1></body>
415 >>> print(html.select('h1')) 424 >>> print(html.select('h1'))
416 <h1>Foo</h1> 425 <h1>Foo</h1>
417 >>> print(html.select('h1/text()')) 426 >>> print(html.select('h1/text()'))
420 :param text: the HTML source 429 :param text: the HTML source
421 :return: the parsed XML event stream 430 :return: the parsed XML event stream
422 :raises ParseError: if the HTML text is not well-formed, and error recovery 431 :raises ParseError: if the HTML text is not well-formed, and error recovery
423 fails 432 fails
424 """ 433 """
425 return Stream(list(HTMLParser(StringIO(text), encoding=encoding))) 434 if isinstance(text, unicode):
435 return Stream(list(HTMLParser(StringIO(text), encoding=encoding)))
436 return Stream(list(HTMLParser(BytesIO(text), encoding=encoding)))
426 437
427 438
428 def _coalesce(stream): 439 def _coalesce(stream):
429 """Coalesces adjacent TEXT events into a single event.""" 440 """Coalesces adjacent TEXT events into a single event."""
430 textbuf = [] 441 textbuf = []
Copyright (C) 2012-2017 Edgewall Software