genshi/genshi-test: genshi/input.py comparison

comparison genshi/input.py @ 932:e53161c2773c

Merge r1140 from py3k: add support for python 3 to core genshi components (genshi.core, genshi.input and genshi.output): * default input and output encodings changed from UTF-8 to None (i.e. unicode strings) * Namespace and QName objects do not call stringrepr in __repr__ in Python 3 since repr() returns a unicode string there. * track changes to expat parser in Python 3 (mostly it accepts bytes instead of strings)

author	hodgestar
date	Fri, 18 Mar 2011 09:08:12 +0000
parents	fbe34d12acde
children

comparison

equal deleted inserted replaced

-:ade3abe742e9
+:e53161c2773c
 """
 from itertools import chain
 import htmlentitydefs as entities
 import HTMLParser as html
-from StringIO import StringIO
 from xml.parsers import expat
 from genshi.core import Attrs, QName, Stream, stripentities
 from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, START_NS, \
 END_NS, START_CDATA, END_CDATA, PI, COMMENT
+from genshi.compat import StringIO, BytesIO
 __all__ = ['ET', 'ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML']
 __docformat__ = 'restructuredtext en'
 END root
 """
 _entitydefs = ['<!ENTITY %s "&#%d;">' % (name, value) for name, value in
 entities.name2codepoint.items()]
-_external_dtd = '\n'.join(_entitydefs)
+_external_dtd = u'\n'.join(_entitydefs).encode('utf-8')
 def __init__(self, source, filename=None, encoding=None):
 """Initialize the parser for the given XML input.
 :param source: the XML text as a file-like object
 self.filename = filename
 # Setup the Expat parser
 parser = expat.ParserCreate(encoding, '}')
 parser.buffer_text = True
-parser.returns_unicode = True
+# Python 3 does not have returns_unicode
+if hasattr(parser, 'returns_unicode'):
+parser.returns_unicode = True
 parser.ordered_attributes = True
 parser.StartElementHandler = self._handle_start
 parser.EndElementHandler = self._handle_end
 parser.CharacterDataHandler = self._handle_data
 bufsize = 4 * 1024 # 4K
 done = False
 while 1:
 while not done and len(self._queue) == 0:
 data = self.source.read(bufsize)
-if data == '': # end of data
+if not data: # end of data
 if hasattr(self, 'expat'):
 self.expat.Parse('', True)
 del self.expat # get rid of circular references
 done = True
 else:
 def __iter__(self):
 return iter(self.parse())
 def _build_foreign(self, context, base, sysid, pubid):
 parser = self.expat.ExternalEntityParserCreate(context)
-parser.ParseFile(StringIO(self._external_dtd))
+parser.ParseFile(BytesIO(self._external_dtd))
 return 1
 def _enqueue(self, kind, data=None, pos=None):
 if pos is None:
 pos = self._getpos()
 This class provides the same interface for generating stream events as
 `XMLParser`, and attempts to automatically balance tags.
 The parsing is initiated by iterating over the parser object:
->>> parser = HTMLParser(StringIO('<UL compact><LI>Foo</UL>'))
+>>> parser = HTMLParser(BytesIO(u'<UL compact><LI>Foo</UL>'.encode('utf-8')), encoding='utf-8')
 >>> for kind, data, pos in parser:
 ...     print('%s %s' % (kind, data))
 START (QName('ul'), Attrs([(QName('compact'), u'compact')]))
 START (QName('li'), Attrs())
 TEXT Foo
 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame',
 'hr', 'img', 'input', 'isindex', 'link', 'meta',
 'param'])
-def __init__(self, source, filename=None, encoding='utf-8'):
+def __init__(self, source, filename=None, encoding=None):
 """Initialize the parser for the given HTML input.
 :param source: the HTML text as a file-like object
 :param filename: the name of the file, if known
 :param filename: encoding of the file; ignored if the input is unicode
 bufsize = 4 * 1024 # 4K
 done = False
 while 1:
 while not done and len(self._queue) == 0:
 data = self.source.read(bufsize)
-if data == '': # end of data
+if not data: # end of data
 self.close()
 done = True
 else:
+if not isinstance(data, unicode):
+# bytes
+if self.encoding:
+data = data.decode(self.encoding)
+else:
+raise UnicodeError("source returned bytes, but no encoding specified")
 self.feed(data)
 for kind, data, pos in self._queue:
 yield kind, data, pos
 self._queue = []
 if done:
 def handle_comment(self, text):
 self._enqueue(COMMENT, text)
-def HTML(text, encoding='utf-8'):
+def HTML(text, encoding=None):
 """Parse the given HTML source and return a markup stream.
 Unlike with `HTMLParser`, the returned stream is reusable, meaning it can be
 iterated over multiple times:
->>> html = HTML('<body><h1>Foo</h1></body>')
+>>> html = HTML('<body><h1>Foo</h1></body>', encoding='utf-8')
 >>> print(html)
 <body><h1>Foo</h1></body>
 >>> print(html.select('h1'))
 <h1>Foo</h1>
 >>> print(html.select('h1/text()'))
 :param text: the HTML source
 :return: the parsed XML event stream
 :raises ParseError: if the HTML text is not well-formed, and error recovery
 fails
 """
-return Stream(list(HTMLParser(StringIO(text), encoding=encoding)))
+if isinstance(text, unicode):
+return Stream(list(HTMLParser(StringIO(text), encoding=encoding)))
+return Stream(list(HTMLParser(BytesIO(text), encoding=encoding)))
 def _coalesce(stream):
 """Coalesces adjacent TEXT events into a single event."""
 textbuf = []

Mercurial > genshi > genshi-test

comparison genshi/input.py @ 932:e53161c2773c