# HG changeset patch
# User hodgestar
# Date 1300439292 0
# Node ID e53161c2773c25f4374ef9cd7c03d03574c5fdf9
# Parent ade3abe742e9d42aeccb3801c7d38e6b8f9247b0
Merge r1140 from py3k:
add support for python 3 to core genshi components (genshi.core, genshi.input and genshi.output):
* default input and output encodings changed from UTF-8 to None (i.e. unicode strings)
* Namespace and QName objects do not call stringrepr in __repr__ in Python 3 since repr() returns a unicode string there.
* track changes to expat parser in Python 3 (mostly it accepts bytes instead of strings)
diff --git a/genshi/core.py b/genshi/core.py
--- a/genshi/core.py
+++ b/genshi/core.py
@@ -17,6 +17,7 @@
reduce # builtin in Python < 3
except NameError:
from functools import reduce
+import sys
from itertools import chain
import operator
@@ -92,7 +93,7 @@
Assume the following stream produced by the `HTML` function:
>>> from genshi.input import HTML
- >>> html = HTML('''
Hello, world!
''')
+ >>> html = HTML('''Hello, world!
''', encoding='utf-8')
>>> print(html)
Hello, world!
@@ -153,7 +154,7 @@
"""
return reduce(operator.or_, (self,) + filters)
- def render(self, method=None, encoding='utf-8', out=None, **kwargs):
+ def render(self, method=None, encoding=None, out=None, **kwargs):
"""Return a string representation of the stream.
Any additional keyword arguments are passed to the serializer, and thus
@@ -187,7 +188,7 @@
XPath expression.
>>> from genshi import HTML
- >>> stream = HTML('foobar')
+ >>> stream = HTML('foobar', encoding='utf-8')
>>> print(stream.select('elem'))
foobar
>>> print(stream.select('elem/text()'))
@@ -667,8 +668,13 @@
def __hash__(self):
return hash(self.uri)
- def __repr__(self):
- return '%s(%s)' % (type(self).__name__, stringrepr(self.uri))
+ if sys.version_info[0] == 2:
+ # Only use stringrepr in python 2
+ def __repr__(self):
+ return '%s(%s)' % (type(self).__name__, stringrepr(self.uri))
+ else:
+ def __repr__(self):
+ return '%s(%r)' % (type(self).__name__, self.uri)
def __str__(self):
return self.uri.encode('utf-8')
@@ -729,5 +735,10 @@
def __getnewargs__(self):
return (self.lstrip('{'),)
- def __repr__(self):
- return '%s(%s)' % (type(self).__name__, stringrepr(self.lstrip('{')))
+ if sys.version_info[0] == 2:
+ # Only use stringrepr in python 2
+ def __repr__(self):
+ return '%s(%s)' % (type(self).__name__, stringrepr(self.lstrip('{')))
+ else:
+ def __repr__(self):
+ return '%s(%r)' % (type(self).__name__, self.lstrip('{'))
diff --git a/genshi/input.py b/genshi/input.py
--- a/genshi/input.py
+++ b/genshi/input.py
@@ -18,12 +18,13 @@
from itertools import chain
import htmlentitydefs as entities
import HTMLParser as html
-from StringIO import StringIO
from xml.parsers import expat
from genshi.core import Attrs, QName, Stream, stripentities
from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, START_NS, \
END_NS, START_CDATA, END_CDATA, PI, COMMENT
+from genshi.compat import StringIO, BytesIO
+
__all__ = ['ET', 'ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML']
__docformat__ = 'restructuredtext en'
@@ -90,7 +91,7 @@
_entitydefs = ['' % (name, value) for name, value in
entities.name2codepoint.items()]
- _external_dtd = '\n'.join(_entitydefs)
+ _external_dtd = u'\n'.join(_entitydefs).encode('utf-8')
def __init__(self, source, filename=None, encoding=None):
"""Initialize the parser for the given XML input.
@@ -108,7 +109,9 @@
# Setup the Expat parser
parser = expat.ParserCreate(encoding, '}')
parser.buffer_text = True
- parser.returns_unicode = True
+ # Python 3 does not have returns_unicode
+ if hasattr(parser, 'returns_unicode'):
+ parser.returns_unicode = True
parser.ordered_attributes = True
parser.StartElementHandler = self._handle_start
@@ -146,7 +149,7 @@
while 1:
while not done and len(self._queue) == 0:
data = self.source.read(bufsize)
- if data == '': # end of data
+ if not data: # end of data
if hasattr(self, 'expat'):
self.expat.Parse('', True)
del self.expat # get rid of circular references
@@ -170,7 +173,7 @@
def _build_foreign(self, context, base, sysid, pubid):
parser = self.expat.ExternalEntityParserCreate(context)
- parser.ParseFile(StringIO(self._external_dtd))
+ parser.ParseFile(BytesIO(self._external_dtd))
return 1
def _enqueue(self, kind, data=None, pos=None):
@@ -279,7 +282,7 @@
The parsing is initiated by iterating over the parser object:
- >>> parser = HTMLParser(StringIO(''))
+ >>> parser = HTMLParser(BytesIO(u''.encode('utf-8')), encoding='utf-8')
>>> for kind, data, pos in parser:
... print('%s %s' % (kind, data))
START (QName('ul'), Attrs([(QName('compact'), u'compact')]))
@@ -293,7 +296,7 @@
'hr', 'img', 'input', 'isindex', 'link', 'meta',
'param'])
- def __init__(self, source, filename=None, encoding='utf-8'):
+ def __init__(self, source, filename=None, encoding=None):
"""Initialize the parser for the given HTML input.
:param source: the HTML text as a file-like object
@@ -320,10 +323,16 @@
while 1:
while not done and len(self._queue) == 0:
data = self.source.read(bufsize)
- if data == '': # end of data
+ if not data: # end of data
self.close()
done = True
else:
+ if not isinstance(data, unicode):
+ # bytes
+ if self.encoding:
+ data = data.decode(self.encoding)
+ else:
+ raise UnicodeError("source returned bytes, but no encoding specified")
self.feed(data)
for kind, data, pos in self._queue:
yield kind, data, pos
@@ -403,13 +412,13 @@
self._enqueue(COMMENT, text)
-def HTML(text, encoding='utf-8'):
+def HTML(text, encoding=None):
"""Parse the given HTML source and return a markup stream.
Unlike with `HTMLParser`, the returned stream is reusable, meaning it can be
iterated over multiple times:
- >>> html = HTML('Foo
')
+ >>> html = HTML('Foo
', encoding='utf-8')
>>> print(html)
Foo
>>> print(html.select('h1'))
@@ -422,7 +431,9 @@
:raises ParseError: if the HTML text is not well-formed, and error recovery
fails
"""
- return Stream(list(HTMLParser(StringIO(text), encoding=encoding)))
+ if isinstance(text, unicode):
+ return Stream(list(HTMLParser(StringIO(text), encoding=encoding)))
+ return Stream(list(HTMLParser(BytesIO(text), encoding=encoding)))
def _coalesce(stream):
diff --git a/genshi/output.py b/genshi/output.py
--- a/genshi/output.py
+++ b/genshi/output.py
@@ -27,7 +27,7 @@
__docformat__ = 'restructuredtext en'
-def encode(iterator, method='xml', encoding='utf-8', out=None):
+def encode(iterator, method='xml', encoding=None, out=None):
"""Encode serializer output into a string.
:param iterator: the iterator returned from serializing a stream (basically
diff --git a/genshi/tests/core.py b/genshi/tests/core.py
--- a/genshi/tests/core.py
+++ b/genshi/tests/core.py
@@ -13,37 +13,34 @@
import doctest
import pickle
-from StringIO import StringIO
-try:
- from cStringIO import StringIO as cStringIO
-except ImportError:
- cStringIO = StringIO
import unittest
from genshi import core
from genshi.core import Markup, Attrs, Namespace, QName, escape, unescape
from genshi.input import XML, ParseError
+from genshi.compat import StringIO, BytesIO
class StreamTestCase(unittest.TestCase):
def test_render_utf8(self):
xml = XML('Über uns')
- self.assertEqual('Über uns', xml.render())
+ self.assertEqual(u'Über uns'.encode('utf-8'), xml.render(encoding='utf-8'))
def test_render_unicode(self):
xml = XML('Über uns')
+ self.assertEqual(u'Über uns', xml.render())
self.assertEqual(u'Über uns', xml.render(encoding=None))
def test_render_ascii(self):
xml = XML('Über uns')
- self.assertEqual('Über uns', xml.render(encoding='ascii'))
+ self.assertEqual(u'Über uns'.encode('ascii'), xml.render(encoding='ascii'))
def test_render_output_stream_utf8(self):
xml = XML('Über uns')
- strio = cStringIO()
- self.assertEqual(None, xml.render(out=strio))
- self.assertEqual('Über uns', strio.getvalue())
+ strio = BytesIO()
+ self.assertEqual(None, xml.render(encoding='utf-8', out=strio))
+ self.assertEqual(u'Über uns'.encode('utf-8'), strio.getvalue())
def test_render_output_stream_unicode(self):
xml = XML('Über uns')
@@ -53,7 +50,7 @@
def test_pickle(self):
xml = XML('Foo')
- buf = StringIO()
+ buf = BytesIO()
pickle.dump(xml, buf, 2)
buf.seek(0)
xml = pickle.load(buf)
@@ -63,8 +60,9 @@
class MarkupTestCase(unittest.TestCase):
def test_new_with_encoding(self):
- markup = Markup('Döner', encoding='utf-8')
- self.assertEquals("", repr(markup))
+ markup = Markup(u'Döner'.encode('utf-8'), encoding='utf-8')
+ # mimic Markup.__repr__ when constructing output for Python 2/3 compatibility
+ self.assertEquals("" % u'D\u00f6ner', repr(markup))
def test_repr(self):
markup = Markup('foo')
@@ -158,7 +156,7 @@
def test_pickle(self):
markup = Markup('foo')
- buf = StringIO()
+ buf = BytesIO()
pickle.dump(markup, buf, 2)
buf.seek(0)
self.assertEquals("", repr(pickle.load(buf)))
@@ -168,7 +166,7 @@
def test_pickle(self):
attrs = Attrs([("attr1", "foo"), ("attr2", "bar")])
- buf = StringIO()
+ buf = BytesIO()
pickle.dump(attrs, buf, 2)
buf.seek(0)
unpickled = pickle.load(buf)
@@ -196,7 +194,7 @@
def test_pickle(self):
ns = Namespace('http://www.example.org/namespace')
- buf = StringIO()
+ buf = BytesIO()
pickle.dump(ns, buf, 2)
buf.seek(0)
unpickled = pickle.load(buf)
@@ -209,7 +207,7 @@
def test_pickle(self):
qname = QName('http://www.example.org/namespace}elem')
- buf = StringIO()
+ buf = BytesIO()
pickle.dump(qname, buf, 2)
buf.seek(0)
unpickled = pickle.load(buf)
diff --git a/genshi/tests/input.py b/genshi/tests/input.py
--- a/genshi/tests/input.py
+++ b/genshi/tests/input.py
@@ -12,12 +12,12 @@
# history and logs, available at http://genshi.edgewall.org/log/.
import doctest
-from StringIO import StringIO
import sys
import unittest
from genshi.core import Attrs, Stream
from genshi.input import XMLParser, HTMLParser, ParseError
+from genshi.compat import StringIO, BytesIO
class XMLParserTestCase(unittest.TestCase):
@@ -59,7 +59,7 @@
def test_latin1_encoded(self):
text = u'\xf6
'.encode('iso-8859-1')
- events = list(XMLParser(StringIO(text), encoding='iso-8859-1'))
+ events = list(XMLParser(BytesIO(text), encoding='iso-8859-1'))
kind, data, pos = events[1]
self.assertEqual(Stream.TEXT, kind)
self.assertEqual(u'\xf6', data)
@@ -68,7 +68,7 @@
text = u"""
\xf6
""".encode('iso-8859-1')
- events = list(XMLParser(StringIO(text)))
+ events = list(XMLParser(BytesIO(text)))
kind, data, pos = events[2]
self.assertEqual(Stream.TEXT, kind)
self.assertEqual(u'\xf6', data)
@@ -116,7 +116,7 @@
class HTMLParserTestCase(unittest.TestCase):
def test_text_node_pos_single_line(self):
- text = 'foo bar'
+ text = u'foo bar'
events = list(HTMLParser(StringIO(text)))
kind, data, pos = events[1]
self.assertEqual(Stream.TEXT, kind)
@@ -124,7 +124,7 @@
self.assertEqual((None, 1, 6), pos)
def test_text_node_pos_multi_line(self):
- text = '''foo
+ text = u'''foo
bar'''
events = list(HTMLParser(StringIO(text)))
kind, data, pos = events[1]
@@ -134,14 +134,14 @@
def test_input_encoding_text(self):
text = u'\xf6
'.encode('iso-8859-1')
- events = list(HTMLParser(StringIO(text), encoding='iso-8859-1'))
+ events = list(HTMLParser(BytesIO(text), encoding='iso-8859-1'))
kind, data, pos = events[1]
self.assertEqual(Stream.TEXT, kind)
self.assertEqual(u'\xf6', data)
def test_input_encoding_attribute(self):
text = u''.encode('iso-8859-1')
- events = list(HTMLParser(StringIO(text), encoding='iso-8859-1'))
+ events = list(HTMLParser(BytesIO(text), encoding='iso-8859-1'))
kind, (tag, attrib), pos = events[0]
self.assertEqual(Stream.START, kind)
self.assertEqual(u'\xf6', attrib.get('title'))
@@ -154,7 +154,7 @@
self.assertEqual(u'\u2013', data)
def test_html_entity_in_attribute(self):
- text = ''
+ text = u''
events = list(HTMLParser(StringIO(text)))
kind, data, pos = events[0]
self.assertEqual(Stream.START, kind)
@@ -163,14 +163,14 @@
self.assertEqual(Stream.END, kind)
def test_html_entity_in_text(self):
- text = '
'
+ text = u'
'
events = list(HTMLParser(StringIO(text)))
kind, data, pos = events[1]
self.assertEqual(Stream.TEXT, kind)
self.assertEqual(u'\xa0', data)
def test_processing_instruction(self):
- text = ''
+ text = u''
events = list(HTMLParser(StringIO(text)))
kind, (target, data), pos = events[0]
self.assertEqual(Stream.PI, kind)
@@ -205,7 +205,7 @@
self.assertEqual(1, standalone)
def test_processing_instruction_trailing_qmark(self):
- text = ''
+ text = u''
events = list(HTMLParser(StringIO(text)))
kind, (target, data), pos = events[0]
self.assertEqual(Stream.PI, kind)
@@ -213,7 +213,7 @@
self.assertEqual('echo "Foobar" ?', data)
def test_out_of_order_tags1(self):
- text = 'Foobar'
+ text = u'Foobar'
events = list(HTMLParser(StringIO(text)))
self.assertEqual(5, len(events))
self.assertEqual((Stream.START, ('span', ())), events[0][:2])
@@ -223,8 +223,8 @@
self.assertEqual((Stream.END, 'span'), events[4][:2])
def test_out_of_order_tags2(self):
- text = 'Foobar'
- events = list(HTMLParser(StringIO(text)))
+ text = u'Foobar'.encode('utf-8')
+ events = list(HTMLParser(BytesIO(text), encoding='utf-8'))
self.assertEqual(7, len(events))
self.assertEqual((Stream.START, ('span', Attrs([('class', 'baz')]))),
events[0][:2])
@@ -236,8 +236,8 @@
self.assertEqual((Stream.END, 'span'), events[6][:2])
def test_out_of_order_tags3(self):
- text = 'Foobar'
- events = list(HTMLParser(StringIO(text)))
+ text = u'Foobar'.encode('utf-8')
+ events = list(HTMLParser(BytesIO(text), encoding='utf-8'))
self.assertEqual(5, len(events))
self.assertEqual((Stream.START, ('span', ())), events[0][:2])
self.assertEqual((Stream.START, ('b', ())), events[1][:2])
@@ -246,7 +246,7 @@
self.assertEqual((Stream.END, 'span'), events[4][:2])
def test_hex_charref(self):
- text = '''
+ text = u'''
events = list(HTMLParser(StringIO(text)))
self.assertEqual(3, len(events))
self.assertEqual((Stream.START, ('span', ())), events[0][:2])
diff --git a/genshi/tests/output.py b/genshi/tests/output.py
--- a/genshi/tests/output.py
+++ b/genshi/tests/output.py
@@ -356,7 +356,7 @@
""", output)
def test_html5_doctype(self):
- stream = HTML('')
+ stream = HTML(u'')
output = stream.render(XHTMLSerializer, doctype=DocType.HTML5,
encoding=None)
self.assertEqual('\n', output)
@@ -427,7 +427,7 @@
""", output)
def test_html5_doctype(self):
- stream = HTML('')
+ stream = HTML(u'')
output = stream.render(HTMLSerializer, doctype=DocType.HTML5,
encoding=None)
self.assertEqual('\n', output)