changeset 915:9fafb35032a1 experimental-py3k

add support for python 3 to core genshi components (genshi.core, genshi.input and genshi.output): * default input and output encodings changed from UTF-8 to None (i.e. unicode strings) * Namespace and QName objects do not call stringrepr in __repr__ in Python 3 since repr() returns a unicode string there. * track changes to expat parser in Python 3 (mostly it accepts bytes instead of strings)
author hodgestar
date Sun, 24 Oct 2010 22:08:11 +0000
parents c5faa881d87f
children 872726bac135
files genshi/core.py genshi/input.py genshi/output.py genshi/tests/core.py genshi/tests/input.py genshi/tests/output.py
diffstat 6 files changed, 75 insertions(+), 55 deletions(-) [+]
line wrap: on
line diff
--- a/genshi/core.py
+++ b/genshi/core.py
@@ -17,6 +17,7 @@
     reduce # builtin in Python < 3
 except NameError:
     from functools import reduce
+import sys
 from itertools import chain
 import operator
 
@@ -92,7 +93,7 @@
         Assume the following stream produced by the `HTML` function:
         
         >>> from genshi.input import HTML
-        >>> html = HTML('''<p onclick="alert('Whoa')">Hello, world!</p>''')
+        >>> html = HTML('''<p onclick="alert('Whoa')">Hello, world!</p>''', encoding='utf-8')
         >>> print(html)
         <p onclick="alert('Whoa')">Hello, world!</p>
         
@@ -153,7 +154,7 @@
         """
         return reduce(operator.or_, (self,) + filters)
 
-    def render(self, method=None, encoding='utf-8', out=None, **kwargs):
+    def render(self, method=None, encoding=None, out=None, **kwargs):
         """Return a string representation of the stream.
         
         Any additional keyword arguments are passed to the serializer, and thus
@@ -187,7 +188,7 @@
         XPath expression.
         
         >>> from genshi import HTML
-        >>> stream = HTML('<doc><elem>foo</elem><elem>bar</elem></doc>')
+        >>> stream = HTML('<doc><elem>foo</elem><elem>bar</elem></doc>', encoding='utf-8')
         >>> print(stream.select('elem'))
         <elem>foo</elem><elem>bar</elem>
         >>> print(stream.select('elem/text()'))
@@ -667,8 +668,13 @@
     def __hash__(self):
         return hash(self.uri)
 
-    def __repr__(self):
-        return '%s(%s)' % (type(self).__name__, stringrepr(self.uri))
+    if sys.version_info[0] == 2:
+        # Only use stringrepr in python 2
+        def __repr__(self):
+            return '%s(%s)' % (type(self).__name__, stringrepr(self.uri))
+    else:
+        def __repr__(self):
+            return '%s(%r)' % (type(self).__name__, self.uri)
 
     def __str__(self):
         return self.uri.encode('utf-8')
@@ -728,5 +734,10 @@
     def __getnewargs__(self):
         return (self.lstrip('{'),)
 
-    def __repr__(self):
-        return '%s(%s)' % (type(self).__name__, stringrepr(self.lstrip('{')))
+    if sys.version_info[0] == 2:
+        # Only use stringrepr in python 2
+        def __repr__(self):
+            return '%s(%s)' % (type(self).__name__, stringrepr(self.lstrip('{')))
+    else:
+        def __repr__(self):
+            return '%s(%r)' % (type(self).__name__, self.lstrip('{'))
--- a/genshi/input.py
+++ b/genshi/input.py
@@ -18,12 +18,13 @@
 from itertools import chain
 import htmlentitydefs as entities
 import HTMLParser as html
-from StringIO import StringIO
 from xml.parsers import expat
 
 from genshi.core import Attrs, QName, Stream, stripentities
 from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, START_NS, \
                         END_NS, START_CDATA, END_CDATA, PI, COMMENT
+from genshi.compat import StringIO, BytesIO
+
 
 __all__ = ['ET', 'ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML']
 __docformat__ = 'restructuredtext en'
@@ -90,7 +91,7 @@
 
     _entitydefs = ['<!ENTITY %s "&#%d;">' % (name, value) for name, value in
                    entities.name2codepoint.items()]
-    _external_dtd = '\n'.join(_entitydefs)
+    _external_dtd = u'\n'.join(_entitydefs).encode('utf-8')
 
     def __init__(self, source, filename=None, encoding=None):
         """Initialize the parser for the given XML input.
@@ -108,7 +109,9 @@
         # Setup the Expat parser
         parser = expat.ParserCreate(encoding, '}')
         parser.buffer_text = True
-        parser.returns_unicode = True
+        # Python 3 does not have returns_unicode
+        if hasattr(parser, 'returns_unicode'):
+            parser.returns_unicode = True
         parser.ordered_attributes = True
 
         parser.StartElementHandler = self._handle_start
@@ -146,7 +149,7 @@
                 while 1:
                     while not done and len(self._queue) == 0:
                         data = self.source.read(bufsize)
-                        if data == '': # end of data
+                        if not data: # end of data
                             if hasattr(self, 'expat'):
                                 self.expat.Parse('', True)
                                 del self.expat # get rid of circular references
@@ -170,7 +173,7 @@
 
     def _build_foreign(self, context, base, sysid, pubid):
         parser = self.expat.ExternalEntityParserCreate(context)
-        parser.ParseFile(StringIO(self._external_dtd))
+        parser.ParseFile(BytesIO(self._external_dtd))
         return 1
 
     def _enqueue(self, kind, data=None, pos=None):
@@ -279,7 +282,7 @@
     
     The parsing is initiated by iterating over the parser object:
     
-    >>> parser = HTMLParser(StringIO('<UL compact><LI>Foo</UL>'))
+    >>> parser = HTMLParser(BytesIO(u'<UL compact><LI>Foo</UL>'.encode('utf-8')), encoding='utf-8')
     >>> for kind, data, pos in parser:
     ...     print('%s %s' % (kind, data))
     START (QName('ul'), Attrs([(QName('compact'), u'compact')]))
@@ -293,7 +296,7 @@
                               'hr', 'img', 'input', 'isindex', 'link', 'meta',
                               'param'])
 
-    def __init__(self, source, filename=None, encoding='utf-8'):
+    def __init__(self, source, filename=None, encoding=None):
         """Initialize the parser for the given HTML input.
         
         :param source: the HTML text as a file-like object
@@ -320,10 +323,16 @@
                 while 1:
                     while not done and len(self._queue) == 0:
                         data = self.source.read(bufsize)
-                        if data == '': # end of data
+                        if not data: # end of data
                             self.close()
                             done = True
                         else:
+                            if not isinstance(data, unicode):
+                                # bytes
+                                if self.encoding:
+                                    data = data.decode(self.encoding)
+                                else:
+                                    raise UnicodeError("source returned bytes, but no encoding specified")
                             self.feed(data)
                     for kind, data, pos in self._queue:
                         yield kind, data, pos
@@ -403,13 +412,13 @@
         self._enqueue(COMMENT, text)
 
 
-def HTML(text, encoding='utf-8'):
+def HTML(text, encoding=None):
     """Parse the given HTML source and return a markup stream.
     
     Unlike with `HTMLParser`, the returned stream is reusable, meaning it can be
     iterated over multiple times:
     
-    >>> html = HTML('<body><h1>Foo</h1></body>')
+    >>> html = HTML('<body><h1>Foo</h1></body>', encoding='utf-8')
     >>> print(html)
     <body><h1>Foo</h1></body>
     >>> print(html.select('h1'))
@@ -422,7 +431,9 @@
     :raises ParseError: if the HTML text is not well-formed, and error recovery
                         fails
     """
-    return Stream(list(HTMLParser(StringIO(text), encoding=encoding)))
+    if isinstance(text, unicode):
+        return Stream(list(HTMLParser(StringIO(text), encoding=encoding)))
+    return Stream(list(HTMLParser(BytesIO(text), encoding=encoding)))
 
 
 def _coalesce(stream):
--- a/genshi/output.py
+++ b/genshi/output.py
@@ -27,7 +27,7 @@
 __docformat__ = 'restructuredtext en'
 
 
-def encode(iterator, method='xml', encoding='utf-8', out=None):
+def encode(iterator, method='xml', encoding=None, out=None):
     """Encode serializer output into a string.
     
     :param iterator: the iterator returned from serializing a stream (basically
--- a/genshi/tests/core.py
+++ b/genshi/tests/core.py
@@ -13,37 +13,34 @@
 
 import doctest
 import pickle
-from StringIO import StringIO
-try:
-    from cStringIO import StringIO as cStringIO
-except ImportError:
-    cStringIO = StringIO
 import unittest
 
 from genshi import core
 from genshi.core import Markup, Attrs, Namespace, QName, escape, unescape
 from genshi.input import XML, ParseError
+from genshi.compat import StringIO, BytesIO
 
 
 class StreamTestCase(unittest.TestCase):
 
     def test_render_utf8(self):
         xml = XML('<li>Über uns</li>')
-        self.assertEqual('<li>Über uns</li>', xml.render())
+        self.assertEqual(u'<li>Über uns</li>'.encode('utf-8'), xml.render(encoding='utf-8'))
 
     def test_render_unicode(self):
         xml = XML('<li>Über uns</li>')
+        self.assertEqual(u'<li>Über uns</li>', xml.render())
         self.assertEqual(u'<li>Über uns</li>', xml.render(encoding=None))
 
     def test_render_ascii(self):
         xml = XML('<li>Über uns</li>')
-        self.assertEqual('<li>&#220;ber uns</li>', xml.render(encoding='ascii'))
+        self.assertEqual(u'<li>&#220;ber uns</li>'.encode('ascii'), xml.render(encoding='ascii'))
 
     def test_render_output_stream_utf8(self):
         xml = XML('<li>Über uns</li>')
-        strio = cStringIO()
-        self.assertEqual(None, xml.render(out=strio))
-        self.assertEqual('<li>Über uns</li>', strio.getvalue())
+        strio = BytesIO()
+        self.assertEqual(None, xml.render(encoding='utf-8', out=strio))
+        self.assertEqual(u'<li>Über uns</li>'.encode('utf-8'), strio.getvalue())
 
     def test_render_output_stream_unicode(self):
         xml = XML('<li>Über uns</li>')
@@ -53,7 +50,7 @@
 
     def test_pickle(self):
         xml = XML('<li>Foo</li>')
-        buf = StringIO()
+        buf = BytesIO()
         pickle.dump(xml, buf, 2)
         buf.seek(0)
         xml = pickle.load(buf)
@@ -63,8 +60,9 @@
 class MarkupTestCase(unittest.TestCase):
 
     def test_new_with_encoding(self):
-        markup = Markup('Döner', encoding='utf-8')
-        self.assertEquals("<Markup u'D\\xf6ner'>", repr(markup))
+        markup = Markup(u'Döner'.encode('utf-8'), encoding='utf-8')
+        # mimic Markup.__repr__ when constructing output for Python 2/3 compatibility
+        self.assertEquals("<Markup %r>" % u'D\u00f6ner', repr(markup))
 
     def test_repr(self):
         markup = Markup('foo')
@@ -158,7 +156,7 @@
 
     def test_pickle(self):
         markup = Markup('foo')
-        buf = StringIO()
+        buf = BytesIO()
         pickle.dump(markup, buf, 2)
         buf.seek(0)
         self.assertEquals("<Markup u'foo'>", repr(pickle.load(buf)))
@@ -168,7 +166,7 @@
 
     def test_pickle(self):
         attrs = Attrs([("attr1", "foo"), ("attr2", "bar")])
-        buf = StringIO()
+        buf = BytesIO()
         pickle.dump(attrs, buf, 2)
         buf.seek(0)
         unpickled = pickle.load(buf)
@@ -196,7 +194,7 @@
 
     def test_pickle(self):
         ns = Namespace('http://www.example.org/namespace')
-        buf = StringIO()
+        buf = BytesIO()
         pickle.dump(ns, buf, 2)
         buf.seek(0)
         unpickled = pickle.load(buf)
@@ -209,7 +207,7 @@
 
     def test_pickle(self):
         qname = QName('http://www.example.org/namespace}elem')
-        buf = StringIO()
+        buf = BytesIO()
         pickle.dump(qname, buf, 2)
         buf.seek(0)
         unpickled = pickle.load(buf)
--- a/genshi/tests/input.py
+++ b/genshi/tests/input.py
@@ -12,12 +12,12 @@
 # history and logs, available at http://genshi.edgewall.org/log/.
 
 import doctest
-from StringIO import StringIO
 import sys
 import unittest
 
 from genshi.core import Attrs, Stream
 from genshi.input import XMLParser, HTMLParser, ParseError
+from genshi.compat import StringIO, BytesIO
 
 
 class XMLParserTestCase(unittest.TestCase):
@@ -59,7 +59,7 @@
 
     def test_latin1_encoded(self):
         text = u'<div>\xf6</div>'.encode('iso-8859-1')
-        events = list(XMLParser(StringIO(text), encoding='iso-8859-1'))
+        events = list(XMLParser(BytesIO(text), encoding='iso-8859-1'))
         kind, data, pos = events[1]
         self.assertEqual(Stream.TEXT, kind)
         self.assertEqual(u'\xf6', data)
@@ -68,7 +68,7 @@
         text = u"""<?xml version="1.0" encoding="iso-8859-1" ?>
         <div>\xf6</div>
         """.encode('iso-8859-1')
-        events = list(XMLParser(StringIO(text)))
+        events = list(XMLParser(BytesIO(text)))
         kind, data, pos = events[2]
         self.assertEqual(Stream.TEXT, kind)
         self.assertEqual(u'\xf6', data)
@@ -116,7 +116,7 @@
 class HTMLParserTestCase(unittest.TestCase):
 
     def test_text_node_pos_single_line(self):
-        text = '<elem>foo bar</elem>'
+        text = u'<elem>foo bar</elem>'
         events = list(HTMLParser(StringIO(text)))
         kind, data, pos = events[1]
         self.assertEqual(Stream.TEXT, kind)
@@ -124,7 +124,7 @@
         self.assertEqual((None, 1, 6), pos)
 
     def test_text_node_pos_multi_line(self):
-        text = '''<elem>foo
+        text = u'''<elem>foo
 bar</elem>'''
         events = list(HTMLParser(StringIO(text)))
         kind, data, pos = events[1]
@@ -134,14 +134,14 @@
 
     def test_input_encoding_text(self):
         text = u'<div>\xf6</div>'.encode('iso-8859-1')
-        events = list(HTMLParser(StringIO(text), encoding='iso-8859-1'))
+        events = list(HTMLParser(BytesIO(text), encoding='iso-8859-1'))
         kind, data, pos = events[1]
         self.assertEqual(Stream.TEXT, kind)
         self.assertEqual(u'\xf6', data)
 
     def test_input_encoding_attribute(self):
         text = u'<div title="\xf6"></div>'.encode('iso-8859-1')
-        events = list(HTMLParser(StringIO(text), encoding='iso-8859-1'))
+        events = list(HTMLParser(BytesIO(text), encoding='iso-8859-1'))
         kind, (tag, attrib), pos = events[0]
         self.assertEqual(Stream.START, kind)
         self.assertEqual(u'\xf6', attrib.get('title'))
@@ -154,7 +154,7 @@
         self.assertEqual(u'\u2013', data)
 
     def test_html_entity_in_attribute(self):
-        text = '<p title="&nbsp;"></p>'
+        text = u'<p title="&nbsp;"></p>'
         events = list(HTMLParser(StringIO(text)))
         kind, data, pos = events[0]
         self.assertEqual(Stream.START, kind)
@@ -163,14 +163,14 @@
         self.assertEqual(Stream.END, kind)
 
     def test_html_entity_in_text(self):
-        text = '<p>&nbsp;</p>'
+        text = u'<p>&nbsp;</p>'
         events = list(HTMLParser(StringIO(text)))
         kind, data, pos = events[1]
         self.assertEqual(Stream.TEXT, kind)
         self.assertEqual(u'\xa0', data)
 
     def test_processing_instruction(self):
-        text = '<?php echo "Foobar" ?>'
+        text = u'<?php echo "Foobar" ?>'
         events = list(HTMLParser(StringIO(text)))
         kind, (target, data), pos = events[0]
         self.assertEqual(Stream.PI, kind)
@@ -205,7 +205,7 @@
         self.assertEqual(1, standalone)
 
     def test_processing_instruction_trailing_qmark(self):
-        text = '<?php echo "Foobar" ??>'
+        text = u'<?php echo "Foobar" ??>'
         events = list(HTMLParser(StringIO(text)))
         kind, (target, data), pos = events[0]
         self.assertEqual(Stream.PI, kind)
@@ -213,7 +213,7 @@
         self.assertEqual('echo "Foobar" ?', data)
 
     def test_out_of_order_tags1(self):
-        text = '<span><b>Foobar</span></b>'
+        text = u'<span><b>Foobar</span></b>'
         events = list(HTMLParser(StringIO(text)))
         self.assertEqual(5, len(events))
         self.assertEqual((Stream.START, ('span', ())), events[0][:2])
@@ -223,8 +223,8 @@
         self.assertEqual((Stream.END, 'span'), events[4][:2])
 
     def test_out_of_order_tags2(self):
-        text = '<span class="baz"><b><i>Foobar</span></b></i>'
-        events = list(HTMLParser(StringIO(text)))
+        text = u'<span class="baz"><b><i>Foobar</span></b></i>'.encode('utf-8')
+        events = list(HTMLParser(BytesIO(text), encoding='utf-8'))
         self.assertEqual(7, len(events))
         self.assertEqual((Stream.START, ('span', Attrs([('class', 'baz')]))),
                          events[0][:2])
@@ -236,8 +236,8 @@
         self.assertEqual((Stream.END, 'span'), events[6][:2])
 
     def test_out_of_order_tags3(self):
-        text = '<span><b>Foobar</i>'
-        events = list(HTMLParser(StringIO(text)))
+        text = u'<span><b>Foobar</i>'.encode('utf-8')
+        events = list(HTMLParser(BytesIO(text), encoding='utf-8'))
         self.assertEqual(5, len(events))
         self.assertEqual((Stream.START, ('span', ())), events[0][:2])
         self.assertEqual((Stream.START, ('b', ())), events[1][:2])
@@ -246,7 +246,7 @@
         self.assertEqual((Stream.END, 'span'), events[4][:2])
 
     def test_hex_charref(self):
-        text = '<span>&#x27;</span>'
+        text = u'<span>&#x27;</span>'
         events = list(HTMLParser(StringIO(text)))
         self.assertEqual(3, len(events))
         self.assertEqual((Stream.START, ('span', ())), events[0][:2])
--- a/genshi/tests/output.py
+++ b/genshi/tests/output.py
@@ -356,7 +356,7 @@
         </div>""", output)
 
     def test_html5_doctype(self):
-        stream = HTML('<html></html>')
+        stream = HTML(u'<html></html>')
         output = stream.render(XHTMLSerializer, doctype=DocType.HTML5,
                                encoding=None)
         self.assertEqual('<!DOCTYPE html>\n<html></html>', output)
@@ -427,7 +427,7 @@
         </style>""", output)
 
     def test_html5_doctype(self):
-        stream = HTML('<html></html>')
+        stream = HTML(u'<html></html>')
         output = stream.render(HTMLSerializer, doctype=DocType.HTML5,
                                encoding=None)
         self.assertEqual('<!DOCTYPE html>\n<html></html>', output)
Copyright (C) 2012-2017 Edgewall Software