changeset 965:2bfd8f8d241c trunk

Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
author hodgestar
date Sat, 29 Dec 2012 13:02:20 +0000
parents 99d4c481e4eb
children 570226c48119
files genshi/input.py genshi/tests/input.py
diffstat 2 files changed, 19 insertions(+), 7 deletions(-) [+]
line wrap: on
line diff
--- a/genshi/input.py
+++ b/genshi/input.py
@@ -16,6 +16,7 @@
 """
 
 from itertools import chain
+import codecs
 import htmlentitydefs as entities
 import HTMLParser as html
 from xml.parsers import expat
@@ -317,22 +318,23 @@
         :raises ParseError: if the HTML text is not well formed
         """
         def _generate():
+            if self.encoding:
+                reader = codecs.getreader(self.encoding)
+                source = reader(self.source)
+            else:
+                source = self.source
             try:
                 bufsize = 4 * 1024 # 4K
                 done = False
                 while 1:
                     while not done and len(self._queue) == 0:
-                        data = self.source.read(bufsize)
+                        data = source.read(bufsize)
                         if not data: # end of data
                             self.close()
                             done = True
                         else:
                             if not isinstance(data, unicode):
-                                # bytes
-                                if self.encoding:
-                                    data = data.decode(self.encoding)
-                                else:
-                                    raise UnicodeError("source returned bytes, but no encoding specified")
+                                raise UnicodeError("source returned bytes, but no encoding specified")
                             self.feed(data)
                     for kind, data, pos in self._queue:
                         yield kind, data, pos
@@ -432,7 +434,10 @@
                         fails
     """
     if isinstance(text, unicode):
-        return Stream(list(HTMLParser(StringIO(text), encoding=encoding)))
+        # If it's unicode text the encoding should be set to None.
+        # The option to pass in an incorrect encoding is for ease
+        # of writing doctests that work in both Python 2.x and 3.x.
+        return Stream(list(HTMLParser(StringIO(text), encoding=None)))
     return Stream(list(HTMLParser(BytesIO(text), encoding=encoding)))
 
 
--- a/genshi/tests/input.py
+++ b/genshi/tests/input.py
@@ -253,6 +253,13 @@
         self.assertEqual((Stream.TEXT, "'"), events[1][:2])
         self.assertEqual((Stream.END, 'span'), events[2][:2])
 
+    def test_multibyte_character_on_chunk_boundary(self):
+        text = u'a' * ((4 * 1024) - 1) + u'\xe6'
+        events = list(HTMLParser(BytesIO(text.encode('utf-8')),
+                                 encoding='utf-8'))
+        self.assertEqual(1, len(events))
+        self.assertEqual((Stream.TEXT, text), events[0][:2])
+
 
 def suite():
     suite = unittest.TestSuite()
Copyright (C) 2012-2017 Edgewall Software