changeset 293:e17b7459b515 trunk

Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
author cmlenz
date Fri, 13 Oct 2006 13:42:38 +0000
parents b72ec37783eb
children 4a8975768572
files genshi/input.py genshi/tests/input.py
diffstat 2 files changed, 38 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/genshi/input.py
+++ b/genshi/input.py
@@ -21,7 +21,7 @@
 import htmlentitydefs
 from StringIO import StringIO
 
-from genshi.core import Attrs, QName, Stream
+from genshi.core import Attrs, QName, Stream, stripentities
 from genshi.core import DOCTYPE, START, END, START_NS, END_NS, TEXT, \
                         START_CDATA, END_CDATA, PI, COMMENT
 
@@ -71,6 +71,10 @@
     END root
     """
 
+    _entitydefs = ['<!ENTITY %s "&#%d;">' % (name, value) for name, value in
+                   htmlentitydefs.name2codepoint.items()]
+    _external_dtd = '\n'.join(_entitydefs)
+
     def __init__(self, source, filename=None):
         """Initialize the parser for the given XML text.
         
@@ -100,7 +104,9 @@
         # Tell Expat that we'll handle non-XML entities ourselves
         # (in _handle_other)
         parser.DefaultHandler = self._handle_other
+        parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS)
         parser.UseForeignDTD()
+        parser.ExternalEntityRefHandler = self._build_foreign
 
         # Location reporting is only support in Python >= 2.4
         if not hasattr(parser, 'CurrentLineNumber'):
@@ -141,6 +147,11 @@
     def __iter__(self):
         return iter(self.parse())
 
+    def _build_foreign(self, context, base, sysid, pubid):
+        parser = self.expat.ExternalEntityParserCreate(context)
+        parser.ParseFile(StringIO(self._external_dtd))
+        return 1
+
     def _enqueue(self, kind, data=None, pos=None):
         if pos is None:
             pos = self._getpos()
@@ -292,7 +303,7 @@
         for name, value in attrib: # Fixup minimized attributes
             if value is None:
                 value = name
-            fixed_attrib.append((name, unicode(value)))
+            fixed_attrib.append((name, unicode(stripentities(value))))
 
         self._enqueue(START, (QName(tag), Attrs(fixed_attrib)))
         if tag in self._EMPTY_ELEMS:
--- a/genshi/tests/input.py
+++ b/genshi/tests/input.py
@@ -76,6 +76,15 @@
         self.assertEqual(Stream.TEXT, kind)
         self.assertEqual(u'\xa0', data)
 
+    def test_html_entity_in_attribute(self):
+        text = '<p title="&nbsp;"/>'
+        events = list(XMLParser(StringIO(text)))
+        kind, data, pos = events[0]
+        self.assertEqual(Stream.START, kind)
+        self.assertEqual(u'\xa0', data[1].get('title'))
+        kind, data, pos = events[1]
+        self.assertEqual(Stream.END, kind)
+
     def test_undefined_entity_with_dtd(self):
         text = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
         "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
@@ -118,6 +127,22 @@
         self.assertEqual(Stream.TEXT, kind)
         self.assertEqual(u'\u2013', data)
 
+    def test_html_entity_in_attribute(self):
+        text = '<p title="&nbsp;"></p>'
+        events = list(HTMLParser(StringIO(text)))
+        kind, data, pos = events[0]
+        self.assertEqual(Stream.START, kind)
+        self.assertEqual(u'\xa0', data[1].get('title'))
+        kind, data, pos = events[1]
+        self.assertEqual(Stream.END, kind)
+
+    def test_html_entity_in_text(self):
+        text = '<p>&nbsp;</p>'
+        events = list(HTMLParser(StringIO(text)))
+        kind, data, pos = events[1]
+        self.assertEqual(Stream.TEXT, kind)
+        self.assertEqual(u'\xa0', data)
+
 
 def suite():
     suite = unittest.TestSuite()
Copyright (C) 2012-2017 Edgewall Software