diff genshi/input.py @ 293:e17b7459b515 trunk

Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
author cmlenz
date Fri, 13 Oct 2006 13:42:38 +0000
parents 94f9f2cc66c8
children 8de1ff534d22
line wrap: on
line diff
--- a/genshi/input.py
+++ b/genshi/input.py
@@ -21,7 +21,7 @@
 import htmlentitydefs
 from StringIO import StringIO
 
-from genshi.core import Attrs, QName, Stream
+from genshi.core import Attrs, QName, Stream, stripentities
 from genshi.core import DOCTYPE, START, END, START_NS, END_NS, TEXT, \
                         START_CDATA, END_CDATA, PI, COMMENT
 
@@ -71,6 +71,10 @@
     END root
     """
 
+    _entitydefs = ['<!ENTITY %s "&#%d;">' % (name, value) for name, value in
+                   htmlentitydefs.name2codepoint.items()]
+    _external_dtd = '\n'.join(_entitydefs)
+
     def __init__(self, source, filename=None):
         """Initialize the parser for the given XML text.
         
@@ -100,7 +104,9 @@
         # Tell Expat that we'll handle non-XML entities ourselves
         # (in _handle_other)
         parser.DefaultHandler = self._handle_other
+        parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS)
         parser.UseForeignDTD()
+        parser.ExternalEntityRefHandler = self._build_foreign
 
         # Location reporting is only support in Python >= 2.4
         if not hasattr(parser, 'CurrentLineNumber'):
@@ -141,6 +147,11 @@
     def __iter__(self):
         return iter(self.parse())
 
+    def _build_foreign(self, context, base, sysid, pubid):
+        parser = self.expat.ExternalEntityParserCreate(context)
+        parser.ParseFile(StringIO(self._external_dtd))
+        return 1
+
     def _enqueue(self, kind, data=None, pos=None):
         if pos is None:
             pos = self._getpos()
@@ -292,7 +303,7 @@
         for name, value in attrib: # Fixup minimized attributes
             if value is None:
                 value = name
-            fixed_attrib.append((name, unicode(value)))
+            fixed_attrib.append((name, unicode(stripentities(value))))
 
         self._enqueue(START, (QName(tag), Attrs(fixed_attrib)))
         if tag in self._EMPTY_ELEMS:
Copyright (C) 2012-2017 Edgewall Software