# HG changeset patch # User cmlenz # Date 1160746958 0 # Node ID 38adb4aa7df5284f50a8e6ab5e890a343bc0d1e1 # Parent 6de057dfa90556aacf68240934e25ac6bd9470c6 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem]. diff --git a/genshi/input.py b/genshi/input.py --- a/genshi/input.py +++ b/genshi/input.py @@ -21,7 +21,7 @@ import htmlentitydefs from StringIO import StringIO -from genshi.core import Attrs, QName, Stream +from genshi.core import Attrs, QName, Stream, stripentities from genshi.core import DOCTYPE, START, END, START_NS, END_NS, TEXT, \ START_CDATA, END_CDATA, PI, COMMENT @@ -71,6 +71,10 @@ END root """ + _entitydefs = ['' % (name, value) for name, value in + htmlentitydefs.name2codepoint.items()] + _external_dtd = '\n'.join(_entitydefs) + def __init__(self, source, filename=None): """Initialize the parser for the given XML text. @@ -100,7 +104,9 @@ # Tell Expat that we'll handle non-XML entities ourselves # (in _handle_other) parser.DefaultHandler = self._handle_other + parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) parser.UseForeignDTD() + parser.ExternalEntityRefHandler = self._build_foreign # Location reporting is only support in Python >= 2.4 if not hasattr(parser, 'CurrentLineNumber'): @@ -141,6 +147,11 @@ def __iter__(self): return iter(self.parse()) + def _build_foreign(self, context, base, sysid, pubid): + parser = self.expat.ExternalEntityParserCreate(context) + parser.ParseFile(StringIO(self._external_dtd)) + return 1 + def _enqueue(self, kind, data=None, pos=None): if pos is None: pos = self._getpos() @@ -292,7 +303,7 @@ for name, value in attrib: # Fixup minimized attributes if value is None: value = name - fixed_attrib.append((name, unicode(value))) + fixed_attrib.append((name, unicode(stripentities(value)))) self._enqueue(START, (QName(tag), Attrs(fixed_attrib))) if tag in self._EMPTY_ELEMS: diff --git a/genshi/tests/input.py b/genshi/tests/input.py --- a/genshi/tests/input.py +++ b/genshi/tests/input.py @@ -76,6 +76,15 @@ self.assertEqual(Stream.TEXT, kind) self.assertEqual(u'\xa0', data) + def test_html_entity_in_attribute(self): + text = '

' + events = list(XMLParser(StringIO(text))) + kind, data, pos = events[0] + self.assertEqual(Stream.START, kind) + self.assertEqual(u'\xa0', data[1].get('title')) + kind, data, pos = events[1] + self.assertEqual(Stream.END, kind) + def test_undefined_entity_with_dtd(self): text = """ @@ -118,6 +127,22 @@ self.assertEqual(Stream.TEXT, kind) self.assertEqual(u'\u2013', data) + def test_html_entity_in_attribute(self): + text = '

' + events = list(HTMLParser(StringIO(text))) + kind, data, pos = events[0] + self.assertEqual(Stream.START, kind) + self.assertEqual(u'\xa0', data[1].get('title')) + kind, data, pos = events[1] + self.assertEqual(Stream.END, kind) + + def test_html_entity_in_text(self): + text = '

 

' + events = list(HTMLParser(StringIO(text))) + kind, data, pos = events[1] + self.assertEqual(Stream.TEXT, kind) + self.assertEqual(u'\xa0', data) + def suite(): suite = unittest.TestSuite()