# HG changeset patch # User cmlenz # Date 1164303712 0 # Node ID fff4a81ffc56d5e81ad41af80217875f8bb7a3ba # Parent 79a7db5177e9fd38bc23c15796ec4bbe00f0be87 Improve handling of incorrectly nested tags in the HTML parser. diff --git a/ChangeLog b/ChangeLog --- a/ChangeLog +++ b/ChangeLog @@ -29,6 +29,7 @@ subclasses of `tuple` instead of `list`). * `MarkupTemplate`s can now be instantiated from markup streams, in addition to strings and file-like objects (ticket #69). + * Improve handling of incorrectly nested tags in the HTML parser. Version 0.3.5 diff --git a/genshi/input.py b/genshi/input.py --- a/genshi/input.py +++ b/genshi/input.py @@ -327,10 +327,9 @@ if tag not in self._EMPTY_ELEMS: while self._open_tags: open_tag = self._open_tags.pop() + self._enqueue(END, QName(open_tag)) if open_tag.lower() == tag.lower(): break - self._enqueue(END, QName(open_tag)) - self._enqueue(END, QName(tag)) def handle_data(self, text): if not isinstance(text, unicode): diff --git a/genshi/tests/input.py b/genshi/tests/input.py --- a/genshi/tests/input.py +++ b/genshi/tests/input.py @@ -16,7 +16,7 @@ import sys import unittest -from genshi.core import Stream +from genshi.core import Attrs, Stream from genshi.input import XMLParser, HTMLParser, ParseError @@ -189,6 +189,39 @@ self.assertEqual(u'php', target) self.assertEqual(u'echo "Foobar" ?', data) + def test_out_of_order_tags1(self): + text = 'Foobar' + events = list(HTMLParser(StringIO(text))) + self.assertEqual(5, len(events)) + self.assertEqual((Stream.START, ('span', ())), events[0][:2]) + self.assertEqual((Stream.START, ('b', ())), events[1][:2]) + self.assertEqual((Stream.TEXT, 'Foobar'), events[2][:2]) + self.assertEqual((Stream.END, 'b'), events[3][:2]) + self.assertEqual((Stream.END, 'span'), events[4][:2]) + + def test_out_of_order_tags2(self): + text = 'Foobar' + events = list(HTMLParser(StringIO(text))) + self.assertEqual(7, len(events)) + self.assertEqual((Stream.START, ('span', Attrs([('class', 'baz')]))), + events[0][:2]) + self.assertEqual((Stream.START, ('b', ())), events[1][:2]) + self.assertEqual((Stream.START, ('i', ())), events[2][:2]) + self.assertEqual((Stream.TEXT, 'Foobar'), events[3][:2]) + self.assertEqual((Stream.END, 'i'), events[4][:2]) + self.assertEqual((Stream.END, 'b'), events[5][:2]) + self.assertEqual((Stream.END, 'span'), events[6][:2]) + + def test_out_of_order_tags1(self): + text = 'Foobar' + events = list(HTMLParser(StringIO(text))) + self.assertEqual(5, len(events)) + self.assertEqual((Stream.START, ('span', ())), events[0][:2]) + self.assertEqual((Stream.START, ('b', ())), events[1][:2]) + self.assertEqual((Stream.TEXT, 'Foobar'), events[2][:2]) + self.assertEqual((Stream.END, 'b'), events[3][:2]) + self.assertEqual((Stream.END, 'span'), events[4][:2]) + def suite(): suite = unittest.TestSuite()