# HG changeset patch
# User cmlenz
# Date 1164303712 0
# Node ID 873ca2a7ec05c7f5f0dc96eb4226128477ac2e48
# Parent 9aa6aa18fa3529fdc2bb0999d25c58bcf264c9d6
Improve handling of incorrectly nested tags in the HTML parser.
diff --git a/ChangeLog b/ChangeLog
--- a/ChangeLog
+++ b/ChangeLog
@@ -29,6 +29,7 @@
subclasses of `tuple` instead of `list`).
* `MarkupTemplate`s can now be instantiated from markup streams, in addition
to strings and file-like objects (ticket #69).
+ * Improve handling of incorrectly nested tags in the HTML parser.
Version 0.3.5
diff --git a/genshi/input.py b/genshi/input.py
--- a/genshi/input.py
+++ b/genshi/input.py
@@ -327,10 +327,9 @@
if tag not in self._EMPTY_ELEMS:
while self._open_tags:
open_tag = self._open_tags.pop()
+ self._enqueue(END, QName(open_tag))
if open_tag.lower() == tag.lower():
break
- self._enqueue(END, QName(open_tag))
- self._enqueue(END, QName(tag))
def handle_data(self, text):
if not isinstance(text, unicode):
diff --git a/genshi/tests/input.py b/genshi/tests/input.py
--- a/genshi/tests/input.py
+++ b/genshi/tests/input.py
@@ -16,7 +16,7 @@
import sys
import unittest
-from genshi.core import Stream
+from genshi.core import Attrs, Stream
from genshi.input import XMLParser, HTMLParser, ParseError
@@ -189,6 +189,39 @@
self.assertEqual(u'php', target)
self.assertEqual(u'echo "Foobar" ?', data)
+ def test_out_of_order_tags1(self):
+ text = 'Foobar'
+ events = list(HTMLParser(StringIO(text)))
+ self.assertEqual(5, len(events))
+ self.assertEqual((Stream.START, ('span', ())), events[0][:2])
+ self.assertEqual((Stream.START, ('b', ())), events[1][:2])
+ self.assertEqual((Stream.TEXT, 'Foobar'), events[2][:2])
+ self.assertEqual((Stream.END, 'b'), events[3][:2])
+ self.assertEqual((Stream.END, 'span'), events[4][:2])
+
+ def test_out_of_order_tags2(self):
+ text = 'Foobar'
+ events = list(HTMLParser(StringIO(text)))
+ self.assertEqual(7, len(events))
+ self.assertEqual((Stream.START, ('span', Attrs([('class', 'baz')]))),
+ events[0][:2])
+ self.assertEqual((Stream.START, ('b', ())), events[1][:2])
+ self.assertEqual((Stream.START, ('i', ())), events[2][:2])
+ self.assertEqual((Stream.TEXT, 'Foobar'), events[3][:2])
+ self.assertEqual((Stream.END, 'i'), events[4][:2])
+ self.assertEqual((Stream.END, 'b'), events[5][:2])
+ self.assertEqual((Stream.END, 'span'), events[6][:2])
+
+ def test_out_of_order_tags1(self):
+ text = 'Foobar'
+ events = list(HTMLParser(StringIO(text)))
+ self.assertEqual(5, len(events))
+ self.assertEqual((Stream.START, ('span', ())), events[0][:2])
+ self.assertEqual((Stream.START, ('b', ())), events[1][:2])
+ self.assertEqual((Stream.TEXT, 'Foobar'), events[2][:2])
+ self.assertEqual((Stream.END, 'b'), events[3][:2])
+ self.assertEqual((Stream.END, 'span'), events[4][:2])
+
def suite():
suite = unittest.TestSuite()