changeset 378:873ca2a7ec05 trunk

Improve handling of incorrectly nested tags in the HTML parser.
author cmlenz
date Thu, 23 Nov 2006 17:41:52 +0000
parents 9aa6aa18fa35
children e1d659c87ddf
files ChangeLog genshi/input.py genshi/tests/input.py
diffstat 3 files changed, 36 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- a/ChangeLog
+++ b/ChangeLog
@@ -29,6 +29,7 @@
    subclasses of `tuple` instead of `list`).
  * `MarkupTemplate`s can now be instantiated from markup streams, in addition
    to strings and file-like objects (ticket #69).
+ * Improve handling of incorrectly nested tags in the HTML parser.
 
 
 Version 0.3.5
--- a/genshi/input.py
+++ b/genshi/input.py
@@ -327,10 +327,9 @@
         if tag not in self._EMPTY_ELEMS:
             while self._open_tags:
                 open_tag = self._open_tags.pop()
+                self._enqueue(END, QName(open_tag))
                 if open_tag.lower() == tag.lower():
                     break
-                self._enqueue(END, QName(open_tag))
-            self._enqueue(END, QName(tag))
 
     def handle_data(self, text):
         if not isinstance(text, unicode):
--- a/genshi/tests/input.py
+++ b/genshi/tests/input.py
@@ -16,7 +16,7 @@
 import sys
 import unittest
 
-from genshi.core import Stream
+from genshi.core import Attrs, Stream
 from genshi.input import XMLParser, HTMLParser, ParseError
 
 
@@ -189,6 +189,39 @@
         self.assertEqual(u'php', target)
         self.assertEqual(u'echo "Foobar" ?', data)
 
+    def test_out_of_order_tags1(self):
+        text = '<span><b>Foobar</span></b>'
+        events = list(HTMLParser(StringIO(text)))
+        self.assertEqual(5, len(events))
+        self.assertEqual((Stream.START, ('span', ())), events[0][:2])
+        self.assertEqual((Stream.START, ('b', ())), events[1][:2])
+        self.assertEqual((Stream.TEXT, 'Foobar'), events[2][:2])
+        self.assertEqual((Stream.END, 'b'), events[3][:2])
+        self.assertEqual((Stream.END, 'span'), events[4][:2])
+
+    def test_out_of_order_tags2(self):
+        text = '<span class="baz"><b><i>Foobar</span></b></i>'
+        events = list(HTMLParser(StringIO(text)))
+        self.assertEqual(7, len(events))
+        self.assertEqual((Stream.START, ('span', Attrs([('class', 'baz')]))),
+                         events[0][:2])
+        self.assertEqual((Stream.START, ('b', ())), events[1][:2])
+        self.assertEqual((Stream.START, ('i', ())), events[2][:2])
+        self.assertEqual((Stream.TEXT, 'Foobar'), events[3][:2])
+        self.assertEqual((Stream.END, 'i'), events[4][:2])
+        self.assertEqual((Stream.END, 'b'), events[5][:2])
+        self.assertEqual((Stream.END, 'span'), events[6][:2])
+
+    def test_out_of_order_tags1(self):
+        text = '<span><b>Foobar</i>'
+        events = list(HTMLParser(StringIO(text)))
+        self.assertEqual(5, len(events))
+        self.assertEqual((Stream.START, ('span', ())), events[0][:2])
+        self.assertEqual((Stream.START, ('b', ())), events[1][:2])
+        self.assertEqual((Stream.TEXT, 'Foobar'), events[2][:2])
+        self.assertEqual((Stream.END, 'b'), events[3][:2])
+        self.assertEqual((Stream.END, 'span'), events[4][:2])
+
 
 def suite():
     suite = unittest.TestSuite()
Copyright (C) 2012-2017 Edgewall Software