annotate genshi/tests/input.py @ 996:0f4b2e892a48 trunk

Fix handling of processing instructions that don't have data attached (patch from Neil Muller, fixes #368).
author hodgestar
date Sat, 26 Jan 2013 17:26:47 +0000
parents 2bfd8f8d241c
children
rev   line source
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
1 # -*- coding: utf-8 -*-
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
2 #
854
4d9bef447df9 More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
3 # Copyright (C) 2006-2009 Edgewall Software
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
4 # All rights reserved.
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
5 #
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
6 # This software is licensed as described in the file COPYING, which
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
7 # you should have received as part of this distribution. The terms
230
84168828b074 Renamed Markup to Genshi in repository.
cmlenz
parents: 209
diff changeset
8 # are also available at http://genshi.edgewall.org/wiki/License.
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
9 #
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
10 # This software consists of voluntary contributions made by many
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
11 # individuals. For the exact contribution history, see the revision
230
84168828b074 Renamed Markup to Genshi in repository.
cmlenz
parents: 209
diff changeset
12 # history and logs, available at http://genshi.edgewall.org/log/.
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
13
26
3c1a022be04c * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 1
diff changeset
14 import doctest
134
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
15 import sys
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
16 import unittest
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
17
378
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
18 from genshi.core import Attrs, Stream
230
84168828b074 Renamed Markup to Genshi in repository.
cmlenz
parents: 209
diff changeset
19 from genshi.input import XMLParser, HTMLParser, ParseError
932
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
20 from genshi.compat import StringIO, BytesIO
134
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
21
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
22
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
23 class XMLParserTestCase(unittest.TestCase):
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
24
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
25 def test_text_node_pos_single_line(self):
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
26 text = '<elem>foo bar</elem>'
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
27 events = list(XMLParser(StringIO(text)))
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
28 kind, data, pos = events[1]
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
29 self.assertEqual(Stream.TEXT, kind)
854
4d9bef447df9 More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
30 self.assertEqual('foo bar', data)
750
52219748e5c1 Remove some cruft for supporting Python 2.3.
cmlenz
parents: 460
diff changeset
31 self.assertEqual((None, 1, 6), pos)
134
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
32
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
33 def test_text_node_pos_multi_line(self):
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
34 text = '''<elem>foo
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
35 bar</elem>'''
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
36 events = list(XMLParser(StringIO(text)))
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
37 kind, data, pos = events[1]
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
38 self.assertEqual(Stream.TEXT, kind)
854
4d9bef447df9 More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
39 self.assertEqual('foo\nbar', data)
750
52219748e5c1 Remove some cruft for supporting Python 2.3.
cmlenz
parents: 460
diff changeset
40 self.assertEqual((None, 1, -1), pos)
134
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
41
160
d19e8a2c549e Attribute order in parsed XML is now preserved.
cmlenz
parents: 141
diff changeset
42 def test_element_attribute_order(self):
d19e8a2c549e Attribute order in parsed XML is now preserved.
cmlenz
parents: 141
diff changeset
43 text = '<elem title="baz" id="foo" class="bar" />'
d19e8a2c549e Attribute order in parsed XML is now preserved.
cmlenz
parents: 141
diff changeset
44 events = list(XMLParser(StringIO(text)))
d19e8a2c549e Attribute order in parsed XML is now preserved.
cmlenz
parents: 141
diff changeset
45 kind, data, pos = events[0]
d19e8a2c549e Attribute order in parsed XML is now preserved.
cmlenz
parents: 141
diff changeset
46 self.assertEqual(Stream.START, kind)
d19e8a2c549e Attribute order in parsed XML is now preserved.
cmlenz
parents: 141
diff changeset
47 tag, attrib = data
854
4d9bef447df9 More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
48 self.assertEqual('elem', tag)
4d9bef447df9 More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
49 self.assertEqual(('title', 'baz'), attrib[0])
4d9bef447df9 More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
50 self.assertEqual(('id', 'foo'), attrib[1])
4d9bef447df9 More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
51 self.assertEqual(('class', 'bar'), attrib[2])
160
d19e8a2c549e Attribute order in parsed XML is now preserved.
cmlenz
parents: 141
diff changeset
52
207
28bfc6aafab7 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 160
diff changeset
53 def test_unicode_input(self):
28bfc6aafab7 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 160
diff changeset
54 text = u'<div>\u2013</div>'
28bfc6aafab7 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 160
diff changeset
55 events = list(XMLParser(StringIO(text)))
28bfc6aafab7 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 160
diff changeset
56 kind, data, pos = events[1]
28bfc6aafab7 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 160
diff changeset
57 self.assertEqual(Stream.TEXT, kind)
28bfc6aafab7 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 160
diff changeset
58 self.assertEqual(u'\u2013', data)
28bfc6aafab7 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 160
diff changeset
59
316
a946edefac40 Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
60 def test_latin1_encoded(self):
a946edefac40 Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
61 text = u'<div>\xf6</div>'.encode('iso-8859-1')
932
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
62 events = list(XMLParser(BytesIO(text), encoding='iso-8859-1'))
316
a946edefac40 Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
63 kind, data, pos = events[1]
a946edefac40 Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
64 self.assertEqual(Stream.TEXT, kind)
a946edefac40 Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
65 self.assertEqual(u'\xf6', data)
a946edefac40 Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
66
a946edefac40 Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
67 def test_latin1_encoded_xmldecl(self):
a946edefac40 Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
68 text = u"""<?xml version="1.0" encoding="iso-8859-1" ?>
a946edefac40 Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
69 <div>\xf6</div>
a946edefac40 Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
70 """.encode('iso-8859-1')
932
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
71 events = list(XMLParser(BytesIO(text)))
460
75425671b437 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
72 kind, data, pos = events[2]
316
a946edefac40 Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
73 self.assertEqual(Stream.TEXT, kind)
a946edefac40 Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
74 self.assertEqual(u'\xf6', data)
a946edefac40 Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
75
209
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
76 def test_html_entity_with_dtd(self):
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
77 text = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
78 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
79 <html>&nbsp;</html>
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
80 """
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
81 events = list(XMLParser(StringIO(text)))
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
82 kind, data, pos = events[2]
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
83 self.assertEqual(Stream.TEXT, kind)
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
84 self.assertEqual(u'\xa0', data)
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
85
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
86 def test_html_entity_without_dtd(self):
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
87 text = '<html>&nbsp;</html>'
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
88 events = list(XMLParser(StringIO(text)))
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
89 kind, data, pos = events[1]
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
90 self.assertEqual(Stream.TEXT, kind)
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
91 self.assertEqual(u'\xa0', data)
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
92
293
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
93 def test_html_entity_in_attribute(self):
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
94 text = '<p title="&nbsp;"/>'
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
95 events = list(XMLParser(StringIO(text)))
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
96 kind, data, pos = events[0]
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
97 self.assertEqual(Stream.START, kind)
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
98 self.assertEqual(u'\xa0', data[1].get('title'))
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
99 kind, data, pos = events[1]
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
100 self.assertEqual(Stream.END, kind)
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
101
209
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
102 def test_undefined_entity_with_dtd(self):
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
103 text = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
104 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
105 <html>&junk;</html>
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
106 """
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
107 events = XMLParser(StringIO(text))
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
108 self.assertRaises(ParseError, list, events)
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
109
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
110 def test_undefined_entity_without_dtd(self):
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
111 text = '<html>&junk;</html>'
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
112 events = XMLParser(StringIO(text))
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
113 self.assertRaises(ParseError, list, events)
fc6b2fb66518 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
114
134
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
115
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
116 class HTMLParserTestCase(unittest.TestCase):
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
117
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
118 def test_text_node_pos_single_line(self):
932
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
119 text = u'<elem>foo bar</elem>'
134
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
120 events = list(HTMLParser(StringIO(text)))
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
121 kind, data, pos = events[1]
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
122 self.assertEqual(Stream.TEXT, kind)
854
4d9bef447df9 More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
123 self.assertEqual('foo bar', data)
750
52219748e5c1 Remove some cruft for supporting Python 2.3.
cmlenz
parents: 460
diff changeset
124 self.assertEqual((None, 1, 6), pos)
134
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
125
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
126 def test_text_node_pos_multi_line(self):
932
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
127 text = u'''<elem>foo
134
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
128 bar</elem>'''
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
129 events = list(HTMLParser(StringIO(text)))
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
130 kind, data, pos = events[1]
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
131 self.assertEqual(Stream.TEXT, kind)
854
4d9bef447df9 More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
132 self.assertEqual('foo\nbar', data)
750
52219748e5c1 Remove some cruft for supporting Python 2.3.
cmlenz
parents: 460
diff changeset
133 self.assertEqual((None, 1, 6), pos)
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
134
312
cb7326367f91 Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents: 311
diff changeset
135 def test_input_encoding_text(self):
311
8de1ff534d22 * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
136 text = u'<div>\xf6</div>'.encode('iso-8859-1')
932
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
137 events = list(HTMLParser(BytesIO(text), encoding='iso-8859-1'))
311
8de1ff534d22 * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
138 kind, data, pos = events[1]
8de1ff534d22 * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
139 self.assertEqual(Stream.TEXT, kind)
8de1ff534d22 * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
140 self.assertEqual(u'\xf6', data)
8de1ff534d22 * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
141
312
cb7326367f91 Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents: 311
diff changeset
142 def test_input_encoding_attribute(self):
cb7326367f91 Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents: 311
diff changeset
143 text = u'<div title="\xf6"></div>'.encode('iso-8859-1')
932
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
144 events = list(HTMLParser(BytesIO(text), encoding='iso-8859-1'))
312
cb7326367f91 Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents: 311
diff changeset
145 kind, (tag, attrib), pos = events[0]
cb7326367f91 Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents: 311
diff changeset
146 self.assertEqual(Stream.START, kind)
cb7326367f91 Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents: 311
diff changeset
147 self.assertEqual(u'\xf6', attrib.get('title'))
cb7326367f91 Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents: 311
diff changeset
148
207
28bfc6aafab7 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 160
diff changeset
149 def test_unicode_input(self):
28bfc6aafab7 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 160
diff changeset
150 text = u'<div>\u2013</div>'
28bfc6aafab7 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 160
diff changeset
151 events = list(HTMLParser(StringIO(text)))
28bfc6aafab7 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 160
diff changeset
152 kind, data, pos = events[1]
28bfc6aafab7 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 160
diff changeset
153 self.assertEqual(Stream.TEXT, kind)
28bfc6aafab7 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 160
diff changeset
154 self.assertEqual(u'\u2013', data)
28bfc6aafab7 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 160
diff changeset
155
293
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
156 def test_html_entity_in_attribute(self):
932
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
157 text = u'<p title="&nbsp;"></p>'
293
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
158 events = list(HTMLParser(StringIO(text)))
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
159 kind, data, pos = events[0]
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
160 self.assertEqual(Stream.START, kind)
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
161 self.assertEqual(u'\xa0', data[1].get('title'))
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
162 kind, data, pos = events[1]
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
163 self.assertEqual(Stream.END, kind)
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
164
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
165 def test_html_entity_in_text(self):
932
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
166 text = u'<p>&nbsp;</p>'
293
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
167 events = list(HTMLParser(StringIO(text)))
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
168 kind, data, pos = events[1]
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
169 self.assertEqual(Stream.TEXT, kind)
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
170 self.assertEqual(u'\xa0', data)
e17b7459b515 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
171
376
0e0952d85d97 Fix parsing of processing instructions in HTML input.
cmlenz
parents: 316
diff changeset
172 def test_processing_instruction(self):
932
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
173 text = u'<?php echo "Foobar" ?>'
376
0e0952d85d97 Fix parsing of processing instructions in HTML input.
cmlenz
parents: 316
diff changeset
174 events = list(HTMLParser(StringIO(text)))
0e0952d85d97 Fix parsing of processing instructions in HTML input.
cmlenz
parents: 316
diff changeset
175 kind, (target, data), pos = events[0]
0e0952d85d97 Fix parsing of processing instructions in HTML input.
cmlenz
parents: 316
diff changeset
176 self.assertEqual(Stream.PI, kind)
854
4d9bef447df9 More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
177 self.assertEqual('php', target)
4d9bef447df9 More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
178 self.assertEqual('echo "Foobar"', data)
376
0e0952d85d97 Fix parsing of processing instructions in HTML input.
cmlenz
parents: 316
diff changeset
179
996
0f4b2e892a48 Fix handling of processing instructions that don't have data attached (patch from Neil Muller, fixes #368).
hodgestar
parents: 965
diff changeset
180 def test_processing_instruction_no_data_1(self):
0f4b2e892a48 Fix handling of processing instructions that don't have data attached (patch from Neil Muller, fixes #368).
hodgestar
parents: 965
diff changeset
181 text = u'<?foo ?>'
0f4b2e892a48 Fix handling of processing instructions that don't have data attached (patch from Neil Muller, fixes #368).
hodgestar
parents: 965
diff changeset
182 events = list(HTMLParser(StringIO(text)))
0f4b2e892a48 Fix handling of processing instructions that don't have data attached (patch from Neil Muller, fixes #368).
hodgestar
parents: 965
diff changeset
183 kind, (target, data), pos = events[0]
0f4b2e892a48 Fix handling of processing instructions that don't have data attached (patch from Neil Muller, fixes #368).
hodgestar
parents: 965
diff changeset
184 self.assertEqual(Stream.PI, kind)
0f4b2e892a48 Fix handling of processing instructions that don't have data attached (patch from Neil Muller, fixes #368).
hodgestar
parents: 965
diff changeset
185 self.assertEqual('foo', target)
0f4b2e892a48 Fix handling of processing instructions that don't have data attached (patch from Neil Muller, fixes #368).
hodgestar
parents: 965
diff changeset
186 self.assertEqual('', data)
0f4b2e892a48 Fix handling of processing instructions that don't have data attached (patch from Neil Muller, fixes #368).
hodgestar
parents: 965
diff changeset
187
0f4b2e892a48 Fix handling of processing instructions that don't have data attached (patch from Neil Muller, fixes #368).
hodgestar
parents: 965
diff changeset
188 def test_processing_instruction_no_data_2(self):
0f4b2e892a48 Fix handling of processing instructions that don't have data attached (patch from Neil Muller, fixes #368).
hodgestar
parents: 965
diff changeset
189 text = u'<?experiment>...<?/experiment>'
0f4b2e892a48 Fix handling of processing instructions that don't have data attached (patch from Neil Muller, fixes #368).
hodgestar
parents: 965
diff changeset
190 events = list(HTMLParser(StringIO(text)))
0f4b2e892a48 Fix handling of processing instructions that don't have data attached (patch from Neil Muller, fixes #368).
hodgestar
parents: 965
diff changeset
191 kind, (target, data), pos = events[0]
0f4b2e892a48 Fix handling of processing instructions that don't have data attached (patch from Neil Muller, fixes #368).
hodgestar
parents: 965
diff changeset
192 self.assertEqual(Stream.PI, kind)
0f4b2e892a48 Fix handling of processing instructions that don't have data attached (patch from Neil Muller, fixes #368).
hodgestar
parents: 965
diff changeset
193 self.assertEqual('experiment', target)
0f4b2e892a48 Fix handling of processing instructions that don't have data attached (patch from Neil Muller, fixes #368).
hodgestar
parents: 965
diff changeset
194 self.assertEqual('', data)
0f4b2e892a48 Fix handling of processing instructions that don't have data attached (patch from Neil Muller, fixes #368).
hodgestar
parents: 965
diff changeset
195 kind, (target, data), pos = events[2]
0f4b2e892a48 Fix handling of processing instructions that don't have data attached (patch from Neil Muller, fixes #368).
hodgestar
parents: 965
diff changeset
196 self.assertEqual('/experiment', target)
0f4b2e892a48 Fix handling of processing instructions that don't have data attached (patch from Neil Muller, fixes #368).
hodgestar
parents: 965
diff changeset
197 self.assertEqual('', data)
0f4b2e892a48 Fix handling of processing instructions that don't have data attached (patch from Neil Muller, fixes #368).
hodgestar
parents: 965
diff changeset
198
460
75425671b437 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
199 def test_xmldecl(self):
75425671b437 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
200 text = '<?xml version="1.0" ?><root />'
75425671b437 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
201 events = list(XMLParser(StringIO(text)))
75425671b437 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
202 kind, (version, encoding, standalone), pos = events[0]
75425671b437 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
203 self.assertEqual(Stream.XML_DECL, kind)
854
4d9bef447df9 More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
204 self.assertEqual('1.0', version)
460
75425671b437 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
205 self.assertEqual(None, encoding)
75425671b437 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
206 self.assertEqual(-1, standalone)
75425671b437 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
207
75425671b437 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
208 def test_xmldecl_encoding(self):
75425671b437 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
209 text = '<?xml version="1.0" encoding="utf-8" ?><root />'
75425671b437 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
210 events = list(XMLParser(StringIO(text)))
75425671b437 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
211 kind, (version, encoding, standalone), pos = events[0]
75425671b437 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
212 self.assertEqual(Stream.XML_DECL, kind)
854
4d9bef447df9 More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
213 self.assertEqual('1.0', version)
4d9bef447df9 More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
214 self.assertEqual('utf-8', encoding)
460
75425671b437 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
215 self.assertEqual(-1, standalone)
75425671b437 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
216
75425671b437 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
217 def test_xmldecl_standalone(self):
75425671b437 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
218 text = '<?xml version="1.0" standalone="yes" ?><root />'
75425671b437 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
219 events = list(XMLParser(StringIO(text)))
75425671b437 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
220 kind, (version, encoding, standalone), pos = events[0]
75425671b437 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
221 self.assertEqual(Stream.XML_DECL, kind)
854
4d9bef447df9 More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
222 self.assertEqual('1.0', version)
460
75425671b437 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
223 self.assertEqual(None, encoding)
75425671b437 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
224 self.assertEqual(1, standalone)
75425671b437 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
225
376
0e0952d85d97 Fix parsing of processing instructions in HTML input.
cmlenz
parents: 316
diff changeset
226 def test_processing_instruction_trailing_qmark(self):
932
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
227 text = u'<?php echo "Foobar" ??>'
376
0e0952d85d97 Fix parsing of processing instructions in HTML input.
cmlenz
parents: 316
diff changeset
228 events = list(HTMLParser(StringIO(text)))
0e0952d85d97 Fix parsing of processing instructions in HTML input.
cmlenz
parents: 316
diff changeset
229 kind, (target, data), pos = events[0]
0e0952d85d97 Fix parsing of processing instructions in HTML input.
cmlenz
parents: 316
diff changeset
230 self.assertEqual(Stream.PI, kind)
854
4d9bef447df9 More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
231 self.assertEqual('php', target)
4d9bef447df9 More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
232 self.assertEqual('echo "Foobar" ?', data)
376
0e0952d85d97 Fix parsing of processing instructions in HTML input.
cmlenz
parents: 316
diff changeset
233
378
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
234 def test_out_of_order_tags1(self):
932
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
235 text = u'<span><b>Foobar</span></b>'
378
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
236 events = list(HTMLParser(StringIO(text)))
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
237 self.assertEqual(5, len(events))
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
238 self.assertEqual((Stream.START, ('span', ())), events[0][:2])
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
239 self.assertEqual((Stream.START, ('b', ())), events[1][:2])
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
240 self.assertEqual((Stream.TEXT, 'Foobar'), events[2][:2])
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
241 self.assertEqual((Stream.END, 'b'), events[3][:2])
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
242 self.assertEqual((Stream.END, 'span'), events[4][:2])
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
243
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
244 def test_out_of_order_tags2(self):
932
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
245 text = u'<span class="baz"><b><i>Foobar</span></b></i>'.encode('utf-8')
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
246 events = list(HTMLParser(BytesIO(text), encoding='utf-8'))
378
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
247 self.assertEqual(7, len(events))
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
248 self.assertEqual((Stream.START, ('span', Attrs([('class', 'baz')]))),
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
249 events[0][:2])
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
250 self.assertEqual((Stream.START, ('b', ())), events[1][:2])
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
251 self.assertEqual((Stream.START, ('i', ())), events[2][:2])
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
252 self.assertEqual((Stream.TEXT, 'Foobar'), events[3][:2])
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
253 self.assertEqual((Stream.END, 'i'), events[4][:2])
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
254 self.assertEqual((Stream.END, 'b'), events[5][:2])
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
255 self.assertEqual((Stream.END, 'span'), events[6][:2])
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
256
383
e17b84835b0f Fix duplicate unit test name reported in #83.
cmlenz
parents: 378
diff changeset
257 def test_out_of_order_tags3(self):
932
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
258 text = u'<span><b>Foobar</i>'.encode('utf-8')
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
259 events = list(HTMLParser(BytesIO(text), encoding='utf-8'))
378
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
260 self.assertEqual(5, len(events))
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
261 self.assertEqual((Stream.START, ('span', ())), events[0][:2])
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
262 self.assertEqual((Stream.START, ('b', ())), events[1][:2])
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
263 self.assertEqual((Stream.TEXT, 'Foobar'), events[2][:2])
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
264 self.assertEqual((Stream.END, 'b'), events[3][:2])
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
265 self.assertEqual((Stream.END, 'span'), events[4][:2])
873ca2a7ec05 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
266
423
56bbe1d94da0 Applied patch for #106 (handling of hex charrefs in HTML parser).
cmlenz
parents: 383
diff changeset
267 def test_hex_charref(self):
932
18209925c54e Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
268 text = u'<span>&#x27;</span>'
423
56bbe1d94da0 Applied patch for #106 (handling of hex charrefs in HTML parser).
cmlenz
parents: 383
diff changeset
269 events = list(HTMLParser(StringIO(text)))
56bbe1d94da0 Applied patch for #106 (handling of hex charrefs in HTML parser).
cmlenz
parents: 383
diff changeset
270 self.assertEqual(3, len(events))
56bbe1d94da0 Applied patch for #106 (handling of hex charrefs in HTML parser).
cmlenz
parents: 383
diff changeset
271 self.assertEqual((Stream.START, ('span', ())), events[0][:2])
56bbe1d94da0 Applied patch for #106 (handling of hex charrefs in HTML parser).
cmlenz
parents: 383
diff changeset
272 self.assertEqual((Stream.TEXT, "'"), events[1][:2])
56bbe1d94da0 Applied patch for #106 (handling of hex charrefs in HTML parser).
cmlenz
parents: 383
diff changeset
273 self.assertEqual((Stream.END, 'span'), events[2][:2])
56bbe1d94da0 Applied patch for #106 (handling of hex charrefs in HTML parser).
cmlenz
parents: 383
diff changeset
274
965
2bfd8f8d241c Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents: 932
diff changeset
275 def test_multibyte_character_on_chunk_boundary(self):
2bfd8f8d241c Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents: 932
diff changeset
276 text = u'a' * ((4 * 1024) - 1) + u'\xe6'
2bfd8f8d241c Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents: 932
diff changeset
277 events = list(HTMLParser(BytesIO(text.encode('utf-8')),
2bfd8f8d241c Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents: 932
diff changeset
278 encoding='utf-8'))
2bfd8f8d241c Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents: 932
diff changeset
279 self.assertEqual(1, len(events))
2bfd8f8d241c Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents: 932
diff changeset
280 self.assertEqual((Stream.TEXT, text), events[0][:2])
2bfd8f8d241c Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents: 932
diff changeset
281
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
282
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
283 def suite():
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
284 suite = unittest.TestSuite()
141
520a5b7dd6d2 * No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents: 134
diff changeset
285 suite.addTest(doctest.DocTestSuite(XMLParser.__module__))
134
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
286 suite.addTest(unittest.makeSuite(XMLParserTestCase, 'test'))
d681d2c3cd8d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
287 suite.addTest(unittest.makeSuite(HTMLParserTestCase, 'test'))
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
288 return suite
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
289
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
290 if __name__ == '__main__':
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
291 unittest.main(defaultTest='suite')
Copyright (C) 2012-2017 Edgewall Software