annotate genshi/tests/input.py @ 932:e53161c2773c

Merge r1140 from py3k: add support for python 3 to core genshi components (genshi.core, genshi.input and genshi.output): * default input and output encodings changed from UTF-8 to None (i.e. unicode strings) * Namespace and QName objects do not call stringrepr in __repr__ in Python 3 since repr() returns a unicode string there. * track changes to expat parser in Python 3 (mostly it accepts bytes instead of strings)
author hodgestar
date Fri, 18 Mar 2011 09:08:12 +0000
parents 0d9e87c6cf6e
children
rev   line source
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
1 # -*- coding: utf-8 -*-
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
2 #
854
0d9e87c6cf6e More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
3 # Copyright (C) 2006-2009 Edgewall Software
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
4 # All rights reserved.
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
5 #
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
6 # This software is licensed as described in the file COPYING, which
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
7 # you should have received as part of this distribution. The terms
230
24757b771651 Renamed Markup to Genshi in repository.
cmlenz
parents: 209
diff changeset
8 # are also available at http://genshi.edgewall.org/wiki/License.
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
9 #
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
10 # This software consists of voluntary contributions made by many
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
11 # individuals. For the exact contribution history, see the revision
230
24757b771651 Renamed Markup to Genshi in repository.
cmlenz
parents: 209
diff changeset
12 # history and logs, available at http://genshi.edgewall.org/log/.
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
13
26
039fc5b87405 * Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents: 1
diff changeset
14 import doctest
134
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
15 import sys
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
16 import unittest
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
17
378
fff4a81ffc56 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
18 from genshi.core import Attrs, Stream
230
24757b771651 Renamed Markup to Genshi in repository.
cmlenz
parents: 209
diff changeset
19 from genshi.input import XMLParser, HTMLParser, ParseError
932
e53161c2773c Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
20 from genshi.compat import StringIO, BytesIO
134
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
21
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
22
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
23 class XMLParserTestCase(unittest.TestCase):
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
24
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
25 def test_text_node_pos_single_line(self):
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
26 text = '<elem>foo bar</elem>'
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
27 events = list(XMLParser(StringIO(text)))
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
28 kind, data, pos = events[1]
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
29 self.assertEqual(Stream.TEXT, kind)
854
0d9e87c6cf6e More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
30 self.assertEqual('foo bar', data)
750
d007a0d7ba81 Remove some cruft for supporting Python 2.3.
cmlenz
parents: 460
diff changeset
31 self.assertEqual((None, 1, 6), pos)
134
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
32
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
33 def test_text_node_pos_multi_line(self):
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
34 text = '''<elem>foo
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
35 bar</elem>'''
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
36 events = list(XMLParser(StringIO(text)))
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
37 kind, data, pos = events[1]
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
38 self.assertEqual(Stream.TEXT, kind)
854
0d9e87c6cf6e More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
39 self.assertEqual('foo\nbar', data)
750
d007a0d7ba81 Remove some cruft for supporting Python 2.3.
cmlenz
parents: 460
diff changeset
40 self.assertEqual((None, 1, -1), pos)
134
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
41
160
faea6db52ef1 Attribute order in parsed XML is now preserved.
cmlenz
parents: 141
diff changeset
42 def test_element_attribute_order(self):
faea6db52ef1 Attribute order in parsed XML is now preserved.
cmlenz
parents: 141
diff changeset
43 text = '<elem title="baz" id="foo" class="bar" />'
faea6db52ef1 Attribute order in parsed XML is now preserved.
cmlenz
parents: 141
diff changeset
44 events = list(XMLParser(StringIO(text)))
faea6db52ef1 Attribute order in parsed XML is now preserved.
cmlenz
parents: 141
diff changeset
45 kind, data, pos = events[0]
faea6db52ef1 Attribute order in parsed XML is now preserved.
cmlenz
parents: 141
diff changeset
46 self.assertEqual(Stream.START, kind)
faea6db52ef1 Attribute order in parsed XML is now preserved.
cmlenz
parents: 141
diff changeset
47 tag, attrib = data
854
0d9e87c6cf6e More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
48 self.assertEqual('elem', tag)
0d9e87c6cf6e More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
49 self.assertEqual(('title', 'baz'), attrib[0])
0d9e87c6cf6e More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
50 self.assertEqual(('id', 'foo'), attrib[1])
0d9e87c6cf6e More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
51 self.assertEqual(('class', 'bar'), attrib[2])
160
faea6db52ef1 Attribute order in parsed XML is now preserved.
cmlenz
parents: 141
diff changeset
52
207
0619a27f5e67 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 160
diff changeset
53 def test_unicode_input(self):
0619a27f5e67 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 160
diff changeset
54 text = u'<div>\u2013</div>'
0619a27f5e67 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 160
diff changeset
55 events = list(XMLParser(StringIO(text)))
0619a27f5e67 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 160
diff changeset
56 kind, data, pos = events[1]
0619a27f5e67 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 160
diff changeset
57 self.assertEqual(Stream.TEXT, kind)
0619a27f5e67 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 160
diff changeset
58 self.assertEqual(u'\u2013', data)
0619a27f5e67 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 160
diff changeset
59
316
4ab9edf5e83b Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
60 def test_latin1_encoded(self):
4ab9edf5e83b Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
61 text = u'<div>\xf6</div>'.encode('iso-8859-1')
932
e53161c2773c Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
62 events = list(XMLParser(BytesIO(text), encoding='iso-8859-1'))
316
4ab9edf5e83b Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
63 kind, data, pos = events[1]
4ab9edf5e83b Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
64 self.assertEqual(Stream.TEXT, kind)
4ab9edf5e83b Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
65 self.assertEqual(u'\xf6', data)
4ab9edf5e83b Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
66
4ab9edf5e83b Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
67 def test_latin1_encoded_xmldecl(self):
4ab9edf5e83b Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
68 text = u"""<?xml version="1.0" encoding="iso-8859-1" ?>
4ab9edf5e83b Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
69 <div>\xf6</div>
4ab9edf5e83b Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
70 """.encode('iso-8859-1')
932
e53161c2773c Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
71 events = list(XMLParser(BytesIO(text)))
460
6b5544bb5a99 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
72 kind, data, pos = events[2]
316
4ab9edf5e83b Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
73 self.assertEqual(Stream.TEXT, kind)
4ab9edf5e83b Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
74 self.assertEqual(u'\xf6', data)
4ab9edf5e83b Configurable encoding of template files, closing #65.
cmlenz
parents: 312
diff changeset
75
209
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
76 def test_html_entity_with_dtd(self):
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
77 text = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
78 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
79 <html>&nbsp;</html>
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
80 """
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
81 events = list(XMLParser(StringIO(text)))
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
82 kind, data, pos = events[2]
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
83 self.assertEqual(Stream.TEXT, kind)
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
84 self.assertEqual(u'\xa0', data)
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
85
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
86 def test_html_entity_without_dtd(self):
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
87 text = '<html>&nbsp;</html>'
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
88 events = list(XMLParser(StringIO(text)))
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
89 kind, data, pos = events[1]
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
90 self.assertEqual(Stream.TEXT, kind)
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
91 self.assertEqual(u'\xa0', data)
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
92
293
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
93 def test_html_entity_in_attribute(self):
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
94 text = '<p title="&nbsp;"/>'
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
95 events = list(XMLParser(StringIO(text)))
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
96 kind, data, pos = events[0]
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
97 self.assertEqual(Stream.START, kind)
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
98 self.assertEqual(u'\xa0', data[1].get('title'))
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
99 kind, data, pos = events[1]
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
100 self.assertEqual(Stream.END, kind)
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
101
209
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
102 def test_undefined_entity_with_dtd(self):
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
103 text = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
104 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
105 <html>&junk;</html>
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
106 """
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
107 events = XMLParser(StringIO(text))
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
108 self.assertRaises(ParseError, list, events)
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
109
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
110 def test_undefined_entity_without_dtd(self):
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
111 text = '<html>&junk;</html>'
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
112 events = XMLParser(StringIO(text))
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
113 self.assertRaises(ParseError, list, events)
5b422db07359 * Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents: 207
diff changeset
114
134
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
115
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
116 class HTMLParserTestCase(unittest.TestCase):
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
117
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
118 def test_text_node_pos_single_line(self):
932
e53161c2773c Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
119 text = u'<elem>foo bar</elem>'
134
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
120 events = list(HTMLParser(StringIO(text)))
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
121 kind, data, pos = events[1]
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
122 self.assertEqual(Stream.TEXT, kind)
854
0d9e87c6cf6e More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
123 self.assertEqual('foo bar', data)
750
d007a0d7ba81 Remove some cruft for supporting Python 2.3.
cmlenz
parents: 460
diff changeset
124 self.assertEqual((None, 1, 6), pos)
134
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
125
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
126 def test_text_node_pos_multi_line(self):
932
e53161c2773c Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
127 text = u'''<elem>foo
134
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
128 bar</elem>'''
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
129 events = list(HTMLParser(StringIO(text)))
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
130 kind, data, pos = events[1]
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
131 self.assertEqual(Stream.TEXT, kind)
854
0d9e87c6cf6e More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
132 self.assertEqual('foo\nbar', data)
750
d007a0d7ba81 Remove some cruft for supporting Python 2.3.
cmlenz
parents: 460
diff changeset
133 self.assertEqual((None, 1, 6), pos)
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
134
312
7e743338a799 Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents: 311
diff changeset
135 def test_input_encoding_text(self):
311
01e2c48f6dfb * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
136 text = u'<div>\xf6</div>'.encode('iso-8859-1')
932
e53161c2773c Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
137 events = list(HTMLParser(BytesIO(text), encoding='iso-8859-1'))
311
01e2c48f6dfb * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
138 kind, data, pos = events[1]
01e2c48f6dfb * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
139 self.assertEqual(Stream.TEXT, kind)
01e2c48f6dfb * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
140 self.assertEqual(u'\xf6', data)
01e2c48f6dfb * The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents: 293
diff changeset
141
312
7e743338a799 Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents: 311
diff changeset
142 def test_input_encoding_attribute(self):
7e743338a799 Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents: 311
diff changeset
143 text = u'<div title="\xf6"></div>'.encode('iso-8859-1')
932
e53161c2773c Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
144 events = list(HTMLParser(BytesIO(text), encoding='iso-8859-1'))
312
7e743338a799 Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents: 311
diff changeset
145 kind, (tag, attrib), pos = events[0]
7e743338a799 Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents: 311
diff changeset
146 self.assertEqual(Stream.START, kind)
7e743338a799 Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents: 311
diff changeset
147 self.assertEqual(u'\xf6', attrib.get('title'))
7e743338a799 Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents: 311
diff changeset
148
207
0619a27f5e67 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 160
diff changeset
149 def test_unicode_input(self):
0619a27f5e67 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 160
diff changeset
150 text = u'<div>\u2013</div>'
0619a27f5e67 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 160
diff changeset
151 events = list(HTMLParser(StringIO(text)))
0619a27f5e67 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 160
diff changeset
152 kind, data, pos = events[1]
0619a27f5e67 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 160
diff changeset
153 self.assertEqual(Stream.TEXT, kind)
0619a27f5e67 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 160
diff changeset
154 self.assertEqual(u'\u2013', data)
0619a27f5e67 The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents: 160
diff changeset
155
293
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
156 def test_html_entity_in_attribute(self):
932
e53161c2773c Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
157 text = u'<p title="&nbsp;"></p>'
293
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
158 events = list(HTMLParser(StringIO(text)))
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
159 kind, data, pos = events[0]
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
160 self.assertEqual(Stream.START, kind)
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
161 self.assertEqual(u'\xa0', data[1].get('title'))
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
162 kind, data, pos = events[1]
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
163 self.assertEqual(Stream.END, kind)
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
164
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
165 def test_html_entity_in_text(self):
932
e53161c2773c Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
166 text = u'<p>&nbsp;</p>'
293
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
167 events = list(HTMLParser(StringIO(text)))
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
168 kind, data, pos = events[1]
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
169 self.assertEqual(Stream.TEXT, kind)
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
170 self.assertEqual(u'\xa0', data)
38adb4aa7df5 Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents: 230
diff changeset
171
376
74b6bf92f0cd Fix parsing of processing instructions in HTML input.
cmlenz
parents: 316
diff changeset
172 def test_processing_instruction(self):
932
e53161c2773c Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
173 text = u'<?php echo "Foobar" ?>'
376
74b6bf92f0cd Fix parsing of processing instructions in HTML input.
cmlenz
parents: 316
diff changeset
174 events = list(HTMLParser(StringIO(text)))
74b6bf92f0cd Fix parsing of processing instructions in HTML input.
cmlenz
parents: 316
diff changeset
175 kind, (target, data), pos = events[0]
74b6bf92f0cd Fix parsing of processing instructions in HTML input.
cmlenz
parents: 316
diff changeset
176 self.assertEqual(Stream.PI, kind)
854
0d9e87c6cf6e More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
177 self.assertEqual('php', target)
0d9e87c6cf6e More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
178 self.assertEqual('echo "Foobar"', data)
376
74b6bf92f0cd Fix parsing of processing instructions in HTML input.
cmlenz
parents: 316
diff changeset
179
460
6b5544bb5a99 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
180 def test_xmldecl(self):
6b5544bb5a99 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
181 text = '<?xml version="1.0" ?><root />'
6b5544bb5a99 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
182 events = list(XMLParser(StringIO(text)))
6b5544bb5a99 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
183 kind, (version, encoding, standalone), pos = events[0]
6b5544bb5a99 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
184 self.assertEqual(Stream.XML_DECL, kind)
854
0d9e87c6cf6e More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
185 self.assertEqual('1.0', version)
460
6b5544bb5a99 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
186 self.assertEqual(None, encoding)
6b5544bb5a99 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
187 self.assertEqual(-1, standalone)
6b5544bb5a99 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
188
6b5544bb5a99 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
189 def test_xmldecl_encoding(self):
6b5544bb5a99 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
190 text = '<?xml version="1.0" encoding="utf-8" ?><root />'
6b5544bb5a99 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
191 events = list(XMLParser(StringIO(text)))
6b5544bb5a99 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
192 kind, (version, encoding, standalone), pos = events[0]
6b5544bb5a99 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
193 self.assertEqual(Stream.XML_DECL, kind)
854
0d9e87c6cf6e More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
194 self.assertEqual('1.0', version)
0d9e87c6cf6e More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
195 self.assertEqual('utf-8', encoding)
460
6b5544bb5a99 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
196 self.assertEqual(-1, standalone)
6b5544bb5a99 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
197
6b5544bb5a99 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
198 def test_xmldecl_standalone(self):
6b5544bb5a99 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
199 text = '<?xml version="1.0" standalone="yes" ?><root />'
6b5544bb5a99 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
200 events = list(XMLParser(StringIO(text)))
6b5544bb5a99 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
201 kind, (version, encoding, standalone), pos = events[0]
6b5544bb5a99 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
202 self.assertEqual(Stream.XML_DECL, kind)
854
0d9e87c6cf6e More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
203 self.assertEqual('1.0', version)
460
6b5544bb5a99 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
204 self.assertEqual(None, encoding)
6b5544bb5a99 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
205 self.assertEqual(1, standalone)
6b5544bb5a99 Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents: 423
diff changeset
206
376
74b6bf92f0cd Fix parsing of processing instructions in HTML input.
cmlenz
parents: 316
diff changeset
207 def test_processing_instruction_trailing_qmark(self):
932
e53161c2773c Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
208 text = u'<?php echo "Foobar" ??>'
376
74b6bf92f0cd Fix parsing of processing instructions in HTML input.
cmlenz
parents: 316
diff changeset
209 events = list(HTMLParser(StringIO(text)))
74b6bf92f0cd Fix parsing of processing instructions in HTML input.
cmlenz
parents: 316
diff changeset
210 kind, (target, data), pos = events[0]
74b6bf92f0cd Fix parsing of processing instructions in HTML input.
cmlenz
parents: 316
diff changeset
211 self.assertEqual(Stream.PI, kind)
854
0d9e87c6cf6e More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
212 self.assertEqual('php', target)
0d9e87c6cf6e More work on reducing the size of the diff produced by 2to3.
cmlenz
parents: 750
diff changeset
213 self.assertEqual('echo "Foobar" ?', data)
376
74b6bf92f0cd Fix parsing of processing instructions in HTML input.
cmlenz
parents: 316
diff changeset
214
378
fff4a81ffc56 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
215 def test_out_of_order_tags1(self):
932
e53161c2773c Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
216 text = u'<span><b>Foobar</span></b>'
378
fff4a81ffc56 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
217 events = list(HTMLParser(StringIO(text)))
fff4a81ffc56 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
218 self.assertEqual(5, len(events))
fff4a81ffc56 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
219 self.assertEqual((Stream.START, ('span', ())), events[0][:2])
fff4a81ffc56 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
220 self.assertEqual((Stream.START, ('b', ())), events[1][:2])
fff4a81ffc56 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
221 self.assertEqual((Stream.TEXT, 'Foobar'), events[2][:2])
fff4a81ffc56 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
222 self.assertEqual((Stream.END, 'b'), events[3][:2])
fff4a81ffc56 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
223 self.assertEqual((Stream.END, 'span'), events[4][:2])
fff4a81ffc56 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
224
fff4a81ffc56 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
225 def test_out_of_order_tags2(self):
932
e53161c2773c Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
226 text = u'<span class="baz"><b><i>Foobar</span></b></i>'.encode('utf-8')
e53161c2773c Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
227 events = list(HTMLParser(BytesIO(text), encoding='utf-8'))
378
fff4a81ffc56 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
228 self.assertEqual(7, len(events))
fff4a81ffc56 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
229 self.assertEqual((Stream.START, ('span', Attrs([('class', 'baz')]))),
fff4a81ffc56 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
230 events[0][:2])
fff4a81ffc56 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
231 self.assertEqual((Stream.START, ('b', ())), events[1][:2])
fff4a81ffc56 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
232 self.assertEqual((Stream.START, ('i', ())), events[2][:2])
fff4a81ffc56 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
233 self.assertEqual((Stream.TEXT, 'Foobar'), events[3][:2])
fff4a81ffc56 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
234 self.assertEqual((Stream.END, 'i'), events[4][:2])
fff4a81ffc56 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
235 self.assertEqual((Stream.END, 'b'), events[5][:2])
fff4a81ffc56 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
236 self.assertEqual((Stream.END, 'span'), events[6][:2])
fff4a81ffc56 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
237
383
9f5a34ac1d90 Fix duplicate unit test name reported in #83.
cmlenz
parents: 378
diff changeset
238 def test_out_of_order_tags3(self):
932
e53161c2773c Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
239 text = u'<span><b>Foobar</i>'.encode('utf-8')
e53161c2773c Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
240 events = list(HTMLParser(BytesIO(text), encoding='utf-8'))
378
fff4a81ffc56 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
241 self.assertEqual(5, len(events))
fff4a81ffc56 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
242 self.assertEqual((Stream.START, ('span', ())), events[0][:2])
fff4a81ffc56 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
243 self.assertEqual((Stream.START, ('b', ())), events[1][:2])
fff4a81ffc56 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
244 self.assertEqual((Stream.TEXT, 'Foobar'), events[2][:2])
fff4a81ffc56 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
245 self.assertEqual((Stream.END, 'b'), events[3][:2])
fff4a81ffc56 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
246 self.assertEqual((Stream.END, 'span'), events[4][:2])
fff4a81ffc56 Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents: 376
diff changeset
247
423
7589a0e51001 Applied patch for #106 (handling of hex charrefs in HTML parser).
cmlenz
parents: 383
diff changeset
248 def test_hex_charref(self):
932
e53161c2773c Merge r1140 from py3k:
hodgestar
parents: 854
diff changeset
249 text = u'<span>&#x27;</span>'
423
7589a0e51001 Applied patch for #106 (handling of hex charrefs in HTML parser).
cmlenz
parents: 383
diff changeset
250 events = list(HTMLParser(StringIO(text)))
7589a0e51001 Applied patch for #106 (handling of hex charrefs in HTML parser).
cmlenz
parents: 383
diff changeset
251 self.assertEqual(3, len(events))
7589a0e51001 Applied patch for #106 (handling of hex charrefs in HTML parser).
cmlenz
parents: 383
diff changeset
252 self.assertEqual((Stream.START, ('span', ())), events[0][:2])
7589a0e51001 Applied patch for #106 (handling of hex charrefs in HTML parser).
cmlenz
parents: 383
diff changeset
253 self.assertEqual((Stream.TEXT, "'"), events[1][:2])
7589a0e51001 Applied patch for #106 (handling of hex charrefs in HTML parser).
cmlenz
parents: 383
diff changeset
254 self.assertEqual((Stream.END, 'span'), events[2][:2])
7589a0e51001 Applied patch for #106 (handling of hex charrefs in HTML parser).
cmlenz
parents: 383
diff changeset
255
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
256
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
257 def suite():
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
258 suite = unittest.TestSuite()
141
b3ceaa35fb6b * No escaping of `<script>` or `<style>` tags in HTML output (see #24)
cmlenz
parents: 134
diff changeset
259 suite.addTest(doctest.DocTestSuite(XMLParser.__module__))
134
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
260 suite.addTest(unittest.makeSuite(XMLParserTestCase, 'test'))
df44110ca91d * Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents: 66
diff changeset
261 suite.addTest(unittest.makeSuite(HTMLParserTestCase, 'test'))
1
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
262 return suite
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
263
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
264 if __name__ == '__main__':
821114ec4f69 Initial import.
cmlenz
parents:
diff changeset
265 unittest.main(defaultTest='suite')
Copyright (C) 2012-2017 Edgewall Software