Mercurial > genshi > genshi-test
annotate genshi/input.py @ 312:7e743338a799
Follow-up to [385]: also decode attribute values in the `HTMLParser`.
author | cmlenz |
---|---|
date | Sun, 22 Oct 2006 16:44:18 +0000 |
parents | 01e2c48f6dfb |
children | 4ab9edf5e83b |
rev | line source |
---|---|
1 | 1 # -*- coding: utf-8 -*- |
2 # | |
66
822089ae65ce
Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents:
27
diff
changeset
|
3 # Copyright (C) 2006 Edgewall Software |
1 | 4 # All rights reserved. |
5 # | |
6 # This software is licensed as described in the file COPYING, which | |
7 # you should have received as part of this distribution. The terms | |
230 | 8 # are also available at http://genshi.edgewall.org/wiki/License. |
1 | 9 # |
10 # This software consists of voluntary contributions made by many | |
11 # individuals. For the exact contribution history, see the revision | |
230 | 12 # history and logs, available at http://genshi.edgewall.org/log/. |
1 | 13 |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
14 from itertools import chain |
1 | 15 from xml.parsers import expat |
16 try: | |
17 frozenset | |
18 except NameError: | |
19 from sets import ImmutableSet as frozenset | |
20 import HTMLParser as html | |
21 import htmlentitydefs | |
22 from StringIO import StringIO | |
23 | |
293
38adb4aa7df5
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
24 from genshi.core import Attrs, QName, Stream, stripentities |
230 | 25 from genshi.core import DOCTYPE, START, END, START_NS, END_NS, TEXT, \ |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
26 START_CDATA, END_CDATA, PI, COMMENT |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
27 |
290
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
28 __all__ = ['ET', 'ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML'] |
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
29 |
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
30 def ET(element): |
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
31 """Convert a given ElementTree element to a markup stream.""" |
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
32 tag_name = QName(element.tag.lstrip('{')) |
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
33 attrs = Attrs(element.items()) |
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
34 |
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
35 yield START, (tag_name, attrs), (None, -1, -1) |
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
36 if element.text: |
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
37 yield TEXT, element.text, (None, -1, -1) |
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
38 for child in element.getchildren(): |
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
39 for item in ET(child): |
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
40 yield item |
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
41 yield END, tag_name, (None, -1, -1) |
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
42 if element.tail: |
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
43 yield TEXT, element.tail, (None, -1, -1) |
1 | 44 |
45 | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
46 class ParseError(Exception): |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
47 """Exception raised when fatal syntax errors are found in the input being |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
48 parsed.""" |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
49 |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
50 def __init__(self, message, filename='<string>', lineno=-1, offset=-1): |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
51 Exception.__init__(self, message) |
213 | 52 self.msg = message |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
53 self.filename = filename |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
54 self.lineno = lineno |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
55 self.offset = offset |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
56 |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
57 |
1 | 58 class XMLParser(object): |
59 """Generator-based XML parser based on roughly equivalent code in | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
60 Kid/ElementTree. |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
61 |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
62 The parsing is initiated by iterating over the parser object: |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
63 |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
64 >>> parser = XMLParser(StringIO('<root id="2"><child>Foo</child></root>')) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
65 >>> for kind, data, pos in parser: |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
66 ... print kind, data |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
67 START (u'root', [(u'id', u'2')]) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
68 START (u'child', []) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
69 TEXT Foo |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
70 END child |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
71 END root |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
72 """ |
1 | 73 |
293
38adb4aa7df5
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
74 _entitydefs = ['<!ENTITY %s "&#%d;">' % (name, value) for name, value in |
38adb4aa7df5
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
75 htmlentitydefs.name2codepoint.items()] |
38adb4aa7df5
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
76 _external_dtd = '\n'.join(_entitydefs) |
38adb4aa7df5
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
77 |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
78 def __init__(self, source, filename=None): |
311
01e2c48f6dfb
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
79 """Initialize the parser for the given XML input. |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
80 |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
81 @param source: the XML text as a file-like object |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
82 @param filename: the name of the file, if appropriate |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
83 """ |
1 | 84 self.source = source |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
85 self.filename = filename |
1 | 86 |
87 # Setup the Expat parser | |
88 parser = expat.ParserCreate('utf-8', '}') | |
89 parser.buffer_text = True | |
90 parser.returns_unicode = True | |
160 | 91 parser.ordered_attributes = True |
92 | |
1 | 93 parser.StartElementHandler = self._handle_start |
94 parser.EndElementHandler = self._handle_end | |
95 parser.CharacterDataHandler = self._handle_data | |
96 parser.StartDoctypeDeclHandler = self._handle_doctype | |
97 parser.StartNamespaceDeclHandler = self._handle_start_ns | |
98 parser.EndNamespaceDeclHandler = self._handle_end_ns | |
143
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
99 parser.StartCdataSectionHandler = self._handle_start_cdata |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
100 parser.EndCdataSectionHandler = self._handle_end_cdata |
1 | 101 parser.ProcessingInstructionHandler = self._handle_pi |
102 parser.CommentHandler = self._handle_comment | |
209
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
103 |
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
104 # Tell Expat that we'll handle non-XML entities ourselves |
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
105 # (in _handle_other) |
1 | 106 parser.DefaultHandler = self._handle_other |
293
38adb4aa7df5
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
107 parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) |
209
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
108 parser.UseForeignDTD() |
293
38adb4aa7df5
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
109 parser.ExternalEntityRefHandler = self._build_foreign |
1 | 110 |
111 # Location reporting is only support in Python >= 2.4 | |
112 if not hasattr(parser, 'CurrentLineNumber'): | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
113 self._getpos = self._getpos_unknown |
1 | 114 |
115 self.expat = parser | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
116 self._queue = [] |
1 | 117 |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
118 def parse(self): |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
119 def _generate(): |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
120 try: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
121 bufsize = 4 * 1024 # 4K |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
122 done = False |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
123 while 1: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
124 while not done and len(self._queue) == 0: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
125 data = self.source.read(bufsize) |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
126 if data == '': # end of data |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
127 if hasattr(self, 'expat'): |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
128 self.expat.Parse('', True) |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
129 del self.expat # get rid of circular references |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
130 done = True |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
131 else: |
207
0619a27f5e67
The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents:
182
diff
changeset
|
132 if isinstance(data, unicode): |
0619a27f5e67
The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents:
182
diff
changeset
|
133 data = data.encode('utf-8') |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
134 self.expat.Parse(data, False) |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
135 for event in self._queue: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
136 yield event |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
137 self._queue = [] |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
138 if done: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
139 break |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
140 except expat.ExpatError, e: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
141 msg = str(e) |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
142 if self.filename: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
143 msg += ', in ' + self.filename |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
144 raise ParseError(msg, self.filename, e.lineno, e.offset) |
146 | 145 return Stream(_generate()).filter(_coalesce) |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
146 |
1 | 147 def __iter__(self): |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
148 return iter(self.parse()) |
1 | 149 |
293
38adb4aa7df5
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
150 def _build_foreign(self, context, base, sysid, pubid): |
38adb4aa7df5
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
151 parser = self.expat.ExternalEntityParserCreate(context) |
38adb4aa7df5
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
152 parser.ParseFile(StringIO(self._external_dtd)) |
38adb4aa7df5
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
153 return 1 |
38adb4aa7df5
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
154 |
143
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
155 def _enqueue(self, kind, data=None, pos=None): |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
156 if pos is None: |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
157 pos = self._getpos() |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
158 if kind is TEXT: |
134
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
159 # Expat reports the *end* of the text event as current position. We |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
160 # try to fix that up here as much as possible. Unfortunately, the |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
161 # offset is only valid for single-line text. For multi-line text, |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
162 # it is apparently not possible to determine at what offset it |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
163 # started |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
164 if '\n' in data: |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
165 lines = data.splitlines() |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
166 lineno = pos[1] - len(lines) + 1 |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
167 offset = -1 |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
168 else: |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
169 lineno = pos[1] |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
170 offset = pos[2] - len(data) |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
171 pos = (pos[0], lineno, offset) |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
172 self._queue.append((kind, data, pos)) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
173 |
1 | 174 def _getpos_unknown(self): |
134
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
175 return (self.filename, -1, -1) |
1 | 176 |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
177 def _getpos(self): |
134
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
178 return (self.filename, self.expat.CurrentLineNumber, |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
179 self.expat.CurrentColumnNumber) |
1 | 180 |
181 def _handle_start(self, tag, attrib): | |
182
41db0260ebb1
Renamed `Attributes` to `Attrs` to reduce the verbosity.
cmlenz
parents:
160
diff
changeset
|
182 self._enqueue(START, (QName(tag), Attrs(zip(*[iter(attrib)] * 2)))) |
1 | 183 |
184 def _handle_end(self, tag): | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
185 self._enqueue(END, QName(tag)) |
1 | 186 |
187 def _handle_data(self, text): | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
188 self._enqueue(TEXT, text) |
1 | 189 |
190 def _handle_doctype(self, name, sysid, pubid, has_internal_subset): | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
191 self._enqueue(DOCTYPE, (name, pubid, sysid)) |
1 | 192 |
193 def _handle_start_ns(self, prefix, uri): | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
194 self._enqueue(START_NS, (prefix or '', uri)) |
1 | 195 |
196 def _handle_end_ns(self, prefix): | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
197 self._enqueue(END_NS, prefix or '') |
1 | 198 |
143
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
199 def _handle_start_cdata(self): |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
200 self._enqueue(START_CDATA) |
143
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
201 |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
202 def _handle_end_cdata(self): |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
203 self._enqueue(END_CDATA) |
143
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
204 |
1 | 205 def _handle_pi(self, target, data): |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
206 self._enqueue(PI, (target, data)) |
1 | 207 |
208 def _handle_comment(self, text): | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
209 self._enqueue(COMMENT, text) |
1 | 210 |
211 def _handle_other(self, text): | |
212 if text.startswith('&'): | |
213 # deal with undefined entities | |
214 try: | |
215 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
216 self._enqueue(TEXT, text) |
1 | 217 except KeyError: |
209
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
218 filename, lineno, offset = self._getpos() |
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
219 error = expat.error('undefined entity "%s": line %d, column %d' |
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
220 % (text, lineno, offset)) |
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
221 error.code = expat.errors.XML_ERROR_UNDEFINED_ENTITY |
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
222 error.lineno = lineno |
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
223 error.offset = offset |
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
224 raise error |
1 | 225 |
226 | |
227 def XML(text): | |
228 return Stream(list(XMLParser(StringIO(text)))) | |
229 | |
230 | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
231 class HTMLParser(html.HTMLParser, object): |
1 | 232 """Parser for HTML input based on the Python `HTMLParser` module. |
233 | |
234 This class provides the same interface for generating stream events as | |
235 `XMLParser`, and attempts to automatically balance tags. | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
236 |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
237 The parsing is initiated by iterating over the parser object: |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
238 |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
239 >>> parser = HTMLParser(StringIO('<UL compact><LI>Foo</UL>')) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
240 >>> for kind, data, pos in parser: |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
241 ... print kind, data |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
242 START (u'ul', [(u'compact', u'compact')]) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
243 START (u'li', []) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
244 TEXT Foo |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
245 END li |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
246 END ul |
1 | 247 """ |
248 | |
249 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', | |
250 'hr', 'img', 'input', 'isindex', 'link', 'meta', | |
251 'param']) | |
252 | |
311
01e2c48f6dfb
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
253 def __init__(self, source, filename=None, encoding='utf-8'): |
01e2c48f6dfb
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
254 """Initialize the parser for the given HTML input. |
01e2c48f6dfb
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
255 |
01e2c48f6dfb
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
256 @param source: the HTML text as a file-like object |
01e2c48f6dfb
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
257 @param filename: the name of the file, if known |
01e2c48f6dfb
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
258 @param filename: encoding of the file; ignored if the input is unicode |
01e2c48f6dfb
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
259 """ |
1 | 260 html.HTMLParser.__init__(self) |
261 self.source = source | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
262 self.filename = filename |
311
01e2c48f6dfb
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
263 self.encoding = encoding |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
264 self._queue = [] |
1 | 265 self._open_tags = [] |
266 | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
267 def parse(self): |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
268 def _generate(): |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
269 try: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
270 bufsize = 4 * 1024 # 4K |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
271 done = False |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
272 while 1: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
273 while not done and len(self._queue) == 0: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
274 data = self.source.read(bufsize) |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
275 if data == '': # end of data |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
276 self.close() |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
277 done = True |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
278 else: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
279 self.feed(data) |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
280 for kind, data, pos in self._queue: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
281 yield kind, data, pos |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
282 self._queue = [] |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
283 if done: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
284 open_tags = self._open_tags |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
285 open_tags.reverse() |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
286 for tag in open_tags: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
287 yield END, QName(tag), pos |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
288 break |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
289 except html.HTMLParseError, e: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
290 msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset) |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
291 if self.filename: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
292 msg += ', in %s' % self.filename |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
293 raise ParseError(msg, self.filename, e.lineno, e.offset) |
146 | 294 return Stream(_generate()).filter(_coalesce) |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
295 |
1 | 296 def __iter__(self): |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
297 return iter(self.parse()) |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
298 |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
299 def _enqueue(self, kind, data, pos=None): |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
300 if pos is None: |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
301 pos = self._getpos() |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
302 self._queue.append((kind, data, pos)) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
303 |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
304 def _getpos(self): |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
305 lineno, column = self.getpos() |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
306 return (self.filename, lineno, column) |
1 | 307 |
308 def handle_starttag(self, tag, attrib): | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
309 fixed_attrib = [] |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
310 for name, value in attrib: # Fixup minimized attributes |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
311 if value is None: |
312
7e743338a799
Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents:
311
diff
changeset
|
312 value = unicode(name) |
7e743338a799
Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents:
311
diff
changeset
|
313 elif not isinstance(value, unicode): |
7e743338a799
Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents:
311
diff
changeset
|
314 value = value.decode(self.encoding, 'replace') |
7e743338a799
Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents:
311
diff
changeset
|
315 fixed_attrib.append((name, stripentities(value))) |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
316 |
182
41db0260ebb1
Renamed `Attributes` to `Attrs` to reduce the verbosity.
cmlenz
parents:
160
diff
changeset
|
317 self._enqueue(START, (QName(tag), Attrs(fixed_attrib))) |
1 | 318 if tag in self._EMPTY_ELEMS: |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
319 self._enqueue(END, QName(tag)) |
1 | 320 else: |
321 self._open_tags.append(tag) | |
322 | |
323 def handle_endtag(self, tag): | |
324 if tag not in self._EMPTY_ELEMS: | |
325 while self._open_tags: | |
326 open_tag = self._open_tags.pop() | |
327 if open_tag.lower() == tag.lower(): | |
328 break | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
329 self._enqueue(END, QName(open_tag)) |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
330 self._enqueue(END, QName(tag)) |
1 | 331 |
332 def handle_data(self, text): | |
311
01e2c48f6dfb
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
333 if not isinstance(text, unicode): |
01e2c48f6dfb
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
334 text = text.decode(self.encoding, 'replace') |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
335 self._enqueue(TEXT, text) |
1 | 336 |
337 def handle_charref(self, name): | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
338 text = unichr(int(name)) |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
339 self._enqueue(TEXT, text) |
1 | 340 |
341 def handle_entityref(self, name): | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
342 try: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
343 text = unichr(htmlentitydefs.name2codepoint[name]) |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
344 except KeyError: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
345 text = '&%s;' % name |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
346 self._enqueue(TEXT, text) |
1 | 347 |
348 def handle_pi(self, data): | |
349 target, data = data.split(maxsplit=1) | |
350 data = data.rstrip('?') | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
351 self._enqueue(PI, (target.strip(), data.strip())) |
1 | 352 |
353 def handle_comment(self, text): | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
354 self._enqueue(COMMENT, text) |
1 | 355 |
356 | |
311
01e2c48f6dfb
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
357 def HTML(text, encoding='utf-8'): |
01e2c48f6dfb
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
358 return Stream(list(HTMLParser(StringIO(text), encoding=encoding))) |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
359 |
146 | 360 def _coalesce(stream): |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
361 """Coalesces adjacent TEXT events into a single event.""" |
146 | 362 textbuf = [] |
363 textpos = None | |
364 for kind, data, pos in chain(stream, [(None, None, None)]): | |
365 if kind is TEXT: | |
366 textbuf.append(data) | |
367 if textpos is None: | |
368 textpos = pos | |
369 else: | |
370 if textbuf: | |
371 yield TEXT, u''.join(textbuf), textpos | |
372 del textbuf[:] | |
373 textpos = None | |
374 if kind: | |
375 yield kind, data, pos |