Mercurial > genshi > mirror
annotate genshi/input.py @ 1034:e02843c0fecc trunk
Add missing boolean attributes to XHTML and HTML serializers (fixes #570).
author | hodgestar |
---|---|
date | Wed, 19 Mar 2014 14:22:22 +0000 |
parents | 0f4b2e892a48 |
children |
rev | line source |
---|---|
1 | 1 # -*- coding: utf-8 -*- |
2 # | |
854
4d9bef447df9
More work on reducing the size of the diff produced by 2to3.
cmlenz
parents:
853
diff
changeset
|
3 # Copyright (C) 2006-2009 Edgewall Software |
1 | 4 # All rights reserved. |
5 # | |
6 # This software is licensed as described in the file COPYING, which | |
7 # you should have received as part of this distribution. The terms | |
230 | 8 # are also available at http://genshi.edgewall.org/wiki/License. |
1 | 9 # |
10 # This software consists of voluntary contributions made by many | |
11 # individuals. For the exact contribution history, see the revision | |
230 | 12 # history and logs, available at http://genshi.edgewall.org/log/. |
1 | 13 |
425
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
423
diff
changeset
|
14 """Support for constructing markup streams from files, strings, or other |
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
423
diff
changeset
|
15 sources. |
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
423
diff
changeset
|
16 """ |
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
423
diff
changeset
|
17 |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
18 from itertools import chain |
965
2bfd8f8d241c
Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents:
932
diff
changeset
|
19 import codecs |
859 | 20 import htmlentitydefs as entities |
21 import HTMLParser as html | |
1 | 22 from xml.parsers import expat |
23 | |
293
e17b7459b515
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
24 from genshi.core import Attrs, QName, Stream, stripentities |
859 | 25 from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, START_NS, \ |
26 END_NS, START_CDATA, END_CDATA, PI, COMMENT | |
932 | 27 from genshi.compat import StringIO, BytesIO |
28 | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
29 |
290
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
30 __all__ = ['ET', 'ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML'] |
425
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
423
diff
changeset
|
31 __docformat__ = 'restructuredtext en' |
290
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
32 |
859 | 33 |
290
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
34 def ET(element): |
433 | 35 """Convert a given ElementTree element to a markup stream. |
36 | |
37 :param element: an ElementTree element | |
38 :return: a markup stream | |
39 """ | |
290
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
40 tag_name = QName(element.tag.lstrip('{')) |
458
5f5b227b04be
The `ET()` function now correctly handles attributes with a namespace.
cmlenz
parents:
434
diff
changeset
|
41 attrs = Attrs([(QName(attr.lstrip('{')), value) |
5f5b227b04be
The `ET()` function now correctly handles attributes with a namespace.
cmlenz
parents:
434
diff
changeset
|
42 for attr, value in element.items()]) |
290
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
43 |
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
44 yield START, (tag_name, attrs), (None, -1, -1) |
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
45 if element.text: |
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
46 yield TEXT, element.text, (None, -1, -1) |
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
47 for child in element.getchildren(): |
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
48 for item in ET(child): |
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
49 yield item |
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
50 yield END, tag_name, (None, -1, -1) |
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
51 if element.tail: |
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
52 yield TEXT, element.tail, (None, -1, -1) |
1 | 53 |
54 | |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
55 class ParseError(Exception): |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
56 """Exception raised when fatal syntax errors are found in the input being |
433 | 57 parsed. |
58 """ | |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
59 |
422
5d08a744636e
More work to include absolute file paths in exceptions.
cmlenz
parents:
419
diff
changeset
|
60 def __init__(self, message, filename=None, lineno=-1, offset=-1): |
433 | 61 """Exception initializer. |
62 | |
63 :param message: the error message from the parser | |
64 :param filename: the path to the file that was parsed | |
65 :param lineno: the number of the line on which the error was encountered | |
66 :param offset: the column number where the error was encountered | |
67 """ | |
422
5d08a744636e
More work to include absolute file paths in exceptions.
cmlenz
parents:
419
diff
changeset
|
68 self.msg = message |
5d08a744636e
More work to include absolute file paths in exceptions.
cmlenz
parents:
419
diff
changeset
|
69 if filename: |
434
5692bc32ba5f
* Better method to propogate the full path to the template file on parse errors. Supersedes r513.
cmlenz
parents:
433
diff
changeset
|
70 message += ', in ' + filename |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
71 Exception.__init__(self, message) |
422
5d08a744636e
More work to include absolute file paths in exceptions.
cmlenz
parents:
419
diff
changeset
|
72 self.filename = filename or '<string>' |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
73 self.lineno = lineno |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
74 self.offset = offset |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
75 |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
76 |
1 | 77 class XMLParser(object): |
78 """Generator-based XML parser based on roughly equivalent code in | |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
79 Kid/ElementTree. |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
80 |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
81 The parsing is initiated by iterating over the parser object: |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
82 |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
83 >>> parser = XMLParser(StringIO('<root id="2"><child>Foo</child></root>')) |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
84 >>> for kind, data, pos in parser: |
853
f33ecf3c319e
Convert a bunch of print statements to py3k compatible syntax.
cmlenz
parents:
852
diff
changeset
|
85 ... print('%s %s' % (kind, data)) |
857
129e54866a98
Avoid unicode literals in `repr`s of `QName` and `Namespace` when not necessary.
cmlenz
parents:
856
diff
changeset
|
86 START (QName('root'), Attrs([(QName('id'), u'2')])) |
129e54866a98
Avoid unicode literals in `repr`s of `QName` and `Namespace` when not necessary.
cmlenz
parents:
856
diff
changeset
|
87 START (QName('child'), Attrs()) |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
88 TEXT Foo |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
89 END child |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
90 END root |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
91 """ |
1 | 92 |
293
e17b7459b515
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
93 _entitydefs = ['<!ENTITY %s "&#%d;">' % (name, value) for name, value in |
856 | 94 entities.name2codepoint.items()] |
932 | 95 _external_dtd = u'\n'.join(_entitydefs).encode('utf-8') |
293
e17b7459b515
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
96 |
316
a946edefac40
Configurable encoding of template files, closing #65.
cmlenz
parents:
312
diff
changeset
|
97 def __init__(self, source, filename=None, encoding=None): |
311
8de1ff534d22
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
98 """Initialize the parser for the given XML input. |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
99 |
425
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
423
diff
changeset
|
100 :param source: the XML text as a file-like object |
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
423
diff
changeset
|
101 :param filename: the name of the file, if appropriate |
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
423
diff
changeset
|
102 :param encoding: the encoding of the file; if not specified, the |
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
423
diff
changeset
|
103 encoding is assumed to be ASCII, UTF-8, or UTF-16, or |
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
423
diff
changeset
|
104 whatever the encoding specified in the XML declaration |
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
423
diff
changeset
|
105 (if any) |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
106 """ |
1 | 107 self.source = source |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
108 self.filename = filename |
1 | 109 |
110 # Setup the Expat parser | |
316
a946edefac40
Configurable encoding of template files, closing #65.
cmlenz
parents:
312
diff
changeset
|
111 parser = expat.ParserCreate(encoding, '}') |
1 | 112 parser.buffer_text = True |
932 | 113 # Python 3 does not have returns_unicode |
114 if hasattr(parser, 'returns_unicode'): | |
115 parser.returns_unicode = True | |
160 | 116 parser.ordered_attributes = True |
117 | |
1 | 118 parser.StartElementHandler = self._handle_start |
119 parser.EndElementHandler = self._handle_end | |
120 parser.CharacterDataHandler = self._handle_data | |
121 parser.StartDoctypeDeclHandler = self._handle_doctype | |
122 parser.StartNamespaceDeclHandler = self._handle_start_ns | |
123 parser.EndNamespaceDeclHandler = self._handle_end_ns | |
143
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
124 parser.StartCdataSectionHandler = self._handle_start_cdata |
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
125 parser.EndCdataSectionHandler = self._handle_end_cdata |
1 | 126 parser.ProcessingInstructionHandler = self._handle_pi |
460
75425671b437
Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents:
458
diff
changeset
|
127 parser.XmlDeclHandler = self._handle_xml_decl |
1 | 128 parser.CommentHandler = self._handle_comment |
209
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
129 |
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
130 # Tell Expat that we'll handle non-XML entities ourselves |
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
131 # (in _handle_other) |
1 | 132 parser.DefaultHandler = self._handle_other |
293
e17b7459b515
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
133 parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) |
209
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
134 parser.UseForeignDTD() |
293
e17b7459b515
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
135 parser.ExternalEntityRefHandler = self._build_foreign |
1 | 136 |
137 self.expat = parser | |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
138 self._queue = [] |
1 | 139 |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
140 def parse(self): |
433 | 141 """Generator that parses the XML source, yielding markup events. |
142 | |
143 :return: a markup event stream | |
144 :raises ParseError: if the XML text is not well formed | |
145 """ | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
146 def _generate(): |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
147 try: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
148 bufsize = 4 * 1024 # 4K |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
149 done = False |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
150 while 1: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
151 while not done and len(self._queue) == 0: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
152 data = self.source.read(bufsize) |
932 | 153 if not data: # end of data |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
154 if hasattr(self, 'expat'): |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
155 self.expat.Parse('', True) |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
156 del self.expat # get rid of circular references |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
157 done = True |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
158 else: |
207
28bfc6aafab7
The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents:
182
diff
changeset
|
159 if isinstance(data, unicode): |
28bfc6aafab7
The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents:
182
diff
changeset
|
160 data = data.encode('utf-8') |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
161 self.expat.Parse(data, False) |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
162 for event in self._queue: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
163 yield event |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
164 self._queue = [] |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
165 if done: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
166 break |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
167 except expat.ExpatError, e: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
168 msg = str(e) |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
169 raise ParseError(msg, self.filename, e.lineno, e.offset) |
146 | 170 return Stream(_generate()).filter(_coalesce) |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
171 |
1 | 172 def __iter__(self): |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
173 return iter(self.parse()) |
1 | 174 |
293
e17b7459b515
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
175 def _build_foreign(self, context, base, sysid, pubid): |
e17b7459b515
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
176 parser = self.expat.ExternalEntityParserCreate(context) |
932 | 177 parser.ParseFile(BytesIO(self._external_dtd)) |
293
e17b7459b515
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
178 return 1 |
e17b7459b515
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
179 |
143
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
180 def _enqueue(self, kind, data=None, pos=None): |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
181 if pos is None: |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
182 pos = self._getpos() |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
183 if kind is TEXT: |
134
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
184 # Expat reports the *end* of the text event as current position. We |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
185 # try to fix that up here as much as possible. Unfortunately, the |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
186 # offset is only valid for single-line text. For multi-line text, |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
187 # it is apparently not possible to determine at what offset it |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
188 # started |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
189 if '\n' in data: |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
190 lines = data.splitlines() |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
191 lineno = pos[1] - len(lines) + 1 |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
192 offset = -1 |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
193 else: |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
194 lineno = pos[1] |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
195 offset = pos[2] - len(data) |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
196 pos = (pos[0], lineno, offset) |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
197 self._queue.append((kind, data, pos)) |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
198 |
1 | 199 def _getpos_unknown(self): |
134
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
200 return (self.filename, -1, -1) |
1 | 201 |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
202 def _getpos(self): |
134
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
203 return (self.filename, self.expat.CurrentLineNumber, |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
204 self.expat.CurrentColumnNumber) |
1 | 205 |
206 def _handle_start(self, tag, attrib): | |
403
228907abb726
Remove some magic/overhead from `Attrs` creation and manipulation by not automatically wrapping attribute names in `QName`.
cmlenz
parents:
378
diff
changeset
|
207 attrs = Attrs([(QName(name), value) for name, value in |
228907abb726
Remove some magic/overhead from `Attrs` creation and manipulation by not automatically wrapping attribute names in `QName`.
cmlenz
parents:
378
diff
changeset
|
208 zip(*[iter(attrib)] * 2)]) |
228907abb726
Remove some magic/overhead from `Attrs` creation and manipulation by not automatically wrapping attribute names in `QName`.
cmlenz
parents:
378
diff
changeset
|
209 self._enqueue(START, (QName(tag), attrs)) |
1 | 210 |
211 def _handle_end(self, tag): | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
212 self._enqueue(END, QName(tag)) |
1 | 213 |
214 def _handle_data(self, text): | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
215 self._enqueue(TEXT, text) |
1 | 216 |
460
75425671b437
Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents:
458
diff
changeset
|
217 def _handle_xml_decl(self, version, encoding, standalone): |
75425671b437
Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents:
458
diff
changeset
|
218 self._enqueue(XML_DECL, (version, encoding, standalone)) |
75425671b437
Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents:
458
diff
changeset
|
219 |
1 | 220 def _handle_doctype(self, name, sysid, pubid, has_internal_subset): |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
221 self._enqueue(DOCTYPE, (name, pubid, sysid)) |
1 | 222 |
223 def _handle_start_ns(self, prefix, uri): | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
224 self._enqueue(START_NS, (prefix or '', uri)) |
1 | 225 |
226 def _handle_end_ns(self, prefix): | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
227 self._enqueue(END_NS, prefix or '') |
1 | 228 |
143
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
229 def _handle_start_cdata(self): |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
230 self._enqueue(START_CDATA) |
143
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
231 |
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
232 def _handle_end_cdata(self): |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
233 self._enqueue(END_CDATA) |
143
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
234 |
1 | 235 def _handle_pi(self, target, data): |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
236 self._enqueue(PI, (target, data)) |
1 | 237 |
238 def _handle_comment(self, text): | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
239 self._enqueue(COMMENT, text) |
1 | 240 |
241 def _handle_other(self, text): | |
242 if text.startswith('&'): | |
243 # deal with undefined entities | |
244 try: | |
856 | 245 text = unichr(entities.name2codepoint[text[1:-1]]) |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
246 self._enqueue(TEXT, text) |
1 | 247 except KeyError: |
209
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
248 filename, lineno, offset = self._getpos() |
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
249 error = expat.error('undefined entity "%s": line %d, column %d' |
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
250 % (text, lineno, offset)) |
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
251 error.code = expat.errors.XML_ERROR_UNDEFINED_ENTITY |
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
252 error.lineno = lineno |
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
253 error.offset = offset |
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
254 raise error |
1 | 255 |
256 | |
257 def XML(text): | |
433 | 258 """Parse the given XML source and return a markup stream. |
259 | |
260 Unlike with `XMLParser`, the returned stream is reusable, meaning it can be | |
261 iterated over multiple times: | |
262 | |
263 >>> xml = XML('<doc><elem>Foo</elem><elem>Bar</elem></doc>') | |
853
f33ecf3c319e
Convert a bunch of print statements to py3k compatible syntax.
cmlenz
parents:
852
diff
changeset
|
264 >>> print(xml) |
433 | 265 <doc><elem>Foo</elem><elem>Bar</elem></doc> |
853
f33ecf3c319e
Convert a bunch of print statements to py3k compatible syntax.
cmlenz
parents:
852
diff
changeset
|
266 >>> print(xml.select('elem')) |
433 | 267 <elem>Foo</elem><elem>Bar</elem> |
853
f33ecf3c319e
Convert a bunch of print statements to py3k compatible syntax.
cmlenz
parents:
852
diff
changeset
|
268 >>> print(xml.select('elem/text()')) |
433 | 269 FooBar |
270 | |
271 :param text: the XML source | |
272 :return: the parsed XML event stream | |
273 :raises ParseError: if the XML text is not well-formed | |
274 """ | |
859 | 275 return Stream(list(XMLParser(StringIO(text)))) |
1 | 276 |
277 | |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
278 class HTMLParser(html.HTMLParser, object): |
1 | 279 """Parser for HTML input based on the Python `HTMLParser` module. |
280 | |
281 This class provides the same interface for generating stream events as | |
282 `XMLParser`, and attempts to automatically balance tags. | |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
283 |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
284 The parsing is initiated by iterating over the parser object: |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
285 |
932 | 286 >>> parser = HTMLParser(BytesIO(u'<UL compact><LI>Foo</UL>'.encode('utf-8')), encoding='utf-8') |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
287 >>> for kind, data, pos in parser: |
853
f33ecf3c319e
Convert a bunch of print statements to py3k compatible syntax.
cmlenz
parents:
852
diff
changeset
|
288 ... print('%s %s' % (kind, data)) |
857
129e54866a98
Avoid unicode literals in `repr`s of `QName` and `Namespace` when not necessary.
cmlenz
parents:
856
diff
changeset
|
289 START (QName('ul'), Attrs([(QName('compact'), u'compact')])) |
129e54866a98
Avoid unicode literals in `repr`s of `QName` and `Namespace` when not necessary.
cmlenz
parents:
856
diff
changeset
|
290 START (QName('li'), Attrs()) |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
291 TEXT Foo |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
292 END li |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
293 END ul |
1 | 294 """ |
295 | |
296 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', | |
297 'hr', 'img', 'input', 'isindex', 'link', 'meta', | |
298 'param']) | |
299 | |
932 | 300 def __init__(self, source, filename=None, encoding=None): |
311
8de1ff534d22
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
301 """Initialize the parser for the given HTML input. |
8de1ff534d22
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
302 |
425
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
423
diff
changeset
|
303 :param source: the HTML text as a file-like object |
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
423
diff
changeset
|
304 :param filename: the name of the file, if known |
073640758a42
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
423
diff
changeset
|
305 :param filename: encoding of the file; ignored if the input is unicode |
311
8de1ff534d22
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
306 """ |
1 | 307 html.HTMLParser.__init__(self) |
308 self.source = source | |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
309 self.filename = filename |
311
8de1ff534d22
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
310 self.encoding = encoding |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
311 self._queue = [] |
1 | 312 self._open_tags = [] |
313 | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
314 def parse(self): |
433 | 315 """Generator that parses the HTML source, yielding markup events. |
316 | |
317 :return: a markup event stream | |
318 :raises ParseError: if the HTML text is not well formed | |
319 """ | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
320 def _generate(): |
965
2bfd8f8d241c
Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents:
932
diff
changeset
|
321 if self.encoding: |
2bfd8f8d241c
Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents:
932
diff
changeset
|
322 reader = codecs.getreader(self.encoding) |
2bfd8f8d241c
Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents:
932
diff
changeset
|
323 source = reader(self.source) |
2bfd8f8d241c
Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents:
932
diff
changeset
|
324 else: |
2bfd8f8d241c
Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents:
932
diff
changeset
|
325 source = self.source |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
326 try: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
327 bufsize = 4 * 1024 # 4K |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
328 done = False |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
329 while 1: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
330 while not done and len(self._queue) == 0: |
965
2bfd8f8d241c
Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents:
932
diff
changeset
|
331 data = source.read(bufsize) |
932 | 332 if not data: # end of data |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
333 self.close() |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
334 done = True |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
335 else: |
932 | 336 if not isinstance(data, unicode): |
965
2bfd8f8d241c
Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents:
932
diff
changeset
|
337 raise UnicodeError("source returned bytes, but no encoding specified") |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
338 self.feed(data) |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
339 for kind, data, pos in self._queue: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
340 yield kind, data, pos |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
341 self._queue = [] |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
342 if done: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
343 open_tags = self._open_tags |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
344 open_tags.reverse() |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
345 for tag in open_tags: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
346 yield END, QName(tag), pos |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
347 break |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
348 except html.HTMLParseError, e: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
349 msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset) |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
350 raise ParseError(msg, self.filename, e.lineno, e.offset) |
146 | 351 return Stream(_generate()).filter(_coalesce) |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
352 |
1 | 353 def __iter__(self): |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
354 return iter(self.parse()) |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
355 |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
356 def _enqueue(self, kind, data, pos=None): |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
357 if pos is None: |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
358 pos = self._getpos() |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
359 self._queue.append((kind, data, pos)) |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
360 |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
361 def _getpos(self): |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
362 lineno, column = self.getpos() |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
363 return (self.filename, lineno, column) |
1 | 364 |
365 def handle_starttag(self, tag, attrib): | |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
366 fixed_attrib = [] |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
367 for name, value in attrib: # Fixup minimized attributes |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
368 if value is None: |
989 | 369 value = name |
403
228907abb726
Remove some magic/overhead from `Attrs` creation and manipulation by not automatically wrapping attribute names in `QName`.
cmlenz
parents:
378
diff
changeset
|
370 fixed_attrib.append((QName(name), stripentities(value))) |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
371 |
182
2f30ce3fb85e
Renamed `Attributes` to `Attrs` to reduce the verbosity.
cmlenz
parents:
160
diff
changeset
|
372 self._enqueue(START, (QName(tag), Attrs(fixed_attrib))) |
1 | 373 if tag in self._EMPTY_ELEMS: |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
374 self._enqueue(END, QName(tag)) |
1 | 375 else: |
376 self._open_tags.append(tag) | |
377 | |
378 def handle_endtag(self, tag): | |
379 if tag not in self._EMPTY_ELEMS: | |
380 while self._open_tags: | |
381 open_tag = self._open_tags.pop() | |
378
873ca2a7ec05
Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents:
376
diff
changeset
|
382 self._enqueue(END, QName(open_tag)) |
1 | 383 if open_tag.lower() == tag.lower(): |
384 break | |
385 | |
386 def handle_data(self, text): | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
387 self._enqueue(TEXT, text) |
1 | 388 |
389 def handle_charref(self, name): | |
423
56bbe1d94da0
Applied patch for #106 (handling of hex charrefs in HTML parser).
cmlenz
parents:
422
diff
changeset
|
390 if name.lower().startswith('x'): |
56bbe1d94da0
Applied patch for #106 (handling of hex charrefs in HTML parser).
cmlenz
parents:
422
diff
changeset
|
391 text = unichr(int(name[1:], 16)) |
56bbe1d94da0
Applied patch for #106 (handling of hex charrefs in HTML parser).
cmlenz
parents:
422
diff
changeset
|
392 else: |
56bbe1d94da0
Applied patch for #106 (handling of hex charrefs in HTML parser).
cmlenz
parents:
422
diff
changeset
|
393 text = unichr(int(name)) |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
394 self._enqueue(TEXT, text) |
1 | 395 |
396 def handle_entityref(self, name): | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
397 try: |
856 | 398 text = unichr(entities.name2codepoint[name]) |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
399 except KeyError: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
400 text = '&%s;' % name |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
401 self._enqueue(TEXT, text) |
1 | 402 |
403 def handle_pi(self, data): | |
376
0e0952d85d97
Fix parsing of processing instructions in HTML input.
cmlenz
parents:
326
diff
changeset
|
404 if data.endswith('?'): |
0e0952d85d97
Fix parsing of processing instructions in HTML input.
cmlenz
parents:
326
diff
changeset
|
405 data = data[:-1] |
996
0f4b2e892a48
Fix handling of processing instructions that don't have data attached (patch from Neil Muller, fixes #368).
hodgestar
parents:
989
diff
changeset
|
406 try: |
0f4b2e892a48
Fix handling of processing instructions that don't have data attached (patch from Neil Muller, fixes #368).
hodgestar
parents:
989
diff
changeset
|
407 target, data = data.split(None, 1) |
0f4b2e892a48
Fix handling of processing instructions that don't have data attached (patch from Neil Muller, fixes #368).
hodgestar
parents:
989
diff
changeset
|
408 except ValueError: |
0f4b2e892a48
Fix handling of processing instructions that don't have data attached (patch from Neil Muller, fixes #368).
hodgestar
parents:
989
diff
changeset
|
409 # PI with no data |
0f4b2e892a48
Fix handling of processing instructions that don't have data attached (patch from Neil Muller, fixes #368).
hodgestar
parents:
989
diff
changeset
|
410 target = data |
0f4b2e892a48
Fix handling of processing instructions that don't have data attached (patch from Neil Muller, fixes #368).
hodgestar
parents:
989
diff
changeset
|
411 data = '' |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
412 self._enqueue(PI, (target.strip(), data.strip())) |
1 | 413 |
414 def handle_comment(self, text): | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
415 self._enqueue(COMMENT, text) |
1 | 416 |
417 | |
932 | 418 def HTML(text, encoding=None): |
433 | 419 """Parse the given HTML source and return a markup stream. |
420 | |
421 Unlike with `HTMLParser`, the returned stream is reusable, meaning it can be | |
422 iterated over multiple times: | |
423 | |
932 | 424 >>> html = HTML('<body><h1>Foo</h1></body>', encoding='utf-8') |
853
f33ecf3c319e
Convert a bunch of print statements to py3k compatible syntax.
cmlenz
parents:
852
diff
changeset
|
425 >>> print(html) |
433 | 426 <body><h1>Foo</h1></body> |
853
f33ecf3c319e
Convert a bunch of print statements to py3k compatible syntax.
cmlenz
parents:
852
diff
changeset
|
427 >>> print(html.select('h1')) |
433 | 428 <h1>Foo</h1> |
853
f33ecf3c319e
Convert a bunch of print statements to py3k compatible syntax.
cmlenz
parents:
852
diff
changeset
|
429 >>> print(html.select('h1/text()')) |
433 | 430 Foo |
431 | |
432 :param text: the HTML source | |
433 :return: the parsed XML event stream | |
434 :raises ParseError: if the HTML text is not well-formed, and error recovery | |
435 fails | |
436 """ | |
932 | 437 if isinstance(text, unicode): |
965
2bfd8f8d241c
Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents:
932
diff
changeset
|
438 # If it's unicode text the encoding should be set to None. |
2bfd8f8d241c
Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents:
932
diff
changeset
|
439 # The option to pass in an incorrect encoding is for ease |
2bfd8f8d241c
Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents:
932
diff
changeset
|
440 # of writing doctests that work in both Python 2.x and 3.x. |
2bfd8f8d241c
Fix parsing of multi-byte characters that occur on 4K boundaries of HTML files (fixes #538).
hodgestar
parents:
932
diff
changeset
|
441 return Stream(list(HTMLParser(StringIO(text), encoding=None))) |
932 | 442 return Stream(list(HTMLParser(BytesIO(text), encoding=encoding))) |
859 | 443 |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
444 |
146 | 445 def _coalesce(stream): |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
446 """Coalesces adjacent TEXT events into a single event.""" |
146 | 447 textbuf = [] |
448 textpos = None | |
449 for kind, data, pos in chain(stream, [(None, None, None)]): | |
450 if kind is TEXT: | |
451 textbuf.append(data) | |
452 if textpos is None: | |
453 textpos = pos | |
454 else: | |
455 if textbuf: | |
852
07f4339fecb0
Remove usage of unicode literals in a couple of places where they were not strictly necessary.
cmlenz
parents:
750
diff
changeset
|
456 yield TEXT, ''.join(textbuf), textpos |
146 | 457 del textbuf[:] |
458 textpos = None | |
459 if kind: | |
460 yield kind, data, pos |