Mercurial > genshi > genshi-test
annotate genshi/input.py @ 621:d218020fb92a
Use system default date/time format in templates.
author | cmlenz |
---|---|
date | Thu, 30 Aug 2007 19:09:28 +0000 |
parents | 6b5544bb5a99 |
children | d007a0d7ba81 |
rev | line source |
---|---|
1 | 1 # -*- coding: utf-8 -*- |
2 # | |
408 | 3 # Copyright (C) 2006-2007 Edgewall Software |
1 | 4 # All rights reserved. |
5 # | |
6 # This software is licensed as described in the file COPYING, which | |
7 # you should have received as part of this distribution. The terms | |
230 | 8 # are also available at http://genshi.edgewall.org/wiki/License. |
1 | 9 # |
10 # This software consists of voluntary contributions made by many | |
11 # individuals. For the exact contribution history, see the revision | |
230 | 12 # history and logs, available at http://genshi.edgewall.org/log/. |
1 | 13 |
425
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
423
diff
changeset
|
14 """Support for constructing markup streams from files, strings, or other |
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
423
diff
changeset
|
15 sources. |
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
423
diff
changeset
|
16 """ |
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
423
diff
changeset
|
17 |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
18 from itertools import chain |
1 | 19 from xml.parsers import expat |
20 try: | |
21 frozenset | |
22 except NameError: | |
23 from sets import ImmutableSet as frozenset | |
24 import HTMLParser as html | |
25 import htmlentitydefs | |
26 from StringIO import StringIO | |
27 | |
293
38adb4aa7df5
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
28 from genshi.core import Attrs, QName, Stream, stripentities |
460
6b5544bb5a99
Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents:
458
diff
changeset
|
29 from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, START_NS, END_NS, \ |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
30 START_CDATA, END_CDATA, PI, COMMENT |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
31 |
290
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
32 __all__ = ['ET', 'ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML'] |
425
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
423
diff
changeset
|
33 __docformat__ = 'restructuredtext en' |
290
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
34 |
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
35 def ET(element): |
433 | 36 """Convert a given ElementTree element to a markup stream. |
37 | |
38 :param element: an ElementTree element | |
39 :return: a markup stream | |
40 """ | |
290
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
41 tag_name = QName(element.tag.lstrip('{')) |
458
160f787cc818
The `ET()` function now correctly handles attributes with a namespace.
cmlenz
parents:
434
diff
changeset
|
42 attrs = Attrs([(QName(attr.lstrip('{')), value) |
160f787cc818
The `ET()` function now correctly handles attributes with a namespace.
cmlenz
parents:
434
diff
changeset
|
43 for attr, value in element.items()]) |
290
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
44 |
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
45 yield START, (tag_name, attrs), (None, -1, -1) |
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
46 if element.text: |
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
47 yield TEXT, element.text, (None, -1, -1) |
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
48 for child in element.getchildren(): |
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
49 for item in ET(child): |
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
50 yield item |
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
51 yield END, tag_name, (None, -1, -1) |
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
52 if element.tail: |
a6738047c85e
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
53 yield TEXT, element.tail, (None, -1, -1) |
1 | 54 |
55 | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
56 class ParseError(Exception): |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
57 """Exception raised when fatal syntax errors are found in the input being |
433 | 58 parsed. |
59 """ | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
60 |
422
95089b6e37ca
More work to include absolute file paths in exceptions.
cmlenz
parents:
419
diff
changeset
|
61 def __init__(self, message, filename=None, lineno=-1, offset=-1): |
433 | 62 """Exception initializer. |
63 | |
64 :param message: the error message from the parser | |
65 :param filename: the path to the file that was parsed | |
66 :param lineno: the number of the line on which the error was encountered | |
67 :param offset: the column number where the error was encountered | |
68 """ | |
422
95089b6e37ca
More work to include absolute file paths in exceptions.
cmlenz
parents:
419
diff
changeset
|
69 self.msg = message |
95089b6e37ca
More work to include absolute file paths in exceptions.
cmlenz
parents:
419
diff
changeset
|
70 if filename: |
434
e065d7906b68
* Better method to propogate the full path to the template file on parse errors. Supersedes r513.
cmlenz
parents:
433
diff
changeset
|
71 message += ', in ' + filename |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
72 Exception.__init__(self, message) |
422
95089b6e37ca
More work to include absolute file paths in exceptions.
cmlenz
parents:
419
diff
changeset
|
73 self.filename = filename or '<string>' |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
74 self.lineno = lineno |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
75 self.offset = offset |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
76 |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
77 |
1 | 78 class XMLParser(object): |
79 """Generator-based XML parser based on roughly equivalent code in | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
80 Kid/ElementTree. |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
81 |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
82 The parsing is initiated by iterating over the parser object: |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
83 |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
84 >>> parser = XMLParser(StringIO('<root id="2"><child>Foo</child></root>')) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
85 >>> for kind, data, pos in parser: |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
86 ... print kind, data |
326
08ada6b4b767
Fixed `__repr__` of the `QName`, `Attrs`, and `Expression` classes so that the output can be used as code to instantiate the object again.
cmlenz
parents:
316
diff
changeset
|
87 START (QName(u'root'), Attrs([(QName(u'id'), u'2')])) |
08ada6b4b767
Fixed `__repr__` of the `QName`, `Attrs`, and `Expression` classes so that the output can be used as code to instantiate the object again.
cmlenz
parents:
316
diff
changeset
|
88 START (QName(u'child'), Attrs()) |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
89 TEXT Foo |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
90 END child |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
91 END root |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
92 """ |
1 | 93 |
293
38adb4aa7df5
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
94 _entitydefs = ['<!ENTITY %s "&#%d;">' % (name, value) for name, value in |
38adb4aa7df5
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
95 htmlentitydefs.name2codepoint.items()] |
38adb4aa7df5
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
96 _external_dtd = '\n'.join(_entitydefs) |
38adb4aa7df5
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
97 |
316
4ab9edf5e83b
Configurable encoding of template files, closing #65.
cmlenz
parents:
312
diff
changeset
|
98 def __init__(self, source, filename=None, encoding=None): |
311
01e2c48f6dfb
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
99 """Initialize the parser for the given XML input. |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
100 |
425
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
423
diff
changeset
|
101 :param source: the XML text as a file-like object |
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
423
diff
changeset
|
102 :param filename: the name of the file, if appropriate |
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
423
diff
changeset
|
103 :param encoding: the encoding of the file; if not specified, the |
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
423
diff
changeset
|
104 encoding is assumed to be ASCII, UTF-8, or UTF-16, or |
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
423
diff
changeset
|
105 whatever the encoding specified in the XML declaration |
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
423
diff
changeset
|
106 (if any) |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
107 """ |
1 | 108 self.source = source |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
109 self.filename = filename |
1 | 110 |
111 # Setup the Expat parser | |
316
4ab9edf5e83b
Configurable encoding of template files, closing #65.
cmlenz
parents:
312
diff
changeset
|
112 parser = expat.ParserCreate(encoding, '}') |
1 | 113 parser.buffer_text = True |
114 parser.returns_unicode = True | |
160 | 115 parser.ordered_attributes = True |
116 | |
1 | 117 parser.StartElementHandler = self._handle_start |
118 parser.EndElementHandler = self._handle_end | |
119 parser.CharacterDataHandler = self._handle_data | |
120 parser.StartDoctypeDeclHandler = self._handle_doctype | |
121 parser.StartNamespaceDeclHandler = self._handle_start_ns | |
122 parser.EndNamespaceDeclHandler = self._handle_end_ns | |
143
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
123 parser.StartCdataSectionHandler = self._handle_start_cdata |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
124 parser.EndCdataSectionHandler = self._handle_end_cdata |
1 | 125 parser.ProcessingInstructionHandler = self._handle_pi |
460
6b5544bb5a99
Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents:
458
diff
changeset
|
126 parser.XmlDeclHandler = self._handle_xml_decl |
1 | 127 parser.CommentHandler = self._handle_comment |
209
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
128 |
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
129 # Tell Expat that we'll handle non-XML entities ourselves |
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
130 # (in _handle_other) |
1 | 131 parser.DefaultHandler = self._handle_other |
293
38adb4aa7df5
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
132 parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) |
209
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
133 parser.UseForeignDTD() |
293
38adb4aa7df5
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
134 parser.ExternalEntityRefHandler = self._build_foreign |
1 | 135 |
136 # Location reporting is only support in Python >= 2.4 | |
137 if not hasattr(parser, 'CurrentLineNumber'): | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
138 self._getpos = self._getpos_unknown |
1 | 139 |
140 self.expat = parser | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
141 self._queue = [] |
1 | 142 |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
143 def parse(self): |
433 | 144 """Generator that parses the XML source, yielding markup events. |
145 | |
146 :return: a markup event stream | |
147 :raises ParseError: if the XML text is not well formed | |
148 """ | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
149 def _generate(): |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
150 try: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
151 bufsize = 4 * 1024 # 4K |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
152 done = False |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
153 while 1: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
154 while not done and len(self._queue) == 0: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
155 data = self.source.read(bufsize) |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
156 if data == '': # end of data |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
157 if hasattr(self, 'expat'): |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
158 self.expat.Parse('', True) |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
159 del self.expat # get rid of circular references |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
160 done = True |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
161 else: |
207
0619a27f5e67
The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents:
182
diff
changeset
|
162 if isinstance(data, unicode): |
0619a27f5e67
The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents:
182
diff
changeset
|
163 data = data.encode('utf-8') |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
164 self.expat.Parse(data, False) |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
165 for event in self._queue: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
166 yield event |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
167 self._queue = [] |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
168 if done: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
169 break |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
170 except expat.ExpatError, e: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
171 msg = str(e) |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
172 raise ParseError(msg, self.filename, e.lineno, e.offset) |
146 | 173 return Stream(_generate()).filter(_coalesce) |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
174 |
1 | 175 def __iter__(self): |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
176 return iter(self.parse()) |
1 | 177 |
293
38adb4aa7df5
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
178 def _build_foreign(self, context, base, sysid, pubid): |
38adb4aa7df5
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
179 parser = self.expat.ExternalEntityParserCreate(context) |
38adb4aa7df5
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
180 parser.ParseFile(StringIO(self._external_dtd)) |
38adb4aa7df5
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
181 return 1 |
38adb4aa7df5
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
182 |
143
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
183 def _enqueue(self, kind, data=None, pos=None): |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
184 if pos is None: |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
185 pos = self._getpos() |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
186 if kind is TEXT: |
134
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
187 # Expat reports the *end* of the text event as current position. We |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
188 # try to fix that up here as much as possible. Unfortunately, the |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
189 # offset is only valid for single-line text. For multi-line text, |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
190 # it is apparently not possible to determine at what offset it |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
191 # started |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
192 if '\n' in data: |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
193 lines = data.splitlines() |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
194 lineno = pos[1] - len(lines) + 1 |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
195 offset = -1 |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
196 else: |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
197 lineno = pos[1] |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
198 offset = pos[2] - len(data) |
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
199 pos = (pos[0], lineno, offset) |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
200 self._queue.append((kind, data, pos)) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
201 |
1 | 202 def _getpos_unknown(self): |
134
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
203 return (self.filename, -1, -1) |
1 | 204 |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
205 def _getpos(self): |
134
df44110ca91d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
206 return (self.filename, self.expat.CurrentLineNumber, |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
207 self.expat.CurrentColumnNumber) |
1 | 208 |
209 def _handle_start(self, tag, attrib): | |
403
32b283e1d310
Remove some magic/overhead from `Attrs` creation and manipulation by not automatically wrapping attribute names in `QName`.
cmlenz
parents:
378
diff
changeset
|
210 attrs = Attrs([(QName(name), value) for name, value in |
32b283e1d310
Remove some magic/overhead from `Attrs` creation and manipulation by not automatically wrapping attribute names in `QName`.
cmlenz
parents:
378
diff
changeset
|
211 zip(*[iter(attrib)] * 2)]) |
32b283e1d310
Remove some magic/overhead from `Attrs` creation and manipulation by not automatically wrapping attribute names in `QName`.
cmlenz
parents:
378
diff
changeset
|
212 self._enqueue(START, (QName(tag), attrs)) |
1 | 213 |
214 def _handle_end(self, tag): | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
215 self._enqueue(END, QName(tag)) |
1 | 216 |
217 def _handle_data(self, text): | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
218 self._enqueue(TEXT, text) |
1 | 219 |
460
6b5544bb5a99
Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents:
458
diff
changeset
|
220 def _handle_xml_decl(self, version, encoding, standalone): |
6b5544bb5a99
Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents:
458
diff
changeset
|
221 self._enqueue(XML_DECL, (version, encoding, standalone)) |
6b5544bb5a99
Apply patch by Alec Thomas for processing XML declarations (#111). Thanks!
cmlenz
parents:
458
diff
changeset
|
222 |
1 | 223 def _handle_doctype(self, name, sysid, pubid, has_internal_subset): |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
224 self._enqueue(DOCTYPE, (name, pubid, sysid)) |
1 | 225 |
226 def _handle_start_ns(self, prefix, uri): | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
227 self._enqueue(START_NS, (prefix or '', uri)) |
1 | 228 |
229 def _handle_end_ns(self, prefix): | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
230 self._enqueue(END_NS, prefix or '') |
1 | 231 |
143
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
232 def _handle_start_cdata(self): |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
233 self._enqueue(START_CDATA) |
143
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
234 |
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
235 def _handle_end_cdata(self): |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
236 self._enqueue(END_CDATA) |
143
ef761afcedff
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
237 |
1 | 238 def _handle_pi(self, target, data): |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
239 self._enqueue(PI, (target, data)) |
1 | 240 |
241 def _handle_comment(self, text): | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
242 self._enqueue(COMMENT, text) |
1 | 243 |
244 def _handle_other(self, text): | |
245 if text.startswith('&'): | |
246 # deal with undefined entities | |
247 try: | |
248 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
249 self._enqueue(TEXT, text) |
1 | 250 except KeyError: |
209
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
251 filename, lineno, offset = self._getpos() |
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
252 error = expat.error('undefined entity "%s": line %d, column %d' |
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
253 % (text, lineno, offset)) |
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
254 error.code = expat.errors.XML_ERROR_UNDEFINED_ENTITY |
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
255 error.lineno = lineno |
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
256 error.offset = offset |
5b422db07359
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
257 raise error |
1 | 258 |
259 | |
260 def XML(text): | |
433 | 261 """Parse the given XML source and return a markup stream. |
262 | |
263 Unlike with `XMLParser`, the returned stream is reusable, meaning it can be | |
264 iterated over multiple times: | |
265 | |
266 >>> xml = XML('<doc><elem>Foo</elem><elem>Bar</elem></doc>') | |
267 >>> print xml | |
268 <doc><elem>Foo</elem><elem>Bar</elem></doc> | |
269 >>> print xml.select('elem') | |
270 <elem>Foo</elem><elem>Bar</elem> | |
271 >>> print xml.select('elem/text()') | |
272 FooBar | |
273 | |
274 :param text: the XML source | |
275 :return: the parsed XML event stream | |
276 :raises ParseError: if the XML text is not well-formed | |
277 """ | |
1 | 278 return Stream(list(XMLParser(StringIO(text)))) |
279 | |
280 | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
281 class HTMLParser(html.HTMLParser, object): |
1 | 282 """Parser for HTML input based on the Python `HTMLParser` module. |
283 | |
284 This class provides the same interface for generating stream events as | |
285 `XMLParser`, and attempts to automatically balance tags. | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
286 |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
287 The parsing is initiated by iterating over the parser object: |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
288 |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
289 >>> parser = HTMLParser(StringIO('<UL compact><LI>Foo</UL>')) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
290 >>> for kind, data, pos in parser: |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
291 ... print kind, data |
326
08ada6b4b767
Fixed `__repr__` of the `QName`, `Attrs`, and `Expression` classes so that the output can be used as code to instantiate the object again.
cmlenz
parents:
316
diff
changeset
|
292 START (QName(u'ul'), Attrs([(QName(u'compact'), u'compact')])) |
08ada6b4b767
Fixed `__repr__` of the `QName`, `Attrs`, and `Expression` classes so that the output can be used as code to instantiate the object again.
cmlenz
parents:
316
diff
changeset
|
293 START (QName(u'li'), Attrs()) |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
294 TEXT Foo |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
295 END li |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
296 END ul |
1 | 297 """ |
298 | |
299 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', | |
300 'hr', 'img', 'input', 'isindex', 'link', 'meta', | |
301 'param']) | |
302 | |
311
01e2c48f6dfb
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
303 def __init__(self, source, filename=None, encoding='utf-8'): |
01e2c48f6dfb
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
304 """Initialize the parser for the given HTML input. |
01e2c48f6dfb
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
305 |
425
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
423
diff
changeset
|
306 :param source: the HTML text as a file-like object |
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
423
diff
changeset
|
307 :param filename: the name of the file, if known |
5b248708bbed
Try to use proper reStructuredText for docstrings throughout.
cmlenz
parents:
423
diff
changeset
|
308 :param filename: encoding of the file; ignored if the input is unicode |
311
01e2c48f6dfb
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
309 """ |
1 | 310 html.HTMLParser.__init__(self) |
311 self.source = source | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
312 self.filename = filename |
311
01e2c48f6dfb
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
313 self.encoding = encoding |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
314 self._queue = [] |
1 | 315 self._open_tags = [] |
316 | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
317 def parse(self): |
433 | 318 """Generator that parses the HTML source, yielding markup events. |
319 | |
320 :return: a markup event stream | |
321 :raises ParseError: if the HTML text is not well formed | |
322 """ | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
323 def _generate(): |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
324 try: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
325 bufsize = 4 * 1024 # 4K |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
326 done = False |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
327 while 1: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
328 while not done and len(self._queue) == 0: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
329 data = self.source.read(bufsize) |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
330 if data == '': # end of data |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
331 self.close() |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
332 done = True |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
333 else: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
334 self.feed(data) |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
335 for kind, data, pos in self._queue: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
336 yield kind, data, pos |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
337 self._queue = [] |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
338 if done: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
339 open_tags = self._open_tags |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
340 open_tags.reverse() |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
341 for tag in open_tags: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
342 yield END, QName(tag), pos |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
343 break |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
344 except html.HTMLParseError, e: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
345 msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset) |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
346 raise ParseError(msg, self.filename, e.lineno, e.offset) |
146 | 347 return Stream(_generate()).filter(_coalesce) |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
348 |
1 | 349 def __iter__(self): |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
350 return iter(self.parse()) |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
351 |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
352 def _enqueue(self, kind, data, pos=None): |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
353 if pos is None: |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
354 pos = self._getpos() |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
355 self._queue.append((kind, data, pos)) |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
356 |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
357 def _getpos(self): |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
358 lineno, column = self.getpos() |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
359 return (self.filename, lineno, column) |
1 | 360 |
361 def handle_starttag(self, tag, attrib): | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
362 fixed_attrib = [] |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
363 for name, value in attrib: # Fixup minimized attributes |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
364 if value is None: |
312
7e743338a799
Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents:
311
diff
changeset
|
365 value = unicode(name) |
7e743338a799
Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents:
311
diff
changeset
|
366 elif not isinstance(value, unicode): |
7e743338a799
Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents:
311
diff
changeset
|
367 value = value.decode(self.encoding, 'replace') |
403
32b283e1d310
Remove some magic/overhead from `Attrs` creation and manipulation by not automatically wrapping attribute names in `QName`.
cmlenz
parents:
378
diff
changeset
|
368 fixed_attrib.append((QName(name), stripentities(value))) |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
369 |
182
41db0260ebb1
Renamed `Attributes` to `Attrs` to reduce the verbosity.
cmlenz
parents:
160
diff
changeset
|
370 self._enqueue(START, (QName(tag), Attrs(fixed_attrib))) |
1 | 371 if tag in self._EMPTY_ELEMS: |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
372 self._enqueue(END, QName(tag)) |
1 | 373 else: |
374 self._open_tags.append(tag) | |
375 | |
376 def handle_endtag(self, tag): | |
377 if tag not in self._EMPTY_ELEMS: | |
378 while self._open_tags: | |
379 open_tag = self._open_tags.pop() | |
378
fff4a81ffc56
Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents:
376
diff
changeset
|
380 self._enqueue(END, QName(open_tag)) |
1 | 381 if open_tag.lower() == tag.lower(): |
382 break | |
383 | |
384 def handle_data(self, text): | |
311
01e2c48f6dfb
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
385 if not isinstance(text, unicode): |
01e2c48f6dfb
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
386 text = text.decode(self.encoding, 'replace') |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
387 self._enqueue(TEXT, text) |
1 | 388 |
389 def handle_charref(self, name): | |
423
7589a0e51001
Applied patch for #106 (handling of hex charrefs in HTML parser).
cmlenz
parents:
422
diff
changeset
|
390 if name.lower().startswith('x'): |
7589a0e51001
Applied patch for #106 (handling of hex charrefs in HTML parser).
cmlenz
parents:
422
diff
changeset
|
391 text = unichr(int(name[1:], 16)) |
7589a0e51001
Applied patch for #106 (handling of hex charrefs in HTML parser).
cmlenz
parents:
422
diff
changeset
|
392 else: |
7589a0e51001
Applied patch for #106 (handling of hex charrefs in HTML parser).
cmlenz
parents:
422
diff
changeset
|
393 text = unichr(int(name)) |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
394 self._enqueue(TEXT, text) |
1 | 395 |
396 def handle_entityref(self, name): | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
397 try: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
398 text = unichr(htmlentitydefs.name2codepoint[name]) |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
399 except KeyError: |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
400 text = '&%s;' % name |
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
401 self._enqueue(TEXT, text) |
1 | 402 |
403 def handle_pi(self, data): | |
376
74b6bf92f0cd
Fix parsing of processing instructions in HTML input.
cmlenz
parents:
326
diff
changeset
|
404 target, data = data.split(None, 1) |
74b6bf92f0cd
Fix parsing of processing instructions in HTML input.
cmlenz
parents:
326
diff
changeset
|
405 if data.endswith('?'): |
74b6bf92f0cd
Fix parsing of processing instructions in HTML input.
cmlenz
parents:
326
diff
changeset
|
406 data = data[:-1] |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
407 self._enqueue(PI, (target.strip(), data.strip())) |
1 | 408 |
409 def handle_comment(self, text): | |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
410 self._enqueue(COMMENT, text) |
1 | 411 |
412 | |
311
01e2c48f6dfb
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
413 def HTML(text, encoding='utf-8'): |
433 | 414 """Parse the given HTML source and return a markup stream. |
415 | |
416 Unlike with `HTMLParser`, the returned stream is reusable, meaning it can be | |
417 iterated over multiple times: | |
418 | |
419 >>> html = HTML('<body><h1>Foo</h1></body>') | |
420 >>> print html | |
421 <body><h1>Foo</h1></body> | |
422 >>> print html.select('h1') | |
423 <h1>Foo</h1> | |
424 >>> print html.select('h1/text()') | |
425 Foo | |
426 | |
427 :param text: the HTML source | |
428 :return: the parsed XML event stream | |
429 :raises ParseError: if the HTML text is not well-formed, and error recovery | |
430 fails | |
431 """ | |
311
01e2c48f6dfb
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
432 return Stream(list(HTMLParser(StringIO(text), encoding=encoding))) |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
433 |
146 | 434 def _coalesce(stream): |
144
28b56f09a7e1
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
435 """Coalesces adjacent TEXT events into a single event.""" |
146 | 436 textbuf = [] |
437 textpos = None | |
438 for kind, data, pos in chain(stream, [(None, None, None)]): | |
439 if kind is TEXT: | |
440 textbuf.append(data) | |
441 if textpos is None: | |
442 textpos = pos | |
443 else: | |
444 if textbuf: | |
445 yield TEXT, u''.join(textbuf), textpos | |
446 del textbuf[:] | |
447 textpos = None | |
448 if kind: | |
449 yield kind, data, pos |