Mercurial > genshi > mirror
annotate genshi/input.py @ 403:228907abb726 trunk
Remove some magic/overhead from `Attrs` creation and manipulation by not automatically wrapping attribute names in `QName`.
author | cmlenz |
---|---|
date | Wed, 21 Feb 2007 09:51:43 +0000 |
parents | 873ca2a7ec05 |
children | 4675d5cf6c67 |
rev | line source |
---|---|
1 | 1 # -*- coding: utf-8 -*- |
2 # | |
66
59eb24184e9c
Switch copyright to Edgewall and URLs to markup.edgewall.org.
cmlenz
parents:
27
diff
changeset
|
3 # Copyright (C) 2006 Edgewall Software |
1 | 4 # All rights reserved. |
5 # | |
6 # This software is licensed as described in the file COPYING, which | |
7 # you should have received as part of this distribution. The terms | |
230 | 8 # are also available at http://genshi.edgewall.org/wiki/License. |
1 | 9 # |
10 # This software consists of voluntary contributions made by many | |
11 # individuals. For the exact contribution history, see the revision | |
230 | 12 # history and logs, available at http://genshi.edgewall.org/log/. |
1 | 13 |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
14 from itertools import chain |
1 | 15 from xml.parsers import expat |
16 try: | |
17 frozenset | |
18 except NameError: | |
19 from sets import ImmutableSet as frozenset | |
20 import HTMLParser as html | |
21 import htmlentitydefs | |
22 from StringIO import StringIO | |
23 | |
293
e17b7459b515
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
24 from genshi.core import Attrs, QName, Stream, stripentities |
230 | 25 from genshi.core import DOCTYPE, START, END, START_NS, END_NS, TEXT, \ |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
26 START_CDATA, END_CDATA, PI, COMMENT |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
27 |
290
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
28 __all__ = ['ET', 'ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML'] |
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
29 |
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
30 def ET(element): |
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
31 """Convert a given ElementTree element to a markup stream.""" |
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
32 tag_name = QName(element.tag.lstrip('{')) |
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
33 attrs = Attrs(element.items()) |
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
34 |
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
35 yield START, (tag_name, attrs), (None, -1, -1) |
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
36 if element.text: |
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
37 yield TEXT, element.text, (None, -1, -1) |
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
38 for child in element.getchildren(): |
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
39 for item in ET(child): |
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
40 yield item |
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
41 yield END, tag_name, (None, -1, -1) |
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
42 if element.tail: |
94f9f2cc66c8
Move the ElementTree ''element-to-stream'' adaptation function `ET()` into the `genshi.input` module.
cmlenz
parents:
230
diff
changeset
|
43 yield TEXT, element.tail, (None, -1, -1) |
1 | 44 |
45 | |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
46 class ParseError(Exception): |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
47 """Exception raised when fatal syntax errors are found in the input being |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
48 parsed.""" |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
49 |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
50 def __init__(self, message, filename='<string>', lineno=-1, offset=-1): |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
51 Exception.__init__(self, message) |
213 | 52 self.msg = message |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
53 self.filename = filename |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
54 self.lineno = lineno |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
55 self.offset = offset |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
56 |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
57 |
1 | 58 class XMLParser(object): |
59 """Generator-based XML parser based on roughly equivalent code in | |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
60 Kid/ElementTree. |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
61 |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
62 The parsing is initiated by iterating over the parser object: |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
63 |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
64 >>> parser = XMLParser(StringIO('<root id="2"><child>Foo</child></root>')) |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
65 >>> for kind, data, pos in parser: |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
66 ... print kind, data |
326
f999da894391
Fixed `__repr__` of the `QName`, `Attrs`, and `Expression` classes so that the output can be used as code to instantiate the object again.
cmlenz
parents:
316
diff
changeset
|
67 START (QName(u'root'), Attrs([(QName(u'id'), u'2')])) |
f999da894391
Fixed `__repr__` of the `QName`, `Attrs`, and `Expression` classes so that the output can be used as code to instantiate the object again.
cmlenz
parents:
316
diff
changeset
|
68 START (QName(u'child'), Attrs()) |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
69 TEXT Foo |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
70 END child |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
71 END root |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
72 """ |
1 | 73 |
293
e17b7459b515
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
74 _entitydefs = ['<!ENTITY %s "&#%d;">' % (name, value) for name, value in |
e17b7459b515
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
75 htmlentitydefs.name2codepoint.items()] |
e17b7459b515
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
76 _external_dtd = '\n'.join(_entitydefs) |
e17b7459b515
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
77 |
316
a946edefac40
Configurable encoding of template files, closing #65.
cmlenz
parents:
312
diff
changeset
|
78 def __init__(self, source, filename=None, encoding=None): |
311
8de1ff534d22
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
79 """Initialize the parser for the given XML input. |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
80 |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
81 @param source: the XML text as a file-like object |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
82 @param filename: the name of the file, if appropriate |
316
a946edefac40
Configurable encoding of template files, closing #65.
cmlenz
parents:
312
diff
changeset
|
83 @param encoding: the encoding of the file; if not specified, the |
a946edefac40
Configurable encoding of template files, closing #65.
cmlenz
parents:
312
diff
changeset
|
84 encoding is assumed to be ASCII, UTF-8, or UTF-16, or whatever the |
a946edefac40
Configurable encoding of template files, closing #65.
cmlenz
parents:
312
diff
changeset
|
85 encoding specified in the XML declaration (if any) |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
86 """ |
1 | 87 self.source = source |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
88 self.filename = filename |
1 | 89 |
90 # Setup the Expat parser | |
316
a946edefac40
Configurable encoding of template files, closing #65.
cmlenz
parents:
312
diff
changeset
|
91 parser = expat.ParserCreate(encoding, '}') |
1 | 92 parser.buffer_text = True |
93 parser.returns_unicode = True | |
160 | 94 parser.ordered_attributes = True |
95 | |
1 | 96 parser.StartElementHandler = self._handle_start |
97 parser.EndElementHandler = self._handle_end | |
98 parser.CharacterDataHandler = self._handle_data | |
99 parser.StartDoctypeDeclHandler = self._handle_doctype | |
100 parser.StartNamespaceDeclHandler = self._handle_start_ns | |
101 parser.EndNamespaceDeclHandler = self._handle_end_ns | |
143
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
102 parser.StartCdataSectionHandler = self._handle_start_cdata |
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
103 parser.EndCdataSectionHandler = self._handle_end_cdata |
1 | 104 parser.ProcessingInstructionHandler = self._handle_pi |
105 parser.CommentHandler = self._handle_comment | |
209
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
106 |
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
107 # Tell Expat that we'll handle non-XML entities ourselves |
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
108 # (in _handle_other) |
1 | 109 parser.DefaultHandler = self._handle_other |
293
e17b7459b515
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
110 parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) |
209
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
111 parser.UseForeignDTD() |
293
e17b7459b515
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
112 parser.ExternalEntityRefHandler = self._build_foreign |
1 | 113 |
114 # Location reporting is only support in Python >= 2.4 | |
115 if not hasattr(parser, 'CurrentLineNumber'): | |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
116 self._getpos = self._getpos_unknown |
1 | 117 |
118 self.expat = parser | |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
119 self._queue = [] |
1 | 120 |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
121 def parse(self): |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
122 def _generate(): |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
123 try: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
124 bufsize = 4 * 1024 # 4K |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
125 done = False |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
126 while 1: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
127 while not done and len(self._queue) == 0: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
128 data = self.source.read(bufsize) |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
129 if data == '': # end of data |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
130 if hasattr(self, 'expat'): |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
131 self.expat.Parse('', True) |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
132 del self.expat # get rid of circular references |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
133 done = True |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
134 else: |
207
28bfc6aafab7
The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents:
182
diff
changeset
|
135 if isinstance(data, unicode): |
28bfc6aafab7
The `XMLParser` now correctly handles unicode input. Closes #43.
cmlenz
parents:
182
diff
changeset
|
136 data = data.encode('utf-8') |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
137 self.expat.Parse(data, False) |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
138 for event in self._queue: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
139 yield event |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
140 self._queue = [] |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
141 if done: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
142 break |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
143 except expat.ExpatError, e: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
144 msg = str(e) |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
145 if self.filename: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
146 msg += ', in ' + self.filename |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
147 raise ParseError(msg, self.filename, e.lineno, e.offset) |
146 | 148 return Stream(_generate()).filter(_coalesce) |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
149 |
1 | 150 def __iter__(self): |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
151 return iter(self.parse()) |
1 | 152 |
293
e17b7459b515
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
153 def _build_foreign(self, context, base, sysid, pubid): |
e17b7459b515
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
154 parser = self.expat.ExternalEntityParserCreate(context) |
e17b7459b515
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
155 parser.ParseFile(StringIO(self._external_dtd)) |
e17b7459b515
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
156 return 1 |
e17b7459b515
Fix a bug in the XML parser, where attributes containing HTML entity references would get pulled out of the attribute value, and instead added as a text node just before the associated start tag. Thanks to Hamish Lawson for [http://groups.google.com/group/genshi/browse_thread/thread/c64eb48676b0ff96/0e6ce786e8820f3d pointing out the problem].
cmlenz
parents:
290
diff
changeset
|
157 |
143
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
158 def _enqueue(self, kind, data=None, pos=None): |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
159 if pos is None: |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
160 pos = self._getpos() |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
161 if kind is TEXT: |
134
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
162 # Expat reports the *end* of the text event as current position. We |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
163 # try to fix that up here as much as possible. Unfortunately, the |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
164 # offset is only valid for single-line text. For multi-line text, |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
165 # it is apparently not possible to determine at what offset it |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
166 # started |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
167 if '\n' in data: |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
168 lines = data.splitlines() |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
169 lineno = pos[1] - len(lines) + 1 |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
170 offset = -1 |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
171 else: |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
172 lineno = pos[1] |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
173 offset = pos[2] - len(data) |
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
174 pos = (pos[0], lineno, offset) |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
175 self._queue.append((kind, data, pos)) |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
176 |
1 | 177 def _getpos_unknown(self): |
134
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
178 return (self.filename, -1, -1) |
1 | 179 |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
180 def _getpos(self): |
134
d681d2c3cd8d
* Improve the accuracy of line numbers for text nodes, so that reported errors about syntax or evaluation errors in expressions point to the right line (not quite perfect yet, though).
cmlenz
parents:
69
diff
changeset
|
181 return (self.filename, self.expat.CurrentLineNumber, |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
182 self.expat.CurrentColumnNumber) |
1 | 183 |
184 def _handle_start(self, tag, attrib): | |
403
228907abb726
Remove some magic/overhead from `Attrs` creation and manipulation by not automatically wrapping attribute names in `QName`.
cmlenz
parents:
378
diff
changeset
|
185 attrs = Attrs([(QName(name), value) for name, value in |
228907abb726
Remove some magic/overhead from `Attrs` creation and manipulation by not automatically wrapping attribute names in `QName`.
cmlenz
parents:
378
diff
changeset
|
186 zip(*[iter(attrib)] * 2)]) |
228907abb726
Remove some magic/overhead from `Attrs` creation and manipulation by not automatically wrapping attribute names in `QName`.
cmlenz
parents:
378
diff
changeset
|
187 self._enqueue(START, (QName(tag), attrs)) |
1 | 188 |
189 def _handle_end(self, tag): | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
190 self._enqueue(END, QName(tag)) |
1 | 191 |
192 def _handle_data(self, text): | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
193 self._enqueue(TEXT, text) |
1 | 194 |
195 def _handle_doctype(self, name, sysid, pubid, has_internal_subset): | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
196 self._enqueue(DOCTYPE, (name, pubid, sysid)) |
1 | 197 |
198 def _handle_start_ns(self, prefix, uri): | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
199 self._enqueue(START_NS, (prefix or '', uri)) |
1 | 200 |
201 def _handle_end_ns(self, prefix): | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
202 self._enqueue(END_NS, prefix or '') |
1 | 203 |
143
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
204 def _handle_start_cdata(self): |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
205 self._enqueue(START_CDATA) |
143
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
206 |
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
207 def _handle_end_cdata(self): |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
208 self._enqueue(END_CDATA) |
143
3d4c214c979a
CDATA sections in XML input now appear as CDATA sections in the output. This should address the problem with escaping the contents of `<style>` and `<script>` elements, which would only get interpreted correctly if the output was served as `application/xhtml+xml`. Closes #24.
cmlenz
parents:
140
diff
changeset
|
209 |
1 | 210 def _handle_pi(self, target, data): |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
211 self._enqueue(PI, (target, data)) |
1 | 212 |
213 def _handle_comment(self, text): | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
214 self._enqueue(COMMENT, text) |
1 | 215 |
216 def _handle_other(self, text): | |
217 if text.startswith('&'): | |
218 # deal with undefined entities | |
219 try: | |
220 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
221 self._enqueue(TEXT, text) |
1 | 222 except KeyError: |
209
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
223 filename, lineno, offset = self._getpos() |
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
224 error = expat.error('undefined entity "%s": line %d, column %d' |
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
225 % (text, lineno, offset)) |
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
226 error.code = expat.errors.XML_ERROR_UNDEFINED_ENTITY |
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
227 error.lineno = lineno |
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
228 error.offset = offset |
fc6b2fb66518
* Fix bug in handling of undefined entities. Thanks to Arnar for reporting the issue on IRC.
cmlenz
parents:
207
diff
changeset
|
229 raise error |
1 | 230 |
231 | |
232 def XML(text): | |
233 return Stream(list(XMLParser(StringIO(text)))) | |
234 | |
235 | |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
236 class HTMLParser(html.HTMLParser, object): |
1 | 237 """Parser for HTML input based on the Python `HTMLParser` module. |
238 | |
239 This class provides the same interface for generating stream events as | |
240 `XMLParser`, and attempts to automatically balance tags. | |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
241 |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
242 The parsing is initiated by iterating over the parser object: |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
243 |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
244 >>> parser = HTMLParser(StringIO('<UL compact><LI>Foo</UL>')) |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
245 >>> for kind, data, pos in parser: |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
246 ... print kind, data |
326
f999da894391
Fixed `__repr__` of the `QName`, `Attrs`, and `Expression` classes so that the output can be used as code to instantiate the object again.
cmlenz
parents:
316
diff
changeset
|
247 START (QName(u'ul'), Attrs([(QName(u'compact'), u'compact')])) |
f999da894391
Fixed `__repr__` of the `QName`, `Attrs`, and `Expression` classes so that the output can be used as code to instantiate the object again.
cmlenz
parents:
316
diff
changeset
|
248 START (QName(u'li'), Attrs()) |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
249 TEXT Foo |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
250 END li |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
251 END ul |
1 | 252 """ |
253 | |
254 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', | |
255 'hr', 'img', 'input', 'isindex', 'link', 'meta', | |
256 'param']) | |
257 | |
311
8de1ff534d22
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
258 def __init__(self, source, filename=None, encoding='utf-8'): |
8de1ff534d22
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
259 """Initialize the parser for the given HTML input. |
8de1ff534d22
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
260 |
8de1ff534d22
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
261 @param source: the HTML text as a file-like object |
8de1ff534d22
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
262 @param filename: the name of the file, if known |
8de1ff534d22
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
263 @param filename: encoding of the file; ignored if the input is unicode |
8de1ff534d22
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
264 """ |
1 | 265 html.HTMLParser.__init__(self) |
266 self.source = source | |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
267 self.filename = filename |
311
8de1ff534d22
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
268 self.encoding = encoding |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
269 self._queue = [] |
1 | 270 self._open_tags = [] |
271 | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
272 def parse(self): |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
273 def _generate(): |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
274 try: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
275 bufsize = 4 * 1024 # 4K |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
276 done = False |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
277 while 1: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
278 while not done and len(self._queue) == 0: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
279 data = self.source.read(bufsize) |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
280 if data == '': # end of data |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
281 self.close() |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
282 done = True |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
283 else: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
284 self.feed(data) |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
285 for kind, data, pos in self._queue: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
286 yield kind, data, pos |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
287 self._queue = [] |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
288 if done: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
289 open_tags = self._open_tags |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
290 open_tags.reverse() |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
291 for tag in open_tags: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
292 yield END, QName(tag), pos |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
293 break |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
294 except html.HTMLParseError, e: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
295 msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset) |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
296 if self.filename: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
297 msg += ', in %s' % self.filename |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
298 raise ParseError(msg, self.filename, e.lineno, e.offset) |
146 | 299 return Stream(_generate()).filter(_coalesce) |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
300 |
1 | 301 def __iter__(self): |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
302 return iter(self.parse()) |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
303 |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
304 def _enqueue(self, kind, data, pos=None): |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
305 if pos is None: |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
306 pos = self._getpos() |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
307 self._queue.append((kind, data, pos)) |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
308 |
21
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
309 def _getpos(self): |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
310 lineno, column = self.getpos() |
b4d17897d053
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
311 return (self.filename, lineno, column) |
1 | 312 |
313 def handle_starttag(self, tag, attrib): | |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
314 fixed_attrib = [] |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
315 for name, value in attrib: # Fixup minimized attributes |
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
316 if value is None: |
312
cb7326367f91
Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents:
311
diff
changeset
|
317 value = unicode(name) |
cb7326367f91
Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents:
311
diff
changeset
|
318 elif not isinstance(value, unicode): |
cb7326367f91
Follow-up to [385]: also decode attribute values in the `HTMLParser`.
cmlenz
parents:
311
diff
changeset
|
319 value = value.decode(self.encoding, 'replace') |
403
228907abb726
Remove some magic/overhead from `Attrs` creation and manipulation by not automatically wrapping attribute names in `QName`.
cmlenz
parents:
378
diff
changeset
|
320 fixed_attrib.append((QName(name), stripentities(value))) |
26
3c1a022be04c
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
21
diff
changeset
|
321 |
182
2f30ce3fb85e
Renamed `Attributes` to `Attrs` to reduce the verbosity.
cmlenz
parents:
160
diff
changeset
|
322 self._enqueue(START, (QName(tag), Attrs(fixed_attrib))) |
1 | 323 if tag in self._EMPTY_ELEMS: |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
324 self._enqueue(END, QName(tag)) |
1 | 325 else: |
326 self._open_tags.append(tag) | |
327 | |
328 def handle_endtag(self, tag): | |
329 if tag not in self._EMPTY_ELEMS: | |
330 while self._open_tags: | |
331 open_tag = self._open_tags.pop() | |
378
873ca2a7ec05
Improve handling of incorrectly nested tags in the HTML parser.
cmlenz
parents:
376
diff
changeset
|
332 self._enqueue(END, QName(open_tag)) |
1 | 333 if open_tag.lower() == tag.lower(): |
334 break | |
335 | |
336 def handle_data(self, text): | |
311
8de1ff534d22
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
337 if not isinstance(text, unicode): |
8de1ff534d22
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
338 text = text.decode(self.encoding, 'replace') |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
339 self._enqueue(TEXT, text) |
1 | 340 |
341 def handle_charref(self, name): | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
342 text = unichr(int(name)) |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
343 self._enqueue(TEXT, text) |
1 | 344 |
345 def handle_entityref(self, name): | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
346 try: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
347 text = unichr(htmlentitydefs.name2codepoint[name]) |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
348 except KeyError: |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
349 text = '&%s;' % name |
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
350 self._enqueue(TEXT, text) |
1 | 351 |
352 def handle_pi(self, data): | |
376
0e0952d85d97
Fix parsing of processing instructions in HTML input.
cmlenz
parents:
326
diff
changeset
|
353 target, data = data.split(None, 1) |
0e0952d85d97
Fix parsing of processing instructions in HTML input.
cmlenz
parents:
326
diff
changeset
|
354 if data.endswith('?'): |
0e0952d85d97
Fix parsing of processing instructions in HTML input.
cmlenz
parents:
326
diff
changeset
|
355 data = data[:-1] |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
356 self._enqueue(PI, (target.strip(), data.strip())) |
1 | 357 |
358 def handle_comment(self, text): | |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
359 self._enqueue(COMMENT, text) |
1 | 360 |
361 | |
311
8de1ff534d22
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
362 def HTML(text, encoding='utf-8'): |
8de1ff534d22
* The `HTMLParser` class and the `HTML` function now accept an `encoding` parameter to properly deal with bytestring input (defaults to UTF-8).
cmlenz
parents:
293
diff
changeset
|
363 return Stream(list(HTMLParser(StringIO(text), encoding=encoding))) |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
364 |
146 | 365 def _coalesce(stream): |
144
d1ce85a7f296
* Coalesce adjacent text events that the parsers would produce when text crossed the buffer boundaries. Fixes #26.
cmlenz
parents:
143
diff
changeset
|
366 """Coalesces adjacent TEXT events into a single event.""" |
146 | 367 textbuf = [] |
368 textpos = None | |
369 for kind, data, pos in chain(stream, [(None, None, None)]): | |
370 if kind is TEXT: | |
371 textbuf.append(data) | |
372 if textpos is None: | |
373 textpos = pos | |
374 else: | |
375 if textbuf: | |
376 yield TEXT, u''.join(textbuf), textpos | |
377 del textbuf[:] | |
378 textpos = None | |
379 if kind: | |
380 yield kind, data, pos |