Mercurial > genshi > genshi-test
annotate markup/input.py @ 24:547e36f7ec94
Cosmetic (mostly whitespace) changes.
author | cmlenz |
---|---|
date | Mon, 26 Jun 2006 17:54:00 +0000 |
parents | eca77129518a |
children | 039fc5b87405 |
rev | line source |
---|---|
1 | 1 # -*- coding: utf-8 -*- |
2 # | |
3 # Copyright (C) 2006 Christopher Lenz | |
4 # All rights reserved. | |
5 # | |
6 # This software is licensed as described in the file COPYING, which | |
7 # you should have received as part of this distribution. The terms | |
8 # are also available at http://trac.edgewall.com/license.html. | |
9 # | |
10 # This software consists of voluntary contributions made by many | |
11 # individuals. For the exact contribution history, see the revision | |
12 # history and logs, available at http://projects.edgewall.com/trac/. | |
13 | |
14 from xml.parsers import expat | |
15 try: | |
16 frozenset | |
17 except NameError: | |
18 from sets import ImmutableSet as frozenset | |
19 import HTMLParser as html | |
20 import htmlentitydefs | |
21 import re | |
22 from StringIO import StringIO | |
23 | |
24 from markup.core import Attributes, Markup, QName, Stream | |
25 | |
26 | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
27 class ParseError(Exception): |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
28 """Exception raised when fatal syntax errors are found in the input being |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
29 parsed.""" |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
30 |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
31 def __init__(self, message, filename='<string>', lineno=-1, offset=-1): |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
32 Exception.__init__(self, message) |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
33 self.filename = filename |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
34 self.lineno = lineno |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
35 self.offset = offset |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
36 |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
37 |
1 | 38 class XMLParser(object): |
39 """Generator-based XML parser based on roughly equivalent code in | |
40 Kid/ElementTree.""" | |
41 | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
42 def __init__(self, source, filename=None): |
1 | 43 self.source = source |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
44 self.filename = filename |
1 | 45 |
46 # Setup the Expat parser | |
47 parser = expat.ParserCreate('utf-8', '}') | |
48 parser.buffer_text = True | |
49 parser.returns_unicode = True | |
50 parser.StartElementHandler = self._handle_start | |
51 parser.EndElementHandler = self._handle_end | |
52 parser.CharacterDataHandler = self._handle_data | |
53 parser.XmlDeclHandler = self._handle_prolog | |
54 parser.StartDoctypeDeclHandler = self._handle_doctype | |
55 parser.StartNamespaceDeclHandler = self._handle_start_ns | |
56 parser.EndNamespaceDeclHandler = self._handle_end_ns | |
57 parser.ProcessingInstructionHandler = self._handle_pi | |
58 parser.CommentHandler = self._handle_comment | |
59 parser.DefaultHandler = self._handle_other | |
60 | |
61 # Location reporting is only support in Python >= 2.4 | |
62 if not hasattr(parser, 'CurrentLineNumber'): | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
63 self._getpos = self._getpos_unknown |
1 | 64 |
65 self.expat = parser | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
66 self._queue = [] |
1 | 67 |
68 def __iter__(self): | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
69 try: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
70 bufsize = 4 * 1024 # 4K |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
71 done = False |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
72 while True: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
73 while not done and len(self._queue) == 0: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
74 data = self.source.read(bufsize) |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
75 if data == '': # end of data |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
76 if hasattr(self, 'expat'): |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
77 self.expat.Parse('', True) |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
78 del self.expat # get rid of circular references |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
79 done = True |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
80 else: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
81 self.expat.Parse(data, False) |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
82 for event in self._queue: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
83 yield event |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
84 self._queue = [] |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
85 if done: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
86 break |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
87 except expat.ExpatError, e: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
88 msg = str(e) |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
89 if self.filename: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
90 msg += ', in ' + self.filename |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
91 raise ParseError(msg, self.filename, e.lineno, e.offset) |
1 | 92 |
93 def _getpos_unknown(self): | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
94 return (self.filename or '<string>', -1, -1) |
1 | 95 |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
96 def _getpos(self): |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
97 return (self.filename or '<string>', self.expat.CurrentLineNumber, |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
98 self.expat.CurrentColumnNumber) |
1 | 99 |
100 def _handle_start(self, tag, attrib): | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
101 self._queue.append((Stream.START, (QName(tag), Attributes(attrib.items())), |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
102 self._getpos())) |
1 | 103 |
104 def _handle_end(self, tag): | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
105 self._queue.append((Stream.END, QName(tag), self._getpos())) |
1 | 106 |
107 def _handle_data(self, text): | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
108 self._queue.append((Stream.TEXT, text, self._getpos())) |
1 | 109 |
110 def _handle_prolog(self, version, encoding, standalone): | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
111 self._queue.append((Stream.PROLOG, (version, encoding, standalone), |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
112 self._getpos())) |
1 | 113 |
114 def _handle_doctype(self, name, sysid, pubid, has_internal_subset): | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
115 self._queue.append((Stream.DOCTYPE, (name, pubid, sysid), self._getpos())) |
1 | 116 |
117 def _handle_start_ns(self, prefix, uri): | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
118 self._queue.append((Stream.START_NS, (prefix or '', uri), self._getpos())) |
1 | 119 |
120 def _handle_end_ns(self, prefix): | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
121 self._queue.append((Stream.END_NS, prefix or '', self._getpos())) |
1 | 122 |
123 def _handle_pi(self, target, data): | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
124 self._queue.append((Stream.PI, (target, data), self._getpos())) |
1 | 125 |
126 def _handle_comment(self, text): | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
127 self._queue.append((Stream.COMMENT, text, self._getpos())) |
1 | 128 |
129 def _handle_other(self, text): | |
130 if text.startswith('&'): | |
131 # deal with undefined entities | |
132 try: | |
133 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
134 self._queue.append((Stream.TEXT, text, self._getpos())) |
1 | 135 except KeyError: |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
136 lineno, offset = self._getpos() |
1 | 137 raise expat.error("undefined entity %s: line %d, column %d" % |
138 (text, lineno, offset)) | |
139 | |
140 | |
141 def XML(text): | |
142 return Stream(list(XMLParser(StringIO(text)))) | |
143 | |
144 | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
145 class HTMLParser(html.HTMLParser, object): |
1 | 146 """Parser for HTML input based on the Python `HTMLParser` module. |
147 | |
148 This class provides the same interface for generating stream events as | |
149 `XMLParser`, and attempts to automatically balance tags. | |
150 """ | |
151 | |
152 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', | |
153 'hr', 'img', 'input', 'isindex', 'link', 'meta', | |
154 'param']) | |
155 | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
156 def __init__(self, source, filename=None): |
1 | 157 html.HTMLParser.__init__(self) |
158 self.source = source | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
159 self.filename = filename |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
160 self._queue = [] |
1 | 161 self._open_tags = [] |
162 | |
163 def __iter__(self): | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
164 try: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
165 bufsize = 4 * 1024 # 4K |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
166 done = False |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
167 while True: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
168 while not done and len(self._queue) == 0: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
169 data = self.source.read(bufsize) |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
170 if data == '': # end of data |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
171 self.close() |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
172 done = True |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
173 else: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
174 self.feed(data) |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
175 for kind, data, pos in self._queue: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
176 yield kind, data, pos |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
177 self._queue = [] |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
178 if done: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
179 open_tags = self._open_tags |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
180 open_tags.reverse() |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
181 for tag in open_tags: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
182 yield Stream.END, QName(tag), pos |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
183 break |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
184 except html.HTMLParseError, e: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
185 msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset) |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
186 if self.filename: |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
187 msg += ', in %s' % self.filename |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
188 raise ParseError(msg, self.filename, e.lineno, e.offset) |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
189 |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
190 def _getpos(self): |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
191 lineno, column = self.getpos() |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
192 return (self.filename, lineno, column) |
1 | 193 |
194 def handle_starttag(self, tag, attrib): | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
195 pos = self._getpos() |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
196 self._queue.append((Stream.START, (QName(tag), Attributes(attrib)), pos)) |
1 | 197 if tag in self._EMPTY_ELEMS: |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
198 self._queue.append((Stream.END, QName(tag), pos)) |
1 | 199 else: |
200 self._open_tags.append(tag) | |
201 | |
202 def handle_endtag(self, tag): | |
203 if tag not in self._EMPTY_ELEMS: | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
204 pos = self._getpos() |
1 | 205 while self._open_tags: |
206 open_tag = self._open_tags.pop() | |
207 if open_tag.lower() == tag.lower(): | |
208 break | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
209 self._queue.append((Stream.END, QName(open_tag), pos)) |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
210 self._queue.append((Stream.END, QName(tag), pos)) |
1 | 211 |
212 def handle_data(self, text): | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
213 self._queue.append((Stream.TEXT, text, self._getpos())) |
1 | 214 |
215 def handle_charref(self, name): | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
216 self._queue.append((Stream.TEXT, Markup('&#%s;' % name), self._getpos())) |
1 | 217 |
218 def handle_entityref(self, name): | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
219 self._queue.append((Stream.TEXT, Markup('&%s;' % name), self._getpos())) |
1 | 220 |
221 def handle_pi(self, data): | |
222 target, data = data.split(maxsplit=1) | |
223 data = data.rstrip('?') | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
224 self._queue.append((Stream.PI, (target.strip(), data.strip()), |
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
225 self._getpos())) |
1 | 226 |
227 def handle_comment(self, text): | |
21
eca77129518a
* Include paths are now interpreted relative to the path of the including template. Closes #3.
cmlenz
parents:
1
diff
changeset
|
228 self._queue.append((Stream.COMMENT, text, self._getpos())) |
1 | 229 |
230 | |
231 def HTML(text): | |
232 return Stream(list(HTMLParser(StringIO(text)))) |