Mercurial > genshi > genshi-test
comparison markup/input.py @ 1:821114ec4f69
Initial import.
author | cmlenz |
---|---|
date | Sat, 03 Jun 2006 07:16:01 +0000 |
parents | |
children | eca77129518a |
comparison
equal
deleted
inserted
replaced
0:20f3417d4171 | 1:821114ec4f69 |
---|---|
1 # -*- coding: utf-8 -*- | |
2 # | |
3 # Copyright (C) 2006 Christopher Lenz | |
4 # All rights reserved. | |
5 # | |
6 # This software is licensed as described in the file COPYING, which | |
7 # you should have received as part of this distribution. The terms | |
8 # are also available at http://trac.edgewall.com/license.html. | |
9 # | |
10 # This software consists of voluntary contributions made by many | |
11 # individuals. For the exact contribution history, see the revision | |
12 # history and logs, available at http://projects.edgewall.com/trac/. | |
13 | |
14 from xml.parsers import expat | |
15 try: | |
16 frozenset | |
17 except NameError: | |
18 from sets import ImmutableSet as frozenset | |
19 import HTMLParser as html | |
20 import htmlentitydefs | |
21 import re | |
22 from StringIO import StringIO | |
23 | |
24 from markup.core import Attributes, Markup, QName, Stream | |
25 | |
26 | |
27 class XMLParser(object): | |
28 """Generator-based XML parser based on roughly equivalent code in | |
29 Kid/ElementTree.""" | |
30 | |
31 def __init__(self, source): | |
32 self.source = source | |
33 | |
34 # Setup the Expat parser | |
35 parser = expat.ParserCreate('utf-8', '}') | |
36 parser.buffer_text = True | |
37 parser.returns_unicode = True | |
38 parser.StartElementHandler = self._handle_start | |
39 parser.EndElementHandler = self._handle_end | |
40 parser.CharacterDataHandler = self._handle_data | |
41 parser.XmlDeclHandler = self._handle_prolog | |
42 parser.StartDoctypeDeclHandler = self._handle_doctype | |
43 parser.StartNamespaceDeclHandler = self._handle_start_ns | |
44 parser.EndNamespaceDeclHandler = self._handle_end_ns | |
45 parser.ProcessingInstructionHandler = self._handle_pi | |
46 parser.CommentHandler = self._handle_comment | |
47 parser.DefaultHandler = self._handle_other | |
48 | |
49 # Location reporting is only support in Python >= 2.4 | |
50 if not hasattr(parser, 'CurrentLineNumber'): | |
51 self.getpos = self._getpos_unknown | |
52 | |
53 self.expat = parser | |
54 self.queue = [] | |
55 | |
56 def __iter__(self): | |
57 bufsize = 4 * 1024 # 4K | |
58 done = False | |
59 while True: | |
60 while not done and len(self.queue) == 0: | |
61 data = self.source.read(bufsize) | |
62 if data == '': # end of data | |
63 if hasattr(self, 'expat'): | |
64 self.expat.Parse('', True) | |
65 del self.expat # get rid of circular references | |
66 done = True | |
67 else: | |
68 self.expat.Parse(data, False) | |
69 for event in self.queue: | |
70 yield event | |
71 self.queue = [] | |
72 if done: | |
73 break | |
74 | |
75 def _getpos_unknown(self): | |
76 return (-1, -1) | |
77 | |
78 def getpos(self): | |
79 return self.expat.CurrentLineNumber, self.expat.CurrentColumnNumber | |
80 | |
81 def _handle_start(self, tag, attrib): | |
82 self.queue.append((Stream.START, (QName(tag), Attributes(attrib.items())), | |
83 self.getpos())) | |
84 | |
85 def _handle_end(self, tag): | |
86 self.queue.append((Stream.END, QName(tag), self.getpos())) | |
87 | |
88 def _handle_data(self, text): | |
89 self.queue.append((Stream.TEXT, text, self.getpos())) | |
90 | |
91 def _handle_prolog(self, version, encoding, standalone): | |
92 self.queue.append((Stream.PROLOG, (version, encoding, standalone), | |
93 self.getpos())) | |
94 | |
95 def _handle_doctype(self, name, sysid, pubid, has_internal_subset): | |
96 self.queue.append((Stream.DOCTYPE, (name, pubid, sysid), self.getpos())) | |
97 | |
98 def _handle_start_ns(self, prefix, uri): | |
99 self.queue.append((Stream.START_NS, (prefix or '', uri), self.getpos())) | |
100 | |
101 def _handle_end_ns(self, prefix): | |
102 self.queue.append((Stream.END_NS, prefix or '', self.getpos())) | |
103 | |
104 def _handle_pi(self, target, data): | |
105 self.queue.append((Stream.PI, (target, data), self.getpos())) | |
106 | |
107 def _handle_comment(self, text): | |
108 self.queue.append((Stream.COMMENT, text, self.getpos())) | |
109 | |
110 def _handle_other(self, text): | |
111 if text.startswith('&'): | |
112 # deal with undefined entities | |
113 try: | |
114 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) | |
115 self.queue.append((Stream.TEXT, text, self.getpos())) | |
116 except KeyError: | |
117 lineno, offset = self.getpos() | |
118 raise expat.error("undefined entity %s: line %d, column %d" % | |
119 (text, lineno, offset)) | |
120 | |
121 | |
122 def XML(text): | |
123 return Stream(list(XMLParser(StringIO(text)))) | |
124 | |
125 | |
126 class HTMLParser(html.HTMLParser): | |
127 """Parser for HTML input based on the Python `HTMLParser` module. | |
128 | |
129 This class provides the same interface for generating stream events as | |
130 `XMLParser`, and attempts to automatically balance tags. | |
131 """ | |
132 | |
133 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', | |
134 'hr', 'img', 'input', 'isindex', 'link', 'meta', | |
135 'param']) | |
136 | |
137 def __init__(self, source): | |
138 html.HTMLParser.__init__(self) | |
139 self.source = source | |
140 self.queue = [] | |
141 self._open_tags = [] | |
142 | |
143 def __iter__(self): | |
144 bufsize = 4 * 1024 # 4K | |
145 done = False | |
146 while True: | |
147 while not done and len(self.queue) == 0: | |
148 data = self.source.read(bufsize) | |
149 if data == '': # end of data | |
150 self.close() | |
151 done = True | |
152 else: | |
153 self.feed(data) | |
154 for kind, data, pos in self.queue: | |
155 yield kind, data, pos | |
156 self.queue = [] | |
157 if done: | |
158 open_tags = self._open_tags | |
159 open_tags.reverse() | |
160 for tag in open_tags: | |
161 yield Stream.END, QName(tag), pos | |
162 break | |
163 | |
164 def handle_starttag(self, tag, attrib): | |
165 pos = self.getpos() | |
166 self.queue.append((Stream.START, (QName(tag), Attributes(attrib)), pos)) | |
167 if tag in self._EMPTY_ELEMS: | |
168 self.queue.append((Stream.END, QName(tag), pos)) | |
169 else: | |
170 self._open_tags.append(tag) | |
171 | |
172 def handle_endtag(self, tag): | |
173 if tag not in self._EMPTY_ELEMS: | |
174 pos = self.getpos() | |
175 while self._open_tags: | |
176 open_tag = self._open_tags.pop() | |
177 if open_tag.lower() == tag.lower(): | |
178 break | |
179 self.queue.append((Stream.END, QName(open_tag), pos)) | |
180 self.queue.append((Stream.END, QName(tag), pos)) | |
181 | |
182 def handle_data(self, text): | |
183 self.queue.append((Stream.TEXT, text, self.getpos())) | |
184 | |
185 def handle_charref(self, name): | |
186 self.queue.append((Stream.TEXT, Markup('&#%s;' % name), self.getpos())) | |
187 | |
188 def handle_entityref(self, name): | |
189 self.queue.append((Stream.TEXT, Markup('&%s;' % name), self.getpos())) | |
190 | |
191 def handle_pi(self, data): | |
192 target, data = data.split(maxsplit=1) | |
193 data = data.rstrip('?') | |
194 self.queue.append((Stream.PI, (target.strip(), data.strip()), | |
195 self.getpos())) | |
196 | |
197 def handle_comment(self, text): | |
198 self.queue.append((Stream.COMMENT, text, self.getpos())) | |
199 | |
200 | |
201 def HTML(text): | |
202 return Stream(list(HTMLParser(StringIO(text)))) |