1
|
1 # -*- coding: utf-8 -*-
|
|
2 #
|
|
3 # Copyright (C) 2006 Christopher Lenz
|
|
4 # All rights reserved.
|
|
5 #
|
|
6 # This software is licensed as described in the file COPYING, which
|
|
7 # you should have received as part of this distribution. The terms
|
|
8 # are also available at http://trac.edgewall.com/license.html.
|
|
9 #
|
|
10 # This software consists of voluntary contributions made by many
|
|
11 # individuals. For the exact contribution history, see the revision
|
|
12 # history and logs, available at http://projects.edgewall.com/trac/.
|
|
13
|
|
14 from xml.parsers import expat
|
|
15 try:
|
|
16 frozenset
|
|
17 except NameError:
|
|
18 from sets import ImmutableSet as frozenset
|
|
19 import HTMLParser as html
|
|
20 import htmlentitydefs
|
|
21 import re
|
|
22 from StringIO import StringIO
|
|
23
|
|
24 from markup.core import Attributes, Markup, QName, Stream
|
|
25
|
|
26
|
|
27 class XMLParser(object):
|
|
28 """Generator-based XML parser based on roughly equivalent code in
|
|
29 Kid/ElementTree."""
|
|
30
|
|
31 def __init__(self, source):
|
|
32 self.source = source
|
|
33
|
|
34 # Setup the Expat parser
|
|
35 parser = expat.ParserCreate('utf-8', '}')
|
|
36 parser.buffer_text = True
|
|
37 parser.returns_unicode = True
|
|
38 parser.StartElementHandler = self._handle_start
|
|
39 parser.EndElementHandler = self._handle_end
|
|
40 parser.CharacterDataHandler = self._handle_data
|
|
41 parser.XmlDeclHandler = self._handle_prolog
|
|
42 parser.StartDoctypeDeclHandler = self._handle_doctype
|
|
43 parser.StartNamespaceDeclHandler = self._handle_start_ns
|
|
44 parser.EndNamespaceDeclHandler = self._handle_end_ns
|
|
45 parser.ProcessingInstructionHandler = self._handle_pi
|
|
46 parser.CommentHandler = self._handle_comment
|
|
47 parser.DefaultHandler = self._handle_other
|
|
48
|
|
49 # Location reporting is only support in Python >= 2.4
|
|
50 if not hasattr(parser, 'CurrentLineNumber'):
|
|
51 self.getpos = self._getpos_unknown
|
|
52
|
|
53 self.expat = parser
|
|
54 self.queue = []
|
|
55
|
|
56 def __iter__(self):
|
|
57 bufsize = 4 * 1024 # 4K
|
|
58 done = False
|
|
59 while True:
|
|
60 while not done and len(self.queue) == 0:
|
|
61 data = self.source.read(bufsize)
|
|
62 if data == '': # end of data
|
|
63 if hasattr(self, 'expat'):
|
|
64 self.expat.Parse('', True)
|
|
65 del self.expat # get rid of circular references
|
|
66 done = True
|
|
67 else:
|
|
68 self.expat.Parse(data, False)
|
|
69 for event in self.queue:
|
|
70 yield event
|
|
71 self.queue = []
|
|
72 if done:
|
|
73 break
|
|
74
|
|
75 def _getpos_unknown(self):
|
|
76 return (-1, -1)
|
|
77
|
|
78 def getpos(self):
|
|
79 return self.expat.CurrentLineNumber, self.expat.CurrentColumnNumber
|
|
80
|
|
81 def _handle_start(self, tag, attrib):
|
|
82 self.queue.append((Stream.START, (QName(tag), Attributes(attrib.items())),
|
|
83 self.getpos()))
|
|
84
|
|
85 def _handle_end(self, tag):
|
|
86 self.queue.append((Stream.END, QName(tag), self.getpos()))
|
|
87
|
|
88 def _handle_data(self, text):
|
|
89 self.queue.append((Stream.TEXT, text, self.getpos()))
|
|
90
|
|
91 def _handle_prolog(self, version, encoding, standalone):
|
|
92 self.queue.append((Stream.PROLOG, (version, encoding, standalone),
|
|
93 self.getpos()))
|
|
94
|
|
95 def _handle_doctype(self, name, sysid, pubid, has_internal_subset):
|
|
96 self.queue.append((Stream.DOCTYPE, (name, pubid, sysid), self.getpos()))
|
|
97
|
|
98 def _handle_start_ns(self, prefix, uri):
|
|
99 self.queue.append((Stream.START_NS, (prefix or '', uri), self.getpos()))
|
|
100
|
|
101 def _handle_end_ns(self, prefix):
|
|
102 self.queue.append((Stream.END_NS, prefix or '', self.getpos()))
|
|
103
|
|
104 def _handle_pi(self, target, data):
|
|
105 self.queue.append((Stream.PI, (target, data), self.getpos()))
|
|
106
|
|
107 def _handle_comment(self, text):
|
|
108 self.queue.append((Stream.COMMENT, text, self.getpos()))
|
|
109
|
|
110 def _handle_other(self, text):
|
|
111 if text.startswith('&'):
|
|
112 # deal with undefined entities
|
|
113 try:
|
|
114 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
|
|
115 self.queue.append((Stream.TEXT, text, self.getpos()))
|
|
116 except KeyError:
|
|
117 lineno, offset = self.getpos()
|
|
118 raise expat.error("undefined entity %s: line %d, column %d" %
|
|
119 (text, lineno, offset))
|
|
120
|
|
121
|
|
122 def XML(text):
|
|
123 return Stream(list(XMLParser(StringIO(text))))
|
|
124
|
|
125
|
|
126 class HTMLParser(html.HTMLParser):
|
|
127 """Parser for HTML input based on the Python `HTMLParser` module.
|
|
128
|
|
129 This class provides the same interface for generating stream events as
|
|
130 `XMLParser`, and attempts to automatically balance tags.
|
|
131 """
|
|
132
|
|
133 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame',
|
|
134 'hr', 'img', 'input', 'isindex', 'link', 'meta',
|
|
135 'param'])
|
|
136
|
|
137 def __init__(self, source):
|
|
138 html.HTMLParser.__init__(self)
|
|
139 self.source = source
|
|
140 self.queue = []
|
|
141 self._open_tags = []
|
|
142
|
|
143 def __iter__(self):
|
|
144 bufsize = 4 * 1024 # 4K
|
|
145 done = False
|
|
146 while True:
|
|
147 while not done and len(self.queue) == 0:
|
|
148 data = self.source.read(bufsize)
|
|
149 if data == '': # end of data
|
|
150 self.close()
|
|
151 done = True
|
|
152 else:
|
|
153 self.feed(data)
|
|
154 for kind, data, pos in self.queue:
|
|
155 yield kind, data, pos
|
|
156 self.queue = []
|
|
157 if done:
|
|
158 open_tags = self._open_tags
|
|
159 open_tags.reverse()
|
|
160 for tag in open_tags:
|
|
161 yield Stream.END, QName(tag), pos
|
|
162 break
|
|
163
|
|
164 def handle_starttag(self, tag, attrib):
|
|
165 pos = self.getpos()
|
|
166 self.queue.append((Stream.START, (QName(tag), Attributes(attrib)), pos))
|
|
167 if tag in self._EMPTY_ELEMS:
|
|
168 self.queue.append((Stream.END, QName(tag), pos))
|
|
169 else:
|
|
170 self._open_tags.append(tag)
|
|
171
|
|
172 def handle_endtag(self, tag):
|
|
173 if tag not in self._EMPTY_ELEMS:
|
|
174 pos = self.getpos()
|
|
175 while self._open_tags:
|
|
176 open_tag = self._open_tags.pop()
|
|
177 if open_tag.lower() == tag.lower():
|
|
178 break
|
|
179 self.queue.append((Stream.END, QName(open_tag), pos))
|
|
180 self.queue.append((Stream.END, QName(tag), pos))
|
|
181
|
|
182 def handle_data(self, text):
|
|
183 self.queue.append((Stream.TEXT, text, self.getpos()))
|
|
184
|
|
185 def handle_charref(self, name):
|
|
186 self.queue.append((Stream.TEXT, Markup('&#%s;' % name), self.getpos()))
|
|
187
|
|
188 def handle_entityref(self, name):
|
|
189 self.queue.append((Stream.TEXT, Markup('&%s;' % name), self.getpos()))
|
|
190
|
|
191 def handle_pi(self, data):
|
|
192 target, data = data.split(maxsplit=1)
|
|
193 data = data.rstrip('?')
|
|
194 self.queue.append((Stream.PI, (target.strip(), data.strip()),
|
|
195 self.getpos()))
|
|
196
|
|
197 def handle_comment(self, text):
|
|
198 self.queue.append((Stream.COMMENT, text, self.getpos()))
|
|
199
|
|
200
|
|
201 def HTML(text):
|
|
202 return Stream(list(HTMLParser(StringIO(text))))
|