comparison markup/input.py @ 1:5479aae32f5a trunk

Initial import.
author cmlenz
date Sat, 03 Jun 2006 07:16:01 +0000
parents
children b4d17897d053
comparison
equal deleted inserted replaced
0:5f9862282a9d 1:5479aae32f5a
1 # -*- coding: utf-8 -*-
2 #
3 # Copyright (C) 2006 Christopher Lenz
4 # All rights reserved.
5 #
6 # This software is licensed as described in the file COPYING, which
7 # you should have received as part of this distribution. The terms
8 # are also available at http://trac.edgewall.com/license.html.
9 #
10 # This software consists of voluntary contributions made by many
11 # individuals. For the exact contribution history, see the revision
12 # history and logs, available at http://projects.edgewall.com/trac/.
13
14 from xml.parsers import expat
15 try:
16 frozenset
17 except NameError:
18 from sets import ImmutableSet as frozenset
19 import HTMLParser as html
20 import htmlentitydefs
21 import re
22 from StringIO import StringIO
23
24 from markup.core import Attributes, Markup, QName, Stream
25
26
27 class XMLParser(object):
28 """Generator-based XML parser based on roughly equivalent code in
29 Kid/ElementTree."""
30
31 def __init__(self, source):
32 self.source = source
33
34 # Setup the Expat parser
35 parser = expat.ParserCreate('utf-8', '}')
36 parser.buffer_text = True
37 parser.returns_unicode = True
38 parser.StartElementHandler = self._handle_start
39 parser.EndElementHandler = self._handle_end
40 parser.CharacterDataHandler = self._handle_data
41 parser.XmlDeclHandler = self._handle_prolog
42 parser.StartDoctypeDeclHandler = self._handle_doctype
43 parser.StartNamespaceDeclHandler = self._handle_start_ns
44 parser.EndNamespaceDeclHandler = self._handle_end_ns
45 parser.ProcessingInstructionHandler = self._handle_pi
46 parser.CommentHandler = self._handle_comment
47 parser.DefaultHandler = self._handle_other
48
49 # Location reporting is only support in Python >= 2.4
50 if not hasattr(parser, 'CurrentLineNumber'):
51 self.getpos = self._getpos_unknown
52
53 self.expat = parser
54 self.queue = []
55
56 def __iter__(self):
57 bufsize = 4 * 1024 # 4K
58 done = False
59 while True:
60 while not done and len(self.queue) == 0:
61 data = self.source.read(bufsize)
62 if data == '': # end of data
63 if hasattr(self, 'expat'):
64 self.expat.Parse('', True)
65 del self.expat # get rid of circular references
66 done = True
67 else:
68 self.expat.Parse(data, False)
69 for event in self.queue:
70 yield event
71 self.queue = []
72 if done:
73 break
74
75 def _getpos_unknown(self):
76 return (-1, -1)
77
78 def getpos(self):
79 return self.expat.CurrentLineNumber, self.expat.CurrentColumnNumber
80
81 def _handle_start(self, tag, attrib):
82 self.queue.append((Stream.START, (QName(tag), Attributes(attrib.items())),
83 self.getpos()))
84
85 def _handle_end(self, tag):
86 self.queue.append((Stream.END, QName(tag), self.getpos()))
87
88 def _handle_data(self, text):
89 self.queue.append((Stream.TEXT, text, self.getpos()))
90
91 def _handle_prolog(self, version, encoding, standalone):
92 self.queue.append((Stream.PROLOG, (version, encoding, standalone),
93 self.getpos()))
94
95 def _handle_doctype(self, name, sysid, pubid, has_internal_subset):
96 self.queue.append((Stream.DOCTYPE, (name, pubid, sysid), self.getpos()))
97
98 def _handle_start_ns(self, prefix, uri):
99 self.queue.append((Stream.START_NS, (prefix or '', uri), self.getpos()))
100
101 def _handle_end_ns(self, prefix):
102 self.queue.append((Stream.END_NS, prefix or '', self.getpos()))
103
104 def _handle_pi(self, target, data):
105 self.queue.append((Stream.PI, (target, data), self.getpos()))
106
107 def _handle_comment(self, text):
108 self.queue.append((Stream.COMMENT, text, self.getpos()))
109
110 def _handle_other(self, text):
111 if text.startswith('&'):
112 # deal with undefined entities
113 try:
114 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
115 self.queue.append((Stream.TEXT, text, self.getpos()))
116 except KeyError:
117 lineno, offset = self.getpos()
118 raise expat.error("undefined entity %s: line %d, column %d" %
119 (text, lineno, offset))
120
121
122 def XML(text):
123 return Stream(list(XMLParser(StringIO(text))))
124
125
126 class HTMLParser(html.HTMLParser):
127 """Parser for HTML input based on the Python `HTMLParser` module.
128
129 This class provides the same interface for generating stream events as
130 `XMLParser`, and attempts to automatically balance tags.
131 """
132
133 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame',
134 'hr', 'img', 'input', 'isindex', 'link', 'meta',
135 'param'])
136
137 def __init__(self, source):
138 html.HTMLParser.__init__(self)
139 self.source = source
140 self.queue = []
141 self._open_tags = []
142
143 def __iter__(self):
144 bufsize = 4 * 1024 # 4K
145 done = False
146 while True:
147 while not done and len(self.queue) == 0:
148 data = self.source.read(bufsize)
149 if data == '': # end of data
150 self.close()
151 done = True
152 else:
153 self.feed(data)
154 for kind, data, pos in self.queue:
155 yield kind, data, pos
156 self.queue = []
157 if done:
158 open_tags = self._open_tags
159 open_tags.reverse()
160 for tag in open_tags:
161 yield Stream.END, QName(tag), pos
162 break
163
164 def handle_starttag(self, tag, attrib):
165 pos = self.getpos()
166 self.queue.append((Stream.START, (QName(tag), Attributes(attrib)), pos))
167 if tag in self._EMPTY_ELEMS:
168 self.queue.append((Stream.END, QName(tag), pos))
169 else:
170 self._open_tags.append(tag)
171
172 def handle_endtag(self, tag):
173 if tag not in self._EMPTY_ELEMS:
174 pos = self.getpos()
175 while self._open_tags:
176 open_tag = self._open_tags.pop()
177 if open_tag.lower() == tag.lower():
178 break
179 self.queue.append((Stream.END, QName(open_tag), pos))
180 self.queue.append((Stream.END, QName(tag), pos))
181
182 def handle_data(self, text):
183 self.queue.append((Stream.TEXT, text, self.getpos()))
184
185 def handle_charref(self, name):
186 self.queue.append((Stream.TEXT, Markup('&#%s;' % name), self.getpos()))
187
188 def handle_entityref(self, name):
189 self.queue.append((Stream.TEXT, Markup('&%s;' % name), self.getpos()))
190
191 def handle_pi(self, data):
192 target, data = data.split(maxsplit=1)
193 data = data.rstrip('?')
194 self.queue.append((Stream.PI, (target.strip(), data.strip()),
195 self.getpos()))
196
197 def handle_comment(self, text):
198 self.queue.append((Stream.COMMENT, text, self.getpos()))
199
200
201 def HTML(text):
202 return Stream(list(HTMLParser(StringIO(text))))
Copyright (C) 2012-2017 Edgewall Software