annotate markup/input.py @ 1:5479aae32f5a trunk

Initial import.
author cmlenz
date Sat, 03 Jun 2006 07:16:01 +0000
parents
children b4d17897d053
rev   line source
1
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
1 # -*- coding: utf-8 -*-
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
2 #
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
3 # Copyright (C) 2006 Christopher Lenz
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
4 # All rights reserved.
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
5 #
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
6 # This software is licensed as described in the file COPYING, which
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
7 # you should have received as part of this distribution. The terms
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
8 # are also available at http://trac.edgewall.com/license.html.
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
9 #
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
10 # This software consists of voluntary contributions made by many
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
11 # individuals. For the exact contribution history, see the revision
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
12 # history and logs, available at http://projects.edgewall.com/trac/.
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
13
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
14 from xml.parsers import expat
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
15 try:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
16 frozenset
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
17 except NameError:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
18 from sets import ImmutableSet as frozenset
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
19 import HTMLParser as html
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
20 import htmlentitydefs
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
21 import re
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
22 from StringIO import StringIO
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
23
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
24 from markup.core import Attributes, Markup, QName, Stream
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
25
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
26
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
27 class XMLParser(object):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
28 """Generator-based XML parser based on roughly equivalent code in
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
29 Kid/ElementTree."""
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
30
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
31 def __init__(self, source):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
32 self.source = source
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
33
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
34 # Setup the Expat parser
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
35 parser = expat.ParserCreate('utf-8', '}')
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
36 parser.buffer_text = True
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
37 parser.returns_unicode = True
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
38 parser.StartElementHandler = self._handle_start
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
39 parser.EndElementHandler = self._handle_end
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
40 parser.CharacterDataHandler = self._handle_data
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
41 parser.XmlDeclHandler = self._handle_prolog
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
42 parser.StartDoctypeDeclHandler = self._handle_doctype
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
43 parser.StartNamespaceDeclHandler = self._handle_start_ns
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
44 parser.EndNamespaceDeclHandler = self._handle_end_ns
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
45 parser.ProcessingInstructionHandler = self._handle_pi
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
46 parser.CommentHandler = self._handle_comment
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
47 parser.DefaultHandler = self._handle_other
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
48
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
49 # Location reporting is only support in Python >= 2.4
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
50 if not hasattr(parser, 'CurrentLineNumber'):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
51 self.getpos = self._getpos_unknown
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
52
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
53 self.expat = parser
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
54 self.queue = []
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
55
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
56 def __iter__(self):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
57 bufsize = 4 * 1024 # 4K
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
58 done = False
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
59 while True:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
60 while not done and len(self.queue) == 0:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
61 data = self.source.read(bufsize)
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
62 if data == '': # end of data
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
63 if hasattr(self, 'expat'):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
64 self.expat.Parse('', True)
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
65 del self.expat # get rid of circular references
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
66 done = True
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
67 else:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
68 self.expat.Parse(data, False)
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
69 for event in self.queue:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
70 yield event
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
71 self.queue = []
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
72 if done:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
73 break
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
74
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
75 def _getpos_unknown(self):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
76 return (-1, -1)
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
77
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
78 def getpos(self):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
79 return self.expat.CurrentLineNumber, self.expat.CurrentColumnNumber
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
80
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
81 def _handle_start(self, tag, attrib):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
82 self.queue.append((Stream.START, (QName(tag), Attributes(attrib.items())),
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
83 self.getpos()))
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
84
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
85 def _handle_end(self, tag):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
86 self.queue.append((Stream.END, QName(tag), self.getpos()))
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
87
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
88 def _handle_data(self, text):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
89 self.queue.append((Stream.TEXT, text, self.getpos()))
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
90
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
91 def _handle_prolog(self, version, encoding, standalone):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
92 self.queue.append((Stream.PROLOG, (version, encoding, standalone),
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
93 self.getpos()))
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
94
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
95 def _handle_doctype(self, name, sysid, pubid, has_internal_subset):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
96 self.queue.append((Stream.DOCTYPE, (name, pubid, sysid), self.getpos()))
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
97
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
98 def _handle_start_ns(self, prefix, uri):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
99 self.queue.append((Stream.START_NS, (prefix or '', uri), self.getpos()))
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
100
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
101 def _handle_end_ns(self, prefix):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
102 self.queue.append((Stream.END_NS, prefix or '', self.getpos()))
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
103
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
104 def _handle_pi(self, target, data):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
105 self.queue.append((Stream.PI, (target, data), self.getpos()))
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
106
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
107 def _handle_comment(self, text):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
108 self.queue.append((Stream.COMMENT, text, self.getpos()))
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
109
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
110 def _handle_other(self, text):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
111 if text.startswith('&'):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
112 # deal with undefined entities
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
113 try:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
114 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
115 self.queue.append((Stream.TEXT, text, self.getpos()))
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
116 except KeyError:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
117 lineno, offset = self.getpos()
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
118 raise expat.error("undefined entity %s: line %d, column %d" %
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
119 (text, lineno, offset))
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
120
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
121
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
122 def XML(text):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
123 return Stream(list(XMLParser(StringIO(text))))
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
124
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
125
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
126 class HTMLParser(html.HTMLParser):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
127 """Parser for HTML input based on the Python `HTMLParser` module.
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
128
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
129 This class provides the same interface for generating stream events as
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
130 `XMLParser`, and attempts to automatically balance tags.
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
131 """
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
132
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
133 _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame',
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
134 'hr', 'img', 'input', 'isindex', 'link', 'meta',
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
135 'param'])
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
136
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
137 def __init__(self, source):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
138 html.HTMLParser.__init__(self)
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
139 self.source = source
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
140 self.queue = []
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
141 self._open_tags = []
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
142
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
143 def __iter__(self):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
144 bufsize = 4 * 1024 # 4K
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
145 done = False
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
146 while True:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
147 while not done and len(self.queue) == 0:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
148 data = self.source.read(bufsize)
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
149 if data == '': # end of data
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
150 self.close()
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
151 done = True
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
152 else:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
153 self.feed(data)
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
154 for kind, data, pos in self.queue:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
155 yield kind, data, pos
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
156 self.queue = []
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
157 if done:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
158 open_tags = self._open_tags
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
159 open_tags.reverse()
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
160 for tag in open_tags:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
161 yield Stream.END, QName(tag), pos
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
162 break
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
163
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
164 def handle_starttag(self, tag, attrib):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
165 pos = self.getpos()
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
166 self.queue.append((Stream.START, (QName(tag), Attributes(attrib)), pos))
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
167 if tag in self._EMPTY_ELEMS:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
168 self.queue.append((Stream.END, QName(tag), pos))
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
169 else:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
170 self._open_tags.append(tag)
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
171
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
172 def handle_endtag(self, tag):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
173 if tag not in self._EMPTY_ELEMS:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
174 pos = self.getpos()
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
175 while self._open_tags:
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
176 open_tag = self._open_tags.pop()
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
177 if open_tag.lower() == tag.lower():
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
178 break
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
179 self.queue.append((Stream.END, QName(open_tag), pos))
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
180 self.queue.append((Stream.END, QName(tag), pos))
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
181
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
182 def handle_data(self, text):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
183 self.queue.append((Stream.TEXT, text, self.getpos()))
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
184
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
185 def handle_charref(self, name):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
186 self.queue.append((Stream.TEXT, Markup('&#%s;' % name), self.getpos()))
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
187
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
188 def handle_entityref(self, name):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
189 self.queue.append((Stream.TEXT, Markup('&%s;' % name), self.getpos()))
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
190
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
191 def handle_pi(self, data):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
192 target, data = data.split(maxsplit=1)
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
193 data = data.rstrip('?')
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
194 self.queue.append((Stream.PI, (target.strip(), data.strip()),
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
195 self.getpos()))
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
196
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
197 def handle_comment(self, text):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
198 self.queue.append((Stream.COMMENT, text, self.getpos()))
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
199
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
200
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
201 def HTML(text):
5479aae32f5a Initial import.
cmlenz
parents:
diff changeset
202 return Stream(list(HTMLParser(StringIO(text))))
Copyright (C) 2012-2017 Edgewall Software