1
|
1 # -*- coding: utf-8 -*-
|
|
2 #
|
|
3 # Copyright (C) 2006 Christopher Lenz
|
|
4 # All rights reserved.
|
|
5 #
|
|
6 # This software is licensed as described in the file COPYING, which
|
|
7 # you should have received as part of this distribution. The terms
|
|
8 # are also available at http://trac.edgewall.com/license.html.
|
|
9 #
|
|
10 # This software consists of voluntary contributions made by many
|
|
11 # individuals. For the exact contribution history, see the revision
|
|
12 # history and logs, available at http://projects.edgewall.com/trac/.
|
|
13
|
|
14 """Core classes for markup processing."""
|
|
15
|
|
16 import htmlentitydefs
|
|
17 import re
|
|
18 from StringIO import StringIO
|
|
19
|
|
20 __all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Namespace', 'QName']
|
|
21
|
|
22
|
|
23 class StreamEventKind(object):
|
|
24 """A kind of event on an XML stream."""
|
|
25
|
|
26 __slots__ = ['name']
|
|
27
|
|
28 def __init__(self, name):
|
|
29 self.name = name
|
|
30
|
|
31 def __repr__(self):
|
|
32 return self.name
|
|
33
|
|
34
|
|
35 class Stream(object):
|
|
36 """Represents a stream of markup events.
|
|
37
|
|
38 This class is basically an iterator over the events.
|
|
39
|
|
40 Also provided are ways to serialize the stream to text. The `serialize()`
|
|
41 method will return an iterator over generated strings, while `render()`
|
|
42 returns the complete generated text at once. Both accept various parameters
|
|
43 that impact the way the stream is serialized.
|
|
44
|
|
45 Stream events are tuples of the form:
|
|
46
|
|
47 (kind, data, position)
|
|
48
|
|
49 where `kind` is the event kind (such as `START`, `END`, `TEXT`, etc), `data`
|
|
50 depends on the kind of event, and `position` is a `(line, offset)` tuple
|
|
51 that contains the location of the original element or text in the input.
|
|
52 """
|
|
53 __slots__ = ['events']
|
|
54
|
|
55 START = StreamEventKind('start') # a start tag
|
|
56 END = StreamEventKind('end') # an end tag
|
|
57 TEXT = StreamEventKind('text') # literal text
|
|
58 EXPR = StreamEventKind('expr') # an expression
|
|
59 SUB = StreamEventKind('sub') # a "subprogram"
|
|
60 PROLOG = StreamEventKind('prolog') # XML prolog
|
|
61 DOCTYPE = StreamEventKind('doctype') # doctype declaration
|
|
62 START_NS = StreamEventKind('start-ns') # start namespace mapping
|
|
63 END_NS = StreamEventKind('end-ns') # end namespace mapping
|
|
64 PI = StreamEventKind('pi') # processing instruction
|
|
65 COMMENT = StreamEventKind('comment') # comment
|
|
66
|
|
67 def __init__(self, events):
|
|
68 """Initialize the stream with a sequence of markup events.
|
|
69
|
|
70 @oaram events: a sequence or iterable providing the events
|
|
71 """
|
|
72 self.events = events
|
|
73
|
|
74 def __iter__(self):
|
|
75 return iter(self.events)
|
|
76
|
|
77 def render(self, method='xml', encoding='utf-8', **kwargs):
|
|
78 """Return a string representation of the stream.
|
|
79
|
|
80 @param method: determines how the stream is serialized; can be either
|
|
81 'xml' or 'html', or a custom `Serializer` subclass
|
|
82 @param encoding: how the output string should be encoded; if set to
|
|
83 `None`, this method returns a `unicode` object
|
|
84
|
|
85 Any additional keyword arguments are passed to the serializer, and thus
|
|
86 depend on the `method` parameter value.
|
|
87 """
|
|
88 retval = u''.join(self.serialize(method=method, **kwargs))
|
|
89 if encoding is not None:
|
|
90 return retval.encode('utf-8')
|
|
91 return retval
|
|
92
|
|
93 def select(self, path):
|
|
94 """Return a new stream that contains the events matching the given
|
|
95 XPath expression.
|
|
96
|
|
97 @param path: a string containing the XPath expression
|
|
98 """
|
|
99 from markup.path import Path
|
|
100 path = Path(path)
|
|
101 return path.select(self)
|
|
102
|
|
103 def serialize(self, method='xml', **kwargs):
|
|
104 """Generate strings corresponding to a specific serialization of the
|
|
105 stream.
|
|
106
|
|
107 Unlike the `render()` method, this method is a generator this returns
|
|
108 the serialized output incrementally, as opposed to returning a single
|
|
109 string.
|
|
110
|
|
111 @param method: determines how the stream is serialized; can be either
|
|
112 'xml' or 'html', or a custom `Serializer` subclass
|
|
113 """
|
|
114 from markup import output
|
|
115 cls = method
|
|
116 if isinstance(method, basestring):
|
|
117 cls = {'xml': output.XMLSerializer,
|
|
118 'html': output.HTMLSerializer}[method]
|
|
119 else:
|
|
120 assert issubclass(cls, serializers.Serializer)
|
|
121 serializer = cls(**kwargs)
|
|
122 return serializer.serialize(self)
|
|
123
|
|
124 def __str__(self):
|
|
125 return self.render()
|
|
126
|
|
127 def __unicode__(self):
|
|
128 return self.render(encoding=None)
|
|
129
|
|
130
|
|
131 class Attributes(list):
|
|
132
|
|
133 def __init__(self, attrib=None):
|
|
134 list.__init__(self, map(lambda (k, v): (QName(k), v), attrib or []))
|
|
135
|
|
136 def __contains__(self, name):
|
|
137 return name in [attr for attr, value in self]
|
|
138
|
|
139 def get(self, name, default=None):
|
|
140 for attr, value in self:
|
|
141 if attr == name:
|
|
142 return value
|
|
143 return default
|
|
144
|
|
145 def set(self, name, value):
|
|
146 for idx, (attr, _) in enumerate(self):
|
|
147 if attr == name:
|
|
148 self[idx] = (attr, value)
|
|
149 break
|
|
150 else:
|
|
151 self.append((QName(name), value))
|
|
152
|
|
153
|
|
154 class Markup(unicode):
|
|
155 """Marks a string as being safe for inclusion in HTML/XML output without
|
|
156 needing to be escaped.
|
|
157 """
|
|
158 def __new__(self, text='', *args):
|
|
159 if args:
|
|
160 text %= tuple([escape(arg) for arg in args])
|
|
161 return unicode.__new__(self, text)
|
|
162
|
|
163 def __add__(self, other):
|
|
164 return Markup(unicode(self) + Markup.escape(other))
|
|
165
|
|
166 def __mod__(self, args):
|
|
167 if not isinstance(args, (list, tuple)):
|
|
168 args = [args]
|
|
169 return Markup(unicode.__mod__(self,
|
|
170 tuple([escape(arg) for arg in args])))
|
|
171
|
|
172 def __mul__(self, num):
|
|
173 return Markup(unicode(self) * num)
|
|
174
|
|
175 def join(self, seq):
|
|
176 return Markup(unicode(self).join([Markup.escape(item) for item in seq]))
|
|
177
|
|
178 def stripentities(self, keepxmlentities=False):
|
|
179 """Return a copy of the text with any character or numeric entities
|
|
180 replaced by the equivalent UTF-8 characters.
|
|
181
|
|
182 If the `keepxmlentities` parameter is provided and evaluates to `True`,
|
|
183 the core XML entities (&, ', >, < and ").
|
|
184 """
|
|
185 def _replace_entity(match):
|
|
186 if match.group(1): # numeric entity
|
|
187 ref = match.group(1)
|
|
188 if ref.startswith('x'):
|
|
189 ref = int(ref[1:], 16)
|
|
190 else:
|
|
191 ref = int(ref, 10)
|
|
192 return unichr(ref)
|
|
193 else: # character entity
|
|
194 ref = match.group(2)
|
|
195 if keepxmlentities and ref in ('amp', 'apos', 'gt', 'lt', 'quot'):
|
|
196 return '&%s;' % ref
|
|
197 try:
|
|
198 codepoint = htmlentitydefs.name2codepoint[ref]
|
|
199 return unichr(codepoint)
|
|
200 except KeyError:
|
|
201 if keepxmlentities:
|
|
202 return '&%s;' % ref
|
|
203 else:
|
|
204 return ref
|
|
205 return Markup(re.sub(r'&(?:#((?:\d+)|(?:[xX][0-9a-fA-F]+));?|(\w+);)',
|
|
206 _replace_entity, self))
|
|
207
|
|
208 def striptags(self):
|
|
209 """Return a copy of the text with all XML/HTML tags removed."""
|
|
210 return Markup(re.sub(r'<[^>]*?>', '', self))
|
|
211
|
|
212 def escape(cls, text, quotes=True):
|
|
213 """Create a Markup instance from a string and escape special characters
|
|
214 it may contain (<, >, & and \").
|
|
215
|
|
216 If the `quotes` parameter is set to `False`, the \" character is left
|
|
217 as is. Escaping quotes is generally only required for strings that are
|
|
218 to be used in attribute values.
|
|
219 """
|
|
220 if isinstance(text, cls):
|
|
221 return text
|
|
222 text = unicode(text)
|
|
223 if not text:
|
|
224 return cls()
|
|
225 text = text.replace('&', '&') \
|
|
226 .replace('<', '<') \
|
|
227 .replace('>', '>')
|
|
228 if quotes:
|
|
229 text = text.replace('"', '"')
|
|
230 return cls(text)
|
|
231 escape = classmethod(escape)
|
|
232
|
|
233 def unescape(self):
|
|
234 """Reverse-escapes &, <, > and \" and returns a `unicode` object."""
|
|
235 if not self:
|
|
236 return ''
|
|
237 return unicode(self).replace('"', '"') \
|
|
238 .replace('>', '>') \
|
|
239 .replace('<', '<') \
|
|
240 .replace('&', '&')
|
|
241
|
|
242 def plaintext(self, keeplinebreaks=True):
|
|
243 """Returns the text as a `unicode`with all entities and tags removed."""
|
|
244 text = unicode(self.striptags().stripentities())
|
|
245 if not keeplinebreaks:
|
|
246 text = text.replace('\n', ' ')
|
|
247 return text
|
|
248
|
|
249 def sanitize(self):
|
|
250 from markup.filters import HTMLSanitizer
|
|
251 from markup.input import HTMLParser
|
|
252 sanitize = HTMLSanitizer()
|
|
253 text = self.stripentities(keepxmlentities=True)
|
|
254 return Stream(sanitize(HTMLParser(StringIO(text)), None))
|
|
255
|
|
256
|
|
257 escape = Markup.escape
|
|
258
|
|
259 def unescape(text):
|
|
260 """Reverse-escapes &, <, > and \" and returns a `unicode` object."""
|
|
261 if not isinstance(text, Markup):
|
|
262 return text
|
|
263 return text.unescape()
|
|
264
|
|
265
|
|
266 class Namespace(object):
|
|
267
|
|
268 def __init__(self, uri):
|
|
269 self.uri = uri
|
|
270
|
|
271 def __getitem__(self, name):
|
|
272 return QName(self.uri + '}' + name)
|
|
273
|
|
274 __getattr__ = __getitem__
|
|
275
|
|
276 def __repr__(self):
|
|
277 return '<Namespace "%s">' % self.uri
|
|
278
|
|
279 def __str__(self):
|
|
280 return self.uri
|
|
281
|
|
282 def __unicode__(self):
|
|
283 return unicode(self.uri)
|
|
284
|
|
285
|
|
286 class QName(unicode):
|
|
287 """A qualified element or attribute name.
|
|
288
|
|
289 The unicode value of instances of this class contains the qualified name of
|
|
290 the element or attribute, in the form `{namespace}localname`. The namespace
|
|
291 URI can be obtained through the additional `namespace` attribute, while the
|
|
292 local name can be accessed through the `localname` attribute.
|
|
293 """
|
|
294 __slots__ = ['namespace', 'localname']
|
|
295
|
|
296 def __new__(cls, qname):
|
|
297 if isinstance(qname, QName):
|
|
298 return qname
|
|
299
|
|
300 parts = qname.split('}', 1)
|
|
301 if qname.find('}') > 0:
|
|
302 self = unicode.__new__(cls, '{' + qname)
|
|
303 self.namespace = parts[0]
|
|
304 self.localname = parts[1]
|
|
305 else:
|
|
306 self = unicode.__new__(cls, qname)
|
|
307 self.namespace = None
|
|
308 self.localname = qname
|
|
309 return self
|