comparison markup/core.py @ 1:5479aae32f5a trunk

Initial import.
author cmlenz
date Sat, 03 Jun 2006 07:16:01 +0000
parents
children dbb08edbc615
comparison
equal deleted inserted replaced
0:5f9862282a9d 1:5479aae32f5a
1 # -*- coding: utf-8 -*-
2 #
3 # Copyright (C) 2006 Christopher Lenz
4 # All rights reserved.
5 #
6 # This software is licensed as described in the file COPYING, which
7 # you should have received as part of this distribution. The terms
8 # are also available at http://trac.edgewall.com/license.html.
9 #
10 # This software consists of voluntary contributions made by many
11 # individuals. For the exact contribution history, see the revision
12 # history and logs, available at http://projects.edgewall.com/trac/.
13
14 """Core classes for markup processing."""
15
16 import htmlentitydefs
17 import re
18 from StringIO import StringIO
19
20 __all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Namespace', 'QName']
21
22
23 class StreamEventKind(object):
24 """A kind of event on an XML stream."""
25
26 __slots__ = ['name']
27
28 def __init__(self, name):
29 self.name = name
30
31 def __repr__(self):
32 return self.name
33
34
35 class Stream(object):
36 """Represents a stream of markup events.
37
38 This class is basically an iterator over the events.
39
40 Also provided are ways to serialize the stream to text. The `serialize()`
41 method will return an iterator over generated strings, while `render()`
42 returns the complete generated text at once. Both accept various parameters
43 that impact the way the stream is serialized.
44
45 Stream events are tuples of the form:
46
47 (kind, data, position)
48
49 where `kind` is the event kind (such as `START`, `END`, `TEXT`, etc), `data`
50 depends on the kind of event, and `position` is a `(line, offset)` tuple
51 that contains the location of the original element or text in the input.
52 """
53 __slots__ = ['events']
54
55 START = StreamEventKind('start') # a start tag
56 END = StreamEventKind('end') # an end tag
57 TEXT = StreamEventKind('text') # literal text
58 EXPR = StreamEventKind('expr') # an expression
59 SUB = StreamEventKind('sub') # a "subprogram"
60 PROLOG = StreamEventKind('prolog') # XML prolog
61 DOCTYPE = StreamEventKind('doctype') # doctype declaration
62 START_NS = StreamEventKind('start-ns') # start namespace mapping
63 END_NS = StreamEventKind('end-ns') # end namespace mapping
64 PI = StreamEventKind('pi') # processing instruction
65 COMMENT = StreamEventKind('comment') # comment
66
67 def __init__(self, events):
68 """Initialize the stream with a sequence of markup events.
69
70 @oaram events: a sequence or iterable providing the events
71 """
72 self.events = events
73
74 def __iter__(self):
75 return iter(self.events)
76
77 def render(self, method='xml', encoding='utf-8', **kwargs):
78 """Return a string representation of the stream.
79
80 @param method: determines how the stream is serialized; can be either
81 'xml' or 'html', or a custom `Serializer` subclass
82 @param encoding: how the output string should be encoded; if set to
83 `None`, this method returns a `unicode` object
84
85 Any additional keyword arguments are passed to the serializer, and thus
86 depend on the `method` parameter value.
87 """
88 retval = u''.join(self.serialize(method=method, **kwargs))
89 if encoding is not None:
90 return retval.encode('utf-8')
91 return retval
92
93 def select(self, path):
94 """Return a new stream that contains the events matching the given
95 XPath expression.
96
97 @param path: a string containing the XPath expression
98 """
99 from markup.path import Path
100 path = Path(path)
101 return path.select(self)
102
103 def serialize(self, method='xml', **kwargs):
104 """Generate strings corresponding to a specific serialization of the
105 stream.
106
107 Unlike the `render()` method, this method is a generator this returns
108 the serialized output incrementally, as opposed to returning a single
109 string.
110
111 @param method: determines how the stream is serialized; can be either
112 'xml' or 'html', or a custom `Serializer` subclass
113 """
114 from markup import output
115 cls = method
116 if isinstance(method, basestring):
117 cls = {'xml': output.XMLSerializer,
118 'html': output.HTMLSerializer}[method]
119 else:
120 assert issubclass(cls, serializers.Serializer)
121 serializer = cls(**kwargs)
122 return serializer.serialize(self)
123
124 def __str__(self):
125 return self.render()
126
127 def __unicode__(self):
128 return self.render(encoding=None)
129
130
131 class Attributes(list):
132
133 def __init__(self, attrib=None):
134 list.__init__(self, map(lambda (k, v): (QName(k), v), attrib or []))
135
136 def __contains__(self, name):
137 return name in [attr for attr, value in self]
138
139 def get(self, name, default=None):
140 for attr, value in self:
141 if attr == name:
142 return value
143 return default
144
145 def set(self, name, value):
146 for idx, (attr, _) in enumerate(self):
147 if attr == name:
148 self[idx] = (attr, value)
149 break
150 else:
151 self.append((QName(name), value))
152
153
154 class Markup(unicode):
155 """Marks a string as being safe for inclusion in HTML/XML output without
156 needing to be escaped.
157 """
158 def __new__(self, text='', *args):
159 if args:
160 text %= tuple([escape(arg) for arg in args])
161 return unicode.__new__(self, text)
162
163 def __add__(self, other):
164 return Markup(unicode(self) + Markup.escape(other))
165
166 def __mod__(self, args):
167 if not isinstance(args, (list, tuple)):
168 args = [args]
169 return Markup(unicode.__mod__(self,
170 tuple([escape(arg) for arg in args])))
171
172 def __mul__(self, num):
173 return Markup(unicode(self) * num)
174
175 def join(self, seq):
176 return Markup(unicode(self).join([Markup.escape(item) for item in seq]))
177
178 def stripentities(self, keepxmlentities=False):
179 """Return a copy of the text with any character or numeric entities
180 replaced by the equivalent UTF-8 characters.
181
182 If the `keepxmlentities` parameter is provided and evaluates to `True`,
183 the core XML entities (&, ', >, < and ").
184 """
185 def _replace_entity(match):
186 if match.group(1): # numeric entity
187 ref = match.group(1)
188 if ref.startswith('x'):
189 ref = int(ref[1:], 16)
190 else:
191 ref = int(ref, 10)
192 return unichr(ref)
193 else: # character entity
194 ref = match.group(2)
195 if keepxmlentities and ref in ('amp', 'apos', 'gt', 'lt', 'quot'):
196 return '&%s;' % ref
197 try:
198 codepoint = htmlentitydefs.name2codepoint[ref]
199 return unichr(codepoint)
200 except KeyError:
201 if keepxmlentities:
202 return '&%s;' % ref
203 else:
204 return ref
205 return Markup(re.sub(r'&(?:#((?:\d+)|(?:[xX][0-9a-fA-F]+));?|(\w+);)',
206 _replace_entity, self))
207
208 def striptags(self):
209 """Return a copy of the text with all XML/HTML tags removed."""
210 return Markup(re.sub(r'<[^>]*?>', '', self))
211
212 def escape(cls, text, quotes=True):
213 """Create a Markup instance from a string and escape special characters
214 it may contain (<, >, & and \").
215
216 If the `quotes` parameter is set to `False`, the \" character is left
217 as is. Escaping quotes is generally only required for strings that are
218 to be used in attribute values.
219 """
220 if isinstance(text, cls):
221 return text
222 text = unicode(text)
223 if not text:
224 return cls()
225 text = text.replace('&', '&amp;') \
226 .replace('<', '&lt;') \
227 .replace('>', '&gt;')
228 if quotes:
229 text = text.replace('"', '&#34;')
230 return cls(text)
231 escape = classmethod(escape)
232
233 def unescape(self):
234 """Reverse-escapes &, <, > and \" and returns a `unicode` object."""
235 if not self:
236 return ''
237 return unicode(self).replace('&#34;', '"') \
238 .replace('&gt;', '>') \
239 .replace('&lt;', '<') \
240 .replace('&amp;', '&')
241
242 def plaintext(self, keeplinebreaks=True):
243 """Returns the text as a `unicode`with all entities and tags removed."""
244 text = unicode(self.striptags().stripentities())
245 if not keeplinebreaks:
246 text = text.replace('\n', ' ')
247 return text
248
249 def sanitize(self):
250 from markup.filters import HTMLSanitizer
251 from markup.input import HTMLParser
252 sanitize = HTMLSanitizer()
253 text = self.stripentities(keepxmlentities=True)
254 return Stream(sanitize(HTMLParser(StringIO(text)), None))
255
256
257 escape = Markup.escape
258
259 def unescape(text):
260 """Reverse-escapes &, <, > and \" and returns a `unicode` object."""
261 if not isinstance(text, Markup):
262 return text
263 return text.unescape()
264
265
266 class Namespace(object):
267
268 def __init__(self, uri):
269 self.uri = uri
270
271 def __getitem__(self, name):
272 return QName(self.uri + '}' + name)
273
274 __getattr__ = __getitem__
275
276 def __repr__(self):
277 return '<Namespace "%s">' % self.uri
278
279 def __str__(self):
280 return self.uri
281
282 def __unicode__(self):
283 return unicode(self.uri)
284
285
286 class QName(unicode):
287 """A qualified element or attribute name.
288
289 The unicode value of instances of this class contains the qualified name of
290 the element or attribute, in the form `{namespace}localname`. The namespace
291 URI can be obtained through the additional `namespace` attribute, while the
292 local name can be accessed through the `localname` attribute.
293 """
294 __slots__ = ['namespace', 'localname']
295
296 def __new__(cls, qname):
297 if isinstance(qname, QName):
298 return qname
299
300 parts = qname.split('}', 1)
301 if qname.find('}') > 0:
302 self = unicode.__new__(cls, '{' + qname)
303 self.namespace = parts[0]
304 self.localname = parts[1]
305 else:
306 self = unicode.__new__(cls, qname)
307 self.namespace = None
308 self.localname = qname
309 return self
Copyright (C) 2012-2017 Edgewall Software