Mercurial > genshi > mirror
comparison markup/core.py @ 1:5479aae32f5a trunk
Initial import.
author | cmlenz |
---|---|
date | Sat, 03 Jun 2006 07:16:01 +0000 |
parents | |
children | dbb08edbc615 |
comparison
equal
deleted
inserted
replaced
0:5f9862282a9d | 1:5479aae32f5a |
---|---|
1 # -*- coding: utf-8 -*- | |
2 # | |
3 # Copyright (C) 2006 Christopher Lenz | |
4 # All rights reserved. | |
5 # | |
6 # This software is licensed as described in the file COPYING, which | |
7 # you should have received as part of this distribution. The terms | |
8 # are also available at http://trac.edgewall.com/license.html. | |
9 # | |
10 # This software consists of voluntary contributions made by many | |
11 # individuals. For the exact contribution history, see the revision | |
12 # history and logs, available at http://projects.edgewall.com/trac/. | |
13 | |
14 """Core classes for markup processing.""" | |
15 | |
16 import htmlentitydefs | |
17 import re | |
18 from StringIO import StringIO | |
19 | |
20 __all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Namespace', 'QName'] | |
21 | |
22 | |
23 class StreamEventKind(object): | |
24 """A kind of event on an XML stream.""" | |
25 | |
26 __slots__ = ['name'] | |
27 | |
28 def __init__(self, name): | |
29 self.name = name | |
30 | |
31 def __repr__(self): | |
32 return self.name | |
33 | |
34 | |
35 class Stream(object): | |
36 """Represents a stream of markup events. | |
37 | |
38 This class is basically an iterator over the events. | |
39 | |
40 Also provided are ways to serialize the stream to text. The `serialize()` | |
41 method will return an iterator over generated strings, while `render()` | |
42 returns the complete generated text at once. Both accept various parameters | |
43 that impact the way the stream is serialized. | |
44 | |
45 Stream events are tuples of the form: | |
46 | |
47 (kind, data, position) | |
48 | |
49 where `kind` is the event kind (such as `START`, `END`, `TEXT`, etc), `data` | |
50 depends on the kind of event, and `position` is a `(line, offset)` tuple | |
51 that contains the location of the original element or text in the input. | |
52 """ | |
53 __slots__ = ['events'] | |
54 | |
55 START = StreamEventKind('start') # a start tag | |
56 END = StreamEventKind('end') # an end tag | |
57 TEXT = StreamEventKind('text') # literal text | |
58 EXPR = StreamEventKind('expr') # an expression | |
59 SUB = StreamEventKind('sub') # a "subprogram" | |
60 PROLOG = StreamEventKind('prolog') # XML prolog | |
61 DOCTYPE = StreamEventKind('doctype') # doctype declaration | |
62 START_NS = StreamEventKind('start-ns') # start namespace mapping | |
63 END_NS = StreamEventKind('end-ns') # end namespace mapping | |
64 PI = StreamEventKind('pi') # processing instruction | |
65 COMMENT = StreamEventKind('comment') # comment | |
66 | |
67 def __init__(self, events): | |
68 """Initialize the stream with a sequence of markup events. | |
69 | |
70 @oaram events: a sequence or iterable providing the events | |
71 """ | |
72 self.events = events | |
73 | |
74 def __iter__(self): | |
75 return iter(self.events) | |
76 | |
77 def render(self, method='xml', encoding='utf-8', **kwargs): | |
78 """Return a string representation of the stream. | |
79 | |
80 @param method: determines how the stream is serialized; can be either | |
81 'xml' or 'html', or a custom `Serializer` subclass | |
82 @param encoding: how the output string should be encoded; if set to | |
83 `None`, this method returns a `unicode` object | |
84 | |
85 Any additional keyword arguments are passed to the serializer, and thus | |
86 depend on the `method` parameter value. | |
87 """ | |
88 retval = u''.join(self.serialize(method=method, **kwargs)) | |
89 if encoding is not None: | |
90 return retval.encode('utf-8') | |
91 return retval | |
92 | |
93 def select(self, path): | |
94 """Return a new stream that contains the events matching the given | |
95 XPath expression. | |
96 | |
97 @param path: a string containing the XPath expression | |
98 """ | |
99 from markup.path import Path | |
100 path = Path(path) | |
101 return path.select(self) | |
102 | |
103 def serialize(self, method='xml', **kwargs): | |
104 """Generate strings corresponding to a specific serialization of the | |
105 stream. | |
106 | |
107 Unlike the `render()` method, this method is a generator this returns | |
108 the serialized output incrementally, as opposed to returning a single | |
109 string. | |
110 | |
111 @param method: determines how the stream is serialized; can be either | |
112 'xml' or 'html', or a custom `Serializer` subclass | |
113 """ | |
114 from markup import output | |
115 cls = method | |
116 if isinstance(method, basestring): | |
117 cls = {'xml': output.XMLSerializer, | |
118 'html': output.HTMLSerializer}[method] | |
119 else: | |
120 assert issubclass(cls, serializers.Serializer) | |
121 serializer = cls(**kwargs) | |
122 return serializer.serialize(self) | |
123 | |
124 def __str__(self): | |
125 return self.render() | |
126 | |
127 def __unicode__(self): | |
128 return self.render(encoding=None) | |
129 | |
130 | |
131 class Attributes(list): | |
132 | |
133 def __init__(self, attrib=None): | |
134 list.__init__(self, map(lambda (k, v): (QName(k), v), attrib or [])) | |
135 | |
136 def __contains__(self, name): | |
137 return name in [attr for attr, value in self] | |
138 | |
139 def get(self, name, default=None): | |
140 for attr, value in self: | |
141 if attr == name: | |
142 return value | |
143 return default | |
144 | |
145 def set(self, name, value): | |
146 for idx, (attr, _) in enumerate(self): | |
147 if attr == name: | |
148 self[idx] = (attr, value) | |
149 break | |
150 else: | |
151 self.append((QName(name), value)) | |
152 | |
153 | |
154 class Markup(unicode): | |
155 """Marks a string as being safe for inclusion in HTML/XML output without | |
156 needing to be escaped. | |
157 """ | |
158 def __new__(self, text='', *args): | |
159 if args: | |
160 text %= tuple([escape(arg) for arg in args]) | |
161 return unicode.__new__(self, text) | |
162 | |
163 def __add__(self, other): | |
164 return Markup(unicode(self) + Markup.escape(other)) | |
165 | |
166 def __mod__(self, args): | |
167 if not isinstance(args, (list, tuple)): | |
168 args = [args] | |
169 return Markup(unicode.__mod__(self, | |
170 tuple([escape(arg) for arg in args]))) | |
171 | |
172 def __mul__(self, num): | |
173 return Markup(unicode(self) * num) | |
174 | |
175 def join(self, seq): | |
176 return Markup(unicode(self).join([Markup.escape(item) for item in seq])) | |
177 | |
178 def stripentities(self, keepxmlentities=False): | |
179 """Return a copy of the text with any character or numeric entities | |
180 replaced by the equivalent UTF-8 characters. | |
181 | |
182 If the `keepxmlentities` parameter is provided and evaluates to `True`, | |
183 the core XML entities (&, ', >, < and "). | |
184 """ | |
185 def _replace_entity(match): | |
186 if match.group(1): # numeric entity | |
187 ref = match.group(1) | |
188 if ref.startswith('x'): | |
189 ref = int(ref[1:], 16) | |
190 else: | |
191 ref = int(ref, 10) | |
192 return unichr(ref) | |
193 else: # character entity | |
194 ref = match.group(2) | |
195 if keepxmlentities and ref in ('amp', 'apos', 'gt', 'lt', 'quot'): | |
196 return '&%s;' % ref | |
197 try: | |
198 codepoint = htmlentitydefs.name2codepoint[ref] | |
199 return unichr(codepoint) | |
200 except KeyError: | |
201 if keepxmlentities: | |
202 return '&%s;' % ref | |
203 else: | |
204 return ref | |
205 return Markup(re.sub(r'&(?:#((?:\d+)|(?:[xX][0-9a-fA-F]+));?|(\w+);)', | |
206 _replace_entity, self)) | |
207 | |
208 def striptags(self): | |
209 """Return a copy of the text with all XML/HTML tags removed.""" | |
210 return Markup(re.sub(r'<[^>]*?>', '', self)) | |
211 | |
212 def escape(cls, text, quotes=True): | |
213 """Create a Markup instance from a string and escape special characters | |
214 it may contain (<, >, & and \"). | |
215 | |
216 If the `quotes` parameter is set to `False`, the \" character is left | |
217 as is. Escaping quotes is generally only required for strings that are | |
218 to be used in attribute values. | |
219 """ | |
220 if isinstance(text, cls): | |
221 return text | |
222 text = unicode(text) | |
223 if not text: | |
224 return cls() | |
225 text = text.replace('&', '&') \ | |
226 .replace('<', '<') \ | |
227 .replace('>', '>') | |
228 if quotes: | |
229 text = text.replace('"', '"') | |
230 return cls(text) | |
231 escape = classmethod(escape) | |
232 | |
233 def unescape(self): | |
234 """Reverse-escapes &, <, > and \" and returns a `unicode` object.""" | |
235 if not self: | |
236 return '' | |
237 return unicode(self).replace('"', '"') \ | |
238 .replace('>', '>') \ | |
239 .replace('<', '<') \ | |
240 .replace('&', '&') | |
241 | |
242 def plaintext(self, keeplinebreaks=True): | |
243 """Returns the text as a `unicode`with all entities and tags removed.""" | |
244 text = unicode(self.striptags().stripentities()) | |
245 if not keeplinebreaks: | |
246 text = text.replace('\n', ' ') | |
247 return text | |
248 | |
249 def sanitize(self): | |
250 from markup.filters import HTMLSanitizer | |
251 from markup.input import HTMLParser | |
252 sanitize = HTMLSanitizer() | |
253 text = self.stripentities(keepxmlentities=True) | |
254 return Stream(sanitize(HTMLParser(StringIO(text)), None)) | |
255 | |
256 | |
257 escape = Markup.escape | |
258 | |
259 def unescape(text): | |
260 """Reverse-escapes &, <, > and \" and returns a `unicode` object.""" | |
261 if not isinstance(text, Markup): | |
262 return text | |
263 return text.unescape() | |
264 | |
265 | |
266 class Namespace(object): | |
267 | |
268 def __init__(self, uri): | |
269 self.uri = uri | |
270 | |
271 def __getitem__(self, name): | |
272 return QName(self.uri + '}' + name) | |
273 | |
274 __getattr__ = __getitem__ | |
275 | |
276 def __repr__(self): | |
277 return '<Namespace "%s">' % self.uri | |
278 | |
279 def __str__(self): | |
280 return self.uri | |
281 | |
282 def __unicode__(self): | |
283 return unicode(self.uri) | |
284 | |
285 | |
286 class QName(unicode): | |
287 """A qualified element or attribute name. | |
288 | |
289 The unicode value of instances of this class contains the qualified name of | |
290 the element or attribute, in the form `{namespace}localname`. The namespace | |
291 URI can be obtained through the additional `namespace` attribute, while the | |
292 local name can be accessed through the `localname` attribute. | |
293 """ | |
294 __slots__ = ['namespace', 'localname'] | |
295 | |
296 def __new__(cls, qname): | |
297 if isinstance(qname, QName): | |
298 return qname | |
299 | |
300 parts = qname.split('}', 1) | |
301 if qname.find('}') > 0: | |
302 self = unicode.__new__(cls, '{' + qname) | |
303 self.namespace = parts[0] | |
304 self.localname = parts[1] | |
305 else: | |
306 self = unicode.__new__(cls, qname) | |
307 self.namespace = None | |
308 self.localname = qname | |
309 return self |