comparison genshi/core.py @ 230:24757b771651

Renamed Markup to Genshi in repository.
author cmlenz
date Mon, 11 Sep 2006 15:07:07 +0000
parents markup/core.py@e4dad1145f84
children 8de2620504b9
comparison
equal deleted inserted replaced
229:110d69dbbda3 230:24757b771651
1 # -*- coding: utf-8 -*-
2 #
3 # Copyright (C) 2006 Edgewall Software
4 # All rights reserved.
5 #
6 # This software is licensed as described in the file COPYING, which
7 # you should have received as part of this distribution. The terms
8 # are also available at http://genshi.edgewall.org/wiki/License.
9 #
10 # This software consists of voluntary contributions made by many
11 # individuals. For the exact contribution history, see the revision
12 # history and logs, available at http://genshi.edgewall.org/log/.
13
14 """Core classes for markup processing."""
15
16 import htmlentitydefs
17 import operator
18 import re
19
20 __all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Namespace', 'QName']
21
22
23 class StreamEventKind(str):
24 """A kind of event on an XML stream."""
25
26
27 class Stream(object):
28 """Represents a stream of markup events.
29
30 This class is basically an iterator over the events.
31
32 Also provided are ways to serialize the stream to text. The `serialize()`
33 method will return an iterator over generated strings, while `render()`
34 returns the complete generated text at once. Both accept various parameters
35 that impact the way the stream is serialized.
36
37 Stream events are tuples of the form:
38
39 (kind, data, position)
40
41 where `kind` is the event kind (such as `START`, `END`, `TEXT`, etc), `data`
42 depends on the kind of event, and `position` is a `(filename, line, offset)`
43 tuple that contains the location of the original element or text in the
44 input. If the original location is unknown, `position` is `(None, -1, -1)`.
45 """
46 __slots__ = ['events']
47
48 START = StreamEventKind('START') # a start tag
49 END = StreamEventKind('END') # an end tag
50 TEXT = StreamEventKind('TEXT') # literal text
51 DOCTYPE = StreamEventKind('DOCTYPE') # doctype declaration
52 START_NS = StreamEventKind('START_NS') # start namespace mapping
53 END_NS = StreamEventKind('END_NS') # end namespace mapping
54 START_CDATA = StreamEventKind('START_CDATA') # start CDATA section
55 END_CDATA = StreamEventKind('END_CDATA') # end CDATA section
56 PI = StreamEventKind('PI') # processing instruction
57 COMMENT = StreamEventKind('COMMENT') # comment
58
59 def __init__(self, events):
60 """Initialize the stream with a sequence of markup events.
61
62 @param events: a sequence or iterable providing the events
63 """
64 self.events = events
65
66 def __iter__(self):
67 return iter(self.events)
68
69 def __or__(self, function):
70 """Override the "bitwise or" operator to apply filters or serializers
71 to the stream, providing a syntax similar to pipes on Unix shells.
72
73 Assume the following stream produced by the `HTML` function:
74
75 >>> from genshi.input import HTML
76 >>> html = HTML('''<p onclick="alert('Whoa')">Hello, world!</p>''')
77 >>> print html
78 <p onclick="alert('Whoa')">Hello, world!</p>
79
80 A filter such as the HTML sanitizer can be applied to that stream using
81 the pipe notation as follows:
82
83 >>> from genshi.filters import HTMLSanitizer
84 >>> sanitizer = HTMLSanitizer()
85 >>> print html | sanitizer
86 <p>Hello, world!</p>
87
88 Filters can be any function that accepts and produces a stream (where
89 a stream is anything that iterators over events):
90
91 >>> def uppercase(stream):
92 ... for kind, data, pos in stream:
93 ... if kind is TEXT:
94 ... data = data.upper()
95 ... yield kind, data, pos
96 >>> print html | sanitizer | uppercase
97 <p>HELLO, WORLD!</p>
98
99 Serializers can also be used with this notation:
100
101 >>> from genshi.output import TextSerializer
102 >>> output = TextSerializer()
103 >>> print html | sanitizer | uppercase | output
104 HELLO, WORLD!
105
106 Commonly, serializers should be used at the end of the "pipeline";
107 using them somewhere in the middle may produce unexpected results.
108 """
109 return Stream(_ensure(function(self)))
110
111 def filter(self, *filters):
112 """Apply filters to the stream.
113
114 This method returns a new stream with the given filters applied. The
115 filters must be callables that accept the stream object as parameter,
116 and return the filtered stream.
117
118 The call:
119
120 stream.filter(filter1, filter2)
121
122 is equivalent to:
123
124 stream | filter1 | filter2
125 """
126 return reduce(operator.or_, (self,) + filters)
127
128 def render(self, method='xml', encoding='utf-8', **kwargs):
129 """Return a string representation of the stream.
130
131 @param method: determines how the stream is serialized; can be either
132 "xml", "xhtml", "html", "text", or a custom serializer
133 class
134 @param encoding: how the output string should be encoded; if set to
135 `None`, this method returns a `unicode` object
136
137 Any additional keyword arguments are passed to the serializer, and thus
138 depend on the `method` parameter value.
139 """
140 generator = self.serialize(method=method, **kwargs)
141 output = u''.join(list(generator))
142 if encoding is not None:
143 errors = 'replace'
144 if method != 'text':
145 errors = 'xmlcharrefreplace'
146 return output.encode(encoding, errors)
147 return output
148
149 def select(self, path):
150 """Return a new stream that contains the events matching the given
151 XPath expression.
152
153 @param path: a string containing the XPath expression
154 """
155 from genshi.path import Path
156 return Path(path).select(self)
157
158 def serialize(self, method='xml', **kwargs):
159 """Generate strings corresponding to a specific serialization of the
160 stream.
161
162 Unlike the `render()` method, this method is a generator that returns
163 the serialized output incrementally, as opposed to returning a single
164 string.
165
166 @param method: determines how the stream is serialized; can be either
167 "xml", "xhtml", "html", "text", or a custom serializer
168 class
169
170 Any additional keyword arguments are passed to the serializer, and thus
171 depend on the `method` parameter value.
172 """
173 from genshi import output
174 cls = method
175 if isinstance(method, basestring):
176 cls = {'xml': output.XMLSerializer,
177 'xhtml': output.XHTMLSerializer,
178 'html': output.HTMLSerializer,
179 'text': output.TextSerializer}[method]
180 return cls(**kwargs)(_ensure(self))
181
182 def __str__(self):
183 return self.render()
184
185 def __unicode__(self):
186 return self.render(encoding=None)
187
188
189 START = Stream.START
190 END = Stream.END
191 TEXT = Stream.TEXT
192 DOCTYPE = Stream.DOCTYPE
193 START_NS = Stream.START_NS
194 END_NS = Stream.END_NS
195 START_CDATA = Stream.START_CDATA
196 END_CDATA = Stream.END_CDATA
197 PI = Stream.PI
198 COMMENT = Stream.COMMENT
199
200 def _ensure(stream):
201 """Ensure that every item on the stream is actually a markup event."""
202 for event in stream:
203 if type(event) is not tuple:
204 if hasattr(event, 'totuple'):
205 event = event.totuple()
206 else:
207 event = TEXT, unicode(event), (None, -1, -1)
208 yield event
209
210
211 class Attrs(list):
212 """Sequence type that stores the attributes of an element.
213
214 The order of the attributes is preserved, while accessing and manipulating
215 attributes by name is also supported.
216
217 >>> attrs = Attrs([('href', '#'), ('title', 'Foo')])
218 >>> attrs
219 [(u'href', '#'), (u'title', 'Foo')]
220
221 >>> 'href' in attrs
222 True
223 >>> 'tabindex' in attrs
224 False
225
226 >>> attrs.get(u'title')
227 'Foo'
228 >>> attrs.set(u'title', 'Bar')
229 >>> attrs
230 [(u'href', '#'), (u'title', 'Bar')]
231 >>> attrs.remove(u'title')
232 >>> attrs
233 [(u'href', '#')]
234
235 New attributes added using the `set()` method are appended to the end of
236 the list:
237
238 >>> attrs.set(u'accesskey', 'k')
239 >>> attrs
240 [(u'href', '#'), (u'accesskey', 'k')]
241
242 An `Attrs` instance can also be initialized with keyword arguments.
243
244 >>> attrs = Attrs(class_='bar', href='#', title='Foo')
245 >>> attrs.get('class')
246 'bar'
247 >>> attrs.get('href')
248 '#'
249 >>> attrs.get('title')
250 'Foo'
251
252 Reserved words can be used by appending a trailing underscore to the name,
253 and any other underscore is replaced by a dash:
254
255 >>> attrs = Attrs(class_='bar', accept_charset='utf-8')
256 >>> attrs.get('class')
257 'bar'
258 >>> attrs.get('accept-charset')
259 'utf-8'
260
261 Thus this shorthand can not be used if attribute names should contain
262 actual underscore characters.
263 """
264 __slots__ = []
265
266 def __init__(self, attrib=None, **kwargs):
267 """Create the `Attrs` instance.
268
269 If the `attrib` parameter is provided, it is expected to be a sequence
270 of `(name, value)` tuples.
271 """
272 if attrib is None:
273 attrib = []
274 list.__init__(self, [(QName(name), value) for name, value in attrib])
275 for name, value in kwargs.items():
276 self.set(name.rstrip('_').replace('_', '-'), value)
277
278 def __contains__(self, name):
279 """Return whether the list includes an attribute with the specified
280 name.
281 """
282 for attr, _ in self:
283 if attr == name:
284 return True
285
286 def get(self, name, default=None):
287 """Return the value of the attribute with the specified name, or the
288 value of the `default` parameter if no such attribute is found.
289 """
290 for attr, value in self:
291 if attr == name:
292 return value
293 return default
294
295 def remove(self, name):
296 """Remove the attribute with the specified name.
297
298 If no such attribute is found, this method does nothing.
299 """
300 for idx, (attr, _) in enumerate(self):
301 if attr == name:
302 del self[idx]
303 break
304
305 def set(self, name, value):
306 """Set the specified attribute to the given value.
307
308 If an attribute with the specified name is already in the list, the
309 value of the existing entry is updated. Otherwise, a new attribute is
310 appended to the end of the list.
311 """
312 for idx, (attr, _) in enumerate(self):
313 if attr == name:
314 self[idx] = (QName(attr), value)
315 break
316 else:
317 self.append((QName(name), value))
318
319 def totuple(self):
320 """Return the attributes as a markup event.
321
322 The returned event is a TEXT event, the data is the value of all
323 attributes joined together.
324 """
325 return TEXT, u''.join([x[1] for x in self]), (None, -1, -1)
326
327
328 def plaintext(text, keeplinebreaks=True):
329 """Returns the text as a `unicode` string with all entities and tags
330 removed.
331 """
332 text = stripentities(striptags(text))
333 if not keeplinebreaks:
334 text = text.replace(u'\n', u' ')
335 return text
336
337 def stripentities(text, keepxmlentities=False):
338 """Return a copy of the given text with any character or numeric entities
339 replaced by the equivalent UTF-8 characters.
340
341 If the `keepxmlentities` parameter is provided and evaluates to `True`,
342 the core XML entities (&amp;, &apos;, &gt;, &lt; and &quot;) are not
343 stripped.
344 """
345 def _replace_entity(match):
346 if match.group(1): # numeric entity
347 ref = match.group(1)
348 if ref.startswith('x'):
349 ref = int(ref[1:], 16)
350 else:
351 ref = int(ref, 10)
352 return unichr(ref)
353 else: # character entity
354 ref = match.group(2)
355 if keepxmlentities and ref in ('amp', 'apos', 'gt', 'lt', 'quot'):
356 return '&%s;' % ref
357 try:
358 codepoint = htmlentitydefs.name2codepoint[ref]
359 return unichr(codepoint)
360 except KeyError:
361 if keepxmlentities:
362 return '&amp;%s;' % ref
363 else:
364 return ref
365 return re.sub(r'&(?:#((?:\d+)|(?:[xX][0-9a-fA-F]+));?|(\w+);)',
366 _replace_entity, text)
367
368 def striptags(text):
369 """Return a copy of the text with all XML/HTML tags removed."""
370 return re.sub(r'<[^>]*?>', '', text)
371
372
373 class Markup(unicode):
374 """Marks a string as being safe for inclusion in HTML/XML output without
375 needing to be escaped.
376 """
377 __slots__ = []
378
379 def __new__(cls, text='', *args):
380 if args:
381 text %= tuple(map(escape, args))
382 return unicode.__new__(cls, text)
383
384 def __add__(self, other):
385 return Markup(unicode(self) + unicode(escape(other)))
386
387 def __radd__(self, other):
388 return Markup(unicode(escape(other)) + unicode(self))
389
390 def __mod__(self, args):
391 if not isinstance(args, (list, tuple)):
392 args = [args]
393 return Markup(unicode.__mod__(self, tuple(map(escape, args))))
394
395 def __mul__(self, num):
396 return Markup(unicode(self) * num)
397
398 def __rmul__(self, num):
399 return Markup(num * unicode(self))
400
401 def __repr__(self):
402 return '<%s "%s">' % (self.__class__.__name__, self)
403
404 def join(self, seq, escape_quotes=True):
405 return Markup(unicode(self).join([escape(item, quotes=escape_quotes)
406 for item in seq]))
407
408 def escape(cls, text, quotes=True):
409 """Create a Markup instance from a string and escape special characters
410 it may contain (<, >, & and \").
411
412 If the `quotes` parameter is set to `False`, the \" character is left
413 as is. Escaping quotes is generally only required for strings that are
414 to be used in attribute values.
415 """
416 if not text:
417 return cls()
418 if type(text) is cls:
419 return text
420 text = unicode(text).replace('&', '&amp;') \
421 .replace('<', '&lt;') \
422 .replace('>', '&gt;')
423 if quotes:
424 text = text.replace('"', '&#34;')
425 return cls(text)
426 escape = classmethod(escape)
427
428 def unescape(self):
429 """Reverse-escapes &, <, > and \" and returns a `unicode` object."""
430 if not self:
431 return u''
432 return unicode(self).replace('&#34;', '"') \
433 .replace('&gt;', '>') \
434 .replace('&lt;', '<') \
435 .replace('&amp;', '&')
436
437 def stripentities(self, keepxmlentities=False):
438 """Return a copy of the text with any character or numeric entities
439 replaced by the equivalent UTF-8 characters.
440
441 If the `keepxmlentities` parameter is provided and evaluates to `True`,
442 the core XML entities (&amp;, &apos;, &gt;, &lt; and &quot;) are not
443 stripped.
444 """
445 return Markup(stripentities(self, keepxmlentities=keepxmlentities))
446
447 def striptags(self):
448 """Return a copy of the text with all XML/HTML tags removed."""
449 return Markup(striptags(self))
450
451
452 escape = Markup.escape
453
454 def unescape(text):
455 """Reverse-escapes &, <, > and \" and returns a `unicode` object."""
456 if not isinstance(text, Markup):
457 return text
458 return text.unescape()
459
460
461 class Namespace(object):
462 """Utility class creating and testing elements with a namespace.
463
464 Internally, namespace URIs are encoded in the `QName` of any element or
465 attribute, the namespace URI being enclosed in curly braces. This class
466 helps create and test these strings.
467
468 A `Namespace` object is instantiated with the namespace URI.
469
470 >>> html = Namespace('http://www.w3.org/1999/xhtml')
471 >>> html
472 <Namespace "http://www.w3.org/1999/xhtml">
473 >>> html.uri
474 u'http://www.w3.org/1999/xhtml'
475
476 The `Namespace` object can than be used to generate `QName` objects with
477 that namespace:
478
479 >>> html.body
480 u'{http://www.w3.org/1999/xhtml}body'
481 >>> html.body.localname
482 u'body'
483 >>> html.body.namespace
484 u'http://www.w3.org/1999/xhtml'
485
486 The same works using item access notation, which is useful for element or
487 attribute names that are not valid Python identifiers:
488
489 >>> html['body']
490 u'{http://www.w3.org/1999/xhtml}body'
491
492 A `Namespace` object can also be used to test whether a specific `QName`
493 belongs to that namespace using the `in` operator:
494
495 >>> qname = html.body
496 >>> qname in html
497 True
498 >>> qname in Namespace('http://www.w3.org/2002/06/xhtml2')
499 False
500 """
501 def __new__(cls, uri):
502 if type(uri) is cls:
503 return uri
504 return object.__new__(cls, uri)
505
506 def __init__(self, uri):
507 self.uri = unicode(uri)
508
509 def __contains__(self, qname):
510 return qname.namespace == self.uri
511
512 def __eq__(self, other):
513 if isinstance(other, Namespace):
514 return self.uri == other.uri
515 return self.uri == other
516
517 def __getitem__(self, name):
518 return QName(self.uri + u'}' + name)
519 __getattr__ = __getitem__
520
521 def __repr__(self):
522 return '<Namespace "%s">' % self.uri
523
524 def __str__(self):
525 return self.uri.encode('utf-8')
526
527 def __unicode__(self):
528 return self.uri
529
530
531 # The namespace used by attributes such as xml:lang and xml:space
532 XML_NAMESPACE = Namespace('http://www.w3.org/XML/1998/namespace')
533
534
535 class QName(unicode):
536 """A qualified element or attribute name.
537
538 The unicode value of instances of this class contains the qualified name of
539 the element or attribute, in the form `{namespace}localname`. The namespace
540 URI can be obtained through the additional `namespace` attribute, while the
541 local name can be accessed through the `localname` attribute.
542
543 >>> qname = QName('foo')
544 >>> qname
545 u'foo'
546 >>> qname.localname
547 u'foo'
548 >>> qname.namespace
549
550 >>> qname = QName('http://www.w3.org/1999/xhtml}body')
551 >>> qname
552 u'{http://www.w3.org/1999/xhtml}body'
553 >>> qname.localname
554 u'body'
555 >>> qname.namespace
556 u'http://www.w3.org/1999/xhtml'
557 """
558 __slots__ = ['namespace', 'localname']
559
560 def __new__(cls, qname):
561 if type(qname) is cls:
562 return qname
563
564 parts = qname.split(u'}', 1)
565 if len(parts) > 1:
566 self = unicode.__new__(cls, u'{%s' % qname)
567 self.namespace, self.localname = map(unicode, parts)
568 else:
569 self = unicode.__new__(cls, qname)
570 self.namespace, self.localname = None, unicode(qname)
571 return self
Copyright (C) 2012-2017 Edgewall Software