comparison genshi/core.py @ 397:31742fe6d47e trunk

* Moved some utility functions from `genshi.core` to `genshi.util` (backwards compatibility preserved via imports) * Minor performance tweaks for the serializers
author cmlenz
date Tue, 02 Jan 2007 17:48:06 +0000
parents 2682dabbcd04
children 228907abb726
comparison
equal deleted inserted replaced
396:7016f404b915 397:31742fe6d47e
11 # individuals. For the exact contribution history, see the revision 11 # individuals. For the exact contribution history, see the revision
12 # history and logs, available at http://genshi.edgewall.org/log/. 12 # history and logs, available at http://genshi.edgewall.org/log/.
13 13
14 """Core classes for markup processing.""" 14 """Core classes for markup processing."""
15 15
16 import htmlentitydefs
17 import operator 16 import operator
18 import re 17
18 from genshi.util import plaintext, stripentities, striptags
19 19
20 __all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Attrs', 'Namespace', 20 __all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Attrs', 'Namespace',
21 'QName'] 21 'QName']
22 22
23 23
24 class StreamEventKind(str): 24 class StreamEventKind(str):
25 """A kind of event on an XML stream.""" 25 """A kind of event on a markup stream."""
26 __slots__ = [] 26 __slots__ = []
27 _instances = {} 27 _instances = {}
28 28
29 def __new__(cls, val): 29 def __new__(cls, val):
30 return cls._instances.setdefault(val, str.__new__(cls, val)) 30 return cls._instances.setdefault(val, str.__new__(cls, val))
32 32
33 class Stream(object): 33 class Stream(object):
34 """Represents a stream of markup events. 34 """Represents a stream of markup events.
35 35
36 This class is basically an iterator over the events. 36 This class is basically an iterator over the events.
37
38 Stream events are tuples of the form:
39
40 (kind, data, position)
41
42 where `kind` is the event kind (such as `START`, `END`, `TEXT`, etc), `data`
43 depends on the kind of event, and `position` is a `(filename, line, offset)`
44 tuple that contains the location of the original element or text in the
45 input. If the original location is unknown, `position` is `(None, -1, -1)`.
37 46
38 Also provided are ways to serialize the stream to text. The `serialize()` 47 Also provided are ways to serialize the stream to text. The `serialize()`
39 method will return an iterator over generated strings, while `render()` 48 method will return an iterator over generated strings, while `render()`
40 returns the complete generated text at once. Both accept various parameters 49 returns the complete generated text at once. Both accept various parameters
41 that impact the way the stream is serialized. 50 that impact the way the stream is serialized.
42
43 Stream events are tuples of the form:
44
45 (kind, data, position)
46
47 where `kind` is the event kind (such as `START`, `END`, `TEXT`, etc), `data`
48 depends on the kind of event, and `position` is a `(filename, line, offset)`
49 tuple that contains the location of the original element or text in the
50 input. If the original location is unknown, `position` is `(None, -1, -1)`.
51 """ 51 """
52 __slots__ = ['events'] 52 __slots__ = ['events']
53 53
54 START = StreamEventKind('START') # a start tag 54 START = StreamEventKind('START') # a start tag
55 END = StreamEventKind('END') # an end tag 55 END = StreamEventKind('END') # an end tag
90 >>> sanitizer = HTMLSanitizer() 90 >>> sanitizer = HTMLSanitizer()
91 >>> print html | sanitizer 91 >>> print html | sanitizer
92 <p>Hello, world!</p> 92 <p>Hello, world!</p>
93 93
94 Filters can be any function that accepts and produces a stream (where 94 Filters can be any function that accepts and produces a stream (where
95 a stream is anything that iterators over events): 95 a stream is anything that iterates over events):
96 96
97 >>> def uppercase(stream): 97 >>> def uppercase(stream):
98 ... for kind, data, pos in stream: 98 ... for kind, data, pos in stream:
99 ... if kind is TEXT: 99 ... if kind is TEXT:
100 ... data = data.upper() 100 ... data = data.upper()
322 322
323 The returned event is a TEXT event, the data is the value of all 323 The returned event is a TEXT event, the data is the value of all
324 attributes joined together. 324 attributes joined together.
325 """ 325 """
326 return TEXT, u''.join([x[1] for x in self]), (None, -1, -1) 326 return TEXT, u''.join([x[1] for x in self]), (None, -1, -1)
327
328
329 def plaintext(text, keeplinebreaks=True):
330 """Returns the text as a `unicode` string with all entities and tags
331 removed.
332 """
333 text = stripentities(striptags(text))
334 if not keeplinebreaks:
335 text = text.replace(u'\n', u' ')
336 return text
337
338 def stripentities(text, keepxmlentities=False):
339 """Return a copy of the given text with any character or numeric entities
340 replaced by the equivalent UTF-8 characters.
341
342 If the `keepxmlentities` parameter is provided and evaluates to `True`,
343 the core XML entities (&amp;, &apos;, &gt;, &lt; and &quot;) are not
344 stripped.
345 """
346 def _replace_entity(match):
347 if match.group(1): # numeric entity
348 ref = match.group(1)
349 if ref.startswith('x'):
350 ref = int(ref[1:], 16)
351 else:
352 ref = int(ref, 10)
353 return unichr(ref)
354 else: # character entity
355 ref = match.group(2)
356 if keepxmlentities and ref in ('amp', 'apos', 'gt', 'lt', 'quot'):
357 return '&%s;' % ref
358 try:
359 codepoint = htmlentitydefs.name2codepoint[ref]
360 return unichr(codepoint)
361 except KeyError:
362 if keepxmlentities:
363 return '&amp;%s;' % ref
364 else:
365 return ref
366 return re.sub(r'&(?:#((?:\d+)|(?:[xX][0-9a-fA-F]+));?|(\w+);)',
367 _replace_entity, text)
368
369 def striptags(text):
370 """Return a copy of the text with all XML/HTML tags removed."""
371 return re.sub(r'<[^>]*?>', '', text)
372 327
373 328
374 class Markup(unicode): 329 class Markup(unicode):
375 """Marks a string as being safe for inclusion in HTML/XML output without 330 """Marks a string as being safe for inclusion in HTML/XML output without
376 needing to be escaped. 331 needing to be escaped.
Copyright (C) 2012-2017 Edgewall Software