Mercurial > genshi > mirror
comparison genshi/core.py @ 397:31742fe6d47e trunk
* Moved some utility functions from `genshi.core` to `genshi.util` (backwards compatibility preserved via imports)
* Minor performance tweaks for the serializers
author | cmlenz |
---|---|
date | Tue, 02 Jan 2007 17:48:06 +0000 |
parents | 2682dabbcd04 |
children | 228907abb726 |
comparison
equal
deleted
inserted
replaced
396:7016f404b915 | 397:31742fe6d47e |
---|---|
11 # individuals. For the exact contribution history, see the revision | 11 # individuals. For the exact contribution history, see the revision |
12 # history and logs, available at http://genshi.edgewall.org/log/. | 12 # history and logs, available at http://genshi.edgewall.org/log/. |
13 | 13 |
14 """Core classes for markup processing.""" | 14 """Core classes for markup processing.""" |
15 | 15 |
16 import htmlentitydefs | |
17 import operator | 16 import operator |
18 import re | 17 |
18 from genshi.util import plaintext, stripentities, striptags | |
19 | 19 |
20 __all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Attrs', 'Namespace', | 20 __all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Attrs', 'Namespace', |
21 'QName'] | 21 'QName'] |
22 | 22 |
23 | 23 |
24 class StreamEventKind(str): | 24 class StreamEventKind(str): |
25 """A kind of event on an XML stream.""" | 25 """A kind of event on a markup stream.""" |
26 __slots__ = [] | 26 __slots__ = [] |
27 _instances = {} | 27 _instances = {} |
28 | 28 |
29 def __new__(cls, val): | 29 def __new__(cls, val): |
30 return cls._instances.setdefault(val, str.__new__(cls, val)) | 30 return cls._instances.setdefault(val, str.__new__(cls, val)) |
32 | 32 |
33 class Stream(object): | 33 class Stream(object): |
34 """Represents a stream of markup events. | 34 """Represents a stream of markup events. |
35 | 35 |
36 This class is basically an iterator over the events. | 36 This class is basically an iterator over the events. |
37 | |
38 Stream events are tuples of the form: | |
39 | |
40 (kind, data, position) | |
41 | |
42 where `kind` is the event kind (such as `START`, `END`, `TEXT`, etc), `data` | |
43 depends on the kind of event, and `position` is a `(filename, line, offset)` | |
44 tuple that contains the location of the original element or text in the | |
45 input. If the original location is unknown, `position` is `(None, -1, -1)`. | |
37 | 46 |
38 Also provided are ways to serialize the stream to text. The `serialize()` | 47 Also provided are ways to serialize the stream to text. The `serialize()` |
39 method will return an iterator over generated strings, while `render()` | 48 method will return an iterator over generated strings, while `render()` |
40 returns the complete generated text at once. Both accept various parameters | 49 returns the complete generated text at once. Both accept various parameters |
41 that impact the way the stream is serialized. | 50 that impact the way the stream is serialized. |
42 | |
43 Stream events are tuples of the form: | |
44 | |
45 (kind, data, position) | |
46 | |
47 where `kind` is the event kind (such as `START`, `END`, `TEXT`, etc), `data` | |
48 depends on the kind of event, and `position` is a `(filename, line, offset)` | |
49 tuple that contains the location of the original element or text in the | |
50 input. If the original location is unknown, `position` is `(None, -1, -1)`. | |
51 """ | 51 """ |
52 __slots__ = ['events'] | 52 __slots__ = ['events'] |
53 | 53 |
54 START = StreamEventKind('START') # a start tag | 54 START = StreamEventKind('START') # a start tag |
55 END = StreamEventKind('END') # an end tag | 55 END = StreamEventKind('END') # an end tag |
90 >>> sanitizer = HTMLSanitizer() | 90 >>> sanitizer = HTMLSanitizer() |
91 >>> print html | sanitizer | 91 >>> print html | sanitizer |
92 <p>Hello, world!</p> | 92 <p>Hello, world!</p> |
93 | 93 |
94 Filters can be any function that accepts and produces a stream (where | 94 Filters can be any function that accepts and produces a stream (where |
95 a stream is anything that iterators over events): | 95 a stream is anything that iterates over events): |
96 | 96 |
97 >>> def uppercase(stream): | 97 >>> def uppercase(stream): |
98 ... for kind, data, pos in stream: | 98 ... for kind, data, pos in stream: |
99 ... if kind is TEXT: | 99 ... if kind is TEXT: |
100 ... data = data.upper() | 100 ... data = data.upper() |
322 | 322 |
323 The returned event is a TEXT event, the data is the value of all | 323 The returned event is a TEXT event, the data is the value of all |
324 attributes joined together. | 324 attributes joined together. |
325 """ | 325 """ |
326 return TEXT, u''.join([x[1] for x in self]), (None, -1, -1) | 326 return TEXT, u''.join([x[1] for x in self]), (None, -1, -1) |
327 | |
328 | |
329 def plaintext(text, keeplinebreaks=True): | |
330 """Returns the text as a `unicode` string with all entities and tags | |
331 removed. | |
332 """ | |
333 text = stripentities(striptags(text)) | |
334 if not keeplinebreaks: | |
335 text = text.replace(u'\n', u' ') | |
336 return text | |
337 | |
338 def stripentities(text, keepxmlentities=False): | |
339 """Return a copy of the given text with any character or numeric entities | |
340 replaced by the equivalent UTF-8 characters. | |
341 | |
342 If the `keepxmlentities` parameter is provided and evaluates to `True`, | |
343 the core XML entities (&, ', >, < and ") are not | |
344 stripped. | |
345 """ | |
346 def _replace_entity(match): | |
347 if match.group(1): # numeric entity | |
348 ref = match.group(1) | |
349 if ref.startswith('x'): | |
350 ref = int(ref[1:], 16) | |
351 else: | |
352 ref = int(ref, 10) | |
353 return unichr(ref) | |
354 else: # character entity | |
355 ref = match.group(2) | |
356 if keepxmlentities and ref in ('amp', 'apos', 'gt', 'lt', 'quot'): | |
357 return '&%s;' % ref | |
358 try: | |
359 codepoint = htmlentitydefs.name2codepoint[ref] | |
360 return unichr(codepoint) | |
361 except KeyError: | |
362 if keepxmlentities: | |
363 return '&%s;' % ref | |
364 else: | |
365 return ref | |
366 return re.sub(r'&(?:#((?:\d+)|(?:[xX][0-9a-fA-F]+));?|(\w+);)', | |
367 _replace_entity, text) | |
368 | |
369 def striptags(text): | |
370 """Return a copy of the text with all XML/HTML tags removed.""" | |
371 return re.sub(r'<[^>]*?>', '', text) | |
372 | 327 |
373 | 328 |
374 class Markup(unicode): | 329 class Markup(unicode): |
375 """Marks a string as being safe for inclusion in HTML/XML output without | 330 """Marks a string as being safe for inclusion in HTML/XML output without |
376 needing to be escaped. | 331 needing to be escaped. |