comparison genshi/output.py @ 820:1837f39efd6f experimental-inline

Sync (old) experimental inline branch with trunk@1027.
author cmlenz
date Wed, 11 Mar 2009 17:51:06 +0000
parents 0742f421caba
children de82830f8816
comparison
equal deleted inserted replaced
500:0742f421caba 820:1837f39efd6f
1 # -*- coding: utf-8 -*- 1 # -*- coding: utf-8 -*-
2 # 2 #
3 # Copyright (C) 2006-2007 Edgewall Software 3 # Copyright (C) 2006-2008 Edgewall Software
4 # All rights reserved. 4 # All rights reserved.
5 # 5 #
6 # This software is licensed as described in the file COPYING, which 6 # This software is licensed as described in the file COPYING, which
7 # you should have received as part of this distribution. The terms 7 # you should have received as part of this distribution. The terms
8 # are also available at http://genshi.edgewall.org/wiki/License. 8 # are also available at http://genshi.edgewall.org/wiki/License.
14 """This module provides different kinds of serialization methods for XML event 14 """This module provides different kinds of serialization methods for XML event
15 streams. 15 streams.
16 """ 16 """
17 17
18 from itertools import chain 18 from itertools import chain
19 try:
20 frozenset
21 except NameError:
22 from sets import ImmutableSet as frozenset
23 import re 19 import re
24 20
25 from genshi.core import escape, Attrs, Markup, Namespace, QName, StreamEventKind 21 from genshi.core import escape, Attrs, Markup, Namespace, QName, StreamEventKind
26 from genshi.core import START, END, TEXT, XML_DECL, DOCTYPE, START_NS, END_NS, \ 22 from genshi.core import START, END, TEXT, XML_DECL, DOCTYPE, START_NS, END_NS, \
27 START_CDATA, END_CDATA, PI, COMMENT, XML_NAMESPACE 23 START_CDATA, END_CDATA, PI, COMMENT, XML_NAMESPACE
28 24
29 __all__ = ['encode', 'get_serializer', 'DocType', 'XMLSerializer', 25 __all__ = ['encode', 'get_serializer', 'DocType', 'XMLSerializer',
30 'XHTMLSerializer', 'HTMLSerializer', 'TextSerializer'] 26 'XHTMLSerializer', 'HTMLSerializer', 'TextSerializer']
31 __docformat__ = 'restructuredtext en' 27 __docformat__ = 'restructuredtext en'
32 28
33 def encode(iterator, method='xml', encoding='utf-8'): 29 def encode(iterator, method='xml', encoding='utf-8', out=None):
34 """Encode serializer output into a string. 30 """Encode serializer output into a string.
35 31
36 :param iterator: the iterator returned from serializing a stream (basically 32 :param iterator: the iterator returned from serializing a stream (basically
37 any iterator that yields unicode objects) 33 any iterator that yields unicode objects)
38 :param method: the serialization method; determines how characters not 34 :param method: the serialization method; determines how characters not
39 representable in the specified encoding are treated 35 representable in the specified encoding are treated
40 :param encoding: how the output string should be encoded; if set to `None`, 36 :param encoding: how the output string should be encoded; if set to `None`,
41 this method returns a `unicode` object 37 this method returns a `unicode` object
42 :return: a string or unicode object (depending on the `encoding` parameter) 38 :param out: a file-like object that the output should be written to
39 instead of being returned as one big string; note that if
40 this is a file or socket (or similar), the `encoding` must
41 not be `None` (that is, the output must be encoded)
42 :return: a `str` or `unicode` object (depending on the `encoding`
43 parameter), or `None` if the `out` parameter is provided
44
43 :since: version 0.4.1 45 :since: version 0.4.1
44 """ 46 :note: Changed in 0.5: added the `out` parameter
45 output = u''.join(list(iterator)) 47 """
46 if encoding is not None: 48 if encoding is not None:
47 errors = 'replace' 49 errors = 'replace'
48 if method != 'text' and not isinstance(method, TextSerializer): 50 if method != 'text' and not isinstance(method, TextSerializer):
49 errors = 'xmlcharrefreplace' 51 errors = 'xmlcharrefreplace'
50 return output.encode(encoding, errors) 52 _encode = lambda string: string.encode(encoding, errors)
51 return output 53 else:
54 _encode = lambda string: string
55 if out is None:
56 return _encode(u''.join(list(iterator)))
57 for chunk in iterator:
58 out.write(_encode(chunk))
52 59
53 def get_serializer(method='xml', **kwargs): 60 def get_serializer(method='xml', **kwargs):
54 """Return a serializer object for the given method. 61 """Return a serializer object for the given method.
55 62
56 :param method: the serialization method; can be either "xml", "xhtml", 63 :param method: the serialization method; can be either "xml", "xhtml",
100 XHTML_FRAMESET = ( 107 XHTML_FRAMESET = (
101 'html', '-//W3C//DTD XHTML 1.0 Frameset//EN', 108 'html', '-//W3C//DTD XHTML 1.0 Frameset//EN',
102 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd' 109 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd'
103 ) 110 )
104 XHTML = XHTML_STRICT 111 XHTML = XHTML_STRICT
112
113 XHTML11 = (
114 'html', '-//W3C//DTD XHTML 1.1//EN',
115 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'
116 )
117
118 SVG_FULL = (
119 'svg', '-//W3C//DTD SVG 1.1//EN',
120 'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd'
121 )
122 SVG_BASIC = (
123 'svg', '-//W3C//DTD SVG Basic 1.1//EN',
124 'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11-basic.dtd'
125 )
126 SVG_TINY = (
127 'svg', '-//W3C//DTD SVG Tiny 1.1//EN',
128 'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11-tiny.dtd'
129 )
130 SVG = SVG_FULL
105 131
106 def get(cls, name): 132 def get(cls, name):
107 """Return the ``(name, pubid, sysid)`` tuple of the ``DOCTYPE`` 133 """Return the ``(name, pubid, sysid)`` tuple of the ``DOCTYPE``
108 declaration for the specified name. 134 declaration for the specified name.
109 135
110 The following names are recognized in this version: 136 The following names are recognized in this version:
111 * "html" or "html-strict" for the HTML 4.01 strict DTD 137 * "html" or "html-strict" for the HTML 4.01 strict DTD
112 * "html-transitional" for the HTML 4.01 transitional DTD 138 * "html-transitional" for the HTML 4.01 transitional DTD
113 * "html-transitional" for the HTML 4.01 frameset DTD 139 * "html-frameset" for the HTML 4.01 frameset DTD
114 * "html5" for the ``DOCTYPE`` proposed for HTML5 140 * "html5" for the ``DOCTYPE`` proposed for HTML5
115 * "xhtml" or "xhtml-strict" for the XHTML 1.0 strict DTD 141 * "xhtml" or "xhtml-strict" for the XHTML 1.0 strict DTD
116 * "xhtml-transitional" for the XHTML 1.0 transitional DTD 142 * "xhtml-transitional" for the XHTML 1.0 transitional DTD
117 * "xhtml-frameset" for the XHTML 1.0 frameset DTD 143 * "xhtml-frameset" for the XHTML 1.0 frameset DTD
144 * "xhtml11" for the XHTML 1.1 DTD
145 * "svg" or "svg-full" for the SVG 1.1 DTD
146 * "svg-basic" for the SVG Basic 1.1 DTD
147 * "svg-tiny" for the SVG Tiny 1.1 DTD
118 148
119 :param name: the name of the ``DOCTYPE`` 149 :param name: the name of the ``DOCTYPE``
120 :return: the ``(name, pubid, sysid)`` tuple for the requested 150 :return: the ``(name, pubid, sysid)`` tuple for the requested
121 ``DOCTYPE``, or ``None`` if the name is not recognized 151 ``DOCTYPE``, or ``None`` if the name is not recognized
122 :since: version 0.4.1 152 :since: version 0.4.1
127 'html-frameset': DocType.HTML_FRAMESET, 157 'html-frameset': DocType.HTML_FRAMESET,
128 'html5': cls.HTML5, 158 'html5': cls.HTML5,
129 'xhtml': cls.XHTML, 'xhtml-strict': cls.XHTML_STRICT, 159 'xhtml': cls.XHTML, 'xhtml-strict': cls.XHTML_STRICT,
130 'xhtml-transitional': cls.XHTML_TRANSITIONAL, 160 'xhtml-transitional': cls.XHTML_TRANSITIONAL,
131 'xhtml-frameset': cls.XHTML_FRAMESET, 161 'xhtml-frameset': cls.XHTML_FRAMESET,
162 'xhtml11': cls.XHTML11,
163 'svg': cls.SVG, 'svg-full': cls.SVG_FULL,
164 'svg-basic': cls.SVG_BASIC,
165 'svg-tiny': cls.SVG_TINY
132 }.get(name.lower()) 166 }.get(name.lower())
133 get = classmethod(get) 167 get = classmethod(get)
134 168
135 169
136 class XMLSerializer(object): 170 class XMLSerializer(object):
154 defined in `DocType.get` 188 defined in `DocType.get`
155 :param strip_whitespace: whether extraneous whitespace should be 189 :param strip_whitespace: whether extraneous whitespace should be
156 stripped from the output 190 stripped from the output
157 :note: Changed in 0.4.2: The `doctype` parameter can now be a string. 191 :note: Changed in 0.4.2: The `doctype` parameter can now be a string.
158 """ 192 """
159 self.preamble = []
160 if doctype:
161 if isinstance(doctype, basestring):
162 doctype = DocType.get(doctype)
163 self.preamble.append((DOCTYPE, doctype, (None, -1, -1)))
164 self.filters = [EmptyTagFilter()] 193 self.filters = [EmptyTagFilter()]
165 if strip_whitespace: 194 if strip_whitespace:
166 self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE)) 195 self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE))
167 self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes)) 196 self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes))
197 if doctype:
198 self.filters.append(DocTypeInserter(doctype))
168 199
169 def __call__(self, stream): 200 def __call__(self, stream):
170 have_decl = have_doctype = False 201 have_decl = have_doctype = False
171 in_cdata = False 202 in_cdata = False
172 203
173 stream = chain(self.preamble, stream)
174 for filter_ in self.filters: 204 for filter_ in self.filters:
175 stream = filter_(stream) 205 stream = filter_(stream)
176 for kind, data, pos in stream: 206 for kind, data, pos in stream:
177 207
178 if kind is START or kind is EMPTY: 208 if kind is START or kind is EMPTY:
215 elif sysid: 245 elif sysid:
216 buf.append(' SYSTEM') 246 buf.append(' SYSTEM')
217 if sysid: 247 if sysid:
218 buf.append(' "%s"') 248 buf.append(' "%s"')
219 buf.append('>\n') 249 buf.append('>\n')
220 yield Markup(u''.join(buf), *filter(None, data)) 250 yield Markup(u''.join(buf)) % filter(None, data)
221 have_doctype = True 251 have_doctype = True
222 252
223 elif kind is START_CDATA: 253 elif kind is START_CDATA:
224 yield Markup('<![CDATA[') 254 yield Markup('<![CDATA[')
225 in_cdata = True 255 in_cdata = True
251 QName('pre'), QName('http://www.w3.org/1999/xhtml}pre'), 281 QName('pre'), QName('http://www.w3.org/1999/xhtml}pre'),
252 QName('textarea'), QName('http://www.w3.org/1999/xhtml}textarea') 282 QName('textarea'), QName('http://www.w3.org/1999/xhtml}textarea')
253 ]) 283 ])
254 284
255 def __init__(self, doctype=None, strip_whitespace=True, 285 def __init__(self, doctype=None, strip_whitespace=True,
256 namespace_prefixes=None): 286 namespace_prefixes=None, drop_xml_decl=True):
257 super(XHTMLSerializer, self).__init__(doctype, False) 287 super(XHTMLSerializer, self).__init__(doctype, False)
258 self.filters = [EmptyTagFilter()] 288 self.filters = [EmptyTagFilter()]
259 if strip_whitespace: 289 if strip_whitespace:
260 self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE)) 290 self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE))
261 namespace_prefixes = namespace_prefixes or {} 291 namespace_prefixes = namespace_prefixes or {}
262 namespace_prefixes['http://www.w3.org/1999/xhtml'] = '' 292 namespace_prefixes['http://www.w3.org/1999/xhtml'] = ''
263 self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes)) 293 self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes))
294 if doctype:
295 self.filters.append(DocTypeInserter(doctype))
296 self.drop_xml_decl = drop_xml_decl
264 297
265 def __call__(self, stream): 298 def __call__(self, stream):
266 boolean_attrs = self._BOOLEAN_ATTRS 299 boolean_attrs = self._BOOLEAN_ATTRS
267 empty_elems = self._EMPTY_ELEMS 300 empty_elems = self._EMPTY_ELEMS
268 have_doctype = False 301 drop_xml_decl = self.drop_xml_decl
302 have_decl = have_doctype = False
269 in_cdata = False 303 in_cdata = False
270 304
271 stream = chain(self.preamble, stream)
272 for filter_ in self.filters: 305 for filter_ in self.filters:
273 stream = filter_(stream) 306 stream = filter_(stream)
274 for kind, data, pos in stream: 307 for kind, data, pos in stream:
275 308
276 if kind is START or kind is EMPTY: 309 if kind is START or kind is EMPTY:
277 tag, attrib = data 310 tag, attrib = data
278 buf = ['<', tag] 311 buf = ['<', tag]
279 for attr, value in attrib: 312 for attr, value in attrib:
280 if attr in boolean_attrs: 313 if attr in boolean_attrs:
281 value = attr 314 value = attr
315 elif attr == u'xml:lang' and u'lang' not in attrib:
316 buf += [' lang="', escape(value), '"']
317 elif attr == u'xml:space':
318 continue
282 buf += [' ', attr, '="', escape(value), '"'] 319 buf += [' ', attr, '="', escape(value), '"']
283 if kind is EMPTY: 320 if kind is EMPTY:
284 if tag in empty_elems: 321 if tag in empty_elems:
285 buf.append(' />') 322 buf.append(' />')
286 else: 323 else:
309 elif sysid: 346 elif sysid:
310 buf.append(' SYSTEM') 347 buf.append(' SYSTEM')
311 if sysid: 348 if sysid:
312 buf.append(' "%s"') 349 buf.append(' "%s"')
313 buf.append('>\n') 350 buf.append('>\n')
314 yield Markup(u''.join(buf), *filter(None, data)) 351 yield Markup(u''.join(buf)) % filter(None, data)
315 have_doctype = True 352 have_doctype = True
353
354 elif kind is XML_DECL and not have_decl and not drop_xml_decl:
355 version, encoding, standalone = data
356 buf = ['<?xml version="%s"' % version]
357 if encoding:
358 buf.append(' encoding="%s"' % encoding)
359 if standalone != -1:
360 standalone = standalone and 'yes' or 'no'
361 buf.append(' standalone="%s"' % standalone)
362 buf.append('?>\n')
363 yield Markup(u''.join(buf))
364 have_decl = True
316 365
317 elif kind is START_CDATA: 366 elif kind is START_CDATA:
318 yield Markup('<![CDATA[') 367 yield Markup('<![CDATA[')
319 in_cdata = True 368 in_cdata = True
320 369
352 super(HTMLSerializer, self).__init__(doctype, False) 401 super(HTMLSerializer, self).__init__(doctype, False)
353 self.filters = [EmptyTagFilter()] 402 self.filters = [EmptyTagFilter()]
354 if strip_whitespace: 403 if strip_whitespace:
355 self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE, 404 self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE,
356 self._NOESCAPE_ELEMS)) 405 self._NOESCAPE_ELEMS))
357 self.filters.append(NamespaceStripper('http://www.w3.org/1999/xhtml')) 406 self.filters.append(NamespaceFlattener(prefixes={
407 'http://www.w3.org/1999/xhtml': ''
408 }))
409 if doctype:
410 self.filters.append(DocTypeInserter(doctype))
358 411
359 def __call__(self, stream): 412 def __call__(self, stream):
360 boolean_attrs = self._BOOLEAN_ATTRS 413 boolean_attrs = self._BOOLEAN_ATTRS
361 empty_elems = self._EMPTY_ELEMS 414 empty_elems = self._EMPTY_ELEMS
362 noescape_elems = self._NOESCAPE_ELEMS 415 noescape_elems = self._NOESCAPE_ELEMS
363 have_doctype = False 416 have_doctype = False
364 noescape = False 417 noescape = False
365 418
366 stream = chain(self.preamble, stream)
367 for filter_ in self.filters: 419 for filter_ in self.filters:
368 stream = filter_(stream) 420 stream = filter_(stream)
369 for kind, data, pos in stream: 421 for kind, data, pos in stream:
370 422
371 if kind is START or kind is EMPTY: 423 if kind is START or kind is EMPTY:
373 buf = ['<', tag] 425 buf = ['<', tag]
374 for attr, value in attrib: 426 for attr, value in attrib:
375 if attr in boolean_attrs: 427 if attr in boolean_attrs:
376 if value: 428 if value:
377 buf += [' ', attr] 429 buf += [' ', attr]
378 else: 430 elif ':' in attr:
431 if attr == 'xml:lang' and u'lang' not in attrib:
432 buf += [' lang="', escape(value), '"']
433 elif attr != 'xmlns':
379 buf += [' ', attr, '="', escape(value), '"'] 434 buf += [' ', attr, '="', escape(value), '"']
380 buf.append('>') 435 buf.append('>')
381 if kind is EMPTY: 436 if kind is EMPTY:
382 if tag not in empty_elems: 437 if tag not in empty_elems:
383 buf.append('</%s>' % tag) 438 buf.append('</%s>' % tag)
406 elif sysid: 461 elif sysid:
407 buf.append(' SYSTEM') 462 buf.append(' SYSTEM')
408 if sysid: 463 if sysid:
409 buf.append(' "%s"') 464 buf.append(' "%s"')
410 buf.append('>\n') 465 buf.append('>\n')
411 yield Markup(u''.join(buf), *filter(None, data)) 466 yield Markup(u''.join(buf)) % filter(None, data)
412 have_doctype = True 467 have_doctype = True
413 468
414 elif kind is PI: 469 elif kind is PI:
415 yield Markup('<?%s %s?>' % data) 470 yield Markup('<?%s %s?>' % data)
416 471
427 <div><a href="foo">&lt;Hello!&gt;</a><br/></div> 482 <div><a href="foo">&lt;Hello!&gt;</a><br/></div>
428 >>> print ''.join(TextSerializer()(elem.generate())) 483 >>> print ''.join(TextSerializer()(elem.generate()))
429 <Hello!> 484 <Hello!>
430 485
431 If text events contain literal markup (instances of the `Markup` class), 486 If text events contain literal markup (instances of the `Markup` class),
432 tags or entities are stripped from the output: 487 that markup is by default passed through unchanged:
433 488
434 >>> elem = tag.div(Markup('<a href="foo">Hello!</a><br/>')) 489 >>> elem = tag.div(Markup('<a href="foo">Hello &amp; Bye!</a><br/>'))
435 >>> print elem 490 >>> print elem.generate().render(TextSerializer)
436 <div><a href="foo">Hello!</a><br/></div> 491 <a href="foo">Hello &amp; Bye!</a><br/>
437 >>> print ''.join(TextSerializer()(elem.generate())) 492
438 Hello! 493 You can use the ``strip_markup`` to change this behavior, so that tags and
439 """ 494 entities are stripped from the output (or in the case of entities,
495 replaced with the equivalent character):
496
497 >>> print elem.generate().render(TextSerializer, strip_markup=True)
498 Hello & Bye!
499 """
500
501 def __init__(self, strip_markup=False):
502 """Create the serializer.
503
504 :param strip_markup: whether markup (tags and encoded characters) found
505 in the text should be removed
506 """
507 self.strip_markup = strip_markup
440 508
441 def __call__(self, stream): 509 def __call__(self, stream):
510 strip_markup = self.strip_markup
442 for event in stream: 511 for event in stream:
443 if event[0] is TEXT: 512 if event[0] is TEXT:
444 data = event[1] 513 data = event[1]
445 if type(data) is Markup: 514 if strip_markup and type(data) is Markup:
446 data = data.striptags().stripentities() 515 data = data.striptags().stripentities()
447 yield unicode(data) 516 yield unicode(data)
448 517
449 518
450 class EmptyTagFilter(object): 519 class EmptyTagFilter(object):
582 if attr in ns_attrs: 651 if attr in ns_attrs:
583 ns_attrs.remove(attr) 652 ns_attrs.remove(attr)
584 653
585 else: 654 else:
586 yield kind, data, pos 655 yield kind, data, pos
587
588
589 class NamespaceStripper(object):
590 r"""Stream filter that removes all namespace information from a stream, and
591 optionally strips out all tags not in a given namespace.
592
593 :param namespace: the URI of the namespace that should not be stripped. If
594 not set, only elements with no namespace are included in
595 the output.
596
597 >>> from genshi.input import XML
598 >>> xml = XML('''<doc xmlns="NS1" xmlns:two="NS2">
599 ... <two:item/>
600 ... </doc>''')
601 >>> for kind, data, pos in NamespaceStripper(Namespace('NS1'))(xml):
602 ... print kind, repr(data)
603 START (u'doc', Attrs())
604 TEXT u'\n '
605 TEXT u'\n'
606 END u'doc'
607 """
608
609 def __init__(self, namespace=None):
610 if namespace is not None:
611 self.namespace = Namespace(namespace)
612 else:
613 self.namespace = {}
614
615 def __call__(self, stream):
616 namespace = self.namespace
617
618 for kind, data, pos in stream:
619
620 if kind is START or kind is EMPTY:
621 tag, attrs = data
622 if tag.namespace and tag not in namespace:
623 continue
624
625 new_attrs = []
626 for attr, value in attrs:
627 if not attr.namespace or attr in namespace:
628 new_attrs.append((attr, value))
629
630 data = tag.localname, Attrs(new_attrs)
631
632 elif kind is END:
633 if data.namespace and data not in namespace:
634 continue
635 data = data.localname
636
637 elif kind is START_NS or kind is END_NS:
638 continue
639
640 yield kind, data, pos
641 656
642 657
643 class WhitespaceFilter(object): 658 class WhitespaceFilter(object):
644 """A filter that removes extraneous ignorable white space from the 659 """A filter that removes extraneous ignorable white space from the
645 stream. 660 stream.
712 elif kind is END_CDATA: 727 elif kind is END_CDATA:
713 noescape = False 728 noescape = False
714 729
715 if kind: 730 if kind:
716 yield kind, data, pos 731 yield kind, data, pos
732
733
734 class DocTypeInserter(object):
735 """A filter that inserts the DOCTYPE declaration in the correct location,
736 after the XML declaration.
737 """
738 def __init__(self, doctype):
739 """Initialize the filter.
740
741 :param doctype: DOCTYPE as a string or DocType object.
742 """
743 if isinstance(doctype, basestring):
744 doctype = DocType.get(doctype)
745 self.doctype_event = (DOCTYPE, doctype, (None, -1, -1))
746
747 def __call__(self, stream):
748 doctype_inserted = False
749 for kind, data, pos in stream:
750 if not doctype_inserted:
751 doctype_inserted = True
752 if kind is XML_DECL:
753 yield (kind, data, pos)
754 yield self.doctype_event
755 continue
756 yield self.doctype_event
757
758 yield (kind, data, pos)
759
760 if not doctype_inserted:
761 yield self.doctype_event
Copyright (C) 2012-2017 Edgewall Software