cmlenz@1: # -*- coding: utf-8 -*-
cmlenz@1: #
cmlenz@854: # Copyright (C) 2006-2009 Edgewall Software
cmlenz@1: # All rights reserved.
cmlenz@1: #
cmlenz@1: # This software is licensed as described in the file COPYING, which
cmlenz@1: # you should have received as part of this distribution. The terms
cmlenz@230: # are also available at http://genshi.edgewall.org/wiki/License.
cmlenz@1: #
cmlenz@1: # This software consists of voluntary contributions made by many
cmlenz@1: # individuals. For the exact contribution history, see the revision
cmlenz@230: # history and logs, available at http://genshi.edgewall.org/log/.
cmlenz@1: 
cmlenz@1: """This module provides different kinds of serialization methods for XML event
cmlenz@1: streams.
cmlenz@1: """
cmlenz@1: 
cmlenz@123: from itertools import chain
cmlenz@123: import re
cmlenz@1: 
cmlenz@410: from genshi.core import escape, Attrs, Markup, Namespace, QName, StreamEventKind
cmlenz@460: from genshi.core import START, END, TEXT, XML_DECL, DOCTYPE, START_NS, END_NS, \
cmlenz@402:                         START_CDATA, END_CDATA, PI, COMMENT, XML_NAMESPACE
cmlenz@1: 
cmlenz@462: __all__ = ['encode', 'get_serializer', 'DocType', 'XMLSerializer',
cmlenz@462:            'XHTMLSerializer', 'HTMLSerializer', 'TextSerializer']
cmlenz@425: __docformat__ = 'restructuredtext en'
cmlenz@1: 
cmlenz@863: 
hodgestar@932: def encode(iterator, method='xml', encoding=None, out=None):
cmlenz@462:     """Encode serializer output into a string.
cmlenz@462:     
cmlenz@462:     :param iterator: the iterator returned from serializing a stream (basically
cmlenz@462:                      any iterator that yields unicode objects)
cmlenz@462:     :param method: the serialization method; determines how characters not
cmlenz@462:                    representable in the specified encoding are treated
cmlenz@462:     :param encoding: how the output string should be encoded; if set to `None`,
cmlenz@462:                      this method returns a `unicode` object
cmlenz@688:     :param out: a file-like object that the output should be written to
cmlenz@688:                 instead of being returned as one big string; note that if
cmlenz@688:                 this is a file or socket (or similar), the `encoding` must
cmlenz@688:                 not be `None` (that is, the output must be encoded)
cmlenz@688:     :return: a `str` or `unicode` object (depending on the `encoding`
cmlenz@688:              parameter), or `None` if the `out` parameter is provided
cmlenz@688:     
cmlenz@462:     :since: version 0.4.1
cmlenz@688:     :note: Changed in 0.5: added the `out` parameter
cmlenz@462:     """
cmlenz@462:     if encoding is not None:
cmlenz@462:         errors = 'replace'
cmlenz@462:         if method != 'text' and not isinstance(method, TextSerializer):
cmlenz@462:             errors = 'xmlcharrefreplace'
cmlenz@688:         _encode = lambda string: string.encode(encoding, errors)
cmlenz@688:     else:
cmlenz@688:         _encode = lambda string: string
cmlenz@688:     if out is None:
cmlenz@852:         return _encode(''.join(list(iterator)))
cmlenz@688:     for chunk in iterator:
cmlenz@688:         out.write(_encode(chunk))
cmlenz@462: 
cmlenz@863: 
cmlenz@462: def get_serializer(method='xml', **kwargs):
cmlenz@462:     """Return a serializer object for the given method.
cmlenz@462:     
cmlenz@462:     :param method: the serialization method; can be either "xml", "xhtml",
cmlenz@462:                    "html", "text", or a custom serializer class
cmlenz@462: 
cmlenz@462:     Any additional keyword arguments are passed to the serializer, and thus
cmlenz@462:     depend on the `method` parameter value.
cmlenz@462:     
cmlenz@462:     :see: `XMLSerializer`, `XHTMLSerializer`, `HTMLSerializer`, `TextSerializer`
cmlenz@462:     :since: version 0.4.1
cmlenz@462:     """
cmlenz@462:     if isinstance(method, basestring):
cmlenz@462:         method = {'xml':   XMLSerializer,
cmlenz@462:                   'xhtml': XHTMLSerializer,
cmlenz@462:                   'html':  HTMLSerializer,
cmlenz@462:                   'text':  TextSerializer}[method.lower()]
cmlenz@462:     return method(**kwargs)
cmlenz@462: 
cmlenz@1: 
cmlenz@85: class DocType(object):
cmlenz@85:     """Defines a number of commonly used DOCTYPE declarations as constants."""
cmlenz@85: 
cmlenz@410:     HTML_STRICT = (
cmlenz@410:         'html', '-//W3C//DTD HTML 4.01//EN',
cmlenz@410:         'http://www.w3.org/TR/html4/strict.dtd'
cmlenz@410:     )
cmlenz@410:     HTML_TRANSITIONAL = (
cmlenz@410:         'html', '-//W3C//DTD HTML 4.01 Transitional//EN',
cmlenz@410:         'http://www.w3.org/TR/html4/loose.dtd'
cmlenz@410:     )
cmlenz@464:     HTML_FRAMESET = (
cmlenz@464:         'html', '-//W3C//DTD HTML 4.01 Frameset//EN',
cmlenz@464:         'http://www.w3.org/TR/html4/frameset.dtd'
cmlenz@464:     )
cmlenz@85:     HTML = HTML_STRICT
cmlenz@85: 
cmlenz@464:     HTML5 = ('html', None, None)
cmlenz@464: 
cmlenz@410:     XHTML_STRICT = (
cmlenz@410:         'html', '-//W3C//DTD XHTML 1.0 Strict//EN',
cmlenz@410:         'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'
cmlenz@410:     )
cmlenz@410:     XHTML_TRANSITIONAL = (
cmlenz@410:         'html', '-//W3C//DTD XHTML 1.0 Transitional//EN',
cmlenz@410:         'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
cmlenz@410:     )
cmlenz@464:     XHTML_FRAMESET = (
cmlenz@464:         'html', '-//W3C//DTD XHTML 1.0 Frameset//EN',
cmlenz@464:         'http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd'
cmlenz@464:     )
cmlenz@85:     XHTML = XHTML_STRICT
cmlenz@85: 
cmlenz@729:     XHTML11 = (
cmlenz@729:         'html', '-//W3C//DTD XHTML 1.1//EN',
cmlenz@729:         'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'
cmlenz@729:     )
cmlenz@729: 
cmlenz@663:     SVG_FULL = (
cmlenz@663:         'svg', '-//W3C//DTD SVG 1.1//EN',
cmlenz@663:         'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd'
cmlenz@663:     )
cmlenz@663:     SVG_BASIC = (
cmlenz@663:         'svg', '-//W3C//DTD SVG Basic 1.1//EN',
cmlenz@663:         'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11-basic.dtd'
cmlenz@663:     )
cmlenz@663:     SVG_TINY = (
cmlenz@663:         'svg', '-//W3C//DTD SVG Tiny 1.1//EN',
cmlenz@663:         'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11-tiny.dtd'
cmlenz@663:     )
cmlenz@663:     SVG = SVG_FULL
cmlenz@663: 
cmlenz@822:     @classmethod
cmlenz@464:     def get(cls, name):
cmlenz@464:         """Return the ``(name, pubid, sysid)`` tuple of the ``DOCTYPE``
cmlenz@464:         declaration for the specified name.
cmlenz@464:         
cmlenz@464:         The following names are recognized in this version:
cmlenz@464:          * "html" or "html-strict" for the HTML 4.01 strict DTD
cmlenz@464:          * "html-transitional" for the HTML 4.01 transitional DTD
cmlenz@745:          * "html-frameset" for the HTML 4.01 frameset DTD
cmlenz@464:          * "html5" for the ``DOCTYPE`` proposed for HTML5
cmlenz@464:          * "xhtml" or "xhtml-strict" for the XHTML 1.0 strict DTD
cmlenz@464:          * "xhtml-transitional" for the XHTML 1.0 transitional DTD
cmlenz@464:          * "xhtml-frameset" for the XHTML 1.0 frameset DTD
cmlenz@729:          * "xhtml11" for the XHTML 1.1 DTD
cmlenz@663:          * "svg" or "svg-full" for the SVG 1.1 DTD
cmlenz@663:          * "svg-basic" for the SVG Basic 1.1 DTD
cmlenz@663:          * "svg-tiny" for the SVG Tiny 1.1 DTD
cmlenz@464:         
cmlenz@464:         :param name: the name of the ``DOCTYPE``
cmlenz@464:         :return: the ``(name, pubid, sysid)`` tuple for the requested
cmlenz@464:                  ``DOCTYPE``, or ``None`` if the name is not recognized
cmlenz@464:         :since: version 0.4.1
cmlenz@464:         """
cmlenz@464:         return {
cmlenz@464:             'html': cls.HTML, 'html-strict': cls.HTML_STRICT,
cmlenz@464:             'html-transitional': DocType.HTML_TRANSITIONAL,
cmlenz@464:             'html-frameset': DocType.HTML_FRAMESET,
cmlenz@464:             'html5': cls.HTML5,
cmlenz@464:             'xhtml': cls.XHTML, 'xhtml-strict': cls.XHTML_STRICT,
cmlenz@464:             'xhtml-transitional': cls.XHTML_TRANSITIONAL,
cmlenz@464:             'xhtml-frameset': cls.XHTML_FRAMESET,
cmlenz@729:             'xhtml11': cls.XHTML11,
cmlenz@663:             'svg': cls.SVG, 'svg-full': cls.SVG_FULL,
cmlenz@663:             'svg-basic': cls.SVG_BASIC,
cmlenz@663:             'svg-tiny': cls.SVG_TINY
cmlenz@464:         }.get(name.lower())
cmlenz@448: 
cmlenz@85: 
cmlenz@123: class XMLSerializer(object):
cmlenz@1:     """Produces XML text from an event stream.
cmlenz@1:     
cmlenz@230:     >>> from genshi.builder import tag
cmlenz@20:     >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True))
cmlenz@853:     >>> print(''.join(XMLSerializer()(elem.generate())))
cmlenz@1:     <div><a href="foo"/><br/><hr noshade="True"/></div>
cmlenz@1:     """
cmlenz@123: 
cmlenz@123:     _PRESERVE_SPACE = frozenset()
cmlenz@123: 
cmlenz@410:     def __init__(self, doctype=None, strip_whitespace=True,
cmlenz@829:                  namespace_prefixes=None, cache=True):
cmlenz@85:         """Initialize the XML serializer.
cmlenz@85:         
cmlenz@425:         :param doctype: a ``(name, pubid, sysid)`` tuple that represents the
cmlenz@425:                         DOCTYPE declaration that should be included at the top
cmlenz@494:                         of the generated output, or the name of a DOCTYPE as
cmlenz@494:                         defined in `DocType.get`
cmlenz@425:         :param strip_whitespace: whether extraneous whitespace should be
cmlenz@425:                                  stripped from the output
cmlenz@829:         :param cache: whether to cache the text output per event, which
cmlenz@829:                       improves performance for repetitive markup
cmlenz@494:         :note: Changed in 0.4.2: The  `doctype` parameter can now be a string.
cmlenz@829:         :note: Changed in 0.6: The `cache` parameter was added
cmlenz@85:         """
cmlenz@212:         self.filters = [EmptyTagFilter()]
cmlenz@123:         if strip_whitespace:
cmlenz@123:             self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE))
cmlenz@829:         self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes,
cmlenz@829:                                                cache=cache))
athomas@671:         if doctype:
athomas@671:             self.filters.append(DocTypeInserter(doctype))
cmlenz@829:         self.cache = cache
cmlenz@1: 
cmlenz@123:     def __call__(self, stream):
cmlenz@460:         have_decl = have_doctype = False
cmlenz@143:         in_cdata = False
cmlenz@1: 
cmlenz@829:         cache = {}
cmlenz@829:         cache_get = cache.get
cmlenz@829:         if self.cache:
cmlenz@829:             def _emit(kind, input, output):
cmlenz@829:                 cache[kind, input] = output
cmlenz@829:                 return output
cmlenz@829:         else:
cmlenz@829:             def _emit(kind, input, output):
cmlenz@829:                 return output
cmlenz@829: 
cmlenz@123:         for filter_ in self.filters:
cmlenz@123:             stream = filter_(stream)
cmlenz@1:         for kind, data, pos in stream:
cmlenz@829:             cached = cache_get((kind, data))
cmlenz@829:             if cached is not None:
cmlenz@829:                 yield cached
cmlenz@1: 
cmlenz@829:             elif kind is START or kind is EMPTY:
cmlenz@1:                 tag, attrib = data
cmlenz@410:                 buf = ['<', tag]
cmlenz@397:                 for attr, value in attrib:
cmlenz@410:                     buf += [' ', attr, '="', escape(value), '"']
cmlenz@397:                 buf.append(kind is EMPTY and '/>' or '>')
cmlenz@852:                 yield _emit(kind, data, Markup(''.join(buf)))
cmlenz@1: 
cmlenz@69:             elif kind is END:
cmlenz@829:                 yield _emit(kind, data, Markup('</%s>' % data))
cmlenz@1: 
cmlenz@69:             elif kind is TEXT:
cmlenz@143:                 if in_cdata:
cmlenz@829:                     yield _emit(kind, data, data)
cmlenz@143:                 else:
cmlenz@829:                     yield _emit(kind, data, escape(data, quotes=False))
cmlenz@1: 
cmlenz@89:             elif kind is COMMENT:
cmlenz@829:                 yield _emit(kind, data, Markup('<!--%s-->' % data))
cmlenz@89: 
cmlenz@460:             elif kind is XML_DECL and not have_decl:
cmlenz@460:                 version, encoding, standalone = data
cmlenz@460:                 buf = ['<?xml version="%s"' % version]
cmlenz@460:                 if encoding:
cmlenz@460:                     buf.append(' encoding="%s"' % encoding)
cmlenz@460:                 if standalone != -1:
cmlenz@460:                     standalone = standalone and 'yes' or 'no'
cmlenz@460:                     buf.append(' standalone="%s"' % standalone)
cmlenz@460:                 buf.append('?>\n')
cmlenz@852:                 yield Markup(''.join(buf))
cmlenz@460:                 have_decl = True
cmlenz@460: 
cmlenz@136:             elif kind is DOCTYPE and not have_doctype:
cmlenz@136:                 name, pubid, sysid = data
cmlenz@136:                 buf = ['<!DOCTYPE %s']
cmlenz@136:                 if pubid:
cmlenz@397:                     buf.append(' PUBLIC "%s"')
cmlenz@136:                 elif sysid:
cmlenz@397:                     buf.append(' SYSTEM')
cmlenz@136:                 if sysid:
cmlenz@397:                     buf.append(' "%s"')
cmlenz@397:                 buf.append('>\n')
cmlenz@854:                 yield Markup(''.join(buf)) % tuple([p for p in data if p])
cmlenz@136:                 have_doctype = True
cmlenz@109: 
cmlenz@143:             elif kind is START_CDATA:
cmlenz@143:                 yield Markup('<![CDATA[')
cmlenz@143:                 in_cdata = True
cmlenz@143: 
cmlenz@143:             elif kind is END_CDATA:
cmlenz@143:                 yield Markup(']]>')
cmlenz@143:                 in_cdata = False
cmlenz@143: 
cmlenz@105:             elif kind is PI:
cmlenz@829:                 yield _emit(kind, data, Markup('<?%s %s?>' % data))
cmlenz@105: 
cmlenz@1: 
cmlenz@96: class XHTMLSerializer(XMLSerializer):
cmlenz@96:     """Produces XHTML text from an event stream.
cmlenz@1:     
cmlenz@230:     >>> from genshi.builder import tag
cmlenz@20:     >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True))
cmlenz@853:     >>> print(''.join(XHTMLSerializer()(elem.generate())))
cmlenz@96:     <div><a href="foo"></a><br /><hr noshade="noshade" /></div>
cmlenz@1:     """
cmlenz@1: 
cmlenz@1:     _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame',
cmlenz@1:                               'hr', 'img', 'input', 'isindex', 'link', 'meta',
cmlenz@1:                               'param'])
cmlenz@1:     _BOOLEAN_ATTRS = frozenset(['selected', 'checked', 'compact', 'declare',
cmlenz@1:                                 'defer', 'disabled', 'ismap', 'multiple',
cmlenz@1:                                 'nohref', 'noresize', 'noshade', 'nowrap'])
cmlenz@346:     _PRESERVE_SPACE = frozenset([
cmlenz@346:         QName('pre'), QName('http://www.w3.org/1999/xhtml}pre'),
cmlenz@346:         QName('textarea'), QName('http://www.w3.org/1999/xhtml}textarea')
cmlenz@346:     ])
cmlenz@1: 
cmlenz@410:     def __init__(self, doctype=None, strip_whitespace=True,
cmlenz@829:                  namespace_prefixes=None, drop_xml_decl=True, cache=True):
cmlenz@410:         super(XHTMLSerializer, self).__init__(doctype, False)
cmlenz@410:         self.filters = [EmptyTagFilter()]
cmlenz@410:         if strip_whitespace:
cmlenz@410:             self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE))
cmlenz@410:         namespace_prefixes = namespace_prefixes or {}
cmlenz@410:         namespace_prefixes['http://www.w3.org/1999/xhtml'] = ''
cmlenz@829:         self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes,
cmlenz@829:                                                cache=cache))
athomas@671:         if doctype:
athomas@671:             self.filters.append(DocTypeInserter(doctype))
cmlenz@729:         self.drop_xml_decl = drop_xml_decl
cmlenz@829:         self.cache = cache
cmlenz@410: 
cmlenz@123:     def __call__(self, stream):
cmlenz@136:         boolean_attrs = self._BOOLEAN_ATTRS
cmlenz@136:         empty_elems = self._EMPTY_ELEMS
cmlenz@729:         drop_xml_decl = self.drop_xml_decl
cmlenz@729:         have_decl = have_doctype = False
cmlenz@143:         in_cdata = False
cmlenz@1: 
cmlenz@829:         cache = {}
cmlenz@829:         cache_get = cache.get
cmlenz@829:         if self.cache:
cmlenz@829:             def _emit(kind, input, output):
cmlenz@829:                 cache[kind, input] = output
cmlenz@829:                 return output
cmlenz@829:         else:
cmlenz@829:             def _emit(kind, input, output):
cmlenz@829:                 return output
cmlenz@829: 
cmlenz@123:         for filter_ in self.filters:
cmlenz@123:             stream = filter_(stream)
cmlenz@1:         for kind, data, pos in stream:
cmlenz@829:             cached = cache_get((kind, data))
cmlenz@829:             if cached is not None:
cmlenz@829:                 yield cached
cmlenz@1: 
cmlenz@829:             elif kind is START or kind is EMPTY:
cmlenz@1:                 tag, attrib = data
cmlenz@410:                 buf = ['<', tag]
cmlenz@397:                 for attr, value in attrib:
cmlenz@410:                     if attr in boolean_attrs:
cmlenz@410:                         value = attr
cmlenz@852:                     elif attr == 'xml:lang' and 'lang' not in attrib:
cmlenz@524:                         buf += [' lang="', escape(value), '"']
cmlenz@852:                     elif attr == 'xml:space':
cmlenz@689:                         continue
cmlenz@410:                     buf += [' ', attr, '="', escape(value), '"']
cmlenz@212:                 if kind is EMPTY:
cmlenz@410:                     if tag in empty_elems:
cmlenz@397:                         buf.append(' />')
cmlenz@96:                     else:
cmlenz@410:                         buf.append('></%s>' % tag)
cmlenz@141:                 else:
cmlenz@397:                     buf.append('>')
cmlenz@852:                 yield _emit(kind, data, Markup(''.join(buf)))
cmlenz@96: 
cmlenz@96:             elif kind is END:
cmlenz@829:                 yield _emit(kind, data, Markup('</%s>' % data))
cmlenz@96: 
cmlenz@96:             elif kind is TEXT:
cmlenz@143:                 if in_cdata:
cmlenz@829:                     yield _emit(kind, data, data)
cmlenz@143:                 else:
cmlenz@829:                     yield _emit(kind, data, escape(data, quotes=False))
cmlenz@96: 
cmlenz@96:             elif kind is COMMENT:
cmlenz@829:                 yield _emit(kind, data, Markup('<!--%s-->' % data))
cmlenz@96: 
cmlenz@136:             elif kind is DOCTYPE and not have_doctype:
cmlenz@136:                 name, pubid, sysid = data
cmlenz@136:                 buf = ['<!DOCTYPE %s']
cmlenz@136:                 if pubid:
cmlenz@397:                     buf.append(' PUBLIC "%s"')
cmlenz@136:                 elif sysid:
cmlenz@397:                     buf.append(' SYSTEM')
cmlenz@136:                 if sysid:
cmlenz@397:                     buf.append(' "%s"')
cmlenz@397:                 buf.append('>\n')
cmlenz@854:                 yield Markup(''.join(buf)) % tuple([p for p in data if p])
cmlenz@136:                 have_doctype = True
cmlenz@109: 
cmlenz@729:             elif kind is XML_DECL and not have_decl and not drop_xml_decl:
cmlenz@729:                 version, encoding, standalone = data
cmlenz@729:                 buf = ['<?xml version="%s"' % version]
cmlenz@729:                 if encoding:
cmlenz@729:                     buf.append(' encoding="%s"' % encoding)
cmlenz@729:                 if standalone != -1:
cmlenz@729:                     standalone = standalone and 'yes' or 'no'
cmlenz@729:                     buf.append(' standalone="%s"' % standalone)
cmlenz@729:                 buf.append('?>\n')
cmlenz@852:                 yield Markup(''.join(buf))
cmlenz@729:                 have_decl = True
cmlenz@729: 
cmlenz@143:             elif kind is START_CDATA:
cmlenz@143:                 yield Markup('<![CDATA[')
cmlenz@143:                 in_cdata = True
cmlenz@143: 
cmlenz@143:             elif kind is END_CDATA:
cmlenz@143:                 yield Markup(']]>')
cmlenz@143:                 in_cdata = False
cmlenz@143: 
cmlenz@105:             elif kind is PI:
cmlenz@829:                 yield _emit(kind, data, Markup('<?%s %s?>' % data))
cmlenz@105: 
cmlenz@96: 
cmlenz@96: class HTMLSerializer(XHTMLSerializer):
cmlenz@96:     """Produces HTML text from an event stream.
cmlenz@96:     
cmlenz@230:     >>> from genshi.builder import tag
cmlenz@96:     >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True))
cmlenz@853:     >>> print(''.join(HTMLSerializer()(elem.generate())))
cmlenz@96:     <div><a href="foo"></a><br><hr noshade></div>
cmlenz@96:     """
cmlenz@96: 
cmlenz@410:     _NOESCAPE_ELEMS = frozenset([
cmlenz@410:         QName('script'), QName('http://www.w3.org/1999/xhtml}script'),
cmlenz@410:         QName('style'), QName('http://www.w3.org/1999/xhtml}style')
cmlenz@410:     ])
cmlenz@141: 
cmlenz@829:     def __init__(self, doctype=None, strip_whitespace=True, cache=True):
cmlenz@141:         """Initialize the HTML serializer.
cmlenz@141:         
cmlenz@425:         :param doctype: a ``(name, pubid, sysid)`` tuple that represents the
cmlenz@425:                         DOCTYPE declaration that should be included at the top
cmlenz@425:                         of the generated output
cmlenz@425:         :param strip_whitespace: whether extraneous whitespace should be
cmlenz@425:                                  stripped from the output
cmlenz@829:         :param cache: whether to cache the text output per event, which
cmlenz@829:                       improves performance for repetitive markup
cmlenz@829:         :note: Changed in 0.6: The `cache` parameter was added
cmlenz@141:         """
cmlenz@141:         super(HTMLSerializer, self).__init__(doctype, False)
cmlenz@410:         self.filters = [EmptyTagFilter()]
cmlenz@141:         if strip_whitespace:
cmlenz@141:             self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE,
cmlenz@305:                                                  self._NOESCAPE_ELEMS))
cmlenz@524:         self.filters.append(NamespaceFlattener(prefixes={
cmlenz@524:             'http://www.w3.org/1999/xhtml': ''
cmlenz@829:         }, cache=cache))
athomas@671:         if doctype:
athomas@671:             self.filters.append(DocTypeInserter(doctype))
cmlenz@829:         self.cache = True
cmlenz@141: 
cmlenz@123:     def __call__(self, stream):
cmlenz@136:         boolean_attrs = self._BOOLEAN_ATTRS
cmlenz@136:         empty_elems = self._EMPTY_ELEMS
cmlenz@141:         noescape_elems = self._NOESCAPE_ELEMS
cmlenz@96:         have_doctype = False
cmlenz@141:         noescape = False
cmlenz@96: 
cmlenz@829:         cache = {}
cmlenz@829:         cache_get = cache.get
cmlenz@829:         if self.cache:
cmlenz@829:             def _emit(kind, input, output):
cmlenz@829:                 cache[kind, input] = output
cmlenz@829:                 return output
cmlenz@829:         else:
cmlenz@829:             def _emit(kind, input, output):
cmlenz@829:                 return output
cmlenz@829: 
cmlenz@123:         for filter_ in self.filters:
cmlenz@123:             stream = filter_(stream)
cmlenz@829:         for kind, data, _ in stream:
cmlenz@829:             output = cache_get((kind, data))
cmlenz@829:             if output is not None:
cmlenz@829:                 yield output
cmlenz@831:                 if (kind is START or kind is EMPTY) \
cmlenz@831:                         and data[0] in noescape_elems:
cmlenz@829:                     noescape = True
cmlenz@829:                 elif kind is END:
cmlenz@829:                     noescape = False
cmlenz@96: 
cmlenz@829:             elif kind is START or kind is EMPTY:
cmlenz@96:                 tag, attrib = data
cmlenz@410:                 buf = ['<', tag]
cmlenz@410:                 for attr, value in attrib:
cmlenz@410:                     if attr in boolean_attrs:
cmlenz@410:                         if value:
cmlenz@410:                             buf += [' ', attr]
cmlenz@524:                     elif ':' in attr:
cmlenz@852:                         if attr == 'xml:lang' and 'lang' not in attrib:
cmlenz@524:                             buf += [' lang="', escape(value), '"']
cmlenz@524:                     elif attr != 'xmlns':
cmlenz@410:                         buf += [' ', attr, '="', escape(value), '"']
cmlenz@410:                 buf.append('>')
cmlenz@410:                 if kind is EMPTY:
cmlenz@410:                     if tag not in empty_elems:
cmlenz@410:                         buf.append('</%s>' % tag)
cmlenz@852:                 yield _emit(kind, data, Markup(''.join(buf)))
cmlenz@410:                 if tag in noescape_elems:
cmlenz@410:                     noescape = True
cmlenz@141: 
cmlenz@69:             elif kind is END:
cmlenz@829:                 yield _emit(kind, data, Markup('</%s>' % data))
cmlenz@141:                 noescape = False
cmlenz@141: 
cmlenz@69:             elif kind is TEXT:
cmlenz@141:                 if noescape:
cmlenz@829:                     yield _emit(kind, data, data)
cmlenz@141:                 else:
cmlenz@829:                     yield _emit(kind, data, escape(data, quotes=False))
cmlenz@1: 
cmlenz@89:             elif kind is COMMENT:
cmlenz@829:                 yield _emit(kind, data, Markup('<!--%s-->' % data))
cmlenz@89: 
cmlenz@136:             elif kind is DOCTYPE and not have_doctype:
cmlenz@136:                 name, pubid, sysid = data
cmlenz@136:                 buf = ['<!DOCTYPE %s']
cmlenz@136:                 if pubid:
cmlenz@397:                     buf.append(' PUBLIC "%s"')
cmlenz@136:                 elif sysid:
cmlenz@397:                     buf.append(' SYSTEM')
cmlenz@136:                 if sysid:
cmlenz@397:                     buf.append(' "%s"')
cmlenz@397:                 buf.append('>\n')
cmlenz@854:                 yield Markup(''.join(buf)) % tuple([p for p in data if p])
cmlenz@136:                 have_doctype = True
cmlenz@109: 
cmlenz@105:             elif kind is PI:
cmlenz@829:                 yield _emit(kind, data, Markup('<?%s %s?>' % data))
cmlenz@105: 
cmlenz@1: 
cmlenz@200: class TextSerializer(object):
cmlenz@200:     """Produces plain text from an event stream.
cmlenz@200:     
cmlenz@200:     Only text events are included in the output. Unlike the other serializer,
cmlenz@200:     special XML characters are not escaped:
cmlenz@200:     
cmlenz@230:     >>> from genshi.builder import tag
cmlenz@200:     >>> elem = tag.div(tag.a('<Hello!>', href='foo'), tag.br)
cmlenz@853:     >>> print(elem)
cmlenz@200:     <div><a href="foo">&lt;Hello!&gt;</a><br/></div>
cmlenz@853:     >>> print(''.join(TextSerializer()(elem.generate())))
cmlenz@200:     <Hello!>
cmlenz@200: 
cmlenz@200:     If text events contain literal markup (instances of the `Markup` class),
cmlenz@658:     that markup is by default passed through unchanged:
cmlenz@200:     
cmlenz@658:     >>> elem = tag.div(Markup('<a href="foo">Hello &amp; Bye!</a><br/>'))
cmlenz@863:     >>> print(elem.generate().render(TextSerializer, encoding=None))
cmlenz@658:     <a href="foo">Hello &amp; Bye!</a><br/>
cmlenz@658:     
cmlenz@740:     You can use the ``strip_markup`` to change this behavior, so that tags and
cmlenz@658:     entities are stripped from the output (or in the case of entities,
cmlenz@658:     replaced with the equivalent character):
cmlenz@658: 
cmlenz@863:     >>> print(elem.generate().render(TextSerializer, strip_markup=True,
cmlenz@863:     ...                              encoding=None))
cmlenz@658:     Hello & Bye!
cmlenz@200:     """
cmlenz@200: 
cmlenz@658:     def __init__(self, strip_markup=False):
cmlenz@740:         """Create the serializer.
cmlenz@740:         
cmlenz@740:         :param strip_markup: whether markup (tags and encoded characters) found
cmlenz@740:                              in the text should be removed
cmlenz@740:         """
cmlenz@658:         self.strip_markup = strip_markup
cmlenz@658: 
cmlenz@200:     def __call__(self, stream):
cmlenz@658:         strip_markup = self.strip_markup
cmlenz@410:         for event in stream:
cmlenz@410:             if event[0] is TEXT:
cmlenz@410:                 data = event[1]
cmlenz@658:                 if strip_markup and type(data) is Markup:
cmlenz@200:                     data = data.striptags().stripentities()
cmlenz@201:                 yield unicode(data)
cmlenz@200: 
cmlenz@200: 
cmlenz@212: class EmptyTagFilter(object):
cmlenz@212:     """Combines `START` and `STOP` events into `EMPTY` events for elements that
cmlenz@212:     have no contents.
cmlenz@212:     """
cmlenz@212: 
cmlenz@212:     EMPTY = StreamEventKind('EMPTY')
cmlenz@212: 
cmlenz@212:     def __call__(self, stream):
cmlenz@212:         prev = (None, None, None)
cmlenz@410:         for ev in stream:
cmlenz@212:             if prev[0] is START:
cmlenz@410:                 if ev[0] is END:
cmlenz@212:                     prev = EMPTY, prev[1], prev[2]
cmlenz@212:                     yield prev
cmlenz@212:                     continue
cmlenz@212:                 else:
cmlenz@212:                     yield prev
cmlenz@410:             if ev[0] is not START:
cmlenz@410:                 yield ev
cmlenz@410:             prev = ev
cmlenz@212: 
cmlenz@212: 
cmlenz@212: EMPTY = EmptyTagFilter.EMPTY
cmlenz@212: 
cmlenz@212: 
cmlenz@410: class NamespaceFlattener(object):
cmlenz@410:     r"""Output stream filter that removes namespace information from the stream,
cmlenz@410:     instead adding namespace attributes and prefixes as needed.
cmlenz@410:     
cmlenz@425:     :param prefixes: optional mapping of namespace URIs to prefixes
cmlenz@410:     
cmlenz@410:     >>> from genshi.input import XML
cmlenz@410:     >>> xml = XML('''<doc xmlns="NS1" xmlns:two="NS2">
cmlenz@410:     ...   <two:item/>
cmlenz@410:     ... </doc>''')
cmlenz@410:     >>> for kind, data, pos in NamespaceFlattener()(xml):
cmlenz@853:     ...     print('%s %r' % (kind, data))
cmlenz@852:     START (u'doc', Attrs([('xmlns', u'NS1'), (u'xmlns:two', u'NS2')]))
cmlenz@410:     TEXT u'\n  '
cmlenz@410:     START (u'two:item', Attrs())
cmlenz@410:     END u'two:item'
cmlenz@410:     TEXT u'\n'
cmlenz@410:     END u'doc'
cmlenz@410:     """
cmlenz@410: 
cmlenz@829:     def __init__(self, prefixes=None, cache=True):
cmlenz@410:         self.prefixes = {XML_NAMESPACE.uri: 'xml'}
cmlenz@410:         if prefixes is not None:
cmlenz@410:             self.prefixes.update(prefixes)
cmlenz@829:         self.cache = cache
cmlenz@410: 
cmlenz@410:     def __call__(self, stream):
cmlenz@829:         cache = {}
cmlenz@829:         cache_get = cache.get
cmlenz@829:         if self.cache:
cmlenz@829:             def _emit(kind, input, output, pos):
cmlenz@829:                 cache[kind, input] = output
cmlenz@829:                 return kind, output, pos
cmlenz@829:         else:
cmlenz@829:             def _emit(kind, input, output, pos):
cmlenz@829:                 return output
cmlenz@829: 
cmlenz@410:         prefixes = dict([(v, [k]) for k, v in self.prefixes.items()])
cmlenz@410:         namespaces = {XML_NAMESPACE.uri: ['xml']}
cmlenz@410:         def _push_ns(prefix, uri):
cmlenz@410:             namespaces.setdefault(uri, []).append(prefix)
cmlenz@410:             prefixes.setdefault(prefix, []).append(uri)
cmlenz@829:             cache.clear()
cmlenz@829:         def _pop_ns(prefix):
cmlenz@829:             uris = prefixes.get(prefix)
cmlenz@829:             uri = uris.pop()
cmlenz@829:             if not uris:
cmlenz@829:                 del prefixes[prefix]
cmlenz@829:             if uri not in uris or uri != uris[-1]:
cmlenz@829:                 uri_prefixes = namespaces[uri]
cmlenz@829:                 uri_prefixes.pop()
cmlenz@829:                 if not uri_prefixes:
cmlenz@829:                     del namespaces[uri]
cmlenz@829:             cache.clear()
cmlenz@829:             return uri
cmlenz@410: 
cmlenz@410:         ns_attrs = []
cmlenz@410:         _push_ns_attr = ns_attrs.append
cmlenz@437:         def _make_ns_attr(prefix, uri):
cmlenz@852:             return 'xmlns%s' % (prefix and ':%s' % prefix or ''), uri
cmlenz@410: 
cmlenz@410:         def _gen_prefix():
cmlenz@410:             val = 0
cmlenz@410:             while 1:
cmlenz@410:                 val += 1
cmlenz@410:                 yield 'ns%d' % val
cmlenz@410:         _gen_prefix = _gen_prefix().next
cmlenz@410: 
cmlenz@410:         for kind, data, pos in stream:
cmlenz@829:             output = cache_get((kind, data))
cmlenz@829:             if output is not None:
cmlenz@829:                 yield kind, output, pos
cmlenz@410: 
cmlenz@829:             elif kind is START or kind is EMPTY:
cmlenz@410:                 tag, attrs = data
cmlenz@410: 
cmlenz@410:                 tagname = tag.localname
cmlenz@410:                 tagns = tag.namespace
cmlenz@410:                 if tagns:
cmlenz@410:                     if tagns in namespaces:
cmlenz@410:                         prefix = namespaces[tagns][-1]
cmlenz@410:                         if prefix:
cmlenz@852:                             tagname = '%s:%s' % (prefix, tagname)
cmlenz@410:                     else:
cmlenz@852:                         _push_ns_attr(('xmlns', tagns))
cmlenz@410:                         _push_ns('', tagns)
cmlenz@410: 
cmlenz@410:                 new_attrs = []
cmlenz@410:                 for attr, value in attrs:
cmlenz@410:                     attrname = attr.localname
cmlenz@410:                     attrns = attr.namespace
cmlenz@410:                     if attrns:
cmlenz@410:                         if attrns not in namespaces:
cmlenz@410:                             prefix = _gen_prefix()
cmlenz@410:                             _push_ns(prefix, attrns)
cmlenz@412:                             _push_ns_attr(('xmlns:%s' % prefix, attrns))
cmlenz@410:                         else:
cmlenz@410:                             prefix = namespaces[attrns][-1]
cmlenz@410:                         if prefix:
cmlenz@852:                             attrname = '%s:%s' % (prefix, attrname)
cmlenz@410:                     new_attrs.append((attrname, value))
cmlenz@410: 
cmlenz@829:                 yield _emit(kind, data, (tagname, Attrs(ns_attrs + new_attrs)), pos)
cmlenz@410:                 del ns_attrs[:]
cmlenz@410: 
cmlenz@410:             elif kind is END:
cmlenz@410:                 tagname = data.localname
cmlenz@410:                 tagns = data.namespace
cmlenz@410:                 if tagns:
cmlenz@410:                     prefix = namespaces[tagns][-1]
cmlenz@410:                     if prefix:
cmlenz@852:                         tagname = '%s:%s' % (prefix, tagname)
cmlenz@829:                 yield _emit(kind, data, tagname, pos)
cmlenz@410: 
cmlenz@410:             elif kind is START_NS:
cmlenz@410:                 prefix, uri = data
cmlenz@410:                 if uri not in namespaces:
cmlenz@410:                     prefix = prefixes.get(uri, [prefix])[-1]
cmlenz@437:                     _push_ns_attr(_make_ns_attr(prefix, uri))
cmlenz@410:                 _push_ns(prefix, uri)
cmlenz@410: 
cmlenz@410:             elif kind is END_NS:
cmlenz@410:                 if data in prefixes:
cmlenz@829:                     uri = _pop_ns(data)
cmlenz@437:                     if ns_attrs:
cmlenz@437:                         attr = _make_ns_attr(data, uri)
cmlenz@437:                         if attr in ns_attrs:
cmlenz@437:                             ns_attrs.remove(attr)
cmlenz@410: 
cmlenz@410:             else:
cmlenz@410:                 yield kind, data, pos
cmlenz@410: 
cmlenz@410: 
cmlenz@123: class WhitespaceFilter(object):
cmlenz@123:     """A filter that removes extraneous ignorable white space from the
cmlenz@410:     stream.
cmlenz@410:     """
cmlenz@123: 
cmlenz@305:     def __init__(self, preserve=None, noescape=None):
cmlenz@123:         """Initialize the filter.
cmlenz@123:         
cmlenz@425:         :param preserve: a set or sequence of tag names for which white-space
cmlenz@425:                          should be preserved
cmlenz@425:         :param noescape: a set or sequence of tag names for which text content
cmlenz@425:                          should not be escaped
cmlenz@141:         
cmlenz@346:         The `noescape` set is expected to refer to elements that cannot contain
cmlenz@425:         further child elements (such as ``<style>`` or ``<script>`` in HTML
cmlenz@425:         documents).
cmlenz@123:         """
cmlenz@123:         if preserve is None:
cmlenz@123:             preserve = []
cmlenz@123:         self.preserve = frozenset(preserve)
cmlenz@141:         if noescape is None:
cmlenz@141:             noescape = []
cmlenz@141:         self.noescape = frozenset(noescape)
cmlenz@123: 
cmlenz@219:     def __call__(self, stream, ctxt=None, space=XML_NAMESPACE['space'],
cmlenz@219:                  trim_trailing_space=re.compile('[ \t]+(?=\n)').sub,
cmlenz@219:                  collapse_lines=re.compile('\n{2,}').sub):
cmlenz@123:         mjoin = Markup('').join
cmlenz@141:         preserve_elems = self.preserve
cmlenz@346:         preserve = 0
cmlenz@141:         noescape_elems = self.noescape
cmlenz@141:         noescape = False
cmlenz@123: 
cmlenz@123:         textbuf = []
cmlenz@141:         push_text = textbuf.append
cmlenz@136:         pop_text = textbuf.pop
cmlenz@123:         for kind, data, pos in chain(stream, [(None, None, None)]):
cmlenz@410: 
cmlenz@123:             if kind is TEXT:
cmlenz@141:                 if noescape:
cmlenz@141:                     data = Markup(data)
cmlenz@141:                 push_text(data)
cmlenz@123:             else:
cmlenz@123:                 if textbuf:
cmlenz@123:                     if len(textbuf) > 1:
cmlenz@123:                         text = mjoin(textbuf, escape_quotes=False)
cmlenz@123:                         del textbuf[:]
cmlenz@123:                     else:
cmlenz@136:                         text = escape(pop_text(), quotes=False)
cmlenz@141:                     if not preserve:
cmlenz@123:                         text = collapse_lines('\n', trim_trailing_space('', text))
cmlenz@123:                     yield TEXT, Markup(text), pos
cmlenz@141: 
cmlenz@141:                 if kind is START:
cmlenz@346:                     tag, attrs = data
cmlenz@346:                     if preserve or (tag in preserve_elems or
cmlenz@346:                                     attrs.get(space) == 'preserve'):
cmlenz@346:                         preserve += 1
cmlenz@219:                     if not noescape and tag in noescape_elems:
cmlenz@141:                         noescape = True
cmlenz@141: 
cmlenz@141:                 elif kind is END:
cmlenz@346:                     noescape = False
cmlenz@346:                     if preserve:
cmlenz@346:                         preserve -= 1
cmlenz@141: 
cmlenz@305:                 elif kind is START_CDATA:
cmlenz@143:                     noescape = True
cmlenz@143: 
cmlenz@305:                 elif kind is END_CDATA:
cmlenz@143:                     noescape = False
cmlenz@143: 
cmlenz@136:                 if kind:
cmlenz@123:                     yield kind, data, pos
athomas@671: 
athomas@671: 
athomas@671: class DocTypeInserter(object):
athomas@671:     """A filter that inserts the DOCTYPE declaration in the correct location,
athomas@671:     after the XML declaration.
athomas@671:     """
athomas@671:     def __init__(self, doctype):
athomas@671:         """Initialize the filter.
athomas@671: 
athomas@671:         :param doctype: DOCTYPE as a string or DocType object.
athomas@671:         """
athomas@671:         if isinstance(doctype, basestring):
athomas@671:             doctype = DocType.get(doctype)
athomas@671:         self.doctype_event = (DOCTYPE, doctype, (None, -1, -1))
athomas@671: 
athomas@671:     def __call__(self, stream):
athomas@671:         doctype_inserted = False
athomas@671:         for kind, data, pos in stream:
athomas@671:             if not doctype_inserted:
athomas@672:                 doctype_inserted = True
athomas@672:                 if kind is XML_DECL:
athomas@672:                     yield (kind, data, pos)
athomas@672:                     yield self.doctype_event
athomas@671:                     continue
athomas@671:                 yield self.doctype_event
athomas@671: 
athomas@671:             yield (kind, data, pos)
athomas@671: 
athomas@671:         if not doctype_inserted:
athomas@671:             yield self.doctype_event