cmlenz@1: # -*- coding: utf-8 -*- cmlenz@1: # cmlenz@719: # Copyright (C) 2006-2008 Edgewall Software cmlenz@1: # All rights reserved. cmlenz@1: # cmlenz@1: # This software is licensed as described in the file COPYING, which cmlenz@1: # you should have received as part of this distribution. The terms cmlenz@230: # are also available at http://genshi.edgewall.org/wiki/License. cmlenz@1: # cmlenz@1: # This software consists of voluntary contributions made by many cmlenz@1: # individuals. For the exact contribution history, see the revision cmlenz@230: # history and logs, available at http://genshi.edgewall.org/log/. cmlenz@1: cmlenz@1: """This module provides different kinds of serialization methods for XML event cmlenz@1: streams. cmlenz@1: """ cmlenz@1: cmlenz@123: from itertools import chain cmlenz@1: try: cmlenz@1: frozenset cmlenz@1: except NameError: cmlenz@1: from sets import ImmutableSet as frozenset cmlenz@123: import re cmlenz@1: cmlenz@410: from genshi.core import escape, Attrs, Markup, Namespace, QName, StreamEventKind cmlenz@460: from genshi.core import START, END, TEXT, XML_DECL, DOCTYPE, START_NS, END_NS, \ cmlenz@402: START_CDATA, END_CDATA, PI, COMMENT, XML_NAMESPACE cmlenz@1: cmlenz@462: __all__ = ['encode', 'get_serializer', 'DocType', 'XMLSerializer', cmlenz@462: 'XHTMLSerializer', 'HTMLSerializer', 'TextSerializer'] cmlenz@425: __docformat__ = 'restructuredtext en' cmlenz@1: cmlenz@688: def encode(iterator, method='xml', encoding='utf-8', out=None): cmlenz@462: """Encode serializer output into a string. cmlenz@462: cmlenz@462: :param iterator: the iterator returned from serializing a stream (basically cmlenz@462: any iterator that yields unicode objects) cmlenz@462: :param method: the serialization method; determines how characters not cmlenz@462: representable in the specified encoding are treated cmlenz@462: :param encoding: how the output string should be encoded; if set to `None`, cmlenz@462: this method returns a `unicode` object cmlenz@688: :param out: a file-like object that the output should be written to cmlenz@688: instead of being returned as one big string; note that if cmlenz@688: this is a file or socket (or similar), the `encoding` must cmlenz@688: not be `None` (that is, the output must be encoded) cmlenz@688: :return: a `str` or `unicode` object (depending on the `encoding` cmlenz@688: parameter), or `None` if the `out` parameter is provided cmlenz@688: cmlenz@462: :since: version 0.4.1 cmlenz@688: :note: Changed in 0.5: added the `out` parameter cmlenz@462: """ cmlenz@462: if encoding is not None: cmlenz@462: errors = 'replace' cmlenz@462: if method != 'text' and not isinstance(method, TextSerializer): cmlenz@462: errors = 'xmlcharrefreplace' cmlenz@688: _encode = lambda string: string.encode(encoding, errors) cmlenz@688: else: cmlenz@688: _encode = lambda string: string cmlenz@688: if out is None: cmlenz@688: return _encode(u''.join(list(iterator))) cmlenz@688: for chunk in iterator: cmlenz@688: out.write(_encode(chunk)) cmlenz@462: cmlenz@462: def get_serializer(method='xml', **kwargs): cmlenz@462: """Return a serializer object for the given method. cmlenz@462: cmlenz@462: :param method: the serialization method; can be either "xml", "xhtml", cmlenz@462: "html", "text", or a custom serializer class cmlenz@462: cmlenz@462: Any additional keyword arguments are passed to the serializer, and thus cmlenz@462: depend on the `method` parameter value. cmlenz@462: cmlenz@462: :see: `XMLSerializer`, `XHTMLSerializer`, `HTMLSerializer`, `TextSerializer` cmlenz@462: :since: version 0.4.1 cmlenz@462: """ cmlenz@462: if isinstance(method, basestring): cmlenz@462: method = {'xml': XMLSerializer, cmlenz@462: 'xhtml': XHTMLSerializer, cmlenz@462: 'html': HTMLSerializer, cmlenz@462: 'text': TextSerializer}[method.lower()] cmlenz@462: return method(**kwargs) cmlenz@462: cmlenz@1: cmlenz@85: class DocType(object): cmlenz@85: """Defines a number of commonly used DOCTYPE declarations as constants.""" cmlenz@85: cmlenz@410: HTML_STRICT = ( cmlenz@410: 'html', '-//W3C//DTD HTML 4.01//EN', cmlenz@410: 'http://www.w3.org/TR/html4/strict.dtd' cmlenz@410: ) cmlenz@410: HTML_TRANSITIONAL = ( cmlenz@410: 'html', '-//W3C//DTD HTML 4.01 Transitional//EN', cmlenz@410: 'http://www.w3.org/TR/html4/loose.dtd' cmlenz@410: ) cmlenz@464: HTML_FRAMESET = ( cmlenz@464: 'html', '-//W3C//DTD HTML 4.01 Frameset//EN', cmlenz@464: 'http://www.w3.org/TR/html4/frameset.dtd' cmlenz@464: ) cmlenz@85: HTML = HTML_STRICT cmlenz@85: cmlenz@464: HTML5 = ('html', None, None) cmlenz@464: cmlenz@410: XHTML_STRICT = ( cmlenz@410: 'html', '-//W3C//DTD XHTML 1.0 Strict//EN', cmlenz@410: 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd' cmlenz@410: ) cmlenz@410: XHTML_TRANSITIONAL = ( cmlenz@410: 'html', '-//W3C//DTD XHTML 1.0 Transitional//EN', cmlenz@410: 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' cmlenz@410: ) cmlenz@464: XHTML_FRAMESET = ( cmlenz@464: 'html', '-//W3C//DTD XHTML 1.0 Frameset//EN', cmlenz@464: 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd' cmlenz@464: ) cmlenz@85: XHTML = XHTML_STRICT cmlenz@85: cmlenz@663: SVG_FULL = ( cmlenz@663: 'svg', '-//W3C//DTD SVG 1.1//EN', cmlenz@663: 'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd' cmlenz@663: ) cmlenz@663: SVG_BASIC = ( cmlenz@663: 'svg', '-//W3C//DTD SVG Basic 1.1//EN', cmlenz@663: 'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11-basic.dtd' cmlenz@663: ) cmlenz@663: SVG_TINY = ( cmlenz@663: 'svg', '-//W3C//DTD SVG Tiny 1.1//EN', cmlenz@663: 'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11-tiny.dtd' cmlenz@663: ) cmlenz@663: SVG = SVG_FULL cmlenz@663: cmlenz@464: def get(cls, name): cmlenz@464: """Return the ``(name, pubid, sysid)`` tuple of the ``DOCTYPE`` cmlenz@464: declaration for the specified name. cmlenz@464: cmlenz@464: The following names are recognized in this version: cmlenz@464: * "html" or "html-strict" for the HTML 4.01 strict DTD cmlenz@464: * "html-transitional" for the HTML 4.01 transitional DTD cmlenz@464: * "html-transitional" for the HTML 4.01 frameset DTD cmlenz@464: * "html5" for the ``DOCTYPE`` proposed for HTML5 cmlenz@464: * "xhtml" or "xhtml-strict" for the XHTML 1.0 strict DTD cmlenz@464: * "xhtml-transitional" for the XHTML 1.0 transitional DTD cmlenz@464: * "xhtml-frameset" for the XHTML 1.0 frameset DTD cmlenz@663: * "svg" or "svg-full" for the SVG 1.1 DTD cmlenz@663: * "svg-basic" for the SVG Basic 1.1 DTD cmlenz@663: * "svg-tiny" for the SVG Tiny 1.1 DTD cmlenz@464: cmlenz@464: :param name: the name of the ``DOCTYPE`` cmlenz@464: :return: the ``(name, pubid, sysid)`` tuple for the requested cmlenz@464: ``DOCTYPE``, or ``None`` if the name is not recognized cmlenz@464: :since: version 0.4.1 cmlenz@464: """ cmlenz@464: return { cmlenz@464: 'html': cls.HTML, 'html-strict': cls.HTML_STRICT, cmlenz@464: 'html-transitional': DocType.HTML_TRANSITIONAL, cmlenz@464: 'html-frameset': DocType.HTML_FRAMESET, cmlenz@464: 'html5': cls.HTML5, cmlenz@464: 'xhtml': cls.XHTML, 'xhtml-strict': cls.XHTML_STRICT, cmlenz@464: 'xhtml-transitional': cls.XHTML_TRANSITIONAL, cmlenz@464: 'xhtml-frameset': cls.XHTML_FRAMESET, cmlenz@663: 'svg': cls.SVG, 'svg-full': cls.SVG_FULL, cmlenz@663: 'svg-basic': cls.SVG_BASIC, cmlenz@663: 'svg-tiny': cls.SVG_TINY cmlenz@464: }.get(name.lower()) cmlenz@464: get = classmethod(get) cmlenz@448: cmlenz@85: cmlenz@123: class XMLSerializer(object): cmlenz@1: """Produces XML text from an event stream. cmlenz@1: cmlenz@230: >>> from genshi.builder import tag cmlenz@20: >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) cmlenz@123: >>> print ''.join(XMLSerializer()(elem.generate())) cmlenz@1:
cmlenz@1: """ cmlenz@123: cmlenz@123: _PRESERVE_SPACE = frozenset() cmlenz@123: cmlenz@410: def __init__(self, doctype=None, strip_whitespace=True, cmlenz@410: namespace_prefixes=None): cmlenz@85: """Initialize the XML serializer. cmlenz@85: cmlenz@425: :param doctype: a ``(name, pubid, sysid)`` tuple that represents the cmlenz@425: DOCTYPE declaration that should be included at the top cmlenz@494: of the generated output, or the name of a DOCTYPE as cmlenz@494: defined in `DocType.get` cmlenz@425: :param strip_whitespace: whether extraneous whitespace should be cmlenz@425: stripped from the output cmlenz@494: :note: Changed in 0.4.2: The `doctype` parameter can now be a string. cmlenz@85: """ cmlenz@212: self.filters = [EmptyTagFilter()] cmlenz@123: if strip_whitespace: cmlenz@123: self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE)) cmlenz@410: self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes)) athomas@671: if doctype: athomas@671: self.filters.append(DocTypeInserter(doctype)) cmlenz@1: cmlenz@123: def __call__(self, stream): cmlenz@460: have_decl = have_doctype = False cmlenz@143: in_cdata = False cmlenz@1: cmlenz@123: for filter_ in self.filters: cmlenz@123: stream = filter_(stream) cmlenz@1: for kind, data, pos in stream: cmlenz@1: cmlenz@212: if kind is START or kind is EMPTY: cmlenz@1: tag, attrib = data cmlenz@410: buf = ['<', tag] cmlenz@397: for attr, value in attrib: cmlenz@410: buf += [' ', attr, '="', escape(value), '"'] cmlenz@397: buf.append(kind is EMPTY and '/>' or '>') cmlenz@397: yield Markup(u''.join(buf)) cmlenz@1: cmlenz@69: elif kind is END: cmlenz@410: yield Markup('%s>' % data) cmlenz@1: cmlenz@69: elif kind is TEXT: cmlenz@143: if in_cdata: cmlenz@143: yield data cmlenz@143: else: cmlenz@143: yield escape(data, quotes=False) cmlenz@1: cmlenz@89: elif kind is COMMENT: cmlenz@89: yield Markup('' % data) cmlenz@89: cmlenz@460: elif kind is XML_DECL and not have_decl: cmlenz@460: version, encoding, standalone = data cmlenz@460: buf = ['\n') cmlenz@460: yield Markup(u''.join(buf)) cmlenz@460: have_decl = True cmlenz@460: cmlenz@136: elif kind is DOCTYPE and not have_doctype: cmlenz@136: name, pubid, sysid = data cmlenz@136: buf = ['\n') cmlenz@713: yield Markup(u''.join(buf)) % filter(None, data) cmlenz@136: have_doctype = True cmlenz@109: cmlenz@143: elif kind is START_CDATA: cmlenz@143: yield Markup('') cmlenz@143: in_cdata = False cmlenz@143: cmlenz@105: elif kind is PI: cmlenz@105: yield Markup('%s %s?>' % data) cmlenz@105: cmlenz@1: cmlenz@96: class XHTMLSerializer(XMLSerializer): cmlenz@96: """Produces XHTML text from an event stream. cmlenz@1: cmlenz@230: >>> from genshi.builder import tag cmlenz@20: >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) cmlenz@123: >>> print ''.join(XHTMLSerializer()(elem.generate())) cmlenz@96: cmlenz@1: """ cmlenz@1: cmlenz@1: _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', cmlenz@1: 'hr', 'img', 'input', 'isindex', 'link', 'meta', cmlenz@1: 'param']) cmlenz@1: _BOOLEAN_ATTRS = frozenset(['selected', 'checked', 'compact', 'declare', cmlenz@1: 'defer', 'disabled', 'ismap', 'multiple', cmlenz@1: 'nohref', 'noresize', 'noshade', 'nowrap']) cmlenz@346: _PRESERVE_SPACE = frozenset([ cmlenz@346: QName('pre'), QName('http://www.w3.org/1999/xhtml}pre'), cmlenz@346: QName('textarea'), QName('http://www.w3.org/1999/xhtml}textarea') cmlenz@346: ]) cmlenz@1: cmlenz@410: def __init__(self, doctype=None, strip_whitespace=True, cmlenz@410: namespace_prefixes=None): cmlenz@410: super(XHTMLSerializer, self).__init__(doctype, False) cmlenz@410: self.filters = [EmptyTagFilter()] cmlenz@410: if strip_whitespace: cmlenz@410: self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE)) cmlenz@410: namespace_prefixes = namespace_prefixes or {} cmlenz@410: namespace_prefixes['http://www.w3.org/1999/xhtml'] = '' cmlenz@410: self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes)) athomas@671: if doctype: athomas@671: self.filters.append(DocTypeInserter(doctype)) cmlenz@410: cmlenz@123: def __call__(self, stream): cmlenz@136: boolean_attrs = self._BOOLEAN_ATTRS cmlenz@136: empty_elems = self._EMPTY_ELEMS cmlenz@85: have_doctype = False cmlenz@143: in_cdata = False cmlenz@1: cmlenz@123: for filter_ in self.filters: cmlenz@123: stream = filter_(stream) cmlenz@1: for kind, data, pos in stream: cmlenz@1: cmlenz@212: if kind is START or kind is EMPTY: cmlenz@1: tag, attrib = data cmlenz@410: buf = ['<', tag] cmlenz@397: for attr, value in attrib: cmlenz@410: if attr in boolean_attrs: cmlenz@410: value = attr cmlenz@524: elif attr == u'xml:lang' and u'lang' not in attrib: cmlenz@524: buf += [' lang="', escape(value), '"'] cmlenz@689: elif attr == u'xml:space': cmlenz@689: continue cmlenz@410: buf += [' ', attr, '="', escape(value), '"'] cmlenz@212: if kind is EMPTY: cmlenz@410: if tag in empty_elems: cmlenz@397: buf.append(' />') cmlenz@96: else: cmlenz@410: buf.append('>%s>' % tag) cmlenz@141: else: cmlenz@397: buf.append('>') cmlenz@397: yield Markup(u''.join(buf)) cmlenz@96: cmlenz@96: elif kind is END: cmlenz@410: yield Markup('%s>' % data) cmlenz@96: cmlenz@96: elif kind is TEXT: cmlenz@143: if in_cdata: cmlenz@143: yield data cmlenz@143: else: cmlenz@143: yield escape(data, quotes=False) cmlenz@96: cmlenz@96: elif kind is COMMENT: cmlenz@96: yield Markup('' % data) cmlenz@96: cmlenz@136: elif kind is DOCTYPE and not have_doctype: cmlenz@136: name, pubid, sysid = data cmlenz@136: buf = ['\n') cmlenz@713: yield Markup(u''.join(buf)) % filter(None, data) cmlenz@136: have_doctype = True cmlenz@109: cmlenz@143: elif kind is START_CDATA: cmlenz@143: yield Markup('') cmlenz@143: in_cdata = False cmlenz@143: cmlenz@105: elif kind is PI: cmlenz@105: yield Markup('%s %s?>' % data) cmlenz@105: cmlenz@96: cmlenz@96: class HTMLSerializer(XHTMLSerializer): cmlenz@96: """Produces HTML text from an event stream. cmlenz@96: cmlenz@230: >>> from genshi.builder import tag cmlenz@96: >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) cmlenz@123: >>> print ''.join(HTMLSerializer()(elem.generate())) cmlenz@96: cmlenz@96: """ cmlenz@96: cmlenz@410: _NOESCAPE_ELEMS = frozenset([ cmlenz@410: QName('script'), QName('http://www.w3.org/1999/xhtml}script'), cmlenz@410: QName('style'), QName('http://www.w3.org/1999/xhtml}style') cmlenz@410: ]) cmlenz@141: cmlenz@141: def __init__(self, doctype=None, strip_whitespace=True): cmlenz@141: """Initialize the HTML serializer. cmlenz@141: cmlenz@425: :param doctype: a ``(name, pubid, sysid)`` tuple that represents the cmlenz@425: DOCTYPE declaration that should be included at the top cmlenz@425: of the generated output cmlenz@425: :param strip_whitespace: whether extraneous whitespace should be cmlenz@425: stripped from the output cmlenz@141: """ cmlenz@141: super(HTMLSerializer, self).__init__(doctype, False) cmlenz@410: self.filters = [EmptyTagFilter()] cmlenz@141: if strip_whitespace: cmlenz@141: self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE, cmlenz@305: self._NOESCAPE_ELEMS)) cmlenz@524: self.filters.append(NamespaceFlattener(prefixes={ cmlenz@524: 'http://www.w3.org/1999/xhtml': '' cmlenz@524: })) athomas@671: if doctype: athomas@671: self.filters.append(DocTypeInserter(doctype)) cmlenz@141: cmlenz@123: def __call__(self, stream): cmlenz@136: boolean_attrs = self._BOOLEAN_ATTRS cmlenz@136: empty_elems = self._EMPTY_ELEMS cmlenz@141: noescape_elems = self._NOESCAPE_ELEMS cmlenz@96: have_doctype = False cmlenz@141: noescape = False cmlenz@96: cmlenz@123: for filter_ in self.filters: cmlenz@123: stream = filter_(stream) cmlenz@96: for kind, data, pos in stream: cmlenz@96: cmlenz@212: if kind is START or kind is EMPTY: cmlenz@96: tag, attrib = data cmlenz@410: buf = ['<', tag] cmlenz@410: for attr, value in attrib: cmlenz@410: if attr in boolean_attrs: cmlenz@410: if value: cmlenz@410: buf += [' ', attr] cmlenz@524: elif ':' in attr: cmlenz@524: if attr == 'xml:lang' and u'lang' not in attrib: cmlenz@524: buf += [' lang="', escape(value), '"'] cmlenz@524: elif attr != 'xmlns': cmlenz@410: buf += [' ', attr, '="', escape(value), '"'] cmlenz@410: buf.append('>') cmlenz@410: if kind is EMPTY: cmlenz@410: if tag not in empty_elems: cmlenz@410: buf.append('%s>' % tag) cmlenz@410: yield Markup(u''.join(buf)) cmlenz@410: if tag in noescape_elems: cmlenz@410: noescape = True cmlenz@141: cmlenz@69: elif kind is END: cmlenz@410: yield Markup('%s>' % data) cmlenz@141: noescape = False cmlenz@141: cmlenz@69: elif kind is TEXT: cmlenz@141: if noescape: cmlenz@141: yield data cmlenz@141: else: cmlenz@141: yield escape(data, quotes=False) cmlenz@1: cmlenz@89: elif kind is COMMENT: cmlenz@89: yield Markup('' % data) cmlenz@89: cmlenz@136: elif kind is DOCTYPE and not have_doctype: cmlenz@136: name, pubid, sysid = data cmlenz@136: buf = ['\n') cmlenz@713: yield Markup(u''.join(buf)) % filter(None, data) cmlenz@136: have_doctype = True cmlenz@109: cmlenz@105: elif kind is PI: cmlenz@105: yield Markup('%s %s?>' % data) cmlenz@105: cmlenz@1: cmlenz@200: class TextSerializer(object): cmlenz@200: """Produces plain text from an event stream. cmlenz@200: cmlenz@200: Only text events are included in the output. Unlike the other serializer, cmlenz@200: special XML characters are not escaped: cmlenz@200: cmlenz@230: >>> from genshi.builder import tag cmlenz@200: >>> elem = tag.div(tag.a('