Mercurial > genshi > genshi-test
view genshi/output.py @ 682:75795aadf920
Assigning to a variable named `data` in a Python code block no longer breaks context lookup. We now use the name `__data__` for internal data, hoping that that name is not as commonly used in templates.
author | cmlenz |
---|---|
date | Wed, 06 Feb 2008 12:18:02 +0000 |
parents | e69d95e80231 |
children | d4c9fa1af3ba 88814cc26d2b 3a4f2fd6f5e2 |
line wrap: on
line source
# -*- coding: utf-8 -*- # # Copyright (C) 2006-2007 Edgewall Software # All rights reserved. # # This software is licensed as described in the file COPYING, which # you should have received as part of this distribution. The terms # are also available at http://genshi.edgewall.org/wiki/License. # # This software consists of voluntary contributions made by many # individuals. For the exact contribution history, see the revision # history and logs, available at http://genshi.edgewall.org/log/. """This module provides different kinds of serialization methods for XML event streams. """ from itertools import chain try: frozenset except NameError: from sets import ImmutableSet as frozenset import re from genshi.core import escape, Attrs, Markup, Namespace, QName, StreamEventKind from genshi.core import START, END, TEXT, XML_DECL, DOCTYPE, START_NS, END_NS, \ START_CDATA, END_CDATA, PI, COMMENT, XML_NAMESPACE __all__ = ['encode', 'get_serializer', 'DocType', 'XMLSerializer', 'XHTMLSerializer', 'HTMLSerializer', 'TextSerializer'] __docformat__ = 'restructuredtext en' def encode(iterator, method='xml', encoding='utf-8'): """Encode serializer output into a string. :param iterator: the iterator returned from serializing a stream (basically any iterator that yields unicode objects) :param method: the serialization method; determines how characters not representable in the specified encoding are treated :param encoding: how the output string should be encoded; if set to `None`, this method returns a `unicode` object :return: a string or unicode object (depending on the `encoding` parameter) :since: version 0.4.1 """ output = u''.join(list(iterator)) if encoding is not None: errors = 'replace' if method != 'text' and not isinstance(method, TextSerializer): errors = 'xmlcharrefreplace' return output.encode(encoding, errors) return output def get_serializer(method='xml', **kwargs): """Return a serializer object for the given method. :param method: the serialization method; can be either "xml", "xhtml", "html", "text", or a custom serializer class Any additional keyword arguments are passed to the serializer, and thus depend on the `method` parameter value. :see: `XMLSerializer`, `XHTMLSerializer`, `HTMLSerializer`, `TextSerializer` :since: version 0.4.1 """ if isinstance(method, basestring): method = {'xml': XMLSerializer, 'xhtml': XHTMLSerializer, 'html': HTMLSerializer, 'text': TextSerializer}[method.lower()] return method(**kwargs) class DocType(object): """Defines a number of commonly used DOCTYPE declarations as constants.""" HTML_STRICT = ( 'html', '-//W3C//DTD HTML 4.01//EN', 'http://www.w3.org/TR/html4/strict.dtd' ) HTML_TRANSITIONAL = ( 'html', '-//W3C//DTD HTML 4.01 Transitional//EN', 'http://www.w3.org/TR/html4/loose.dtd' ) HTML_FRAMESET = ( 'html', '-//W3C//DTD HTML 4.01 Frameset//EN', 'http://www.w3.org/TR/html4/frameset.dtd' ) HTML = HTML_STRICT HTML5 = ('html', None, None) XHTML_STRICT = ( 'html', '-//W3C//DTD XHTML 1.0 Strict//EN', 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd' ) XHTML_TRANSITIONAL = ( 'html', '-//W3C//DTD XHTML 1.0 Transitional//EN', 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' ) XHTML_FRAMESET = ( 'html', '-//W3C//DTD XHTML 1.0 Frameset//EN', 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd' ) XHTML = XHTML_STRICT SVG_FULL = ( 'svg', '-//W3C//DTD SVG 1.1//EN', 'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd' ) SVG_BASIC = ( 'svg', '-//W3C//DTD SVG Basic 1.1//EN', 'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11-basic.dtd' ) SVG_TINY = ( 'svg', '-//W3C//DTD SVG Tiny 1.1//EN', 'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11-tiny.dtd' ) SVG = SVG_FULL def get(cls, name): """Return the ``(name, pubid, sysid)`` tuple of the ``DOCTYPE`` declaration for the specified name. The following names are recognized in this version: * "html" or "html-strict" for the HTML 4.01 strict DTD * "html-transitional" for the HTML 4.01 transitional DTD * "html-transitional" for the HTML 4.01 frameset DTD * "html5" for the ``DOCTYPE`` proposed for HTML5 * "xhtml" or "xhtml-strict" for the XHTML 1.0 strict DTD * "xhtml-transitional" for the XHTML 1.0 transitional DTD * "xhtml-frameset" for the XHTML 1.0 frameset DTD * "svg" or "svg-full" for the SVG 1.1 DTD * "svg-basic" for the SVG Basic 1.1 DTD * "svg-tiny" for the SVG Tiny 1.1 DTD :param name: the name of the ``DOCTYPE`` :return: the ``(name, pubid, sysid)`` tuple for the requested ``DOCTYPE``, or ``None`` if the name is not recognized :since: version 0.4.1 """ return { 'html': cls.HTML, 'html-strict': cls.HTML_STRICT, 'html-transitional': DocType.HTML_TRANSITIONAL, 'html-frameset': DocType.HTML_FRAMESET, 'html5': cls.HTML5, 'xhtml': cls.XHTML, 'xhtml-strict': cls.XHTML_STRICT, 'xhtml-transitional': cls.XHTML_TRANSITIONAL, 'xhtml-frameset': cls.XHTML_FRAMESET, 'svg': cls.SVG, 'svg-full': cls.SVG_FULL, 'svg-basic': cls.SVG_BASIC, 'svg-tiny': cls.SVG_TINY }.get(name.lower()) get = classmethod(get) class XMLSerializer(object): """Produces XML text from an event stream. >>> from genshi.builder import tag >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) >>> print ''.join(XMLSerializer()(elem.generate())) <div><a href="foo"/><br/><hr noshade="True"/></div> """ _PRESERVE_SPACE = frozenset() def __init__(self, doctype=None, strip_whitespace=True, namespace_prefixes=None): """Initialize the XML serializer. :param doctype: a ``(name, pubid, sysid)`` tuple that represents the DOCTYPE declaration that should be included at the top of the generated output, or the name of a DOCTYPE as defined in `DocType.get` :param strip_whitespace: whether extraneous whitespace should be stripped from the output :note: Changed in 0.4.2: The `doctype` parameter can now be a string. """ self.filters = [EmptyTagFilter()] if strip_whitespace: self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE)) self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes)) if doctype: self.filters.append(DocTypeInserter(doctype)) def __call__(self, stream): have_decl = have_doctype = False in_cdata = False for filter_ in self.filters: stream = filter_(stream) for kind, data, pos in stream: if kind is START or kind is EMPTY: tag, attrib = data buf = ['<', tag] for attr, value in attrib: buf += [' ', attr, '="', escape(value), '"'] buf.append(kind is EMPTY and '/>' or '>') yield Markup(u''.join(buf)) elif kind is END: yield Markup('</%s>' % data) elif kind is TEXT: if in_cdata: yield data else: yield escape(data, quotes=False) elif kind is COMMENT: yield Markup('<!--%s-->' % data) elif kind is XML_DECL and not have_decl: version, encoding, standalone = data buf = ['<?xml version="%s"' % version] if encoding: buf.append(' encoding="%s"' % encoding) if standalone != -1: standalone = standalone and 'yes' or 'no' buf.append(' standalone="%s"' % standalone) buf.append('?>\n') yield Markup(u''.join(buf)) have_decl = True elif kind is DOCTYPE and not have_doctype: name, pubid, sysid = data buf = ['<!DOCTYPE %s'] if pubid: buf.append(' PUBLIC "%s"') elif sysid: buf.append(' SYSTEM') if sysid: buf.append(' "%s"') buf.append('>\n') yield Markup(u''.join(buf), *filter(None, data)) have_doctype = True elif kind is START_CDATA: yield Markup('<![CDATA[') in_cdata = True elif kind is END_CDATA: yield Markup(']]>') in_cdata = False elif kind is PI: yield Markup('<?%s %s?>' % data) class XHTMLSerializer(XMLSerializer): """Produces XHTML text from an event stream. >>> from genshi.builder import tag >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) >>> print ''.join(XHTMLSerializer()(elem.generate())) <div><a href="foo"></a><br /><hr noshade="noshade" /></div> """ _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', 'img', 'input', 'isindex', 'link', 'meta', 'param']) _BOOLEAN_ATTRS = frozenset(['selected', 'checked', 'compact', 'declare', 'defer', 'disabled', 'ismap', 'multiple', 'nohref', 'noresize', 'noshade', 'nowrap']) _PRESERVE_SPACE = frozenset([ QName('pre'), QName('http://www.w3.org/1999/xhtml}pre'), QName('textarea'), QName('http://www.w3.org/1999/xhtml}textarea') ]) def __init__(self, doctype=None, strip_whitespace=True, namespace_prefixes=None): super(XHTMLSerializer, self).__init__(doctype, False) self.filters = [EmptyTagFilter()] if strip_whitespace: self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE)) namespace_prefixes = namespace_prefixes or {} namespace_prefixes['http://www.w3.org/1999/xhtml'] = '' self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes)) if doctype: self.filters.append(DocTypeInserter(doctype)) def __call__(self, stream): boolean_attrs = self._BOOLEAN_ATTRS empty_elems = self._EMPTY_ELEMS have_doctype = False in_cdata = False for filter_ in self.filters: stream = filter_(stream) for kind, data, pos in stream: if kind is START or kind is EMPTY: tag, attrib = data buf = ['<', tag] for attr, value in attrib: if attr in boolean_attrs: value = attr elif attr == u'xml:lang' and u'lang' not in attrib: buf += [' lang="', escape(value), '"'] buf += [' ', attr, '="', escape(value), '"'] if kind is EMPTY: if tag in empty_elems: buf.append(' />') else: buf.append('></%s>' % tag) else: buf.append('>') yield Markup(u''.join(buf)) elif kind is END: yield Markup('</%s>' % data) elif kind is TEXT: if in_cdata: yield data else: yield escape(data, quotes=False) elif kind is COMMENT: yield Markup('<!--%s-->' % data) elif kind is DOCTYPE and not have_doctype: name, pubid, sysid = data buf = ['<!DOCTYPE %s'] if pubid: buf.append(' PUBLIC "%s"') elif sysid: buf.append(' SYSTEM') if sysid: buf.append(' "%s"') buf.append('>\n') yield Markup(u''.join(buf), *filter(None, data)) have_doctype = True elif kind is START_CDATA: yield Markup('<![CDATA[') in_cdata = True elif kind is END_CDATA: yield Markup(']]>') in_cdata = False elif kind is PI: yield Markup('<?%s %s?>' % data) class HTMLSerializer(XHTMLSerializer): """Produces HTML text from an event stream. >>> from genshi.builder import tag >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) >>> print ''.join(HTMLSerializer()(elem.generate())) <div><a href="foo"></a><br><hr noshade></div> """ _NOESCAPE_ELEMS = frozenset([ QName('script'), QName('http://www.w3.org/1999/xhtml}script'), QName('style'), QName('http://www.w3.org/1999/xhtml}style') ]) def __init__(self, doctype=None, strip_whitespace=True): """Initialize the HTML serializer. :param doctype: a ``(name, pubid, sysid)`` tuple that represents the DOCTYPE declaration that should be included at the top of the generated output :param strip_whitespace: whether extraneous whitespace should be stripped from the output """ super(HTMLSerializer, self).__init__(doctype, False) self.filters = [EmptyTagFilter()] if strip_whitespace: self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE, self._NOESCAPE_ELEMS)) self.filters.append(NamespaceFlattener(prefixes={ 'http://www.w3.org/1999/xhtml': '' })) if doctype: self.filters.append(DocTypeInserter(doctype)) def __call__(self, stream): boolean_attrs = self._BOOLEAN_ATTRS empty_elems = self._EMPTY_ELEMS noescape_elems = self._NOESCAPE_ELEMS have_doctype = False noescape = False for filter_ in self.filters: stream = filter_(stream) for kind, data, pos in stream: if kind is START or kind is EMPTY: tag, attrib = data buf = ['<', tag] for attr, value in attrib: if attr in boolean_attrs: if value: buf += [' ', attr] elif ':' in attr: if attr == 'xml:lang' and u'lang' not in attrib: buf += [' lang="', escape(value), '"'] elif attr != 'xmlns': buf += [' ', attr, '="', escape(value), '"'] buf.append('>') if kind is EMPTY: if tag not in empty_elems: buf.append('</%s>' % tag) yield Markup(u''.join(buf)) if tag in noescape_elems: noescape = True elif kind is END: yield Markup('</%s>' % data) noescape = False elif kind is TEXT: if noescape: yield data else: yield escape(data, quotes=False) elif kind is COMMENT: yield Markup('<!--%s-->' % data) elif kind is DOCTYPE and not have_doctype: name, pubid, sysid = data buf = ['<!DOCTYPE %s'] if pubid: buf.append(' PUBLIC "%s"') elif sysid: buf.append(' SYSTEM') if sysid: buf.append(' "%s"') buf.append('>\n') yield Markup(u''.join(buf), *filter(None, data)) have_doctype = True elif kind is PI: yield Markup('<?%s %s?>' % data) class TextSerializer(object): """Produces plain text from an event stream. Only text events are included in the output. Unlike the other serializer, special XML characters are not escaped: >>> from genshi.builder import tag >>> elem = tag.div(tag.a('<Hello!>', href='foo'), tag.br) >>> print elem <div><a href="foo"><Hello!></a><br/></div> >>> print ''.join(TextSerializer()(elem.generate())) <Hello!> If text events contain literal markup (instances of the `Markup` class), that markup is by default passed through unchanged: >>> elem = tag.div(Markup('<a href="foo">Hello & Bye!</a><br/>')) >>> print elem.generate().render(TextSerializer) <a href="foo">Hello & Bye!</a><br/> You can use the `strip_markup` to change this behavior, so that tags and entities are stripped from the output (or in the case of entities, replaced with the equivalent character): >>> print elem.generate().render(TextSerializer, strip_markup=True) Hello & Bye! """ def __init__(self, strip_markup=False): self.strip_markup = strip_markup def __call__(self, stream): strip_markup = self.strip_markup for event in stream: if event[0] is TEXT: data = event[1] if strip_markup and type(data) is Markup: data = data.striptags().stripentities() yield unicode(data) class EmptyTagFilter(object): """Combines `START` and `STOP` events into `EMPTY` events for elements that have no contents. """ EMPTY = StreamEventKind('EMPTY') def __call__(self, stream): prev = (None, None, None) for ev in stream: if prev[0] is START: if ev[0] is END: prev = EMPTY, prev[1], prev[2] yield prev continue else: yield prev if ev[0] is not START: yield ev prev = ev EMPTY = EmptyTagFilter.EMPTY class NamespaceFlattener(object): r"""Output stream filter that removes namespace information from the stream, instead adding namespace attributes and prefixes as needed. :param prefixes: optional mapping of namespace URIs to prefixes >>> from genshi.input import XML >>> xml = XML('''<doc xmlns="NS1" xmlns:two="NS2"> ... <two:item/> ... </doc>''') >>> for kind, data, pos in NamespaceFlattener()(xml): ... print kind, repr(data) START (u'doc', Attrs([(u'xmlns', u'NS1'), (u'xmlns:two', u'NS2')])) TEXT u'\n ' START (u'two:item', Attrs()) END u'two:item' TEXT u'\n' END u'doc' """ def __init__(self, prefixes=None): self.prefixes = {XML_NAMESPACE.uri: 'xml'} if prefixes is not None: self.prefixes.update(prefixes) def __call__(self, stream): prefixes = dict([(v, [k]) for k, v in self.prefixes.items()]) namespaces = {XML_NAMESPACE.uri: ['xml']} def _push_ns(prefix, uri): namespaces.setdefault(uri, []).append(prefix) prefixes.setdefault(prefix, []).append(uri) ns_attrs = [] _push_ns_attr = ns_attrs.append def _make_ns_attr(prefix, uri): return u'xmlns%s' % (prefix and ':%s' % prefix or ''), uri def _gen_prefix(): val = 0 while 1: val += 1 yield 'ns%d' % val _gen_prefix = _gen_prefix().next for kind, data, pos in stream: if kind is START or kind is EMPTY: tag, attrs = data tagname = tag.localname tagns = tag.namespace if tagns: if tagns in namespaces: prefix = namespaces[tagns][-1] if prefix: tagname = u'%s:%s' % (prefix, tagname) else: _push_ns_attr((u'xmlns', tagns)) _push_ns('', tagns) new_attrs = [] for attr, value in attrs: attrname = attr.localname attrns = attr.namespace if attrns: if attrns not in namespaces: prefix = _gen_prefix() _push_ns(prefix, attrns) _push_ns_attr(('xmlns:%s' % prefix, attrns)) else: prefix = namespaces[attrns][-1] if prefix: attrname = u'%s:%s' % (prefix, attrname) new_attrs.append((attrname, value)) yield kind, (tagname, Attrs(ns_attrs + new_attrs)), pos del ns_attrs[:] elif kind is END: tagname = data.localname tagns = data.namespace if tagns: prefix = namespaces[tagns][-1] if prefix: tagname = u'%s:%s' % (prefix, tagname) yield kind, tagname, pos elif kind is START_NS: prefix, uri = data if uri not in namespaces: prefix = prefixes.get(uri, [prefix])[-1] _push_ns_attr(_make_ns_attr(prefix, uri)) _push_ns(prefix, uri) elif kind is END_NS: if data in prefixes: uris = prefixes.get(data) uri = uris.pop() if not uris: del prefixes[data] if uri not in uris or uri != uris[-1]: uri_prefixes = namespaces[uri] uri_prefixes.pop() if not uri_prefixes: del namespaces[uri] if ns_attrs: attr = _make_ns_attr(data, uri) if attr in ns_attrs: ns_attrs.remove(attr) else: yield kind, data, pos class WhitespaceFilter(object): """A filter that removes extraneous ignorable white space from the stream. """ def __init__(self, preserve=None, noescape=None): """Initialize the filter. :param preserve: a set or sequence of tag names for which white-space should be preserved :param noescape: a set or sequence of tag names for which text content should not be escaped The `noescape` set is expected to refer to elements that cannot contain further child elements (such as ``<style>`` or ``<script>`` in HTML documents). """ if preserve is None: preserve = [] self.preserve = frozenset(preserve) if noescape is None: noescape = [] self.noescape = frozenset(noescape) def __call__(self, stream, ctxt=None, space=XML_NAMESPACE['space'], trim_trailing_space=re.compile('[ \t]+(?=\n)').sub, collapse_lines=re.compile('\n{2,}').sub): mjoin = Markup('').join preserve_elems = self.preserve preserve = 0 noescape_elems = self.noescape noescape = False textbuf = [] push_text = textbuf.append pop_text = textbuf.pop for kind, data, pos in chain(stream, [(None, None, None)]): if kind is TEXT: if noescape: data = Markup(data) push_text(data) else: if textbuf: if len(textbuf) > 1: text = mjoin(textbuf, escape_quotes=False) del textbuf[:] else: text = escape(pop_text(), quotes=False) if not preserve: text = collapse_lines('\n', trim_trailing_space('', text)) yield TEXT, Markup(text), pos if kind is START: tag, attrs = data if preserve or (tag in preserve_elems or attrs.get(space) == 'preserve'): preserve += 1 if not noescape and tag in noescape_elems: noescape = True elif kind is END: noescape = False if preserve: preserve -= 1 elif kind is START_CDATA: noescape = True elif kind is END_CDATA: noescape = False if kind: yield kind, data, pos class DocTypeInserter(object): """A filter that inserts the DOCTYPE declaration in the correct location, after the XML declaration. """ def __init__(self, doctype): """Initialize the filter. :param doctype: DOCTYPE as a string or DocType object. """ if isinstance(doctype, basestring): doctype = DocType.get(doctype) self.doctype_event = (DOCTYPE, doctype, (None, -1, -1)) def __call__(self, stream): doctype_inserted = False for kind, data, pos in stream: if not doctype_inserted: doctype_inserted = True if kind is XML_DECL: yield (kind, data, pos) yield self.doctype_event continue yield self.doctype_event yield (kind, data, pos) if not doctype_inserted: yield self.doctype_event