cmlenz@1: # -*- coding: utf-8 -*- cmlenz@1: # cmlenz@66: # Copyright (C) 2006 Edgewall Software cmlenz@1: # All rights reserved. cmlenz@1: # cmlenz@1: # This software is licensed as described in the file COPYING, which cmlenz@1: # you should have received as part of this distribution. The terms cmlenz@230: # are also available at http://genshi.edgewall.org/wiki/License. cmlenz@1: # cmlenz@1: # This software consists of voluntary contributions made by many cmlenz@1: # individuals. For the exact contribution history, see the revision cmlenz@230: # history and logs, available at http://genshi.edgewall.org/log/. cmlenz@1: cmlenz@1: """This module provides different kinds of serialization methods for XML event cmlenz@1: streams. cmlenz@1: """ cmlenz@1: cmlenz@123: from itertools import chain cmlenz@1: try: cmlenz@1: frozenset cmlenz@1: except NameError: cmlenz@1: from sets import ImmutableSet as frozenset cmlenz@123: import re cmlenz@1: cmlenz@230: from genshi.core import escape, Markup, Namespace, QName, StreamEventKind cmlenz@230: from genshi.core import DOCTYPE, START, END, START_NS, TEXT, START_CDATA, \ cmlenz@145: END_CDATA, PI, COMMENT, XML_NAMESPACE cmlenz@1: cmlenz@200: __all__ = ['DocType', 'XMLSerializer', 'XHTMLSerializer', 'HTMLSerializer', cmlenz@200: 'TextSerializer'] cmlenz@1: cmlenz@1: cmlenz@85: class DocType(object): cmlenz@85: """Defines a number of commonly used DOCTYPE declarations as constants.""" cmlenz@85: cmlenz@85: HTML_STRICT = ('html', '-//W3C//DTD HTML 4.01//EN', cmlenz@85: 'http://www.w3.org/TR/html4/strict.dtd') cmlenz@85: HTML_TRANSITIONAL = ('html', '-//W3C//DTD HTML 4.01 Transitional//EN', cmlenz@85: 'http://www.w3.org/TR/html4/loose.dtd') cmlenz@85: HTML = HTML_STRICT cmlenz@85: cmlenz@85: XHTML_STRICT = ('html', '-//W3C//DTD XHTML 1.0 Strict//EN', cmlenz@85: 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd') cmlenz@85: XHTML_TRANSITIONAL = ('html', '-//W3C//DTD XHTML 1.0 Transitional//EN', cmlenz@85: 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd') cmlenz@85: XHTML = XHTML_STRICT cmlenz@85: cmlenz@85: cmlenz@123: class XMLSerializer(object): cmlenz@1: """Produces XML text from an event stream. cmlenz@1: cmlenz@230: >>> from genshi.builder import tag cmlenz@20: >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) cmlenz@123: >>> print ''.join(XMLSerializer()(elem.generate())) cmlenz@1:

cmlenz@1: """ cmlenz@123: cmlenz@123: _PRESERVE_SPACE = frozenset() cmlenz@123: cmlenz@123: def __init__(self, doctype=None, strip_whitespace=True): cmlenz@85: """Initialize the XML serializer. cmlenz@85: cmlenz@85: @param doctype: a `(name, pubid, sysid)` tuple that represents the cmlenz@85: DOCTYPE declaration that should be included at the top of the cmlenz@85: generated output cmlenz@123: @param strip_whitespace: whether extraneous whitespace should be cmlenz@123: stripped from the output cmlenz@85: """ cmlenz@85: self.preamble = [] cmlenz@85: if doctype: cmlenz@85: self.preamble.append((DOCTYPE, doctype, (None, -1, -1))) cmlenz@212: self.filters = [EmptyTagFilter()] cmlenz@123: if strip_whitespace: cmlenz@123: self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE)) cmlenz@1: cmlenz@123: def __call__(self, stream): cmlenz@1: ns_attrib = [] cmlenz@141: ns_mapping = {XML_NAMESPACE.uri: 'xml'} cmlenz@143: have_doctype = False cmlenz@143: in_cdata = False cmlenz@1: cmlenz@123: stream = chain(self.preamble, stream) cmlenz@123: for filter_ in self.filters: cmlenz@123: stream = filter_(stream) cmlenz@1: for kind, data, pos in stream: cmlenz@1: cmlenz@212: if kind is START or kind is EMPTY: cmlenz@1: tag, attrib = data cmlenz@1: cmlenz@1: tagname = tag.localname cmlenz@123: namespace = tag.namespace cmlenz@123: if namespace: cmlenz@123: if namespace in ns_mapping: cmlenz@123: prefix = ns_mapping[namespace] cmlenz@1: if prefix: cmlenz@123: tagname = '%s:%s' % (prefix, tagname) cmlenz@123: else: cmlenz@123: ns_attrib.append((QName('xmlns'), namespace)) cmlenz@136: buf = ['<', tagname] cmlenz@1: cmlenz@397: if ns_attrib: cmlenz@397: attrib += tuple(ns_attrib) cmlenz@397: for attr, value in attrib: cmlenz@1: attrname = attr.localname cmlenz@397: attrns = attr.namespace cmlenz@397: if attrns: cmlenz@397: prefix = ns_mapping.get(attrns) cmlenz@1: if prefix: cmlenz@69: attrname = '%s:%s' % (prefix, attrname) cmlenz@136: buf += [' ', attrname, '="', escape(value), '"'] cmlenz@123: ns_attrib = [] cmlenz@1: cmlenz@397: buf.append(kind is EMPTY and '/>' or '>') cmlenz@1: cmlenz@397: yield Markup(u''.join(buf)) cmlenz@1: cmlenz@69: elif kind is END: cmlenz@1: tag = data cmlenz@1: tagname = tag.localname cmlenz@1: if tag.namespace: cmlenz@26: prefix = ns_mapping.get(tag.namespace) cmlenz@26: if prefix: cmlenz@69: tagname = '%s:%s' % (prefix, tag.localname) cmlenz@1: yield Markup('' % tagname) cmlenz@1: cmlenz@69: elif kind is TEXT: cmlenz@143: if in_cdata: cmlenz@143: yield data cmlenz@143: else: cmlenz@143: yield escape(data, quotes=False) cmlenz@1: cmlenz@89: elif kind is COMMENT: cmlenz@89: yield Markup('' % data) cmlenz@89: cmlenz@136: elif kind is DOCTYPE and not have_doctype: cmlenz@136: name, pubid, sysid = data cmlenz@136: buf = ['\n') cmlenz@397: yield Markup(u''.join(buf), *filter(None, data)) cmlenz@136: have_doctype = True cmlenz@109: cmlenz@109: elif kind is START_NS: cmlenz@109: prefix, uri = data cmlenz@109: if uri not in ns_mapping: cmlenz@109: ns_mapping[uri] = prefix cmlenz@109: if not prefix: cmlenz@109: ns_attrib.append((QName('xmlns'), uri)) cmlenz@109: else: cmlenz@109: ns_attrib.append((QName('xmlns:%s' % prefix), uri)) cmlenz@109: cmlenz@143: elif kind is START_CDATA: cmlenz@143: yield Markup('') cmlenz@143: in_cdata = False cmlenz@143: cmlenz@105: elif kind is PI: cmlenz@105: yield Markup('' % data) cmlenz@105: cmlenz@1: cmlenz@96: class XHTMLSerializer(XMLSerializer): cmlenz@96: """Produces XHTML text from an event stream. cmlenz@1: cmlenz@230: >>> from genshi.builder import tag cmlenz@20: >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) cmlenz@123: >>> print ''.join(XHTMLSerializer()(elem.generate())) cmlenz@96:

cmlenz@1: """ cmlenz@1: cmlenz@18: NAMESPACE = Namespace('http://www.w3.org/1999/xhtml') cmlenz@1: cmlenz@1: _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', cmlenz@1: 'hr', 'img', 'input', 'isindex', 'link', 'meta', cmlenz@1: 'param']) cmlenz@1: _BOOLEAN_ATTRS = frozenset(['selected', 'checked', 'compact', 'declare', cmlenz@1: 'defer', 'disabled', 'ismap', 'multiple', cmlenz@1: 'nohref', 'noresize', 'noshade', 'nowrap']) cmlenz@346: _PRESERVE_SPACE = frozenset([ cmlenz@346: QName('pre'), QName('http://www.w3.org/1999/xhtml}pre'), cmlenz@346: QName('textarea'), QName('http://www.w3.org/1999/xhtml}textarea') cmlenz@346: ]) cmlenz@1: cmlenz@123: def __call__(self, stream): cmlenz@136: namespace = self.NAMESPACE cmlenz@141: ns_attrib = [] cmlenz@141: ns_mapping = {XML_NAMESPACE.uri: 'xml'} cmlenz@136: boolean_attrs = self._BOOLEAN_ATTRS cmlenz@136: empty_elems = self._EMPTY_ELEMS cmlenz@85: have_doctype = False cmlenz@143: in_cdata = False cmlenz@1: cmlenz@123: stream = chain(self.preamble, stream) cmlenz@123: for filter_ in self.filters: cmlenz@123: stream = filter_(stream) cmlenz@1: for kind, data, pos in stream: cmlenz@1: cmlenz@212: if kind is START or kind is EMPTY: cmlenz@1: tag, attrib = data cmlenz@96: cmlenz@141: tagname = tag.localname cmlenz@177: tagns = tag.namespace cmlenz@177: if tagns: cmlenz@177: if tagns in ns_mapping: cmlenz@177: prefix = ns_mapping[tagns] cmlenz@141: if prefix: cmlenz@141: tagname = '%s:%s' % (prefix, tagname) cmlenz@141: else: cmlenz@177: ns_attrib.append((QName('xmlns'), tagns)) cmlenz@141: buf = ['<', tagname] cmlenz@136: cmlenz@397: if ns_attrib: cmlenz@397: attrib += tuple(ns_attrib) cmlenz@397: for attr, value in attrib: cmlenz@141: attrname = attr.localname cmlenz@397: attrns = attr.namespace cmlenz@397: if attrns: cmlenz@397: prefix = ns_mapping.get(attrns) cmlenz@141: if prefix: cmlenz@141: attrname = '%s:%s' % (prefix, attrname) cmlenz@141: if attrname in boolean_attrs: cmlenz@141: if value: cmlenz@141: buf += [' ', attrname, '="', attrname, '"'] cmlenz@141: else: cmlenz@141: buf += [' ', attrname, '="', escape(value), '"'] cmlenz@141: ns_attrib = [] cmlenz@141: cmlenz@212: if kind is EMPTY: cmlenz@212: if (tagns and tagns != namespace.uri) \ cmlenz@397: or tagname in empty_elems: cmlenz@397: buf.append(' />') cmlenz@96: else: cmlenz@397: buf.append('>' % tagname) cmlenz@141: else: cmlenz@397: buf.append('>') cmlenz@96: cmlenz@397: yield Markup(u''.join(buf)) cmlenz@96: cmlenz@96: elif kind is END: cmlenz@96: tag = data cmlenz@141: tagname = tag.localname cmlenz@141: if tag.namespace: cmlenz@141: prefix = ns_mapping.get(tag.namespace) cmlenz@141: if prefix: cmlenz@177: tagname = '%s:%s' % (prefix, tagname) cmlenz@141: yield Markup('' % tagname) cmlenz@96: cmlenz@96: elif kind is TEXT: cmlenz@143: if in_cdata: cmlenz@143: yield data cmlenz@143: else: cmlenz@143: yield escape(data, quotes=False) cmlenz@96: cmlenz@96: elif kind is COMMENT: cmlenz@96: yield Markup('' % data) cmlenz@96: cmlenz@136: elif kind is DOCTYPE and not have_doctype: cmlenz@136: name, pubid, sysid = data cmlenz@136: buf = ['\n') cmlenz@397: yield Markup(u''.join(buf), *filter(None, data)) cmlenz@136: have_doctype = True cmlenz@109: cmlenz@141: elif kind is START_NS: cmlenz@141: prefix, uri = data cmlenz@141: if uri not in ns_mapping: cmlenz@141: ns_mapping[uri] = prefix cmlenz@141: if not prefix: cmlenz@141: ns_attrib.append((QName('xmlns'), uri)) cmlenz@141: else: cmlenz@141: ns_attrib.append((QName('xmlns:%s' % prefix), uri)) cmlenz@109: cmlenz@143: elif kind is START_CDATA: cmlenz@143: yield Markup('') cmlenz@143: in_cdata = False cmlenz@143: cmlenz@105: elif kind is PI: cmlenz@105: yield Markup('' % data) cmlenz@105: cmlenz@96: cmlenz@96: class HTMLSerializer(XHTMLSerializer): cmlenz@96: """Produces HTML text from an event stream. cmlenz@96: cmlenz@230: >>> from genshi.builder import tag cmlenz@96: >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) cmlenz@123: >>> print ''.join(HTMLSerializer()(elem.generate())) cmlenz@96:

cmlenz@96: """ cmlenz@96: cmlenz@280: _NOESCAPE_ELEMS = frozenset([QName('script'), cmlenz@280: QName('http://www.w3.org/1999/xhtml}script'), cmlenz@280: QName('style'), cmlenz@280: QName('http://www.w3.org/1999/xhtml}style')]) cmlenz@141: cmlenz@141: def __init__(self, doctype=None, strip_whitespace=True): cmlenz@141: """Initialize the HTML serializer. cmlenz@141: cmlenz@141: @param doctype: a `(name, pubid, sysid)` tuple that represents the cmlenz@141: DOCTYPE declaration that should be included at the top of the cmlenz@141: generated output cmlenz@141: @param strip_whitespace: whether extraneous whitespace should be cmlenz@141: stripped from the output cmlenz@141: """ cmlenz@141: super(HTMLSerializer, self).__init__(doctype, False) cmlenz@141: if strip_whitespace: cmlenz@141: self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE, cmlenz@305: self._NOESCAPE_ELEMS)) cmlenz@141: cmlenz@123: def __call__(self, stream): cmlenz@136: namespace = self.NAMESPACE cmlenz@136: ns_mapping = {} cmlenz@136: boolean_attrs = self._BOOLEAN_ATTRS cmlenz@136: empty_elems = self._EMPTY_ELEMS cmlenz@141: noescape_elems = self._NOESCAPE_ELEMS cmlenz@96: have_doctype = False cmlenz@141: noescape = False cmlenz@96: cmlenz@123: stream = chain(self.preamble, stream) cmlenz@123: for filter_ in self.filters: cmlenz@123: stream = filter_(stream) cmlenz@96: for kind, data, pos in stream: cmlenz@96: cmlenz@212: if kind is START or kind is EMPTY: cmlenz@96: tag, attrib = data cmlenz@136: if not tag.namespace or tag in namespace: cmlenz@136: tagname = tag.localname cmlenz@136: buf = ['<', tagname] cmlenz@96: cmlenz@136: for attr, value in attrib: cmlenz@136: attrname = attr.localname cmlenz@141: if not attr.namespace or attr in namespace: cmlenz@136: if attrname in boolean_attrs: cmlenz@136: if value: cmlenz@136: buf += [' ', attrname] cmlenz@136: else: cmlenz@136: buf += [' ', attrname, '="', escape(value), '"'] cmlenz@1: cmlenz@397: buf.append('>') cmlenz@1: cmlenz@212: if kind is EMPTY: cmlenz@212: if tagname not in empty_elems: cmlenz@397: buf.append('' % tagname) cmlenz@212: cmlenz@397: yield Markup(u''.join(buf)) cmlenz@1: cmlenz@141: if tagname in noescape_elems: cmlenz@141: noescape = True cmlenz@141: cmlenz@69: elif kind is END: cmlenz@1: tag = data cmlenz@136: if not tag.namespace or tag in namespace: cmlenz@136: yield Markup('' % tag.localname) cmlenz@1: cmlenz@141: noescape = False cmlenz@141: cmlenz@69: elif kind is TEXT: cmlenz@141: if noescape: cmlenz@141: yield data cmlenz@141: else: cmlenz@141: yield escape(data, quotes=False) cmlenz@1: cmlenz@89: elif kind is COMMENT: cmlenz@89: yield Markup('' % data) cmlenz@89: cmlenz@136: elif kind is DOCTYPE and not have_doctype: cmlenz@136: name, pubid, sysid = data cmlenz@136: buf = ['\n') cmlenz@397: yield Markup(u''.join(buf), *filter(None, data)) cmlenz@136: have_doctype = True cmlenz@109: cmlenz@136: elif kind is START_NS and data[1] not in ns_mapping: cmlenz@136: ns_mapping[data[1]] = data[0] cmlenz@109: cmlenz@105: elif kind is PI: cmlenz@105: yield Markup('' % data) cmlenz@105: cmlenz@1: cmlenz@200: class TextSerializer(object): cmlenz@200: """Produces plain text from an event stream. cmlenz@200: cmlenz@200: Only text events are included in the output. Unlike the other serializer, cmlenz@200: special XML characters are not escaped: cmlenz@200: cmlenz@230: >>> from genshi.builder import tag cmlenz@200: >>> elem = tag.div(tag.a('', href='foo'), tag.br) cmlenz@200: >>> print elem cmlenz@200:

<Hello!>

cmlenz@200: >>> print ''.join(TextSerializer()(elem.generate())) cmlenz@200: cmlenz@200: cmlenz@200: If text events contain literal markup (instances of the `Markup` class), cmlenz@200: tags or entities are stripped from the output: cmlenz@200: cmlenz@200: >>> elem = tag.div(Markup('Hello!
')) cmlenz@200: >>> print elem cmlenz@200:

Hello!

cmlenz@200: >>> print ''.join(TextSerializer()(elem.generate())) cmlenz@200: Hello! cmlenz@200: """ cmlenz@200: cmlenz@200: def __call__(self, stream): cmlenz@200: for kind, data, pos in stream: cmlenz@200: if kind is TEXT: cmlenz@200: if type(data) is Markup: cmlenz@200: data = data.striptags().stripentities() cmlenz@201: yield unicode(data) cmlenz@200: cmlenz@200: cmlenz@212: class EmptyTagFilter(object): cmlenz@212: """Combines `START` and `STOP` events into `EMPTY` events for elements that cmlenz@212: have no contents. cmlenz@212: """ cmlenz@212: cmlenz@212: EMPTY = StreamEventKind('EMPTY') cmlenz@212: cmlenz@212: def __call__(self, stream): cmlenz@212: prev = (None, None, None) cmlenz@212: for kind, data, pos in stream: cmlenz@212: if prev[0] is START: cmlenz@212: if kind is END: cmlenz@212: prev = EMPTY, prev[1], prev[2] cmlenz@212: yield prev cmlenz@212: continue cmlenz@212: else: cmlenz@212: yield prev cmlenz@212: if kind is not START: cmlenz@212: yield kind, data, pos cmlenz@212: prev = kind, data, pos cmlenz@212: cmlenz@212: cmlenz@212: EMPTY = EmptyTagFilter.EMPTY cmlenz@212: cmlenz@212: cmlenz@123: class WhitespaceFilter(object): cmlenz@123: """A filter that removes extraneous ignorable white space from the cmlenz@123: stream.""" cmlenz@123: cmlenz@305: def __init__(self, preserve=None, noescape=None): cmlenz@123: """Initialize the filter. cmlenz@123: cmlenz@141: @param preserve: a set or sequence of tag names for which white-space cmlenz@397: should be preserved cmlenz@141: @param noescape: a set or sequence of tag names for which text content cmlenz@141: should not be escaped cmlenz@141: cmlenz@346: The `noescape` set is expected to refer to elements that cannot contain cmlenz@346: further child elements (such as