cmlenz@1: # -*- coding: utf-8 -*- cmlenz@1: # cmlenz@902: # Copyright (C) 2006-2009 Edgewall Software cmlenz@1: # All rights reserved. cmlenz@1: # cmlenz@1: # This software is licensed as described in the file COPYING, which cmlenz@1: # you should have received as part of this distribution. The terms cmlenz@230: # are also available at http://genshi.edgewall.org/wiki/License. cmlenz@1: # cmlenz@1: # This software consists of voluntary contributions made by many cmlenz@1: # individuals. For the exact contribution history, see the revision cmlenz@230: # history and logs, available at http://genshi.edgewall.org/log/. cmlenz@1: cmlenz@1: """This module provides different kinds of serialization methods for XML event cmlenz@1: streams. cmlenz@1: """ cmlenz@1: cmlenz@123: from itertools import chain cmlenz@123: import re cmlenz@1: cmlenz@500: from genshi.core import escape, Attrs, Markup, Namespace, QName, StreamEventKind cmlenz@500: from genshi.core import START, END, TEXT, XML_DECL, DOCTYPE, START_NS, END_NS, \ cmlenz@500: START_CDATA, END_CDATA, PI, COMMENT, XML_NAMESPACE cmlenz@1: cmlenz@500: __all__ = ['encode', 'get_serializer', 'DocType', 'XMLSerializer', cmlenz@500: 'XHTMLSerializer', 'HTMLSerializer', 'TextSerializer'] cmlenz@500: __docformat__ = 'restructuredtext en' cmlenz@500: cmlenz@902: cmlenz@820: def encode(iterator, method='xml', encoding='utf-8', out=None): cmlenz@500: """Encode serializer output into a string. cmlenz@500: cmlenz@500: :param iterator: the iterator returned from serializing a stream (basically cmlenz@500: any iterator that yields unicode objects) cmlenz@500: :param method: the serialization method; determines how characters not cmlenz@500: representable in the specified encoding are treated cmlenz@500: :param encoding: how the output string should be encoded; if set to `None`, cmlenz@500: this method returns a `unicode` object cmlenz@820: :param out: a file-like object that the output should be written to cmlenz@820: instead of being returned as one big string; note that if cmlenz@820: this is a file or socket (or similar), the `encoding` must cmlenz@820: not be `None` (that is, the output must be encoded) cmlenz@820: :return: a `str` or `unicode` object (depending on the `encoding` cmlenz@820: parameter), or `None` if the `out` parameter is provided cmlenz@820: cmlenz@500: :since: version 0.4.1 cmlenz@820: :note: Changed in 0.5: added the `out` parameter cmlenz@500: """ cmlenz@500: if encoding is not None: cmlenz@500: errors = 'replace' cmlenz@500: if method != 'text' and not isinstance(method, TextSerializer): cmlenz@500: errors = 'xmlcharrefreplace' cmlenz@820: _encode = lambda string: string.encode(encoding, errors) cmlenz@820: else: cmlenz@820: _encode = lambda string: string cmlenz@820: if out is None: cmlenz@902: return _encode(''.join(list(iterator))) cmlenz@820: for chunk in iterator: cmlenz@820: out.write(_encode(chunk)) cmlenz@500: cmlenz@902: cmlenz@500: def get_serializer(method='xml', **kwargs): cmlenz@500: """Return a serializer object for the given method. cmlenz@500: cmlenz@500: :param method: the serialization method; can be either "xml", "xhtml", cmlenz@500: "html", "text", or a custom serializer class cmlenz@500: cmlenz@500: Any additional keyword arguments are passed to the serializer, and thus cmlenz@500: depend on the `method` parameter value. cmlenz@500: cmlenz@500: :see: `XMLSerializer`, `XHTMLSerializer`, `HTMLSerializer`, `TextSerializer` cmlenz@500: :since: version 0.4.1 cmlenz@500: """ cmlenz@500: if isinstance(method, basestring): cmlenz@500: method = {'xml': XMLSerializer, cmlenz@500: 'xhtml': XHTMLSerializer, cmlenz@500: 'html': HTMLSerializer, cmlenz@500: 'text': TextSerializer}[method.lower()] cmlenz@500: return method(**kwargs) cmlenz@1: cmlenz@1: cmlenz@85: class DocType(object): cmlenz@85: """Defines a number of commonly used DOCTYPE declarations as constants.""" cmlenz@85: cmlenz@500: HTML_STRICT = ( cmlenz@500: 'html', '-//W3C//DTD HTML 4.01//EN', cmlenz@500: 'http://www.w3.org/TR/html4/strict.dtd' cmlenz@500: ) cmlenz@500: HTML_TRANSITIONAL = ( cmlenz@500: 'html', '-//W3C//DTD HTML 4.01 Transitional//EN', cmlenz@500: 'http://www.w3.org/TR/html4/loose.dtd' cmlenz@500: ) cmlenz@500: HTML_FRAMESET = ( cmlenz@500: 'html', '-//W3C//DTD HTML 4.01 Frameset//EN', cmlenz@500: 'http://www.w3.org/TR/html4/frameset.dtd' cmlenz@500: ) cmlenz@85: HTML = HTML_STRICT cmlenz@85: cmlenz@500: HTML5 = ('html', None, None) cmlenz@500: cmlenz@500: XHTML_STRICT = ( cmlenz@500: 'html', '-//W3C//DTD XHTML 1.0 Strict//EN', cmlenz@500: 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd' cmlenz@500: ) cmlenz@500: XHTML_TRANSITIONAL = ( cmlenz@500: 'html', '-//W3C//DTD XHTML 1.0 Transitional//EN', cmlenz@500: 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' cmlenz@500: ) cmlenz@500: XHTML_FRAMESET = ( cmlenz@500: 'html', '-//W3C//DTD XHTML 1.0 Frameset//EN', cmlenz@500: 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd' cmlenz@500: ) cmlenz@85: XHTML = XHTML_STRICT cmlenz@85: cmlenz@820: XHTML11 = ( cmlenz@820: 'html', '-//W3C//DTD XHTML 1.1//EN', cmlenz@820: 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd' cmlenz@820: ) cmlenz@820: cmlenz@820: SVG_FULL = ( cmlenz@820: 'svg', '-//W3C//DTD SVG 1.1//EN', cmlenz@820: 'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd' cmlenz@820: ) cmlenz@820: SVG_BASIC = ( cmlenz@820: 'svg', '-//W3C//DTD SVG Basic 1.1//EN', cmlenz@820: 'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11-basic.dtd' cmlenz@820: ) cmlenz@820: SVG_TINY = ( cmlenz@820: 'svg', '-//W3C//DTD SVG Tiny 1.1//EN', cmlenz@820: 'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11-tiny.dtd' cmlenz@820: ) cmlenz@820: SVG = SVG_FULL cmlenz@820: cmlenz@830: @classmethod cmlenz@500: def get(cls, name): cmlenz@500: """Return the ``(name, pubid, sysid)`` tuple of the ``DOCTYPE`` cmlenz@500: declaration for the specified name. cmlenz@500: cmlenz@500: The following names are recognized in this version: cmlenz@500: * "html" or "html-strict" for the HTML 4.01 strict DTD cmlenz@500: * "html-transitional" for the HTML 4.01 transitional DTD cmlenz@820: * "html-frameset" for the HTML 4.01 frameset DTD cmlenz@500: * "html5" for the ``DOCTYPE`` proposed for HTML5 cmlenz@500: * "xhtml" or "xhtml-strict" for the XHTML 1.0 strict DTD cmlenz@500: * "xhtml-transitional" for the XHTML 1.0 transitional DTD cmlenz@500: * "xhtml-frameset" for the XHTML 1.0 frameset DTD cmlenz@820: * "xhtml11" for the XHTML 1.1 DTD cmlenz@820: * "svg" or "svg-full" for the SVG 1.1 DTD cmlenz@820: * "svg-basic" for the SVG Basic 1.1 DTD cmlenz@820: * "svg-tiny" for the SVG Tiny 1.1 DTD cmlenz@500: cmlenz@500: :param name: the name of the ``DOCTYPE`` cmlenz@500: :return: the ``(name, pubid, sysid)`` tuple for the requested cmlenz@500: ``DOCTYPE``, or ``None`` if the name is not recognized cmlenz@500: :since: version 0.4.1 cmlenz@500: """ cmlenz@500: return { cmlenz@500: 'html': cls.HTML, 'html-strict': cls.HTML_STRICT, cmlenz@500: 'html-transitional': DocType.HTML_TRANSITIONAL, cmlenz@500: 'html-frameset': DocType.HTML_FRAMESET, cmlenz@500: 'html5': cls.HTML5, cmlenz@500: 'xhtml': cls.XHTML, 'xhtml-strict': cls.XHTML_STRICT, cmlenz@500: 'xhtml-transitional': cls.XHTML_TRANSITIONAL, cmlenz@500: 'xhtml-frameset': cls.XHTML_FRAMESET, cmlenz@820: 'xhtml11': cls.XHTML11, cmlenz@820: 'svg': cls.SVG, 'svg-full': cls.SVG_FULL, cmlenz@820: 'svg-basic': cls.SVG_BASIC, cmlenz@820: 'svg-tiny': cls.SVG_TINY cmlenz@500: }.get(name.lower()) cmlenz@500: cmlenz@85: cmlenz@123: class XMLSerializer(object): cmlenz@1: """Produces XML text from an event stream. cmlenz@1: cmlenz@230: >>> from genshi.builder import tag cmlenz@20: >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) cmlenz@902: >>> print(''.join(XMLSerializer()(elem.generate()))) cmlenz@1:


cmlenz@1: """ cmlenz@123: cmlenz@123: _PRESERVE_SPACE = frozenset() cmlenz@123: cmlenz@500: def __init__(self, doctype=None, strip_whitespace=True, cmlenz@830: namespace_prefixes=None, cache=True): cmlenz@85: """Initialize the XML serializer. cmlenz@85: cmlenz@500: :param doctype: a ``(name, pubid, sysid)`` tuple that represents the cmlenz@500: DOCTYPE declaration that should be included at the top cmlenz@500: of the generated output, or the name of a DOCTYPE as cmlenz@500: defined in `DocType.get` cmlenz@500: :param strip_whitespace: whether extraneous whitespace should be cmlenz@500: stripped from the output cmlenz@830: :param cache: whether to cache the text output per event, which cmlenz@830: improves performance for repetitive markup cmlenz@500: :note: Changed in 0.4.2: The `doctype` parameter can now be a string. cmlenz@830: :note: Changed in 0.6: The `cache` parameter was added cmlenz@85: """ cmlenz@212: self.filters = [EmptyTagFilter()] cmlenz@123: if strip_whitespace: cmlenz@123: self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE)) cmlenz@830: self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes, cmlenz@830: cache=cache)) cmlenz@820: if doctype: cmlenz@820: self.filters.append(DocTypeInserter(doctype)) cmlenz@830: self.cache = cache cmlenz@1: cmlenz@123: def __call__(self, stream): cmlenz@500: have_decl = have_doctype = False cmlenz@500: in_cdata = False cmlenz@500: cmlenz@830: cache = {} cmlenz@830: cache_get = cache.get cmlenz@830: if self.cache: cmlenz@830: def _emit(kind, input, output): cmlenz@830: cache[kind, input] = output cmlenz@830: return output cmlenz@830: else: cmlenz@830: def _emit(kind, input, output): cmlenz@830: return output cmlenz@830: cmlenz@500: for filter_ in self.filters: cmlenz@500: stream = filter_(stream) cmlenz@500: for kind, data, pos in stream: cmlenz@830: cached = cache_get((kind, data)) cmlenz@830: if cached is not None: cmlenz@830: yield cached cmlenz@500: cmlenz@830: elif kind is START or kind is EMPTY: cmlenz@500: tag, attrib = data cmlenz@500: buf = ['<', tag] cmlenz@500: for attr, value in attrib: cmlenz@500: buf += [' ', attr, '="', escape(value), '"'] cmlenz@500: buf.append(kind is EMPTY and '/>' or '>') cmlenz@902: yield _emit(kind, data, Markup(''.join(buf))) cmlenz@500: cmlenz@500: elif kind is END: cmlenz@830: yield _emit(kind, data, Markup('' % data)) cmlenz@500: cmlenz@500: elif kind is TEXT: cmlenz@500: if in_cdata: cmlenz@830: yield _emit(kind, data, data) cmlenz@500: else: cmlenz@830: yield _emit(kind, data, escape(data, quotes=False)) cmlenz@500: cmlenz@500: elif kind is COMMENT: cmlenz@830: yield _emit(kind, data, Markup('' % data)) cmlenz@500: cmlenz@500: elif kind is XML_DECL and not have_decl: cmlenz@500: version, encoding, standalone = data cmlenz@500: buf = ['\n') cmlenz@902: yield Markup(''.join(buf)) cmlenz@500: have_decl = True cmlenz@500: cmlenz@500: elif kind is DOCTYPE and not have_doctype: cmlenz@500: name, pubid, sysid = data cmlenz@500: buf = ['\n') cmlenz@902: yield Markup(''.join(buf)) % tuple([p for p in data if p]) cmlenz@500: have_doctype = True cmlenz@500: cmlenz@500: elif kind is START_CDATA: cmlenz@500: yield Markup('') cmlenz@500: in_cdata = False cmlenz@500: cmlenz@500: elif kind is PI: cmlenz@830: yield _emit(kind, data, Markup('' % data)) cmlenz@500: cmlenz@500: cmlenz@500: class XHTMLSerializer(XMLSerializer): cmlenz@500: """Produces XHTML text from an event stream. cmlenz@500: cmlenz@500: >>> from genshi.builder import tag cmlenz@500: >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) cmlenz@902: >>> print(''.join(XHTMLSerializer()(elem.generate()))) cmlenz@500:


cmlenz@500: """ cmlenz@500: cmlenz@500: _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', cmlenz@500: 'hr', 'img', 'input', 'isindex', 'link', 'meta', cmlenz@500: 'param']) cmlenz@500: _BOOLEAN_ATTRS = frozenset(['selected', 'checked', 'compact', 'declare', cmlenz@500: 'defer', 'disabled', 'ismap', 'multiple', cmlenz@500: 'nohref', 'noresize', 'noshade', 'nowrap']) cmlenz@500: _PRESERVE_SPACE = frozenset([ cmlenz@500: QName('pre'), QName('http://www.w3.org/1999/xhtml}pre'), cmlenz@500: QName('textarea'), QName('http://www.w3.org/1999/xhtml}textarea') cmlenz@500: ]) cmlenz@500: cmlenz@500: def __init__(self, doctype=None, strip_whitespace=True, cmlenz@830: namespace_prefixes=None, drop_xml_decl=True, cache=True): cmlenz@500: super(XHTMLSerializer, self).__init__(doctype, False) cmlenz@500: self.filters = [EmptyTagFilter()] cmlenz@500: if strip_whitespace: cmlenz@500: self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE)) cmlenz@500: namespace_prefixes = namespace_prefixes or {} cmlenz@500: namespace_prefixes['http://www.w3.org/1999/xhtml'] = '' cmlenz@830: self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes, cmlenz@830: cache=cache)) cmlenz@820: if doctype: cmlenz@820: self.filters.append(DocTypeInserter(doctype)) cmlenz@820: self.drop_xml_decl = drop_xml_decl cmlenz@830: self.cache = cache cmlenz@500: cmlenz@500: def __call__(self, stream): cmlenz@500: boolean_attrs = self._BOOLEAN_ATTRS cmlenz@500: empty_elems = self._EMPTY_ELEMS cmlenz@820: drop_xml_decl = self.drop_xml_decl cmlenz@820: have_decl = have_doctype = False cmlenz@143: in_cdata = False cmlenz@1: cmlenz@830: cache = {} cmlenz@830: cache_get = cache.get cmlenz@830: if self.cache: cmlenz@830: def _emit(kind, input, output): cmlenz@830: cache[kind, input] = output cmlenz@830: return output cmlenz@830: else: cmlenz@830: def _emit(kind, input, output): cmlenz@830: return output cmlenz@830: cmlenz@123: for filter_ in self.filters: cmlenz@123: stream = filter_(stream) cmlenz@1: for kind, data, pos in stream: cmlenz@830: cached = cache_get((kind, data)) cmlenz@830: if cached is not None: cmlenz@830: yield cached cmlenz@1: cmlenz@830: elif kind is START or kind is EMPTY: cmlenz@1: tag, attrib = data cmlenz@500: buf = ['<', tag] cmlenz@500: for attr, value in attrib: cmlenz@500: if attr in boolean_attrs: cmlenz@500: value = attr cmlenz@902: elif attr == 'xml:lang' and 'lang' not in attrib: cmlenz@820: buf += [' lang="', escape(value), '"'] cmlenz@902: elif attr == 'xml:space': cmlenz@820: continue cmlenz@500: buf += [' ', attr, '="', escape(value), '"'] cmlenz@500: if kind is EMPTY: cmlenz@500: if tag in empty_elems: cmlenz@500: buf.append(' />') cmlenz@123: else: cmlenz@500: buf.append('>' % tag) cmlenz@500: else: cmlenz@500: buf.append('>') cmlenz@902: yield _emit(kind, data, Markup(''.join(buf))) cmlenz@1: cmlenz@69: elif kind is END: cmlenz@830: yield _emit(kind, data, Markup('' % data)) cmlenz@1: cmlenz@69: elif kind is TEXT: cmlenz@143: if in_cdata: cmlenz@830: yield _emit(kind, data, data) cmlenz@143: else: cmlenz@830: yield _emit(kind, data, escape(data, quotes=False)) cmlenz@1: cmlenz@89: elif kind is COMMENT: cmlenz@830: yield _emit(kind, data, Markup('' % data)) cmlenz@89: cmlenz@136: elif kind is DOCTYPE and not have_doctype: cmlenz@136: name, pubid, sysid = data cmlenz@136: buf = ['\n') cmlenz@902: yield Markup(''.join(buf)) % tuple([p for p in data if p]) cmlenz@136: have_doctype = True cmlenz@109: cmlenz@820: elif kind is XML_DECL and not have_decl and not drop_xml_decl: cmlenz@820: version, encoding, standalone = data cmlenz@820: buf = ['\n') cmlenz@902: yield Markup(''.join(buf)) cmlenz@820: have_decl = True cmlenz@820: cmlenz@143: elif kind is START_CDATA: cmlenz@143: yield Markup('') cmlenz@143: in_cdata = False cmlenz@143: cmlenz@105: elif kind is PI: cmlenz@830: yield _emit(kind, data, Markup('' % data)) cmlenz@105: cmlenz@96: cmlenz@96: class HTMLSerializer(XHTMLSerializer): cmlenz@96: """Produces HTML text from an event stream. cmlenz@96: cmlenz@230: >>> from genshi.builder import tag cmlenz@96: >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) cmlenz@902: >>> print(''.join(HTMLSerializer()(elem.generate()))) cmlenz@96:


cmlenz@96: """ cmlenz@96: cmlenz@500: _NOESCAPE_ELEMS = frozenset([ cmlenz@500: QName('script'), QName('http://www.w3.org/1999/xhtml}script'), cmlenz@500: QName('style'), QName('http://www.w3.org/1999/xhtml}style') cmlenz@500: ]) cmlenz@141: cmlenz@830: def __init__(self, doctype=None, strip_whitespace=True, cache=True): cmlenz@141: """Initialize the HTML serializer. cmlenz@141: cmlenz@500: :param doctype: a ``(name, pubid, sysid)`` tuple that represents the cmlenz@500: DOCTYPE declaration that should be included at the top cmlenz@500: of the generated output cmlenz@500: :param strip_whitespace: whether extraneous whitespace should be cmlenz@500: stripped from the output cmlenz@830: :param cache: whether to cache the text output per event, which cmlenz@830: improves performance for repetitive markup cmlenz@830: :note: Changed in 0.6: The `cache` parameter was added cmlenz@141: """ cmlenz@141: super(HTMLSerializer, self).__init__(doctype, False) cmlenz@500: self.filters = [EmptyTagFilter()] cmlenz@141: if strip_whitespace: cmlenz@141: self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE, cmlenz@305: self._NOESCAPE_ELEMS)) cmlenz@820: self.filters.append(NamespaceFlattener(prefixes={ cmlenz@820: 'http://www.w3.org/1999/xhtml': '' cmlenz@830: }, cache=cache)) cmlenz@820: if doctype: cmlenz@820: self.filters.append(DocTypeInserter(doctype)) cmlenz@830: self.cache = True cmlenz@141: cmlenz@123: def __call__(self, stream): cmlenz@136: boolean_attrs = self._BOOLEAN_ATTRS cmlenz@136: empty_elems = self._EMPTY_ELEMS cmlenz@141: noescape_elems = self._NOESCAPE_ELEMS cmlenz@96: have_doctype = False cmlenz@141: noescape = False cmlenz@96: cmlenz@830: cache = {} cmlenz@830: cache_get = cache.get cmlenz@830: if self.cache: cmlenz@830: def _emit(kind, input, output): cmlenz@830: cache[kind, input] = output cmlenz@830: return output cmlenz@830: else: cmlenz@830: def _emit(kind, input, output): cmlenz@830: return output cmlenz@830: cmlenz@123: for filter_ in self.filters: cmlenz@123: stream = filter_(stream) cmlenz@830: for kind, data, _ in stream: cmlenz@830: output = cache_get((kind, data)) cmlenz@830: if output is not None: cmlenz@830: yield output cmlenz@902: if (kind is START or kind is EMPTY) \ cmlenz@902: and data[0] in noescape_elems: cmlenz@830: noescape = True cmlenz@830: elif kind is END: cmlenz@830: noescape = False cmlenz@96: cmlenz@830: elif kind is START or kind is EMPTY: cmlenz@96: tag, attrib = data cmlenz@500: buf = ['<', tag] cmlenz@500: for attr, value in attrib: cmlenz@500: if attr in boolean_attrs: cmlenz@500: if value: cmlenz@500: buf += [' ', attr] cmlenz@820: elif ':' in attr: cmlenz@902: if attr == 'xml:lang' and 'lang' not in attrib: cmlenz@820: buf += [' lang="', escape(value), '"'] cmlenz@820: elif attr != 'xmlns': cmlenz@500: buf += [' ', attr, '="', escape(value), '"'] cmlenz@500: buf.append('>') cmlenz@500: if kind is EMPTY: cmlenz@500: if tag not in empty_elems: cmlenz@500: buf.append('' % tag) cmlenz@902: yield _emit(kind, data, Markup(''.join(buf))) cmlenz@500: if tag in noescape_elems: cmlenz@500: noescape = True cmlenz@141: cmlenz@69: elif kind is END: cmlenz@830: yield _emit(kind, data, Markup('' % data)) cmlenz@141: noescape = False cmlenz@141: cmlenz@69: elif kind is TEXT: cmlenz@141: if noescape: cmlenz@830: yield _emit(kind, data, data) cmlenz@141: else: cmlenz@830: yield _emit(kind, data, escape(data, quotes=False)) cmlenz@1: cmlenz@89: elif kind is COMMENT: cmlenz@830: yield _emit(kind, data, Markup('' % data)) cmlenz@89: cmlenz@136: elif kind is DOCTYPE and not have_doctype: cmlenz@136: name, pubid, sysid = data cmlenz@136: buf = ['\n') cmlenz@902: yield Markup(''.join(buf)) % tuple([p for p in data if p]) cmlenz@136: have_doctype = True cmlenz@109: cmlenz@105: elif kind is PI: cmlenz@830: yield _emit(kind, data, Markup('' % data)) cmlenz@105: cmlenz@1: cmlenz@200: class TextSerializer(object): cmlenz@200: """Produces plain text from an event stream. cmlenz@200: cmlenz@200: Only text events are included in the output. Unlike the other serializer, cmlenz@200: special XML characters are not escaped: cmlenz@200: cmlenz@230: >>> from genshi.builder import tag cmlenz@200: >>> elem = tag.div(tag.a('', href='foo'), tag.br) cmlenz@902: >>> print(elem) cmlenz@200:
<Hello!>
cmlenz@902: >>> print(''.join(TextSerializer()(elem.generate()))) cmlenz@200: cmlenz@200: cmlenz@200: If text events contain literal markup (instances of the `Markup` class), cmlenz@820: that markup is by default passed through unchanged: cmlenz@200: cmlenz@820: >>> elem = tag.div(Markup('Hello & Bye!
')) cmlenz@902: >>> print(elem.generate().render(TextSerializer, encoding=None)) cmlenz@820: Hello & Bye!
cmlenz@820: cmlenz@820: You can use the ``strip_markup`` to change this behavior, so that tags and cmlenz@820: entities are stripped from the output (or in the case of entities, cmlenz@820: replaced with the equivalent character): cmlenz@820: cmlenz@902: >>> print(elem.generate().render(TextSerializer, strip_markup=True, cmlenz@902: ... encoding=None)) cmlenz@820: Hello & Bye! cmlenz@200: """ cmlenz@200: cmlenz@820: def __init__(self, strip_markup=False): cmlenz@820: """Create the serializer. cmlenz@820: cmlenz@820: :param strip_markup: whether markup (tags and encoded characters) found cmlenz@820: in the text should be removed cmlenz@820: """ cmlenz@820: self.strip_markup = strip_markup cmlenz@820: cmlenz@200: def __call__(self, stream): cmlenz@820: strip_markup = self.strip_markup cmlenz@500: for event in stream: cmlenz@500: if event[0] is TEXT: cmlenz@500: data = event[1] cmlenz@820: if strip_markup and type(data) is Markup: cmlenz@200: data = data.striptags().stripentities() cmlenz@201: yield unicode(data) cmlenz@200: cmlenz@200: cmlenz@212: class EmptyTagFilter(object): cmlenz@212: """Combines `START` and `STOP` events into `EMPTY` events for elements that cmlenz@212: have no contents. cmlenz@212: """ cmlenz@212: cmlenz@212: EMPTY = StreamEventKind('EMPTY') cmlenz@212: cmlenz@212: def __call__(self, stream): cmlenz@212: prev = (None, None, None) cmlenz@500: for ev in stream: cmlenz@212: if prev[0] is START: cmlenz@500: if ev[0] is END: cmlenz@212: prev = EMPTY, prev[1], prev[2] cmlenz@212: yield prev cmlenz@212: continue cmlenz@212: else: cmlenz@212: yield prev cmlenz@500: if ev[0] is not START: cmlenz@500: yield ev cmlenz@500: prev = ev cmlenz@212: cmlenz@212: cmlenz@212: EMPTY = EmptyTagFilter.EMPTY cmlenz@212: cmlenz@212: cmlenz@500: class NamespaceFlattener(object): cmlenz@500: r"""Output stream filter that removes namespace information from the stream, cmlenz@500: instead adding namespace attributes and prefixes as needed. cmlenz@500: cmlenz@500: :param prefixes: optional mapping of namespace URIs to prefixes cmlenz@500: cmlenz@500: >>> from genshi.input import XML cmlenz@500: >>> xml = XML(''' cmlenz@500: ... cmlenz@500: ... ''') cmlenz@500: >>> for kind, data, pos in NamespaceFlattener()(xml): cmlenz@902: ... print('%s %r' % (kind, data)) cmlenz@902: START (u'doc', Attrs([('xmlns', u'NS1'), (u'xmlns:two', u'NS2')])) cmlenz@500: TEXT u'\n ' cmlenz@500: START (u'two:item', Attrs()) cmlenz@500: END u'two:item' cmlenz@500: TEXT u'\n' cmlenz@500: END u'doc' cmlenz@500: """ cmlenz@500: cmlenz@830: def __init__(self, prefixes=None, cache=True): cmlenz@500: self.prefixes = {XML_NAMESPACE.uri: 'xml'} cmlenz@500: if prefixes is not None: cmlenz@500: self.prefixes.update(prefixes) cmlenz@830: self.cache = cache cmlenz@500: cmlenz@500: def __call__(self, stream): cmlenz@830: cache = {} cmlenz@830: cache_get = cache.get cmlenz@830: if self.cache: cmlenz@830: def _emit(kind, input, output, pos): cmlenz@830: cache[kind, input] = output cmlenz@830: return kind, output, pos cmlenz@830: else: cmlenz@830: def _emit(kind, input, output, pos): cmlenz@830: return output cmlenz@830: cmlenz@500: prefixes = dict([(v, [k]) for k, v in self.prefixes.items()]) cmlenz@500: namespaces = {XML_NAMESPACE.uri: ['xml']} cmlenz@500: def _push_ns(prefix, uri): cmlenz@500: namespaces.setdefault(uri, []).append(prefix) cmlenz@500: prefixes.setdefault(prefix, []).append(uri) cmlenz@830: cache.clear() cmlenz@830: def _pop_ns(prefix): cmlenz@830: uris = prefixes.get(prefix) cmlenz@830: uri = uris.pop() cmlenz@830: if not uris: cmlenz@830: del prefixes[prefix] cmlenz@830: if uri not in uris or uri != uris[-1]: cmlenz@830: uri_prefixes = namespaces[uri] cmlenz@830: uri_prefixes.pop() cmlenz@830: if not uri_prefixes: cmlenz@830: del namespaces[uri] cmlenz@830: cache.clear() cmlenz@830: return uri cmlenz@500: cmlenz@500: ns_attrs = [] cmlenz@500: _push_ns_attr = ns_attrs.append cmlenz@500: def _make_ns_attr(prefix, uri): cmlenz@902: return 'xmlns%s' % (prefix and ':%s' % prefix or ''), uri cmlenz@500: cmlenz@500: def _gen_prefix(): cmlenz@500: val = 0 cmlenz@500: while 1: cmlenz@500: val += 1 cmlenz@500: yield 'ns%d' % val cmlenz@500: _gen_prefix = _gen_prefix().next cmlenz@500: cmlenz@500: for kind, data, pos in stream: cmlenz@830: output = cache_get((kind, data)) cmlenz@830: if output is not None: cmlenz@830: yield kind, output, pos cmlenz@500: cmlenz@830: elif kind is START or kind is EMPTY: cmlenz@500: tag, attrs = data cmlenz@500: cmlenz@500: tagname = tag.localname cmlenz@500: tagns = tag.namespace cmlenz@500: if tagns: cmlenz@500: if tagns in namespaces: cmlenz@500: prefix = namespaces[tagns][-1] cmlenz@500: if prefix: cmlenz@902: tagname = '%s:%s' % (prefix, tagname) cmlenz@500: else: cmlenz@902: _push_ns_attr(('xmlns', tagns)) cmlenz@500: _push_ns('', tagns) cmlenz@500: cmlenz@500: new_attrs = [] cmlenz@500: for attr, value in attrs: cmlenz@500: attrname = attr.localname cmlenz@500: attrns = attr.namespace cmlenz@500: if attrns: cmlenz@500: if attrns not in namespaces: cmlenz@500: prefix = _gen_prefix() cmlenz@500: _push_ns(prefix, attrns) cmlenz@500: _push_ns_attr(('xmlns:%s' % prefix, attrns)) cmlenz@500: else: cmlenz@500: prefix = namespaces[attrns][-1] cmlenz@500: if prefix: cmlenz@902: attrname = '%s:%s' % (prefix, attrname) cmlenz@500: new_attrs.append((attrname, value)) cmlenz@500: cmlenz@830: yield _emit(kind, data, (tagname, Attrs(ns_attrs + new_attrs)), pos) cmlenz@500: del ns_attrs[:] cmlenz@500: cmlenz@500: elif kind is END: cmlenz@500: tagname = data.localname cmlenz@500: tagns = data.namespace cmlenz@500: if tagns: cmlenz@500: prefix = namespaces[tagns][-1] cmlenz@500: if prefix: cmlenz@902: tagname = '%s:%s' % (prefix, tagname) cmlenz@830: yield _emit(kind, data, tagname, pos) cmlenz@500: cmlenz@500: elif kind is START_NS: cmlenz@500: prefix, uri = data cmlenz@500: if uri not in namespaces: cmlenz@500: prefix = prefixes.get(uri, [prefix])[-1] cmlenz@500: _push_ns_attr(_make_ns_attr(prefix, uri)) cmlenz@500: _push_ns(prefix, uri) cmlenz@500: cmlenz@500: elif kind is END_NS: cmlenz@500: if data in prefixes: cmlenz@830: uri = _pop_ns(data) cmlenz@500: if ns_attrs: cmlenz@500: attr = _make_ns_attr(data, uri) cmlenz@500: if attr in ns_attrs: cmlenz@500: ns_attrs.remove(attr) cmlenz@500: cmlenz@500: else: cmlenz@500: yield kind, data, pos cmlenz@500: cmlenz@500: cmlenz@123: class WhitespaceFilter(object): cmlenz@123: """A filter that removes extraneous ignorable white space from the cmlenz@500: stream. cmlenz@500: """ cmlenz@123: cmlenz@305: def __init__(self, preserve=None, noescape=None): cmlenz@123: """Initialize the filter. cmlenz@123: cmlenz@500: :param preserve: a set or sequence of tag names for which white-space cmlenz@500: should be preserved cmlenz@500: :param noescape: a set or sequence of tag names for which text content cmlenz@500: should not be escaped cmlenz@141: cmlenz@347: The `noescape` set is expected to refer to elements that cannot contain cmlenz@500: further child elements (such as ``