Mercurial > genshi > genshi-test
diff genshi/output.py @ 500:0742f421caba experimental-inline
Merged revisions 487-603 via svnmerge from
http://svn.edgewall.org/repos/genshi/trunk
author | cmlenz |
---|---|
date | Fri, 01 Jun 2007 17:21:47 +0000 |
parents | 49aa525b8f83 |
children | 1837f39efd6f |
line wrap: on
line diff
--- a/genshi/output.py +++ b/genshi/output.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2006 Edgewall Software +# Copyright (C) 2006-2007 Edgewall Software # All rights reserved. # # This software is licensed as described in the file COPYING, which @@ -22,29 +22,116 @@ from sets import ImmutableSet as frozenset import re -from genshi.core import escape, Markup, Namespace, QName, StreamEventKind -from genshi.core import DOCTYPE, START, END, START_NS, TEXT, START_CDATA, \ - END_CDATA, PI, COMMENT, XML_NAMESPACE +from genshi.core import escape, Attrs, Markup, Namespace, QName, StreamEventKind +from genshi.core import START, END, TEXT, XML_DECL, DOCTYPE, START_NS, END_NS, \ + START_CDATA, END_CDATA, PI, COMMENT, XML_NAMESPACE -__all__ = ['DocType', 'XMLSerializer', 'XHTMLSerializer', 'HTMLSerializer', - 'TextSerializer'] +__all__ = ['encode', 'get_serializer', 'DocType', 'XMLSerializer', + 'XHTMLSerializer', 'HTMLSerializer', 'TextSerializer'] +__docformat__ = 'restructuredtext en' + +def encode(iterator, method='xml', encoding='utf-8'): + """Encode serializer output into a string. + + :param iterator: the iterator returned from serializing a stream (basically + any iterator that yields unicode objects) + :param method: the serialization method; determines how characters not + representable in the specified encoding are treated + :param encoding: how the output string should be encoded; if set to `None`, + this method returns a `unicode` object + :return: a string or unicode object (depending on the `encoding` parameter) + :since: version 0.4.1 + """ + output = u''.join(list(iterator)) + if encoding is not None: + errors = 'replace' + if method != 'text' and not isinstance(method, TextSerializer): + errors = 'xmlcharrefreplace' + return output.encode(encoding, errors) + return output + +def get_serializer(method='xml', **kwargs): + """Return a serializer object for the given method. + + :param method: the serialization method; can be either "xml", "xhtml", + "html", "text", or a custom serializer class + + Any additional keyword arguments are passed to the serializer, and thus + depend on the `method` parameter value. + + :see: `XMLSerializer`, `XHTMLSerializer`, `HTMLSerializer`, `TextSerializer` + :since: version 0.4.1 + """ + if isinstance(method, basestring): + method = {'xml': XMLSerializer, + 'xhtml': XHTMLSerializer, + 'html': HTMLSerializer, + 'text': TextSerializer}[method.lower()] + return method(**kwargs) class DocType(object): """Defines a number of commonly used DOCTYPE declarations as constants.""" - HTML_STRICT = ('html', '-//W3C//DTD HTML 4.01//EN', - 'http://www.w3.org/TR/html4/strict.dtd') - HTML_TRANSITIONAL = ('html', '-//W3C//DTD HTML 4.01 Transitional//EN', - 'http://www.w3.org/TR/html4/loose.dtd') + HTML_STRICT = ( + 'html', '-//W3C//DTD HTML 4.01//EN', + 'http://www.w3.org/TR/html4/strict.dtd' + ) + HTML_TRANSITIONAL = ( + 'html', '-//W3C//DTD HTML 4.01 Transitional//EN', + 'http://www.w3.org/TR/html4/loose.dtd' + ) + HTML_FRAMESET = ( + 'html', '-//W3C//DTD HTML 4.01 Frameset//EN', + 'http://www.w3.org/TR/html4/frameset.dtd' + ) HTML = HTML_STRICT - XHTML_STRICT = ('html', '-//W3C//DTD XHTML 1.0 Strict//EN', - 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd') - XHTML_TRANSITIONAL = ('html', '-//W3C//DTD XHTML 1.0 Transitional//EN', - 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd') + HTML5 = ('html', None, None) + + XHTML_STRICT = ( + 'html', '-//W3C//DTD XHTML 1.0 Strict//EN', + 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd' + ) + XHTML_TRANSITIONAL = ( + 'html', '-//W3C//DTD XHTML 1.0 Transitional//EN', + 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' + ) + XHTML_FRAMESET = ( + 'html', '-//W3C//DTD XHTML 1.0 Frameset//EN', + 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd' + ) XHTML = XHTML_STRICT + def get(cls, name): + """Return the ``(name, pubid, sysid)`` tuple of the ``DOCTYPE`` + declaration for the specified name. + + The following names are recognized in this version: + * "html" or "html-strict" for the HTML 4.01 strict DTD + * "html-transitional" for the HTML 4.01 transitional DTD + * "html-transitional" for the HTML 4.01 frameset DTD + * "html5" for the ``DOCTYPE`` proposed for HTML5 + * "xhtml" or "xhtml-strict" for the XHTML 1.0 strict DTD + * "xhtml-transitional" for the XHTML 1.0 transitional DTD + * "xhtml-frameset" for the XHTML 1.0 frameset DTD + + :param name: the name of the ``DOCTYPE`` + :return: the ``(name, pubid, sysid)`` tuple for the requested + ``DOCTYPE``, or ``None`` if the name is not recognized + :since: version 0.4.1 + """ + return { + 'html': cls.HTML, 'html-strict': cls.HTML_STRICT, + 'html-transitional': DocType.HTML_TRANSITIONAL, + 'html-frameset': DocType.HTML_FRAMESET, + 'html5': cls.HTML5, + 'xhtml': cls.XHTML, 'xhtml-strict': cls.XHTML_STRICT, + 'xhtml-transitional': cls.XHTML_TRANSITIONAL, + 'xhtml-frameset': cls.XHTML_FRAMESET, + }.get(name.lower()) + get = classmethod(get) + class XMLSerializer(object): """Produces XML text from an event stream. @@ -57,25 +144,127 @@ _PRESERVE_SPACE = frozenset() - def __init__(self, doctype=None, strip_whitespace=True): + def __init__(self, doctype=None, strip_whitespace=True, + namespace_prefixes=None): """Initialize the XML serializer. - @param doctype: a `(name, pubid, sysid)` tuple that represents the - DOCTYPE declaration that should be included at the top of the - generated output - @param strip_whitespace: whether extraneous whitespace should be - stripped from the output + :param doctype: a ``(name, pubid, sysid)`` tuple that represents the + DOCTYPE declaration that should be included at the top + of the generated output, or the name of a DOCTYPE as + defined in `DocType.get` + :param strip_whitespace: whether extraneous whitespace should be + stripped from the output + :note: Changed in 0.4.2: The `doctype` parameter can now be a string. """ self.preamble = [] if doctype: + if isinstance(doctype, basestring): + doctype = DocType.get(doctype) self.preamble.append((DOCTYPE, doctype, (None, -1, -1))) self.filters = [EmptyTagFilter()] if strip_whitespace: self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE)) + self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes)) def __call__(self, stream): - ns_attrib = [] - ns_mapping = {XML_NAMESPACE.uri: 'xml'} + have_decl = have_doctype = False + in_cdata = False + + stream = chain(self.preamble, stream) + for filter_ in self.filters: + stream = filter_(stream) + for kind, data, pos in stream: + + if kind is START or kind is EMPTY: + tag, attrib = data + buf = ['<', tag] + for attr, value in attrib: + buf += [' ', attr, '="', escape(value), '"'] + buf.append(kind is EMPTY and '/>' or '>') + yield Markup(u''.join(buf)) + + elif kind is END: + yield Markup('</%s>' % data) + + elif kind is TEXT: + if in_cdata: + yield data + else: + yield escape(data, quotes=False) + + elif kind is COMMENT: + yield Markup('<!--%s-->' % data) + + elif kind is XML_DECL and not have_decl: + version, encoding, standalone = data + buf = ['<?xml version="%s"' % version] + if encoding: + buf.append(' encoding="%s"' % encoding) + if standalone != -1: + standalone = standalone and 'yes' or 'no' + buf.append(' standalone="%s"' % standalone) + buf.append('?>\n') + yield Markup(u''.join(buf)) + have_decl = True + + elif kind is DOCTYPE and not have_doctype: + name, pubid, sysid = data + buf = ['<!DOCTYPE %s'] + if pubid: + buf.append(' PUBLIC "%s"') + elif sysid: + buf.append(' SYSTEM') + if sysid: + buf.append(' "%s"') + buf.append('>\n') + yield Markup(u''.join(buf), *filter(None, data)) + have_doctype = True + + elif kind is START_CDATA: + yield Markup('<![CDATA[') + in_cdata = True + + elif kind is END_CDATA: + yield Markup(']]>') + in_cdata = False + + elif kind is PI: + yield Markup('<?%s %s?>' % data) + + +class XHTMLSerializer(XMLSerializer): + """Produces XHTML text from an event stream. + + >>> from genshi.builder import tag + >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) + >>> print ''.join(XHTMLSerializer()(elem.generate())) + <div><a href="foo"></a><br /><hr noshade="noshade" /></div> + """ + + _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', + 'hr', 'img', 'input', 'isindex', 'link', 'meta', + 'param']) + _BOOLEAN_ATTRS = frozenset(['selected', 'checked', 'compact', 'declare', + 'defer', 'disabled', 'ismap', 'multiple', + 'nohref', 'noresize', 'noshade', 'nowrap']) + _PRESERVE_SPACE = frozenset([ + QName('pre'), QName('http://www.w3.org/1999/xhtml}pre'), + QName('textarea'), QName('http://www.w3.org/1999/xhtml}textarea') + ]) + + def __init__(self, doctype=None, strip_whitespace=True, + namespace_prefixes=None): + super(XHTMLSerializer, self).__init__(doctype, False) + self.filters = [EmptyTagFilter()] + if strip_whitespace: + self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE)) + namespace_prefixes = namespace_prefixes or {} + namespace_prefixes['http://www.w3.org/1999/xhtml'] = '' + self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes)) + + def __call__(self, stream): + boolean_attrs = self._BOOLEAN_ATTRS + empty_elems = self._EMPTY_ELEMS have_doctype = False in_cdata = False @@ -86,42 +275,22 @@ if kind is START or kind is EMPTY: tag, attrib = data - - tagname = tag.localname - namespace = tag.namespace - if namespace: - if namespace in ns_mapping: - prefix = ns_mapping[namespace] - if prefix: - tagname = '%s:%s' % (prefix, tagname) + buf = ['<', tag] + for attr, value in attrib: + if attr in boolean_attrs: + value = attr + buf += [' ', attr, '="', escape(value), '"'] + if kind is EMPTY: + if tag in empty_elems: + buf.append(' />') else: - ns_attrib.append((QName('xmlns'), namespace)) - buf = ['<', tagname] - - if ns_attrib: - attrib += tuple(ns_attrib) - for attr, value in attrib: - attrname = attr.localname - attrns = attr.namespace - if attrns: - prefix = ns_mapping.get(attrns) - if prefix: - attrname = '%s:%s' % (prefix, attrname) - buf += [' ', attrname, '="', escape(value), '"'] - ns_attrib = [] - - buf.append(kind is EMPTY and '/>' or '>') - + buf.append('></%s>' % tag) + else: + buf.append('>') yield Markup(u''.join(buf)) elif kind is END: - tag = data - tagname = tag.localname - if tag.namespace: - prefix = ns_mapping.get(tag.namespace) - if prefix: - tagname = '%s:%s' % (prefix, tag.localname) - yield Markup('</%s>' % tagname) + yield Markup('</%s>' % data) elif kind is TEXT: if in_cdata: @@ -145,144 +314,6 @@ yield Markup(u''.join(buf), *filter(None, data)) have_doctype = True - elif kind is START_NS: - prefix, uri = data - if uri not in ns_mapping: - ns_mapping[uri] = prefix - if not prefix: - ns_attrib.append((QName('xmlns'), uri)) - else: - ns_attrib.append((QName('xmlns:%s' % prefix), uri)) - - elif kind is START_CDATA: - yield Markup('<![CDATA[') - in_cdata = True - - elif kind is END_CDATA: - yield Markup(']]>') - in_cdata = False - - elif kind is PI: - yield Markup('<?%s %s?>' % data) - - -class XHTMLSerializer(XMLSerializer): - """Produces XHTML text from an event stream. - - >>> from genshi.builder import tag - >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) - >>> print ''.join(XHTMLSerializer()(elem.generate())) - <div><a href="foo"></a><br /><hr noshade="noshade" /></div> - """ - - NAMESPACE = Namespace('http://www.w3.org/1999/xhtml') - - _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', - 'hr', 'img', 'input', 'isindex', 'link', 'meta', - 'param']) - _BOOLEAN_ATTRS = frozenset(['selected', 'checked', 'compact', 'declare', - 'defer', 'disabled', 'ismap', 'multiple', - 'nohref', 'noresize', 'noshade', 'nowrap']) - _PRESERVE_SPACE = frozenset([ - QName('pre'), QName('http://www.w3.org/1999/xhtml}pre'), - QName('textarea'), QName('http://www.w3.org/1999/xhtml}textarea') - ]) - - def __call__(self, stream): - namespace = self.NAMESPACE - ns_attrib = [] - ns_mapping = {XML_NAMESPACE.uri: 'xml'} - boolean_attrs = self._BOOLEAN_ATTRS - empty_elems = self._EMPTY_ELEMS - have_doctype = False - in_cdata = False - - stream = chain(self.preamble, stream) - for filter_ in self.filters: - stream = filter_(stream) - for kind, data, pos in stream: - - if kind is START or kind is EMPTY: - tag, attrib = data - - tagname = tag.localname - tagns = tag.namespace - if tagns: - if tagns in ns_mapping: - prefix = ns_mapping[tagns] - if prefix: - tagname = '%s:%s' % (prefix, tagname) - else: - ns_attrib.append((QName('xmlns'), tagns)) - buf = ['<', tagname] - - if ns_attrib: - attrib += tuple(ns_attrib) - for attr, value in attrib: - attrname = attr.localname - attrns = attr.namespace - if attrns: - prefix = ns_mapping.get(attrns) - if prefix: - attrname = '%s:%s' % (prefix, attrname) - if attrname in boolean_attrs: - if value: - buf += [' ', attrname, '="', attrname, '"'] - else: - buf += [' ', attrname, '="', escape(value), '"'] - ns_attrib = [] - - if kind is EMPTY: - if (tagns and tagns != namespace.uri) \ - or tagname in empty_elems: - buf.append(' />') - else: - buf.append('></%s>' % tagname) - else: - buf.append('>') - - yield Markup(u''.join(buf)) - - elif kind is END: - tag = data - tagname = tag.localname - if tag.namespace: - prefix = ns_mapping.get(tag.namespace) - if prefix: - tagname = '%s:%s' % (prefix, tagname) - yield Markup('</%s>' % tagname) - - elif kind is TEXT: - if in_cdata: - yield data - else: - yield escape(data, quotes=False) - - elif kind is COMMENT: - yield Markup('<!--%s-->' % data) - - elif kind is DOCTYPE and not have_doctype: - name, pubid, sysid = data - buf = ['<!DOCTYPE %s'] - if pubid: - buf.append(' PUBLIC "%s"') - elif sysid: - buf.append(' SYSTEM') - if sysid: - buf.append(' "%s"') - buf.append('>\n') - yield Markup(u''.join(buf), *filter(None, data)) - have_doctype = True - - elif kind is START_NS: - prefix, uri = data - if uri not in ns_mapping: - ns_mapping[uri] = prefix - if not prefix: - ns_attrib.append((QName('xmlns'), uri)) - else: - ns_attrib.append((QName('xmlns:%s' % prefix), uri)) - elif kind is START_CDATA: yield Markup('<![CDATA[') in_cdata = True @@ -304,28 +335,28 @@ <div><a href="foo"></a><br><hr noshade></div> """ - _NOESCAPE_ELEMS = frozenset([QName('script'), - QName('http://www.w3.org/1999/xhtml}script'), - QName('style'), - QName('http://www.w3.org/1999/xhtml}style')]) + _NOESCAPE_ELEMS = frozenset([ + QName('script'), QName('http://www.w3.org/1999/xhtml}script'), + QName('style'), QName('http://www.w3.org/1999/xhtml}style') + ]) def __init__(self, doctype=None, strip_whitespace=True): """Initialize the HTML serializer. - @param doctype: a `(name, pubid, sysid)` tuple that represents the - DOCTYPE declaration that should be included at the top of the - generated output - @param strip_whitespace: whether extraneous whitespace should be - stripped from the output + :param doctype: a ``(name, pubid, sysid)`` tuple that represents the + DOCTYPE declaration that should be included at the top + of the generated output + :param strip_whitespace: whether extraneous whitespace should be + stripped from the output """ super(HTMLSerializer, self).__init__(doctype, False) + self.filters = [EmptyTagFilter()] if strip_whitespace: self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE, self._NOESCAPE_ELEMS)) + self.filters.append(NamespaceStripper('http://www.w3.org/1999/xhtml')) def __call__(self, stream): - namespace = self.NAMESPACE - ns_mapping = {} boolean_attrs = self._BOOLEAN_ATTRS empty_elems = self._EMPTY_ELEMS noescape_elems = self._NOESCAPE_ELEMS @@ -339,35 +370,23 @@ if kind is START or kind is EMPTY: tag, attrib = data - if not tag.namespace or tag in namespace: - tagname = tag.localname - buf = ['<', tagname] - - for attr, value in attrib: - attrname = attr.localname - if not attr.namespace or attr in namespace: - if attrname in boolean_attrs: - if value: - buf += [' ', attrname] - else: - buf += [' ', attrname, '="', escape(value), '"'] - - buf.append('>') - - if kind is EMPTY: - if tagname not in empty_elems: - buf.append('</%s>' % tagname) - - yield Markup(u''.join(buf)) - - if tagname in noescape_elems: - noescape = True + buf = ['<', tag] + for attr, value in attrib: + if attr in boolean_attrs: + if value: + buf += [' ', attr] + else: + buf += [' ', attr, '="', escape(value), '"'] + buf.append('>') + if kind is EMPTY: + if tag not in empty_elems: + buf.append('</%s>' % tag) + yield Markup(u''.join(buf)) + if tag in noescape_elems: + noescape = True elif kind is END: - tag = data - if not tag.namespace or tag in namespace: - yield Markup('</%s>' % tag.localname) - + yield Markup('</%s>' % data) noescape = False elif kind is TEXT: @@ -392,9 +411,6 @@ yield Markup(u''.join(buf), *filter(None, data)) have_doctype = True - elif kind is START_NS and data[1] not in ns_mapping: - ns_mapping[data[1]] = data[0] - elif kind is PI: yield Markup('<?%s %s?>' % data) @@ -423,8 +439,9 @@ """ def __call__(self, stream): - for kind, data, pos in stream: - if kind is TEXT: + for event in stream: + if event[0] is TEXT: + data = event[1] if type(data) is Markup: data = data.striptags().stripentities() yield unicode(data) @@ -439,36 +456,206 @@ def __call__(self, stream): prev = (None, None, None) - for kind, data, pos in stream: + for ev in stream: if prev[0] is START: - if kind is END: + if ev[0] is END: prev = EMPTY, prev[1], prev[2] yield prev continue else: yield prev - if kind is not START: - yield kind, data, pos - prev = kind, data, pos + if ev[0] is not START: + yield ev + prev = ev EMPTY = EmptyTagFilter.EMPTY +class NamespaceFlattener(object): + r"""Output stream filter that removes namespace information from the stream, + instead adding namespace attributes and prefixes as needed. + + :param prefixes: optional mapping of namespace URIs to prefixes + + >>> from genshi.input import XML + >>> xml = XML('''<doc xmlns="NS1" xmlns:two="NS2"> + ... <two:item/> + ... </doc>''') + >>> for kind, data, pos in NamespaceFlattener()(xml): + ... print kind, repr(data) + START (u'doc', Attrs([(u'xmlns', u'NS1'), (u'xmlns:two', u'NS2')])) + TEXT u'\n ' + START (u'two:item', Attrs()) + END u'two:item' + TEXT u'\n' + END u'doc' + """ + + def __init__(self, prefixes=None): + self.prefixes = {XML_NAMESPACE.uri: 'xml'} + if prefixes is not None: + self.prefixes.update(prefixes) + + def __call__(self, stream): + prefixes = dict([(v, [k]) for k, v in self.prefixes.items()]) + namespaces = {XML_NAMESPACE.uri: ['xml']} + def _push_ns(prefix, uri): + namespaces.setdefault(uri, []).append(prefix) + prefixes.setdefault(prefix, []).append(uri) + + ns_attrs = [] + _push_ns_attr = ns_attrs.append + def _make_ns_attr(prefix, uri): + return u'xmlns%s' % (prefix and ':%s' % prefix or ''), uri + + def _gen_prefix(): + val = 0 + while 1: + val += 1 + yield 'ns%d' % val + _gen_prefix = _gen_prefix().next + + for kind, data, pos in stream: + + if kind is START or kind is EMPTY: + tag, attrs = data + + tagname = tag.localname + tagns = tag.namespace + if tagns: + if tagns in namespaces: + prefix = namespaces[tagns][-1] + if prefix: + tagname = u'%s:%s' % (prefix, tagname) + else: + _push_ns_attr((u'xmlns', tagns)) + _push_ns('', tagns) + + new_attrs = [] + for attr, value in attrs: + attrname = attr.localname + attrns = attr.namespace + if attrns: + if attrns not in namespaces: + prefix = _gen_prefix() + _push_ns(prefix, attrns) + _push_ns_attr(('xmlns:%s' % prefix, attrns)) + else: + prefix = namespaces[attrns][-1] + if prefix: + attrname = u'%s:%s' % (prefix, attrname) + new_attrs.append((attrname, value)) + + yield kind, (tagname, Attrs(ns_attrs + new_attrs)), pos + del ns_attrs[:] + + elif kind is END: + tagname = data.localname + tagns = data.namespace + if tagns: + prefix = namespaces[tagns][-1] + if prefix: + tagname = u'%s:%s' % (prefix, tagname) + yield kind, tagname, pos + + elif kind is START_NS: + prefix, uri = data + if uri not in namespaces: + prefix = prefixes.get(uri, [prefix])[-1] + _push_ns_attr(_make_ns_attr(prefix, uri)) + _push_ns(prefix, uri) + + elif kind is END_NS: + if data in prefixes: + uris = prefixes.get(data) + uri = uris.pop() + if not uris: + del prefixes[data] + if uri not in uris or uri != uris[-1]: + uri_prefixes = namespaces[uri] + uri_prefixes.pop() + if not uri_prefixes: + del namespaces[uri] + if ns_attrs: + attr = _make_ns_attr(data, uri) + if attr in ns_attrs: + ns_attrs.remove(attr) + + else: + yield kind, data, pos + + +class NamespaceStripper(object): + r"""Stream filter that removes all namespace information from a stream, and + optionally strips out all tags not in a given namespace. + + :param namespace: the URI of the namespace that should not be stripped. If + not set, only elements with no namespace are included in + the output. + + >>> from genshi.input import XML + >>> xml = XML('''<doc xmlns="NS1" xmlns:two="NS2"> + ... <two:item/> + ... </doc>''') + >>> for kind, data, pos in NamespaceStripper(Namespace('NS1'))(xml): + ... print kind, repr(data) + START (u'doc', Attrs()) + TEXT u'\n ' + TEXT u'\n' + END u'doc' + """ + + def __init__(self, namespace=None): + if namespace is not None: + self.namespace = Namespace(namespace) + else: + self.namespace = {} + + def __call__(self, stream): + namespace = self.namespace + + for kind, data, pos in stream: + + if kind is START or kind is EMPTY: + tag, attrs = data + if tag.namespace and tag not in namespace: + continue + + new_attrs = [] + for attr, value in attrs: + if not attr.namespace or attr in namespace: + new_attrs.append((attr, value)) + + data = tag.localname, Attrs(new_attrs) + + elif kind is END: + if data.namespace and data not in namespace: + continue + data = data.localname + + elif kind is START_NS or kind is END_NS: + continue + + yield kind, data, pos + + class WhitespaceFilter(object): """A filter that removes extraneous ignorable white space from the - stream.""" + stream. + """ def __init__(self, preserve=None, noescape=None): """Initialize the filter. - @param preserve: a set or sequence of tag names for which white-space - should be preserved - @param noescape: a set or sequence of tag names for which text content - should not be escaped + :param preserve: a set or sequence of tag names for which white-space + should be preserved + :param noescape: a set or sequence of tag names for which text content + should not be escaped The `noescape` set is expected to refer to elements that cannot contain - further child elements (such as <style> or <script> in HTML documents). + further child elements (such as ``<style>`` or ``<script>`` in HTML + documents). """ if preserve is None: preserve = [] @@ -490,6 +677,7 @@ push_text = textbuf.append pop_text = textbuf.pop for kind, data, pos in chain(stream, [(None, None, None)]): + if kind is TEXT: if noescape: data = Markup(data)