Mercurial > genshi > genshi-test

diff genshi/output.py @ 500:0742f421caba experimental-inline
Merged revisions 487-603 via svnmerge from http://svn.edgewall.org/repos/genshi/trunk
author: cmlenz
date: Fri, 01 Jun 2007 17:21:47 +0000
parents: 49aa525b8f83
children: 1837f39efd6f
--- a/genshi/output.py
+++ b/genshi/output.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2006 Edgewall Software
+# Copyright (C) 2006-2007 Edgewall Software
 # All rights reserved.
 #
 # This software is licensed as described in the file COPYING, which
@@ -22,29 +22,116 @@
     from sets import ImmutableSet as frozenset
 import re
 
-from genshi.core import escape, Markup, Namespace, QName, StreamEventKind
-from genshi.core import DOCTYPE, START, END, START_NS, TEXT, START_CDATA, \
-                        END_CDATA, PI, COMMENT, XML_NAMESPACE
+from genshi.core import escape, Attrs, Markup, Namespace, QName, StreamEventKind
+from genshi.core import START, END, TEXT, XML_DECL, DOCTYPE, START_NS, END_NS, \
+                        START_CDATA, END_CDATA, PI, COMMENT, XML_NAMESPACE
 
-__all__ = ['DocType', 'XMLSerializer', 'XHTMLSerializer', 'HTMLSerializer',
-           'TextSerializer']
+__all__ = ['encode', 'get_serializer', 'DocType', 'XMLSerializer',
+           'XHTMLSerializer', 'HTMLSerializer', 'TextSerializer']
+__docformat__ = 'restructuredtext en'
+
+def encode(iterator, method='xml', encoding='utf-8'):
+    """Encode serializer output into a string.
+    
+    :param iterator: the iterator returned from serializing a stream (basically
+                     any iterator that yields unicode objects)
+    :param method: the serialization method; determines how characters not
+                   representable in the specified encoding are treated
+    :param encoding: how the output string should be encoded; if set to `None`,
+                     this method returns a `unicode` object
+    :return: a string or unicode object (depending on the `encoding` parameter)
+    :since: version 0.4.1
+    """
+    output = u''.join(list(iterator))
+    if encoding is not None:
+        errors = 'replace'
+        if method != 'text' and not isinstance(method, TextSerializer):
+            errors = 'xmlcharrefreplace'
+        return output.encode(encoding, errors)
+    return output
+
+def get_serializer(method='xml', **kwargs):
+    """Return a serializer object for the given method.
+    
+    :param method: the serialization method; can be either "xml", "xhtml",
+                   "html", "text", or a custom serializer class
+
+    Any additional keyword arguments are passed to the serializer, and thus
+    depend on the `method` parameter value.
+    
+    :see: `XMLSerializer`, `XHTMLSerializer`, `HTMLSerializer`, `TextSerializer`
+    :since: version 0.4.1
+    """
+    if isinstance(method, basestring):
+        method = {'xml':   XMLSerializer,
+                  'xhtml': XHTMLSerializer,
+                  'html':  HTMLSerializer,
+                  'text':  TextSerializer}[method.lower()]
+    return method(**kwargs)
 
 
 class DocType(object):
     """Defines a number of commonly used DOCTYPE declarations as constants."""
 
-    HTML_STRICT = ('html', '-//W3C//DTD HTML 4.01//EN',
-                   'http://www.w3.org/TR/html4/strict.dtd')
-    HTML_TRANSITIONAL = ('html', '-//W3C//DTD HTML 4.01 Transitional//EN',
-                         'http://www.w3.org/TR/html4/loose.dtd')
+    HTML_STRICT = (
+        'html', '-//W3C//DTD HTML 4.01//EN',
+        'http://www.w3.org/TR/html4/strict.dtd'
+    )
+    HTML_TRANSITIONAL = (
+        'html', '-//W3C//DTD HTML 4.01 Transitional//EN',
+        'http://www.w3.org/TR/html4/loose.dtd'
+    )
+    HTML_FRAMESET = (
+        'html', '-//W3C//DTD HTML 4.01 Frameset//EN',
+        'http://www.w3.org/TR/html4/frameset.dtd'
+    )
     HTML = HTML_STRICT
 
-    XHTML_STRICT = ('html', '-//W3C//DTD XHTML 1.0 Strict//EN',
-                    'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd')
-    XHTML_TRANSITIONAL = ('html', '-//W3C//DTD XHTML 1.0 Transitional//EN',
-                          'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd')
+    HTML5 = ('html', None, None)
+
+    XHTML_STRICT = (
+        'html', '-//W3C//DTD XHTML 1.0 Strict//EN',
+        'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'
+    )
+    XHTML_TRANSITIONAL = (
+        'html', '-//W3C//DTD XHTML 1.0 Transitional//EN',
+        'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
+    )
+    XHTML_FRAMESET = (
+        'html', '-//W3C//DTD XHTML 1.0 Frameset//EN',
+        'http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd'
+    )
     XHTML = XHTML_STRICT
 
+    def get(cls, name):
+        """Return the ``(name, pubid, sysid)`` tuple of the ``DOCTYPE``
+        declaration for the specified name.
+        
+        The following names are recognized in this version:
+         * "html" or "html-strict" for the HTML 4.01 strict DTD
+         * "html-transitional" for the HTML 4.01 transitional DTD
+         * "html-transitional" for the HTML 4.01 frameset DTD
+         * "html5" for the ``DOCTYPE`` proposed for HTML5
+         * "xhtml" or "xhtml-strict" for the XHTML 1.0 strict DTD
+         * "xhtml-transitional" for the XHTML 1.0 transitional DTD
+         * "xhtml-frameset" for the XHTML 1.0 frameset DTD
+        
+        :param name: the name of the ``DOCTYPE``
+        :return: the ``(name, pubid, sysid)`` tuple for the requested
+                 ``DOCTYPE``, or ``None`` if the name is not recognized
+        :since: version 0.4.1
+        """
+        return {
+            'html': cls.HTML, 'html-strict': cls.HTML_STRICT,
+            'html-transitional': DocType.HTML_TRANSITIONAL,
+            'html-frameset': DocType.HTML_FRAMESET,
+            'html5': cls.HTML5,
+            'xhtml': cls.XHTML, 'xhtml-strict': cls.XHTML_STRICT,
+            'xhtml-transitional': cls.XHTML_TRANSITIONAL,
+            'xhtml-frameset': cls.XHTML_FRAMESET,
+        }.get(name.lower())
+    get = classmethod(get)
+
 
 class XMLSerializer(object):
     """Produces XML text from an event stream.
@@ -57,25 +144,127 @@
 
     _PRESERVE_SPACE = frozenset()
 
-    def __init__(self, doctype=None, strip_whitespace=True):
+    def __init__(self, doctype=None, strip_whitespace=True,
+                 namespace_prefixes=None):
         """Initialize the XML serializer.
         
-        @param doctype: a `(name, pubid, sysid)` tuple that represents the
-            DOCTYPE declaration that should be included at the top of the
-            generated output
-        @param strip_whitespace: whether extraneous whitespace should be
-            stripped from the output
+        :param doctype: a ``(name, pubid, sysid)`` tuple that represents the
+                        DOCTYPE declaration that should be included at the top
+                        of the generated output, or the name of a DOCTYPE as
+                        defined in `DocType.get`
+        :param strip_whitespace: whether extraneous whitespace should be
+                                 stripped from the output
+        :note: Changed in 0.4.2: The  `doctype` parameter can now be a string.
         """
         self.preamble = []
         if doctype:
+            if isinstance(doctype, basestring):
+                doctype = DocType.get(doctype)
             self.preamble.append((DOCTYPE, doctype, (None, -1, -1)))
         self.filters = [EmptyTagFilter()]
         if strip_whitespace:
             self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE))
+        self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes))
 
     def __call__(self, stream):
-        ns_attrib = []
-        ns_mapping = {XML_NAMESPACE.uri: 'xml'}
+        have_decl = have_doctype = False
+        in_cdata = False
+
+        stream = chain(self.preamble, stream)
+        for filter_ in self.filters:
+            stream = filter_(stream)
+        for kind, data, pos in stream:
+
+            if kind is START or kind is EMPTY:
+                tag, attrib = data
+                buf = ['<', tag]
+                for attr, value in attrib:
+                    buf += [' ', attr, '="', escape(value), '"']
+                buf.append(kind is EMPTY and '/>' or '>')
+                yield Markup(u''.join(buf))
+
+            elif kind is END:
+                yield Markup('</%s>' % data)
+
+            elif kind is TEXT:
+                if in_cdata:
+                    yield data
+                else:
+                    yield escape(data, quotes=False)
+
+            elif kind is COMMENT:
+                yield Markup('<!--%s-->' % data)
+
+            elif kind is XML_DECL and not have_decl:
+                version, encoding, standalone = data
+                buf = ['<?xml version="%s"' % version]
+                if encoding:
+                    buf.append(' encoding="%s"' % encoding)
+                if standalone != -1:
+                    standalone = standalone and 'yes' or 'no'
+                    buf.append(' standalone="%s"' % standalone)
+                buf.append('?>\n')
+                yield Markup(u''.join(buf))
+                have_decl = True
+
+            elif kind is DOCTYPE and not have_doctype:
+                name, pubid, sysid = data
+                buf = ['<!DOCTYPE %s']
+                if pubid:
+                    buf.append(' PUBLIC "%s"')
+                elif sysid:
+                    buf.append(' SYSTEM')
+                if sysid:
+                    buf.append(' "%s"')
+                buf.append('>\n')
+                yield Markup(u''.join(buf), *filter(None, data))
+                have_doctype = True
+
+            elif kind is START_CDATA:
+                yield Markup('<![CDATA[')
+                in_cdata = True
+
+            elif kind is END_CDATA:
+                yield Markup(']]>')
+                in_cdata = False
+
+            elif kind is PI:
+                yield Markup('<?%s %s?>' % data)
+
+
+class XHTMLSerializer(XMLSerializer):
+    """Produces XHTML text from an event stream.
+    
+    >>> from genshi.builder import tag
+    >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True))
+    >>> print ''.join(XHTMLSerializer()(elem.generate()))
+    <div><a href="foo"></a><br /><hr noshade="noshade" /></div>
+    """
+
+    _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame',
+                              'hr', 'img', 'input', 'isindex', 'link', 'meta',
+                              'param'])
+    _BOOLEAN_ATTRS = frozenset(['selected', 'checked', 'compact', 'declare',
+                                'defer', 'disabled', 'ismap', 'multiple',
+                                'nohref', 'noresize', 'noshade', 'nowrap'])
+    _PRESERVE_SPACE = frozenset([
+        QName('pre'), QName('http://www.w3.org/1999/xhtml}pre'),
+        QName('textarea'), QName('http://www.w3.org/1999/xhtml}textarea')
+    ])
+
+    def __init__(self, doctype=None, strip_whitespace=True,
+                 namespace_prefixes=None):
+        super(XHTMLSerializer, self).__init__(doctype, False)
+        self.filters = [EmptyTagFilter()]
+        if strip_whitespace:
+            self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE))
+        namespace_prefixes = namespace_prefixes or {}
+        namespace_prefixes['http://www.w3.org/1999/xhtml'] = ''
+        self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes))
+
+    def __call__(self, stream):
+        boolean_attrs = self._BOOLEAN_ATTRS
+        empty_elems = self._EMPTY_ELEMS
         have_doctype = False
         in_cdata = False
 
@@ -86,42 +275,22 @@
 
             if kind is START or kind is EMPTY:
                 tag, attrib = data
-
-                tagname = tag.localname
-                namespace = tag.namespace
-                if namespace:
-                    if namespace in ns_mapping:
-                        prefix = ns_mapping[namespace]
-                        if prefix:
-                            tagname = '%s:%s' % (prefix, tagname)
+                buf = ['<', tag]
+                for attr, value in attrib:
+                    if attr in boolean_attrs:
+                        value = attr
+                    buf += [' ', attr, '="', escape(value), '"']
+                if kind is EMPTY:
+                    if tag in empty_elems:
+                        buf.append(' />')
                     else:
-                        ns_attrib.append((QName('xmlns'), namespace))
-                buf = ['<', tagname]
-
-                if ns_attrib:
-                    attrib += tuple(ns_attrib)
-                for attr, value in attrib:
-                    attrname = attr.localname
-                    attrns = attr.namespace
-                    if attrns:
-                        prefix = ns_mapping.get(attrns)
-                        if prefix:
-                            attrname = '%s:%s' % (prefix, attrname)
-                    buf += [' ', attrname, '="', escape(value), '"']
-                ns_attrib = []
-
-                buf.append(kind is EMPTY and '/>' or '>')
-
+                        buf.append('></%s>' % tag)
+                else:
+                    buf.append('>')
                 yield Markup(u''.join(buf))
 
             elif kind is END:
-                tag = data
-                tagname = tag.localname
-                if tag.namespace:
-                    prefix = ns_mapping.get(tag.namespace)
-                    if prefix:
-                        tagname = '%s:%s' % (prefix, tag.localname)
-                yield Markup('</%s>' % tagname)
+                yield Markup('</%s>' % data)
 
             elif kind is TEXT:
                 if in_cdata:
@@ -145,144 +314,6 @@
                 yield Markup(u''.join(buf), *filter(None, data))
                 have_doctype = True
 
-            elif kind is START_NS:
-                prefix, uri = data
-                if uri not in ns_mapping:
-                    ns_mapping[uri] = prefix
-                    if not prefix:
-                        ns_attrib.append((QName('xmlns'), uri))
-                    else:
-                        ns_attrib.append((QName('xmlns:%s' % prefix), uri))
-
-            elif kind is START_CDATA:
-                yield Markup('<![CDATA[')
-                in_cdata = True
-
-            elif kind is END_CDATA:
-                yield Markup(']]>')
-                in_cdata = False
-
-            elif kind is PI:
-                yield Markup('<?%s %s?>' % data)
-
-
-class XHTMLSerializer(XMLSerializer):
-    """Produces XHTML text from an event stream.
-    
-    >>> from genshi.builder import tag
-    >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True))
-    >>> print ''.join(XHTMLSerializer()(elem.generate()))
-    <div><a href="foo"></a><br /><hr noshade="noshade" /></div>
-    """
-
-    NAMESPACE = Namespace('http://www.w3.org/1999/xhtml')
-
-    _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame',
-                              'hr', 'img', 'input', 'isindex', 'link', 'meta',
-                              'param'])
-    _BOOLEAN_ATTRS = frozenset(['selected', 'checked', 'compact', 'declare',
-                                'defer', 'disabled', 'ismap', 'multiple',
-                                'nohref', 'noresize', 'noshade', 'nowrap'])
-    _PRESERVE_SPACE = frozenset([
-        QName('pre'), QName('http://www.w3.org/1999/xhtml}pre'),
-        QName('textarea'), QName('http://www.w3.org/1999/xhtml}textarea')
-    ])
-
-    def __call__(self, stream):
-        namespace = self.NAMESPACE
-        ns_attrib = []
-        ns_mapping = {XML_NAMESPACE.uri: 'xml'}
-        boolean_attrs = self._BOOLEAN_ATTRS
-        empty_elems = self._EMPTY_ELEMS
-        have_doctype = False
-        in_cdata = False
-
-        stream = chain(self.preamble, stream)
-        for filter_ in self.filters:
-            stream = filter_(stream)
-        for kind, data, pos in stream:
-
-            if kind is START or kind is EMPTY:
-                tag, attrib = data
-
-                tagname = tag.localname
-                tagns = tag.namespace
-                if tagns:
-                    if tagns in ns_mapping:
-                        prefix = ns_mapping[tagns]
-                        if prefix:
-                            tagname = '%s:%s' % (prefix, tagname)
-                    else:
-                        ns_attrib.append((QName('xmlns'), tagns))
-                buf = ['<', tagname]
-
-                if ns_attrib:
-                    attrib += tuple(ns_attrib)
-                for attr, value in attrib:
-                    attrname = attr.localname
-                    attrns = attr.namespace
-                    if attrns:
-                        prefix = ns_mapping.get(attrns)
-                        if prefix:
-                            attrname = '%s:%s' % (prefix, attrname)
-                    if attrname in boolean_attrs:
-                        if value:
-                            buf += [' ', attrname, '="', attrname, '"']
-                    else:
-                        buf += [' ', attrname, '="', escape(value), '"']
-                ns_attrib = []
-
-                if kind is EMPTY:
-                    if (tagns and tagns != namespace.uri) \
-                            or tagname in empty_elems:
-                        buf.append(' />')
-                    else:
-                        buf.append('></%s>' % tagname)
-                else:
-                    buf.append('>')
-
-                yield Markup(u''.join(buf))
-
-            elif kind is END:
-                tag = data
-                tagname = tag.localname
-                if tag.namespace:
-                    prefix = ns_mapping.get(tag.namespace)
-                    if prefix:
-                        tagname = '%s:%s' % (prefix, tagname)
-                yield Markup('</%s>' % tagname)
-
-            elif kind is TEXT:
-                if in_cdata:
-                    yield data
-                else:
-                    yield escape(data, quotes=False)
-
-            elif kind is COMMENT:
-                yield Markup('<!--%s-->' % data)
-
-            elif kind is DOCTYPE and not have_doctype:
-                name, pubid, sysid = data
-                buf = ['<!DOCTYPE %s']
-                if pubid:
-                    buf.append(' PUBLIC "%s"')
-                elif sysid:
-                    buf.append(' SYSTEM')
-                if sysid:
-                    buf.append(' "%s"')
-                buf.append('>\n')
-                yield Markup(u''.join(buf), *filter(None, data))
-                have_doctype = True
-
-            elif kind is START_NS:
-                prefix, uri = data
-                if uri not in ns_mapping:
-                    ns_mapping[uri] = prefix
-                    if not prefix:
-                        ns_attrib.append((QName('xmlns'), uri))
-                    else:
-                        ns_attrib.append((QName('xmlns:%s' % prefix), uri))
-
             elif kind is START_CDATA:
                 yield Markup('<![CDATA[')
                 in_cdata = True
@@ -304,28 +335,28 @@
     <div><a href="foo"></a><br><hr noshade></div>
     """
 
-    _NOESCAPE_ELEMS = frozenset([QName('script'),
-                                 QName('http://www.w3.org/1999/xhtml}script'),
-                                 QName('style'),
-                                 QName('http://www.w3.org/1999/xhtml}style')])
+    _NOESCAPE_ELEMS = frozenset([
+        QName('script'), QName('http://www.w3.org/1999/xhtml}script'),
+        QName('style'), QName('http://www.w3.org/1999/xhtml}style')
+    ])
 
     def __init__(self, doctype=None, strip_whitespace=True):
         """Initialize the HTML serializer.
         
-        @param doctype: a `(name, pubid, sysid)` tuple that represents the
-            DOCTYPE declaration that should be included at the top of the
-            generated output
-        @param strip_whitespace: whether extraneous whitespace should be
-            stripped from the output
+        :param doctype: a ``(name, pubid, sysid)`` tuple that represents the
+                        DOCTYPE declaration that should be included at the top
+                        of the generated output
+        :param strip_whitespace: whether extraneous whitespace should be
+                                 stripped from the output
         """
         super(HTMLSerializer, self).__init__(doctype, False)
+        self.filters = [EmptyTagFilter()]
         if strip_whitespace:
             self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE,
                                                  self._NOESCAPE_ELEMS))
+        self.filters.append(NamespaceStripper('http://www.w3.org/1999/xhtml'))
 
     def __call__(self, stream):
-        namespace = self.NAMESPACE
-        ns_mapping = {}
         boolean_attrs = self._BOOLEAN_ATTRS
         empty_elems = self._EMPTY_ELEMS
         noescape_elems = self._NOESCAPE_ELEMS
@@ -339,35 +370,23 @@
 
             if kind is START or kind is EMPTY:
                 tag, attrib = data
-                if not tag.namespace or tag in namespace:
-                    tagname = tag.localname
-                    buf = ['<', tagname]
-
-                    for attr, value in attrib:
-                        attrname = attr.localname
-                        if not attr.namespace or attr in namespace:
-                            if attrname in boolean_attrs:
-                                if value:
-                                    buf += [' ', attrname]
-                            else:
-                                buf += [' ', attrname, '="', escape(value), '"']
-
-                    buf.append('>')
-
-                    if kind is EMPTY:
-                        if tagname not in empty_elems:
-                            buf.append('</%s>' % tagname)
-
-                    yield Markup(u''.join(buf))
-
-                    if tagname in noescape_elems:
-                        noescape = True
+                buf = ['<', tag]
+                for attr, value in attrib:
+                    if attr in boolean_attrs:
+                        if value:
+                            buf += [' ', attr]
+                    else:
+                        buf += [' ', attr, '="', escape(value), '"']
+                buf.append('>')
+                if kind is EMPTY:
+                    if tag not in empty_elems:
+                        buf.append('</%s>' % tag)
+                yield Markup(u''.join(buf))
+                if tag in noescape_elems:
+                    noescape = True
 
             elif kind is END:
-                tag = data
-                if not tag.namespace or tag in namespace:
-                    yield Markup('</%s>' % tag.localname)
-
+                yield Markup('</%s>' % data)
                 noescape = False
 
             elif kind is TEXT:
@@ -392,9 +411,6 @@
                 yield Markup(u''.join(buf), *filter(None, data))
                 have_doctype = True
 
-            elif kind is START_NS and data[1] not in ns_mapping:
-                ns_mapping[data[1]] = data[0]
-
             elif kind is PI:
                 yield Markup('<?%s %s?>' % data)
 
@@ -423,8 +439,9 @@
     """
 
     def __call__(self, stream):
-        for kind, data, pos in stream:
-            if kind is TEXT:
+        for event in stream:
+            if event[0] is TEXT:
+                data = event[1]
                 if type(data) is Markup:
                     data = data.striptags().stripentities()
                 yield unicode(data)
@@ -439,36 +456,206 @@
 
     def __call__(self, stream):
         prev = (None, None, None)
-        for kind, data, pos in stream:
+        for ev in stream:
             if prev[0] is START:
-                if kind is END:
+                if ev[0] is END:
                     prev = EMPTY, prev[1], prev[2]
                     yield prev
                     continue
                 else:
                     yield prev
-            if kind is not START:
-                yield kind, data, pos
-            prev = kind, data, pos
+            if ev[0] is not START:
+                yield ev
+            prev = ev
 
 
 EMPTY = EmptyTagFilter.EMPTY
 
 
+class NamespaceFlattener(object):
+    r"""Output stream filter that removes namespace information from the stream,
+    instead adding namespace attributes and prefixes as needed.
+    
+    :param prefixes: optional mapping of namespace URIs to prefixes
+    
+    >>> from genshi.input import XML
+    >>> xml = XML('''<doc xmlns="NS1" xmlns:two="NS2">
+    ...   <two:item/>
+    ... </doc>''')
+    >>> for kind, data, pos in NamespaceFlattener()(xml):
+    ...     print kind, repr(data)
+    START (u'doc', Attrs([(u'xmlns', u'NS1'), (u'xmlns:two', u'NS2')]))
+    TEXT u'\n  '
+    START (u'two:item', Attrs())
+    END u'two:item'
+    TEXT u'\n'
+    END u'doc'
+    """
+
+    def __init__(self, prefixes=None):
+        self.prefixes = {XML_NAMESPACE.uri: 'xml'}
+        if prefixes is not None:
+            self.prefixes.update(prefixes)
+
+    def __call__(self, stream):
+        prefixes = dict([(v, [k]) for k, v in self.prefixes.items()])
+        namespaces = {XML_NAMESPACE.uri: ['xml']}
+        def _push_ns(prefix, uri):
+            namespaces.setdefault(uri, []).append(prefix)
+            prefixes.setdefault(prefix, []).append(uri)
+
+        ns_attrs = []
+        _push_ns_attr = ns_attrs.append
+        def _make_ns_attr(prefix, uri):
+            return u'xmlns%s' % (prefix and ':%s' % prefix or ''), uri
+
+        def _gen_prefix():
+            val = 0
+            while 1:
+                val += 1
+                yield 'ns%d' % val
+        _gen_prefix = _gen_prefix().next
+
+        for kind, data, pos in stream:
+
+            if kind is START or kind is EMPTY:
+                tag, attrs = data
+
+                tagname = tag.localname
+                tagns = tag.namespace
+                if tagns:
+                    if tagns in namespaces:
+                        prefix = namespaces[tagns][-1]
+                        if prefix:
+                            tagname = u'%s:%s' % (prefix, tagname)
+                    else:
+                        _push_ns_attr((u'xmlns', tagns))
+                        _push_ns('', tagns)
+
+                new_attrs = []
+                for attr, value in attrs:
+                    attrname = attr.localname
+                    attrns = attr.namespace
+                    if attrns:
+                        if attrns not in namespaces:
+                            prefix = _gen_prefix()
+                            _push_ns(prefix, attrns)
+                            _push_ns_attr(('xmlns:%s' % prefix, attrns))
+                        else:
+                            prefix = namespaces[attrns][-1]
+                        if prefix:
+                            attrname = u'%s:%s' % (prefix, attrname)
+                    new_attrs.append((attrname, value))
+
+                yield kind, (tagname, Attrs(ns_attrs + new_attrs)), pos
+                del ns_attrs[:]
+
+            elif kind is END:
+                tagname = data.localname
+                tagns = data.namespace
+                if tagns:
+                    prefix = namespaces[tagns][-1]
+                    if prefix:
+                        tagname = u'%s:%s' % (prefix, tagname)
+                yield kind, tagname, pos
+
+            elif kind is START_NS:
+                prefix, uri = data
+                if uri not in namespaces:
+                    prefix = prefixes.get(uri, [prefix])[-1]
+                    _push_ns_attr(_make_ns_attr(prefix, uri))
+                _push_ns(prefix, uri)
+
+            elif kind is END_NS:
+                if data in prefixes:
+                    uris = prefixes.get(data)
+                    uri = uris.pop()
+                    if not uris:
+                        del prefixes[data]
+                    if uri not in uris or uri != uris[-1]:
+                        uri_prefixes = namespaces[uri]
+                        uri_prefixes.pop()
+                        if not uri_prefixes:
+                            del namespaces[uri]
+                    if ns_attrs:
+                        attr = _make_ns_attr(data, uri)
+                        if attr in ns_attrs:
+                            ns_attrs.remove(attr)
+
+            else:
+                yield kind, data, pos
+
+
+class NamespaceStripper(object):
+    r"""Stream filter that removes all namespace information from a stream, and
+    optionally strips out all tags not in a given namespace.
+    
+    :param namespace: the URI of the namespace that should not be stripped. If
+                      not set, only elements with no namespace are included in
+                      the output.
+    
+    >>> from genshi.input import XML
+    >>> xml = XML('''<doc xmlns="NS1" xmlns:two="NS2">
+    ...   <two:item/>
+    ... </doc>''')
+    >>> for kind, data, pos in NamespaceStripper(Namespace('NS1'))(xml):
+    ...     print kind, repr(data)
+    START (u'doc', Attrs())
+    TEXT u'\n  '
+    TEXT u'\n'
+    END u'doc'
+    """
+
+    def __init__(self, namespace=None):
+        if namespace is not None:
+            self.namespace = Namespace(namespace)
+        else:
+            self.namespace = {}
+
+    def __call__(self, stream):
+        namespace = self.namespace
+
+        for kind, data, pos in stream:
+
+            if kind is START or kind is EMPTY:
+                tag, attrs = data
+                if tag.namespace and tag not in namespace:
+                    continue
+
+                new_attrs = []
+                for attr, value in attrs:
+                    if not attr.namespace or attr in namespace:
+                        new_attrs.append((attr, value))
+
+                data = tag.localname, Attrs(new_attrs)
+
+            elif kind is END:
+                if data.namespace and data not in namespace:
+                    continue
+                data = data.localname
+
+            elif kind is START_NS or kind is END_NS:
+                continue
+
+            yield kind, data, pos
+
+
 class WhitespaceFilter(object):
     """A filter that removes extraneous ignorable white space from the
-    stream."""
+    stream.
+    """
 
     def __init__(self, preserve=None, noescape=None):
         """Initialize the filter.
         
-        @param preserve: a set or sequence of tag names for which white-space
-            should be preserved
-        @param noescape: a set or sequence of tag names for which text content
-            should not be escaped
+        :param preserve: a set or sequence of tag names for which white-space
+                         should be preserved
+        :param noescape: a set or sequence of tag names for which text content
+                         should not be escaped
         
         The `noescape` set is expected to refer to elements that cannot contain
-        further child elements (such as <style> or <script> in HTML documents).
+        further child elements (such as ``<style>`` or ``<script>`` in HTML
+        documents).
         """
         if preserve is None:
             preserve = []
@@ -490,6 +677,7 @@
         push_text = textbuf.append
         pop_text = textbuf.pop
         for kind, data, pos in chain(stream, [(None, None, None)]):
+
             if kind is TEXT:
                 if noescape:
                     data = Markup(data)
author	cmlenz
date	Fri, 01 Jun 2007 17:21:47 +0000
parents	49aa525b8f83
children	1837f39efd6f