genshi/genshi-test: genshi/output.py comparison

comparison genshi/output.py @ 820:1837f39efd6f experimental-inline

Sync (old) experimental inline branch with trunk@1027.

author	cmlenz
date	Wed, 11 Mar 2009 17:51:06 +0000
parents	0742f421caba
children	de82830f8816

comparison

equal deleted inserted replaced

-:0742f421caba
+:1837f39efd6f
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2006-2007 Edgewall Software
+# Copyright (C) 2006-2008 Edgewall Software
 # All rights reserved.
 #
 # This software is licensed as described in the file COPYING, which
 # you should have received as part of this distribution. The terms
 # are also available at http://genshi.edgewall.org/wiki/License.
 """This module provides different kinds of serialization methods for XML event
 streams.
 """
 from itertools import chain
-try:
-frozenset
-except NameError:
-from sets import ImmutableSet as frozenset
 import re
 from genshi.core import escape, Attrs, Markup, Namespace, QName, StreamEventKind
 from genshi.core import START, END, TEXT, XML_DECL, DOCTYPE, START_NS, END_NS, \
 START_CDATA, END_CDATA, PI, COMMENT, XML_NAMESPACE
 __all__ = ['encode', 'get_serializer', 'DocType', 'XMLSerializer',
 'XHTMLSerializer', 'HTMLSerializer', 'TextSerializer']
 __docformat__ = 'restructuredtext en'
-def encode(iterator, method='xml', encoding='utf-8'):
+def encode(iterator, method='xml', encoding='utf-8', out=None):
 """Encode serializer output into a string.
 :param iterator: the iterator returned from serializing a stream (basically
 any iterator that yields unicode objects)
 :param method: the serialization method; determines how characters not
 representable in the specified encoding are treated
 :param encoding: how the output string should be encoded; if set to `None`,
 this method returns a `unicode` object
-:return: a string or unicode object (depending on the `encoding` parameter)
+:param out: a file-like object that the output should be written to
+instead of being returned as one big string; note that if
+this is a file or socket (or similar), the `encoding` must
+not be `None` (that is, the output must be encoded)
+:return: a `str` or `unicode` object (depending on the `encoding`
+parameter), or `None` if the `out` parameter is provided
 :since: version 0.4.1
-"""
+:note: Changed in 0.5: added the `out` parameter
-output = u''.join(list(iterator))
+"""
 if encoding is not None:
 errors = 'replace'
 if method != 'text' and not isinstance(method, TextSerializer):
 errors = 'xmlcharrefreplace'
-return output.encode(encoding, errors)
+_encode = lambda string: string.encode(encoding, errors)
-return output
+else:
+_encode = lambda string: string
+if out is None:
+return _encode(u''.join(list(iterator)))
+for chunk in iterator:
+out.write(_encode(chunk))
 def get_serializer(method='xml', **kwargs):
 """Return a serializer object for the given method.
 :param method: the serialization method; can be either "xml", "xhtml",
 XHTML_FRAMESET = (
 'html', '-//W3C//DTD XHTML 1.0 Frameset//EN',
 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd'
 )
 XHTML = XHTML_STRICT
+XHTML11 = (
+'html', '-//W3C//DTD XHTML 1.1//EN',
+'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'
+)
+SVG_FULL = (
+'svg', '-//W3C//DTD SVG 1.1//EN',
+'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd'
+)
+SVG_BASIC = (
+'svg', '-//W3C//DTD SVG Basic 1.1//EN',
+'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11-basic.dtd'
+)
+SVG_TINY = (
+'svg', '-//W3C//DTD SVG Tiny 1.1//EN',
+'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11-tiny.dtd'
+)
+SVG = SVG_FULL
 def get(cls, name):
 """Return the ``(name, pubid, sysid)`` tuple of the ``DOCTYPE``
 declaration for the specified name.
 The following names are recognized in this version:
 * "html" or "html-strict" for the HTML 4.01 strict DTD
 * "html-transitional" for the HTML 4.01 transitional DTD
-* "html-transitional" for the HTML 4.01 frameset DTD
+* "html-frameset" for the HTML 4.01 frameset DTD
 * "html5" for the ``DOCTYPE`` proposed for HTML5
 * "xhtml" or "xhtml-strict" for the XHTML 1.0 strict DTD
 * "xhtml-transitional" for the XHTML 1.0 transitional DTD
 * "xhtml-frameset" for the XHTML 1.0 frameset DTD
+* "xhtml11" for the XHTML 1.1 DTD
+* "svg" or "svg-full" for the SVG 1.1 DTD
+* "svg-basic" for the SVG Basic 1.1 DTD
+* "svg-tiny" for the SVG Tiny 1.1 DTD
 :param name: the name of the ``DOCTYPE``
 :return: the ``(name, pubid, sysid)`` tuple for the requested
 ``DOCTYPE``, or ``None`` if the name is not recognized
 :since: version 0.4.1
 'html-frameset': DocType.HTML_FRAMESET,
 'html5': cls.HTML5,
 'xhtml': cls.XHTML, 'xhtml-strict': cls.XHTML_STRICT,
 'xhtml-transitional': cls.XHTML_TRANSITIONAL,
 'xhtml-frameset': cls.XHTML_FRAMESET,
+'xhtml11': cls.XHTML11,
+'svg': cls.SVG, 'svg-full': cls.SVG_FULL,
+'svg-basic': cls.SVG_BASIC,
+'svg-tiny': cls.SVG_TINY
 }.get(name.lower())
 get = classmethod(get)
 class XMLSerializer(object):
 defined in `DocType.get`
 :param strip_whitespace: whether extraneous whitespace should be
 stripped from the output
 :note: Changed in 0.4.2: The  `doctype` parameter can now be a string.
 """
-self.preamble = []
-if doctype:
-if isinstance(doctype, basestring):
-doctype = DocType.get(doctype)
-self.preamble.append((DOCTYPE, doctype, (None, -1, -1)))
 self.filters = [EmptyTagFilter()]
 if strip_whitespace:
 self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE))
 self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes))
+if doctype:
+self.filters.append(DocTypeInserter(doctype))
 def __call__(self, stream):
 have_decl = have_doctype = False
 in_cdata = False
-stream = chain(self.preamble, stream)
 for filter_ in self.filters:
 stream = filter_(stream)
 for kind, data, pos in stream:
 if kind is START or kind is EMPTY:
 elif sysid:
 buf.append(' SYSTEM')
 if sysid:
 buf.append(' "%s"')
 buf.append('>\n')
-yield Markup(u''.join(buf), *filter(None, data))
+yield Markup(u''.join(buf)) % filter(None, data)
 have_doctype = True
 elif kind is START_CDATA:
 yield Markup('<![CDATA[')
 in_cdata = True
 QName('pre'), QName('http://www.w3.org/1999/xhtml}pre'),
 QName('textarea'), QName('http://www.w3.org/1999/xhtml}textarea')
 ])
 def __init__(self, doctype=None, strip_whitespace=True,
-namespace_prefixes=None):
+namespace_prefixes=None, drop_xml_decl=True):
 super(XHTMLSerializer, self).__init__(doctype, False)
 self.filters = [EmptyTagFilter()]
 if strip_whitespace:
 self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE))
 namespace_prefixes = namespace_prefixes or {}
 namespace_prefixes['http://www.w3.org/1999/xhtml'] = ''
 self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes))
+if doctype:
+self.filters.append(DocTypeInserter(doctype))
+self.drop_xml_decl = drop_xml_decl
 def __call__(self, stream):
 boolean_attrs = self._BOOLEAN_ATTRS
 empty_elems = self._EMPTY_ELEMS
-have_doctype = False
+drop_xml_decl = self.drop_xml_decl
+have_decl = have_doctype = False
 in_cdata = False
-stream = chain(self.preamble, stream)
 for filter_ in self.filters:
 stream = filter_(stream)
 for kind, data, pos in stream:
 if kind is START or kind is EMPTY:
 tag, attrib = data
 buf = ['<', tag]
 for attr, value in attrib:
 if attr in boolean_attrs:
 value = attr
+elif attr == u'xml:lang' and u'lang' not in attrib:
+buf += [' lang="', escape(value), '"']
+elif attr == u'xml:space':
+continue
 buf += [' ', attr, '="', escape(value), '"']
 if kind is EMPTY:
 if tag in empty_elems:
 buf.append(' />')
 else:
 elif sysid:
 buf.append(' SYSTEM')
 if sysid:
 buf.append(' "%s"')
 buf.append('>\n')
-yield Markup(u''.join(buf), *filter(None, data))
+yield Markup(u''.join(buf)) % filter(None, data)
 have_doctype = True
+elif kind is XML_DECL and not have_decl and not drop_xml_decl:
+version, encoding, standalone = data
+buf = ['<?xml version="%s"' % version]
+if encoding:
+buf.append(' encoding="%s"' % encoding)
+if standalone != -1:
+standalone = standalone and 'yes' or 'no'
+buf.append(' standalone="%s"' % standalone)
+buf.append('?>\n')
+yield Markup(u''.join(buf))
+have_decl = True
 elif kind is START_CDATA:
 yield Markup('<![CDATA[')
 in_cdata = True
 super(HTMLSerializer, self).__init__(doctype, False)
 self.filters = [EmptyTagFilter()]
 if strip_whitespace:
 self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE,
 self._NOESCAPE_ELEMS))
-self.filters.append(NamespaceStripper('http://www.w3.org/1999/xhtml'))
+self.filters.append(NamespaceFlattener(prefixes={
+'http://www.w3.org/1999/xhtml': ''
+}))
+if doctype:
+self.filters.append(DocTypeInserter(doctype))
 def __call__(self, stream):
 boolean_attrs = self._BOOLEAN_ATTRS
 empty_elems = self._EMPTY_ELEMS
 noescape_elems = self._NOESCAPE_ELEMS
 have_doctype = False
 noescape = False
-stream = chain(self.preamble, stream)
 for filter_ in self.filters:
 stream = filter_(stream)
 for kind, data, pos in stream:
 if kind is START or kind is EMPTY:
 buf = ['<', tag]
 for attr, value in attrib:
 if attr in boolean_attrs:
 if value:
 buf += [' ', attr]
-else:
+elif ':' in attr:
+if attr == 'xml:lang' and u'lang' not in attrib:
+buf += [' lang="', escape(value), '"']
+elif attr != 'xmlns':
 buf += [' ', attr, '="', escape(value), '"']
 buf.append('>')
 if kind is EMPTY:
 if tag not in empty_elems:
 buf.append('</%s>' % tag)
 elif sysid:
 buf.append(' SYSTEM')
 if sysid:
 buf.append(' "%s"')
 buf.append('>\n')
-yield Markup(u''.join(buf), *filter(None, data))
+yield Markup(u''.join(buf)) % filter(None, data)
 have_doctype = True
 elif kind is PI:
 yield Markup('<?%s %s?>' % data)
 <div><a href="foo">&lt;Hello!&gt;</a><br/></div>
 >>> print ''.join(TextSerializer()(elem.generate()))
 <Hello!>
 If text events contain literal markup (instances of the `Markup` class),
-tags or entities are stripped from the output:
+that markup is by default passed through unchanged:
->>> elem = tag.div(Markup('<a href="foo">Hello!</a><br/>'))
+>>> elem = tag.div(Markup('<a href="foo">Hello &amp; Bye!</a><br/>'))
->>> print elem
+>>> print elem.generate().render(TextSerializer)
-<div><a href="foo">Hello!</a><br/></div>
+<a href="foo">Hello &amp; Bye!</a><br/>
->>> print ''.join(TextSerializer()(elem.generate()))
-Hello!
+You can use the ``strip_markup`` to change this behavior, so that tags and
-"""
+entities are stripped from the output (or in the case of entities,
+replaced with the equivalent character):
+>>> print elem.generate().render(TextSerializer, strip_markup=True)
+Hello & Bye!
+"""
+def __init__(self, strip_markup=False):
+"""Create the serializer.
+:param strip_markup: whether markup (tags and encoded characters) found
+in the text should be removed
+"""
+self.strip_markup = strip_markup
 def __call__(self, stream):
+strip_markup = self.strip_markup
 for event in stream:
 if event[0] is TEXT:
 data = event[1]
-if type(data) is Markup:
+if strip_markup and type(data) is Markup:
 data = data.striptags().stripentities()
 yield unicode(data)
 class EmptyTagFilter(object):
 if attr in ns_attrs:
 ns_attrs.remove(attr)
 else:
 yield kind, data, pos
-class NamespaceStripper(object):
-r"""Stream filter that removes all namespace information from a stream, and
-optionally strips out all tags not in a given namespace.
-:param namespace: the URI of the namespace that should not be stripped. If
-not set, only elements with no namespace are included in
-the output.
->>> from genshi.input import XML
->>> xml = XML('''<doc xmlns="NS1" xmlns:two="NS2">
-...   <two:item/>
-... </doc>''')
->>> for kind, data, pos in NamespaceStripper(Namespace('NS1'))(xml):
-...     print kind, repr(data)
-START (u'doc', Attrs())
-TEXT u'\n  '
-TEXT u'\n'
-END u'doc'
-"""
-def __init__(self, namespace=None):
-if namespace is not None:
-self.namespace = Namespace(namespace)
-else:
-self.namespace = {}
-def __call__(self, stream):
-namespace = self.namespace
-for kind, data, pos in stream:
-if kind is START or kind is EMPTY:
-tag, attrs = data
-if tag.namespace and tag not in namespace:
-continue
-new_attrs = []
-for attr, value in attrs:
-if not attr.namespace or attr in namespace:
-new_attrs.append((attr, value))
-data = tag.localname, Attrs(new_attrs)
-elif kind is END:
-if data.namespace and data not in namespace:
-continue
-data = data.localname
-elif kind is START_NS or kind is END_NS:
-continue
-yield kind, data, pos
 class WhitespaceFilter(object):
 """A filter that removes extraneous ignorable white space from the
 stream.
 elif kind is END_CDATA:
 noescape = False
 if kind:
 yield kind, data, pos
+class DocTypeInserter(object):
+"""A filter that inserts the DOCTYPE declaration in the correct location,
+after the XML declaration.
+"""
+def __init__(self, doctype):
+"""Initialize the filter.
+:param doctype: DOCTYPE as a string or DocType object.
+"""
+if isinstance(doctype, basestring):
+doctype = DocType.get(doctype)
+self.doctype_event = (DOCTYPE, doctype, (None, -1, -1))
+def __call__(self, stream):
+doctype_inserted = False
+for kind, data, pos in stream:
+if not doctype_inserted:
+doctype_inserted = True
+if kind is XML_DECL:
+yield (kind, data, pos)
+yield self.doctype_event
+continue
+yield self.doctype_event
+yield (kind, data, pos)
+if not doctype_inserted:
+yield self.doctype_event

Mercurial > genshi > genshi-test

comparison genshi/output.py @ 820:1837f39efd6f experimental-inline