Mercurial > genshi > genshi-test
diff genshi/output.py @ 820:1837f39efd6f experimental-inline
Sync (old) experimental inline branch with trunk@1027.
author | cmlenz |
---|---|
date | Wed, 11 Mar 2009 17:51:06 +0000 |
parents | 0742f421caba |
children | de82830f8816 |
line wrap: on
line diff
--- a/genshi/output.py +++ b/genshi/output.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2006-2007 Edgewall Software +# Copyright (C) 2006-2008 Edgewall Software # All rights reserved. # # This software is licensed as described in the file COPYING, which @@ -16,10 +16,6 @@ """ from itertools import chain -try: - frozenset -except NameError: - from sets import ImmutableSet as frozenset import re from genshi.core import escape, Attrs, Markup, Namespace, QName, StreamEventKind @@ -30,7 +26,7 @@ 'XHTMLSerializer', 'HTMLSerializer', 'TextSerializer'] __docformat__ = 'restructuredtext en' -def encode(iterator, method='xml', encoding='utf-8'): +def encode(iterator, method='xml', encoding='utf-8', out=None): """Encode serializer output into a string. :param iterator: the iterator returned from serializing a stream (basically @@ -39,16 +35,27 @@ representable in the specified encoding are treated :param encoding: how the output string should be encoded; if set to `None`, this method returns a `unicode` object - :return: a string or unicode object (depending on the `encoding` parameter) + :param out: a file-like object that the output should be written to + instead of being returned as one big string; note that if + this is a file or socket (or similar), the `encoding` must + not be `None` (that is, the output must be encoded) + :return: a `str` or `unicode` object (depending on the `encoding` + parameter), or `None` if the `out` parameter is provided + :since: version 0.4.1 + :note: Changed in 0.5: added the `out` parameter """ - output = u''.join(list(iterator)) if encoding is not None: errors = 'replace' if method != 'text' and not isinstance(method, TextSerializer): errors = 'xmlcharrefreplace' - return output.encode(encoding, errors) - return output + _encode = lambda string: string.encode(encoding, errors) + else: + _encode = lambda string: string + if out is None: + return _encode(u''.join(list(iterator))) + for chunk in iterator: + out.write(_encode(chunk)) def get_serializer(method='xml', **kwargs): """Return a serializer object for the given method. @@ -103,6 +110,25 @@ ) XHTML = XHTML_STRICT + XHTML11 = ( + 'html', '-//W3C//DTD XHTML 1.1//EN', + 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd' + ) + + SVG_FULL = ( + 'svg', '-//W3C//DTD SVG 1.1//EN', + 'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd' + ) + SVG_BASIC = ( + 'svg', '-//W3C//DTD SVG Basic 1.1//EN', + 'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11-basic.dtd' + ) + SVG_TINY = ( + 'svg', '-//W3C//DTD SVG Tiny 1.1//EN', + 'http://www.w3.org/Graphics/SVG/1.1/DTD/svg11-tiny.dtd' + ) + SVG = SVG_FULL + def get(cls, name): """Return the ``(name, pubid, sysid)`` tuple of the ``DOCTYPE`` declaration for the specified name. @@ -110,11 +136,15 @@ The following names are recognized in this version: * "html" or "html-strict" for the HTML 4.01 strict DTD * "html-transitional" for the HTML 4.01 transitional DTD - * "html-transitional" for the HTML 4.01 frameset DTD + * "html-frameset" for the HTML 4.01 frameset DTD * "html5" for the ``DOCTYPE`` proposed for HTML5 * "xhtml" or "xhtml-strict" for the XHTML 1.0 strict DTD * "xhtml-transitional" for the XHTML 1.0 transitional DTD * "xhtml-frameset" for the XHTML 1.0 frameset DTD + * "xhtml11" for the XHTML 1.1 DTD + * "svg" or "svg-full" for the SVG 1.1 DTD + * "svg-basic" for the SVG Basic 1.1 DTD + * "svg-tiny" for the SVG Tiny 1.1 DTD :param name: the name of the ``DOCTYPE`` :return: the ``(name, pubid, sysid)`` tuple for the requested @@ -129,6 +159,10 @@ 'xhtml': cls.XHTML, 'xhtml-strict': cls.XHTML_STRICT, 'xhtml-transitional': cls.XHTML_TRANSITIONAL, 'xhtml-frameset': cls.XHTML_FRAMESET, + 'xhtml11': cls.XHTML11, + 'svg': cls.SVG, 'svg-full': cls.SVG_FULL, + 'svg-basic': cls.SVG_BASIC, + 'svg-tiny': cls.SVG_TINY }.get(name.lower()) get = classmethod(get) @@ -156,21 +190,17 @@ stripped from the output :note: Changed in 0.4.2: The `doctype` parameter can now be a string. """ - self.preamble = [] - if doctype: - if isinstance(doctype, basestring): - doctype = DocType.get(doctype) - self.preamble.append((DOCTYPE, doctype, (None, -1, -1))) self.filters = [EmptyTagFilter()] if strip_whitespace: self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE)) self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes)) + if doctype: + self.filters.append(DocTypeInserter(doctype)) def __call__(self, stream): have_decl = have_doctype = False in_cdata = False - stream = chain(self.preamble, stream) for filter_ in self.filters: stream = filter_(stream) for kind, data, pos in stream: @@ -217,7 +247,7 @@ if sysid: buf.append(' "%s"') buf.append('>\n') - yield Markup(u''.join(buf), *filter(None, data)) + yield Markup(u''.join(buf)) % filter(None, data) have_doctype = True elif kind is START_CDATA: @@ -253,7 +283,7 @@ ]) def __init__(self, doctype=None, strip_whitespace=True, - namespace_prefixes=None): + namespace_prefixes=None, drop_xml_decl=True): super(XHTMLSerializer, self).__init__(doctype, False) self.filters = [EmptyTagFilter()] if strip_whitespace: @@ -261,14 +291,17 @@ namespace_prefixes = namespace_prefixes or {} namespace_prefixes['http://www.w3.org/1999/xhtml'] = '' self.filters.append(NamespaceFlattener(prefixes=namespace_prefixes)) + if doctype: + self.filters.append(DocTypeInserter(doctype)) + self.drop_xml_decl = drop_xml_decl def __call__(self, stream): boolean_attrs = self._BOOLEAN_ATTRS empty_elems = self._EMPTY_ELEMS - have_doctype = False + drop_xml_decl = self.drop_xml_decl + have_decl = have_doctype = False in_cdata = False - stream = chain(self.preamble, stream) for filter_ in self.filters: stream = filter_(stream) for kind, data, pos in stream: @@ -279,6 +312,10 @@ for attr, value in attrib: if attr in boolean_attrs: value = attr + elif attr == u'xml:lang' and u'lang' not in attrib: + buf += [' lang="', escape(value), '"'] + elif attr == u'xml:space': + continue buf += [' ', attr, '="', escape(value), '"'] if kind is EMPTY: if tag in empty_elems: @@ -311,9 +348,21 @@ if sysid: buf.append(' "%s"') buf.append('>\n') - yield Markup(u''.join(buf), *filter(None, data)) + yield Markup(u''.join(buf)) % filter(None, data) have_doctype = True + elif kind is XML_DECL and not have_decl and not drop_xml_decl: + version, encoding, standalone = data + buf = ['<?xml version="%s"' % version] + if encoding: + buf.append(' encoding="%s"' % encoding) + if standalone != -1: + standalone = standalone and 'yes' or 'no' + buf.append(' standalone="%s"' % standalone) + buf.append('?>\n') + yield Markup(u''.join(buf)) + have_decl = True + elif kind is START_CDATA: yield Markup('<![CDATA[') in_cdata = True @@ -354,7 +403,11 @@ if strip_whitespace: self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE, self._NOESCAPE_ELEMS)) - self.filters.append(NamespaceStripper('http://www.w3.org/1999/xhtml')) + self.filters.append(NamespaceFlattener(prefixes={ + 'http://www.w3.org/1999/xhtml': '' + })) + if doctype: + self.filters.append(DocTypeInserter(doctype)) def __call__(self, stream): boolean_attrs = self._BOOLEAN_ATTRS @@ -363,7 +416,6 @@ have_doctype = False noescape = False - stream = chain(self.preamble, stream) for filter_ in self.filters: stream = filter_(stream) for kind, data, pos in stream: @@ -375,7 +427,10 @@ if attr in boolean_attrs: if value: buf += [' ', attr] - else: + elif ':' in attr: + if attr == 'xml:lang' and u'lang' not in attrib: + buf += [' lang="', escape(value), '"'] + elif attr != 'xmlns': buf += [' ', attr, '="', escape(value), '"'] buf.append('>') if kind is EMPTY: @@ -408,7 +463,7 @@ if sysid: buf.append(' "%s"') buf.append('>\n') - yield Markup(u''.join(buf), *filter(None, data)) + yield Markup(u''.join(buf)) % filter(None, data) have_doctype = True elif kind is PI: @@ -429,20 +484,34 @@ <Hello!> If text events contain literal markup (instances of the `Markup` class), - tags or entities are stripped from the output: + that markup is by default passed through unchanged: - >>> elem = tag.div(Markup('<a href="foo">Hello!</a><br/>')) - >>> print elem - <div><a href="foo">Hello!</a><br/></div> - >>> print ''.join(TextSerializer()(elem.generate())) - Hello! + >>> elem = tag.div(Markup('<a href="foo">Hello & Bye!</a><br/>')) + >>> print elem.generate().render(TextSerializer) + <a href="foo">Hello & Bye!</a><br/> + + You can use the ``strip_markup`` to change this behavior, so that tags and + entities are stripped from the output (or in the case of entities, + replaced with the equivalent character): + + >>> print elem.generate().render(TextSerializer, strip_markup=True) + Hello & Bye! """ + def __init__(self, strip_markup=False): + """Create the serializer. + + :param strip_markup: whether markup (tags and encoded characters) found + in the text should be removed + """ + self.strip_markup = strip_markup + def __call__(self, stream): + strip_markup = self.strip_markup for event in stream: if event[0] is TEXT: data = event[1] - if type(data) is Markup: + if strip_markup and type(data) is Markup: data = data.striptags().stripentities() yield unicode(data) @@ -586,60 +655,6 @@ yield kind, data, pos -class NamespaceStripper(object): - r"""Stream filter that removes all namespace information from a stream, and - optionally strips out all tags not in a given namespace. - - :param namespace: the URI of the namespace that should not be stripped. If - not set, only elements with no namespace are included in - the output. - - >>> from genshi.input import XML - >>> xml = XML('''<doc xmlns="NS1" xmlns:two="NS2"> - ... <two:item/> - ... </doc>''') - >>> for kind, data, pos in NamespaceStripper(Namespace('NS1'))(xml): - ... print kind, repr(data) - START (u'doc', Attrs()) - TEXT u'\n ' - TEXT u'\n' - END u'doc' - """ - - def __init__(self, namespace=None): - if namespace is not None: - self.namespace = Namespace(namespace) - else: - self.namespace = {} - - def __call__(self, stream): - namespace = self.namespace - - for kind, data, pos in stream: - - if kind is START or kind is EMPTY: - tag, attrs = data - if tag.namespace and tag not in namespace: - continue - - new_attrs = [] - for attr, value in attrs: - if not attr.namespace or attr in namespace: - new_attrs.append((attr, value)) - - data = tag.localname, Attrs(new_attrs) - - elif kind is END: - if data.namespace and data not in namespace: - continue - data = data.localname - - elif kind is START_NS or kind is END_NS: - continue - - yield kind, data, pos - - class WhitespaceFilter(object): """A filter that removes extraneous ignorable white space from the stream. @@ -714,3 +729,33 @@ if kind: yield kind, data, pos + + +class DocTypeInserter(object): + """A filter that inserts the DOCTYPE declaration in the correct location, + after the XML declaration. + """ + def __init__(self, doctype): + """Initialize the filter. + + :param doctype: DOCTYPE as a string or DocType object. + """ + if isinstance(doctype, basestring): + doctype = DocType.get(doctype) + self.doctype_event = (DOCTYPE, doctype, (None, -1, -1)) + + def __call__(self, stream): + doctype_inserted = False + for kind, data, pos in stream: + if not doctype_inserted: + doctype_inserted = True + if kind is XML_DECL: + yield (kind, data, pos) + yield self.doctype_event + continue + yield self.doctype_event + + yield (kind, data, pos) + + if not doctype_inserted: + yield self.doctype_event