cmlenz@1: # -*- coding: utf-8 -*- cmlenz@1: # cmlenz@66: # Copyright (C) 2006 Edgewall Software cmlenz@1: # All rights reserved. cmlenz@1: # cmlenz@1: # This software is licensed as described in the file COPYING, which cmlenz@1: # you should have received as part of this distribution. The terms cmlenz@66: # are also available at http://markup.edgewall.org/wiki/License. cmlenz@1: # cmlenz@1: # This software consists of voluntary contributions made by many cmlenz@1: # individuals. For the exact contribution history, see the revision cmlenz@66: # history and logs, available at http://markup.edgewall.org/log/. cmlenz@1: cmlenz@1: """This module provides different kinds of serialization methods for XML event cmlenz@1: streams. cmlenz@1: """ cmlenz@1: cmlenz@123: from itertools import chain cmlenz@1: try: cmlenz@1: frozenset cmlenz@1: except NameError: cmlenz@1: from sets import ImmutableSet as frozenset cmlenz@123: import re cmlenz@1: cmlenz@73: from markup.core import escape, Markup, Namespace, QName cmlenz@105: from markup.core import DOCTYPE, START, END, START_NS, END_NS, TEXT, COMMENT, PI cmlenz@1: cmlenz@1: __all__ = ['Serializer', 'XMLSerializer', 'HTMLSerializer'] cmlenz@1: cmlenz@1: cmlenz@85: class DocType(object): cmlenz@85: """Defines a number of commonly used DOCTYPE declarations as constants.""" cmlenz@85: cmlenz@85: HTML_STRICT = ('html', '-//W3C//DTD HTML 4.01//EN', cmlenz@85: 'http://www.w3.org/TR/html4/strict.dtd') cmlenz@85: HTML_TRANSITIONAL = ('html', '-//W3C//DTD HTML 4.01 Transitional//EN', cmlenz@85: 'http://www.w3.org/TR/html4/loose.dtd') cmlenz@85: HTML = HTML_STRICT cmlenz@85: cmlenz@85: XHTML_STRICT = ('html', '-//W3C//DTD XHTML 1.0 Strict//EN', cmlenz@85: 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd') cmlenz@85: XHTML_TRANSITIONAL = ('html', '-//W3C//DTD XHTML 1.0 Transitional//EN', cmlenz@85: 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd') cmlenz@85: XHTML = XHTML_STRICT cmlenz@85: cmlenz@85: cmlenz@123: class XMLSerializer(object): cmlenz@1: """Produces XML text from an event stream. cmlenz@1: cmlenz@1: >>> from markup.builder import tag cmlenz@20: >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) cmlenz@123: >>> print ''.join(XMLSerializer()(elem.generate())) cmlenz@1:


cmlenz@1: """ cmlenz@123: cmlenz@123: _PRESERVE_SPACE = frozenset() cmlenz@123: cmlenz@123: def __init__(self, doctype=None, strip_whitespace=True): cmlenz@85: """Initialize the XML serializer. cmlenz@85: cmlenz@85: @param doctype: a `(name, pubid, sysid)` tuple that represents the cmlenz@85: DOCTYPE declaration that should be included at the top of the cmlenz@85: generated output cmlenz@123: @param strip_whitespace: whether extraneous whitespace should be cmlenz@123: stripped from the output cmlenz@85: """ cmlenz@85: self.preamble = [] cmlenz@85: if doctype: cmlenz@85: self.preamble.append((DOCTYPE, doctype, (None, -1, -1))) cmlenz@123: self.filters = [] cmlenz@123: if strip_whitespace: cmlenz@123: self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE)) cmlenz@1: cmlenz@123: def __call__(self, stream): cmlenz@85: have_doctype = False cmlenz@1: ns_attrib = [] cmlenz@1: ns_mapping = {} cmlenz@1: cmlenz@123: stream = chain(self.preamble, stream) cmlenz@123: for filter_ in self.filters: cmlenz@123: stream = filter_(stream) cmlenz@123: stream = _PushbackIterator(stream) cmlenz@1: for kind, data, pos in stream: cmlenz@1: cmlenz@109: if kind is START: cmlenz@1: tag, attrib = data cmlenz@1: cmlenz@1: tagname = tag.localname cmlenz@123: namespace = tag.namespace cmlenz@123: if namespace: cmlenz@123: if namespace in ns_mapping: cmlenz@123: prefix = ns_mapping[namespace] cmlenz@1: if prefix: cmlenz@123: tagname = '%s:%s' % (prefix, tagname) cmlenz@123: else: cmlenz@123: ns_attrib.append((QName('xmlns'), namespace)) cmlenz@69: buf = ['<%s' % tagname] cmlenz@1: cmlenz@123: for attr, value in attrib + ns_attrib: cmlenz@1: attrname = attr.localname cmlenz@1: if attr.namespace: cmlenz@26: prefix = ns_mapping.get(attr.namespace) cmlenz@1: if prefix: cmlenz@69: attrname = '%s:%s' % (prefix, attrname) cmlenz@73: buf.append(' %s="%s"' % (attrname, escape(value))) cmlenz@123: ns_attrib = [] cmlenz@1: cmlenz@1: kind, data, pos = stream.next() cmlenz@69: if kind is END: cmlenz@1: buf.append('/>') cmlenz@1: else: cmlenz@1: buf.append('>') cmlenz@1: stream.pushback((kind, data, pos)) cmlenz@1: cmlenz@1: yield Markup(''.join(buf)) cmlenz@1: cmlenz@69: elif kind is END: cmlenz@1: tag = data cmlenz@1: tagname = tag.localname cmlenz@1: if tag.namespace: cmlenz@26: prefix = ns_mapping.get(tag.namespace) cmlenz@26: if prefix: cmlenz@69: tagname = '%s:%s' % (prefix, tag.localname) cmlenz@1: yield Markup('' % tagname) cmlenz@1: cmlenz@69: elif kind is TEXT: cmlenz@73: yield escape(data, quotes=False) cmlenz@1: cmlenz@89: elif kind is COMMENT: cmlenz@89: yield Markup('' % data) cmlenz@89: cmlenz@109: elif kind is DOCTYPE: cmlenz@109: if not have_doctype: cmlenz@109: name, pubid, sysid = data cmlenz@109: buf = ['\n') cmlenz@109: yield Markup(''.join(buf), *filter(None, data)) cmlenz@109: have_doctype = True cmlenz@109: cmlenz@109: elif kind is START_NS: cmlenz@109: prefix, uri = data cmlenz@109: if uri not in ns_mapping: cmlenz@109: ns_mapping[uri] = prefix cmlenz@109: if not prefix: cmlenz@109: ns_attrib.append((QName('xmlns'), uri)) cmlenz@109: else: cmlenz@109: ns_attrib.append((QName('xmlns:%s' % prefix), uri)) cmlenz@109: cmlenz@105: elif kind is PI: cmlenz@105: yield Markup('' % data) cmlenz@105: cmlenz@1: cmlenz@96: class XHTMLSerializer(XMLSerializer): cmlenz@96: """Produces XHTML text from an event stream. cmlenz@1: cmlenz@1: >>> from markup.builder import tag cmlenz@20: >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) cmlenz@123: >>> print ''.join(XHTMLSerializer()(elem.generate())) cmlenz@96:


cmlenz@1: """ cmlenz@1: cmlenz@18: NAMESPACE = Namespace('http://www.w3.org/1999/xhtml') cmlenz@1: cmlenz@1: _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', cmlenz@1: 'hr', 'img', 'input', 'isindex', 'link', 'meta', cmlenz@1: 'param']) cmlenz@1: _BOOLEAN_ATTRS = frozenset(['selected', 'checked', 'compact', 'declare', cmlenz@1: 'defer', 'disabled', 'ismap', 'multiple', cmlenz@1: 'nohref', 'noresize', 'noshade', 'nowrap']) cmlenz@123: _PRESERVE_SPACE = frozenset([QName('pre'), QName('textarea')]) cmlenz@1: cmlenz@123: def __call__(self, stream): cmlenz@85: have_doctype = False cmlenz@1: ns_mapping = {} cmlenz@1: cmlenz@123: stream = chain(self.preamble, stream) cmlenz@123: for filter_ in self.filters: cmlenz@123: stream = filter_(stream) cmlenz@123: stream = _PushbackIterator(stream) cmlenz@1: for kind, data, pos in stream: cmlenz@1: cmlenz@109: if kind is START: cmlenz@1: tag, attrib = data cmlenz@18: if tag.namespace and tag not in self.NAMESPACE: cmlenz@1: continue # not in the HTML namespace, so don't emit cmlenz@1: buf = ['<', tag.localname] cmlenz@96: cmlenz@96: for attr, value in attrib: cmlenz@96: if attr.namespace and attr not in self.NAMESPACE: cmlenz@96: continue # not in the HTML namespace, so don't emit cmlenz@96: if attr.localname in self._BOOLEAN_ATTRS: cmlenz@96: if value: cmlenz@96: buf.append(' %s="%s"' % (attr.localname, attr.localname)) cmlenz@96: else: cmlenz@96: buf.append(' %s="%s"' % (attr.localname, escape(value))) cmlenz@96: cmlenz@96: if tag.localname in self._EMPTY_ELEMS: cmlenz@96: kind, data, pos = stream.next() cmlenz@96: if kind is END: cmlenz@96: buf.append(' />') cmlenz@96: else: cmlenz@96: buf.append('>') cmlenz@96: stream.pushback((kind, data, pos)) cmlenz@96: else: cmlenz@96: buf.append('>') cmlenz@96: cmlenz@96: yield Markup(''.join(buf)) cmlenz@96: cmlenz@96: elif kind is END: cmlenz@96: tag = data cmlenz@96: if tag.namespace and tag not in self.NAMESPACE: cmlenz@96: continue # not in the HTML namespace, so don't emit cmlenz@96: yield Markup('' % tag.localname) cmlenz@96: cmlenz@96: elif kind is TEXT: cmlenz@96: yield escape(data, quotes=False) cmlenz@96: cmlenz@96: elif kind is COMMENT: cmlenz@96: yield Markup('' % data) cmlenz@96: cmlenz@109: elif kind is DOCTYPE: cmlenz@109: if not have_doctype: cmlenz@109: name, pubid, sysid = data cmlenz@109: buf = ['\n') cmlenz@109: yield Markup(''.join(buf), *filter(None, data)) cmlenz@109: have_doctype = True cmlenz@109: cmlenz@109: elif kind is START_NS: cmlenz@109: prefix, uri = data cmlenz@109: if uri not in ns_mapping: cmlenz@109: ns_mapping[uri] = prefix cmlenz@109: cmlenz@105: elif kind is PI: cmlenz@105: yield Markup('' % data) cmlenz@105: cmlenz@96: cmlenz@96: class HTMLSerializer(XHTMLSerializer): cmlenz@96: """Produces HTML text from an event stream. cmlenz@96: cmlenz@96: >>> from markup.builder import tag cmlenz@96: >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True)) cmlenz@123: >>> print ''.join(HTMLSerializer()(elem.generate())) cmlenz@96:


cmlenz@96: """ cmlenz@96: cmlenz@123: def __call__(self, stream): cmlenz@96: have_doctype = False cmlenz@96: ns_mapping = {} cmlenz@96: cmlenz@123: stream = chain(self.preamble, stream) cmlenz@123: for filter_ in self.filters: cmlenz@123: stream = filter_(stream) cmlenz@123: stream = _PushbackIterator(stream) cmlenz@96: for kind, data, pos in stream: cmlenz@96: cmlenz@109: if kind is START: cmlenz@96: tag, attrib = data cmlenz@96: if tag.namespace and tag not in self.NAMESPACE: cmlenz@96: continue # not in the HTML namespace, so don't emit cmlenz@96: buf = ['<', tag.localname] cmlenz@96: cmlenz@1: for attr, value in attrib: cmlenz@123: if attr.namespace and attr not in self.NAMESPACE \ cmlenz@123: or attr.localname.startswith('xml:'): cmlenz@1: continue # not in the HTML namespace, so don't emit cmlenz@1: if attr.localname in self._BOOLEAN_ATTRS: cmlenz@1: if value: cmlenz@1: buf.append(' %s' % attr.localname) cmlenz@1: else: cmlenz@73: buf.append(' %s="%s"' % (attr.localname, escape(value))) cmlenz@1: cmlenz@1: if tag.localname in self._EMPTY_ELEMS: cmlenz@1: kind, data, pos = stream.next() cmlenz@69: if kind is not END: cmlenz@1: stream.pushback((kind, data, pos)) cmlenz@1: cmlenz@1: yield Markup(''.join(buf + ['>'])) cmlenz@1: cmlenz@69: elif kind is END: cmlenz@1: tag = data cmlenz@18: if tag.namespace and tag not in self.NAMESPACE: cmlenz@1: continue # not in the HTML namespace, so don't emit cmlenz@1: yield Markup('' % tag.localname) cmlenz@1: cmlenz@69: elif kind is TEXT: cmlenz@73: yield escape(data, quotes=False) cmlenz@1: cmlenz@89: elif kind is COMMENT: cmlenz@89: yield Markup('' % data) cmlenz@89: cmlenz@109: elif kind is DOCTYPE: cmlenz@109: if not have_doctype: cmlenz@109: name, pubid, sysid = data cmlenz@109: buf = ['\n') cmlenz@109: yield Markup(''.join(buf), *filter(None, data)) cmlenz@109: have_doctype = True cmlenz@109: cmlenz@109: elif kind is START_NS: cmlenz@109: prefix, uri = data cmlenz@109: if uri not in ns_mapping: cmlenz@109: ns_mapping[uri] = prefix cmlenz@109: cmlenz@105: elif kind is PI: cmlenz@105: yield Markup('' % data) cmlenz@105: cmlenz@1: cmlenz@123: class WhitespaceFilter(object): cmlenz@123: """A filter that removes extraneous ignorable white space from the cmlenz@123: stream.""" cmlenz@123: cmlenz@123: _TRAILING_SPACE = re.compile('[ \t]+(?=\n)') cmlenz@123: _LINE_COLLAPSE = re.compile('\n{2,}') cmlenz@123: cmlenz@123: def __init__(self, preserve=None): cmlenz@123: """Initialize the filter. cmlenz@123: cmlenz@123: @param preserve: a sequence of tag names for which white-space should cmlenz@123: be ignored. cmlenz@123: """ cmlenz@123: if preserve is None: cmlenz@123: preserve = [] cmlenz@123: self.preserve = frozenset(preserve) cmlenz@123: cmlenz@123: def __call__(self, stream, ctxt=None): cmlenz@123: trim_trailing_space = self._TRAILING_SPACE.sub cmlenz@123: collapse_lines = self._LINE_COLLAPSE.sub cmlenz@123: mjoin = Markup('').join cmlenz@123: preserve = [False] cmlenz@123: cmlenz@123: textbuf = [] cmlenz@123: for kind, data, pos in chain(stream, [(None, None, None)]): cmlenz@123: if kind is TEXT: cmlenz@123: textbuf.append(data) cmlenz@123: else: cmlenz@123: if kind is START: cmlenz@123: preserve.append(data[0] in self.preserve or cmlenz@123: data[1].get('xml:space') == 'preserve') cmlenz@123: if textbuf: cmlenz@123: if len(textbuf) > 1: cmlenz@123: text = mjoin(textbuf, escape_quotes=False) cmlenz@123: del textbuf[:] cmlenz@123: else: cmlenz@123: text = escape(textbuf.pop(), quotes=False) cmlenz@123: if not preserve[-1]: cmlenz@123: text = collapse_lines('\n', trim_trailing_space('', text)) cmlenz@123: yield TEXT, Markup(text), pos cmlenz@123: if kind is END: cmlenz@123: preserve.pop() cmlenz@123: if kind is not None: cmlenz@123: yield kind, data, pos cmlenz@123: cmlenz@123: cmlenz@26: class _PushbackIterator(object): cmlenz@1: """A simple wrapper for iterators that allows pushing items back on the cmlenz@1: queue via the `pushback()` method. cmlenz@1: cmlenz@1: That can effectively be used to peek at the next item.""" cmlenz@1: __slots__ = ['iterable', 'buf'] cmlenz@1: cmlenz@1: def __init__(self, iterable): cmlenz@1: self.iterable = iter(iterable) cmlenz@1: self.buf = [] cmlenz@1: cmlenz@1: def __iter__(self): cmlenz@1: return self cmlenz@1: cmlenz@1: def next(self): cmlenz@1: if self.buf: cmlenz@1: return self.buf.pop(0) cmlenz@1: return self.iterable.next() cmlenz@1: cmlenz@1: def pushback(self, item): cmlenz@1: self.buf.append(item)