diff markup/output.py @ 123:10279d2eeec9 trunk

Fix for #18: whitespace in space-sensitive elements such as `<pre>` and `<textarea>` is now preserved.
author cmlenz
date Thu, 03 Aug 2006 14:49:22 +0000
parents 230ee6a2c6b2
children b86f496f6035
line wrap: on
line diff
--- a/markup/output.py
+++ b/markup/output.py
@@ -15,11 +15,12 @@
 streams.
 """
 
+from itertools import chain
 try:
     frozenset
 except NameError:
     from sets import ImmutableSet as frozenset
-from itertools import chain
+import re
 
 from markup.core import escape, Markup, Namespace, QName
 from markup.core import DOCTYPE, START, END, START_NS, END_NS, TEXT, COMMENT, PI
@@ -27,19 +28,6 @@
 __all__ = ['Serializer', 'XMLSerializer', 'HTMLSerializer']
 
 
-class Serializer(object):
-    """Base class for serializers."""
-
-    def serialize(self, stream):
-        """Must be implemented by concrete subclasses to serialize the given
-        stream.
-        
-        This method must be implemented as a generator, producing the
-        serialized output incrementally as unicode strings.
-        """
-        raise NotImplementedError
-
-
 class DocType(object):
     """Defines a number of commonly used DOCTYPE declarations as constants."""
 
@@ -56,56 +44,66 @@
     XHTML = XHTML_STRICT
 
 
-class XMLSerializer(Serializer):
+class XMLSerializer(object):
     """Produces XML text from an event stream.
     
     >>> from markup.builder import tag
     >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True))
-    >>> print ''.join(XMLSerializer().serialize(elem.generate()))
+    >>> print ''.join(XMLSerializer()(elem.generate()))
     <div><a href="foo"/><br/><hr noshade="True"/></div>
     """
-    def __init__(self, doctype=None):
+
+    _PRESERVE_SPACE = frozenset()
+
+    def __init__(self, doctype=None, strip_whitespace=True):
         """Initialize the XML serializer.
         
         @param doctype: a `(name, pubid, sysid)` tuple that represents the
             DOCTYPE declaration that should be included at the top of the
             generated output
+        @param strip_whitespace: whether extraneous whitespace should be
+            stripped from the output
         """
         self.preamble = []
         if doctype:
             self.preamble.append((DOCTYPE, doctype, (None, -1, -1)))
+        self.filters = []
+        if strip_whitespace:
+            self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE))
 
-    def serialize(self, stream):
+    def __call__(self, stream):
         have_doctype = False
         ns_attrib = []
         ns_mapping = {}
 
-        stream = _PushbackIterator(chain(self.preamble, stream))
+        stream = chain(self.preamble, stream)
+        for filter_ in self.filters:
+            stream = filter_(stream)
+        stream = _PushbackIterator(stream)
         for kind, data, pos in stream:
 
             if kind is START:
                 tag, attrib = data
 
                 tagname = tag.localname
-                if tag.namespace:
-                    try:
-                        prefix = ns_mapping[tag.namespace]
+                namespace = tag.namespace
+                if namespace:
+                    if namespace in ns_mapping:
+                        prefix = ns_mapping[namespace]
                         if prefix:
-                            tagname = '%s:%s' % (prefix, tag.localname)
-                    except KeyError:
-                        ns_attrib.append((QName('xmlns'), tag.namespace))
+                            tagname = '%s:%s' % (prefix, tagname)
+                    else:
+                        ns_attrib.append((QName('xmlns'), namespace))
                 buf = ['<%s' % tagname]
 
-                if ns_attrib:
-                    attrib.extend(ns_attrib)
-                    ns_attrib = []
-                for attr, value in attrib:
+                for attr, value in attrib + ns_attrib:
                     attrname = attr.localname
                     if attr.namespace:
                         prefix = ns_mapping.get(attr.namespace)
                         if prefix:
                             attrname = '%s:%s' % (prefix, attrname)
                     buf.append(' %s="%s"' % (attrname, escape(value)))
+                ns_attrib = []
 
                 kind, data, pos = stream.next()
                 if kind is END:
@@ -163,7 +161,7 @@
     
     >>> from markup.builder import tag
     >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True))
-    >>> print ''.join(XHTMLSerializer().serialize(elem.generate()))
+    >>> print ''.join(XHTMLSerializer()(elem.generate()))
     <div><a href="foo"></a><br /><hr noshade="noshade" /></div>
     """
 
@@ -175,12 +173,16 @@
     _BOOLEAN_ATTRS = frozenset(['selected', 'checked', 'compact', 'declare',
                                 'defer', 'disabled', 'ismap', 'multiple',
                                 'nohref', 'noresize', 'noshade', 'nowrap'])
+    _PRESERVE_SPACE = frozenset([QName('pre'), QName('textarea')])
 
-    def serialize(self, stream):
+    def __call__(self, stream):
         have_doctype = False
         ns_mapping = {}
 
-        stream = _PushbackIterator(chain(self.preamble, stream))
+        stream = chain(self.preamble, stream)
+        for filter_ in self.filters:
+            stream = filter_(stream)
+        stream = _PushbackIterator(stream)
         for kind, data, pos in stream:
 
             if kind is START:
@@ -250,15 +252,18 @@
     
     >>> from markup.builder import tag
     >>> elem = tag.div(tag.a(href='foo'), tag.br, tag.hr(noshade=True))
-    >>> print ''.join(HTMLSerializer().serialize(elem.generate()))
+    >>> print ''.join(HTMLSerializer()(elem.generate()))
     <div><a href="foo"></a><br><hr noshade></div>
     """
 
-    def serialize(self, stream):
+    def __call__(self, stream):
         have_doctype = False
         ns_mapping = {}
 
-        stream = _PushbackIterator(chain(self.preamble, stream))
+        stream = chain(self.preamble, stream)
+        for filter_ in self.filters:
+            stream = filter_(stream)
+        stream = _PushbackIterator(stream)
         for kind, data, pos in stream:
 
             if kind is START:
@@ -268,7 +273,8 @@
                 buf = ['<', tag.localname]
 
                 for attr, value in attrib:
-                    if attr.namespace and attr not in self.NAMESPACE:
+                    if attr.namespace and attr not in self.NAMESPACE \
+                            or attr.localname.startswith('xml:'):
                         continue # not in the HTML namespace, so don't emit
                     if attr.localname in self._BOOLEAN_ATTRS:
                         if value:
@@ -318,6 +324,52 @@
                 yield Markup('<?%s %s?>' % data)
 
 
+class WhitespaceFilter(object):
+    """A filter that removes extraneous ignorable white space from the
+    stream."""
+
+    _TRAILING_SPACE = re.compile('[ \t]+(?=\n)')
+    _LINE_COLLAPSE = re.compile('\n{2,}')
+
+    def __init__(self, preserve=None):
+        """Initialize the filter.
+        
+        @param preserve: a sequence of tag names for which white-space should
+            be ignored.
+        """
+        if preserve is None:
+            preserve = []
+        self.preserve = frozenset(preserve)
+
+    def __call__(self, stream, ctxt=None):
+        trim_trailing_space = self._TRAILING_SPACE.sub
+        collapse_lines = self._LINE_COLLAPSE.sub
+        mjoin = Markup('').join
+        preserve = [False]
+
+        textbuf = []
+        for kind, data, pos in chain(stream, [(None, None, None)]):
+            if kind is TEXT:
+                textbuf.append(data)
+            else:
+                if kind is START:
+                    preserve.append(data[0] in self.preserve or 
+                                    data[1].get('xml:space') == 'preserve')
+                if textbuf:
+                    if len(textbuf) > 1:
+                        text = mjoin(textbuf, escape_quotes=False)
+                        del textbuf[:]
+                    else:
+                        text = escape(textbuf.pop(), quotes=False)
+                    if not preserve[-1]:
+                        text = collapse_lines('\n', trim_trailing_space('', text))
+                    yield TEXT, Markup(text), pos
+                if kind is END:
+                    preserve.pop()
+                if kind is not None:
+                    yield kind, data, pos
+
+
 class _PushbackIterator(object):
     """A simple wrapper for iterators that allows pushing items back on the
     queue via the `pushback()` method.
Copyright (C) 2012-2017 Edgewall Software