changeset 141:520a5b7dd6d2 trunk

* No escaping of `<script>` or `<style>` tags in HTML output (see #24) * Fix parsing of `xml:space` attribute.
author cmlenz
date Thu, 10 Aug 2006 15:21:55 +0000
parents c1f4390d50f8
children 349b3ff5367d
files markup/core.py markup/output.py markup/tests/input.py markup/tests/output.py
diffstat 4 files changed, 147 insertions(+), 50 deletions(-) [+]
line wrap: on
line diff
--- a/markup/core.py
+++ b/markup/core.py
@@ -429,6 +429,9 @@
         return self.uri
 
 
+XML_NAMESPACE = Namespace('http://www.w3.org/XML/1998/namespace')
+
+
 class QName(unicode):
     """A qualified element or attribute name.
     
--- a/markup/output.py
+++ b/markup/output.py
@@ -22,7 +22,7 @@
     from sets import ImmutableSet as frozenset
 import re
 
-from markup.core import escape, Markup, Namespace, QName
+from markup.core import escape, Markup, Namespace, QName, XML_NAMESPACE
 from markup.core import DOCTYPE, START, END, START_NS, END_NS, TEXT, COMMENT, PI
 
 __all__ = ['Serializer', 'XMLSerializer', 'HTMLSerializer']
@@ -74,7 +74,7 @@
     def __call__(self, stream):
         have_doctype = False
         ns_attrib = []
-        ns_mapping = {}
+        ns_mapping = {XML_NAMESPACE.uri: 'xml'}
 
         stream = chain(self.preamble, stream)
         for filter_ in self.filters:
@@ -177,7 +177,8 @@
 
     def __call__(self, stream):
         namespace = self.NAMESPACE
-        ns_mapping = {}
+        ns_attrib = []
+        ns_mapping = {XML_NAMESPACE.uri: 'xml'}
         boolean_attrs = self._BOOLEAN_ATTRS
         empty_elems = self._EMPTY_ELEMS
         have_doctype = False
@@ -191,35 +192,52 @@
 
             if kind is START:
                 tag, attrib = data
-                if not tag.namespace or tag in namespace:
-                    tagname = tag.localname
-                    buf = ['<', tagname]
 
-                    for attr, value in attrib:
-                        if not attr.namespace or attr in namespace:
-                            attrname = attr.localname
-                            if attrname in boolean_attrs:
-                                if value:
-                                    buf += [' ', attrname, '="', attrname, '"']
-                            else:
-                                buf += [' ', attrname, '="', escape(value), '"']
+                tagname = tag.localname
+                namespace = tag.namespace
+                if namespace:
+                    if namespace in ns_mapping:
+                        prefix = ns_mapping[namespace]
+                        if prefix:
+                            tagname = '%s:%s' % (prefix, tagname)
+                    else:
+                        ns_attrib.append((QName('xmlns'), namespace))
+                buf = ['<', tagname]
 
-                    if tagname in empty_elems:
-                        kind, data, pos = stream.next()
-                        if kind is END:
-                            buf += [' />']
-                        else:
-                            buf += ['>']
-                            pushback((kind, data, pos))
+                for attr, value in attrib + ns_attrib:
+                    attrname = attr.localname
+                    if attr.namespace:
+                        prefix = ns_mapping.get(attr.namespace)
+                        if prefix:
+                            attrname = '%s:%s' % (prefix, attrname)
+                    if attrname in boolean_attrs:
+                        if value:
+                            buf += [' ', attrname, '="', attrname, '"']
+                    else:
+                        buf += [' ', attrname, '="', escape(value), '"']
+                ns_attrib = []
+
+                if (not tag.namespace or tag in namespace) and \
+                        tagname in empty_elems:
+                    kind, data, pos = stream.next()
+                    if kind is END:
+                        buf += [' />']
                     else:
                         buf += ['>']
+                        pushback((kind, data, pos))
+                else:
+                    buf += ['>']
 
-                    yield Markup(''.join(buf))
+                yield Markup(''.join(buf))
 
             elif kind is END:
                 tag = data
-                if not tag.namespace or tag in namespace:
-                    yield Markup('</%s>' % tag.localname)
+                tagname = tag.localname
+                if tag.namespace:
+                    prefix = ns_mapping.get(tag.namespace)
+                    if prefix:
+                        tagname = '%s:%s' % (prefix, tag.localname)
+                yield Markup('</%s>' % tagname)
 
             elif kind is TEXT:
                 yield escape(data, quotes=False)
@@ -240,8 +258,14 @@
                 yield Markup(''.join(buf), *filter(None, data))
                 have_doctype = True
 
-            elif kind is START_NS and data[1] not in ns_mapping:
-                ns_mapping[data[1]] = data[0]
+            elif kind is START_NS:
+                prefix, uri = data
+                if uri not in ns_mapping:
+                    ns_mapping[uri] = prefix
+                    if not prefix:
+                        ns_attrib.append((QName('xmlns'), uri))
+                    else:
+                        ns_attrib.append((QName('xmlns:%s' % prefix), uri))
 
             elif kind is PI:
                 yield Markup('<?%s %s?>' % data)
@@ -256,17 +280,36 @@
     <div><a href="foo"></a><br><hr noshade></div>
     """
 
+    _NOESCAPE_ELEMS = frozenset([QName('script'), QName('style')])
+
+    def __init__(self, doctype=None, strip_whitespace=True):
+        """Initialize the HTML serializer.
+        
+        @param doctype: a `(name, pubid, sysid)` tuple that represents the
+            DOCTYPE declaration that should be included at the top of the
+            generated output
+        @param strip_whitespace: whether extraneous whitespace should be
+            stripped from the output
+        """
+        super(HTMLSerializer, self).__init__(doctype, False)
+        if strip_whitespace:
+            self.filters.append(WhitespaceFilter(self._PRESERVE_SPACE,
+                                                 self._NOESCAPE_ELEMS))
+
     def __call__(self, stream):
         namespace = self.NAMESPACE
         ns_mapping = {}
         boolean_attrs = self._BOOLEAN_ATTRS
         empty_elems = self._EMPTY_ELEMS
+        noescape_elems = self._NOESCAPE_ELEMS
         have_doctype = False
+        noescape = False
 
         stream = chain(self.preamble, stream)
         for filter_ in self.filters:
             stream = filter_(stream)
         stream = _PushbackIterator(stream)
+        pushback = stream.pushback
         for kind, data, pos in stream:
 
             if kind is START:
@@ -277,9 +320,7 @@
 
                     for attr, value in attrib:
                         attrname = attr.localname
-                        if not attr.namespace and not \
-                                attrname.startswith('xml:') or \
-                                attr in namespace:
+                        if not attr.namespace or attr in namespace:
                             if attrname in boolean_attrs:
                                 if value:
                                     buf += [' ', attrname]
@@ -289,18 +330,26 @@
                     if tagname in empty_elems:
                         kind, data, pos = stream.next()
                         if kind is not END:
-                            stream.pushback((kind, data, pos))
+                            pushback((kind, data, pos))
 
                     buf += ['>']
                     yield Markup(''.join(buf))
 
+                    if tagname in noescape_elems:
+                        noescape = True
+
             elif kind is END:
                 tag = data
                 if not tag.namespace or tag in namespace:
                     yield Markup('</%s>' % tag.localname)
 
+                noescape = False
+
             elif kind is TEXT:
-                yield escape(data, quotes=False)
+                if noescape:
+                    yield data
+                else:
+                    yield escape(data, quotes=False)
 
             elif kind is COMMENT:
                 yield Markup('<!--%s-->' % data)
@@ -331,46 +380,67 @@
 
     _TRAILING_SPACE = re.compile('[ \t]+(?=\n)')
     _LINE_COLLAPSE = re.compile('\n{2,}')
+    _XML_SPACE = XML_NAMESPACE['space']
 
-    def __init__(self, preserve=None):
+    def __init__(self, preserve=None, noescape=None):
         """Initialize the filter.
         
-        @param preserve: a sequence of tag names for which white-space should
-            be ignored.
+        @param preserve: a set or sequence of tag names for which white-space
+            should be ignored.
+        @param noescape: a set or sequence of tag names for which text content
+            should not be escaped
+        
+        Both the `preserve` and `noescape` sets are expected to refer to
+        elements that cannot contain further child elements.
         """
         if preserve is None:
             preserve = []
         self.preserve = frozenset(preserve)
+        if noescape is None:
+            noescape = []
+        self.noescape = frozenset(noescape)
 
     def __call__(self, stream, ctxt=None):
         trim_trailing_space = self._TRAILING_SPACE.sub
         collapse_lines = self._LINE_COLLAPSE.sub
+        xml_space = self._XML_SPACE
         mjoin = Markup('').join
-        preserve = [False]
-        append_preserve = preserve.append
-        pop_preserve = preserve.pop
+        preserve_elems = self.preserve
+        preserve = False
+        noescape_elems = self.noescape
+        noescape = False
 
         textbuf = []
-        append_text = textbuf.append
+        push_text = textbuf.append
         pop_text = textbuf.pop
         for kind, data, pos in chain(stream, [(None, None, None)]):
             if kind is TEXT:
-                append_text(data)
+                if noescape:
+                    data = Markup(data)
+                push_text(data)
             else:
-                if kind is START:
-                    append_preserve(data[0] in self.preserve or 
-                                    data[1].get('xml:space') == 'preserve')
                 if textbuf:
                     if len(textbuf) > 1:
                         text = mjoin(textbuf, escape_quotes=False)
                         del textbuf[:]
                     else:
                         text = escape(pop_text(), quotes=False)
-                    if not preserve[-1]:
+                    if not preserve:
                         text = collapse_lines('\n', trim_trailing_space('', text))
                     yield TEXT, Markup(text), pos
-                if kind is END:
-                    pop_preserve()
+
+                if kind is START:
+                    tag, attrib = data
+                    if tag.localname in preserve_elems or \
+                            data[1].get(xml_space) == 'preserve':
+                        preserve = True
+
+                    if tag.localname in noescape_elems:
+                        noescape = True
+
+                elif kind is END:
+                    preserve = noescape = False
+
                 if kind:
                     yield kind, data, pos
 
--- a/markup/tests/input.py
+++ b/markup/tests/input.py
@@ -66,7 +66,7 @@
 
 def suite():
     suite = unittest.TestSuite()
-    #suite.addTest(doctest.DocTestSuite(XMLParser.__module__))
+    suite.addTest(doctest.DocTestSuite(XMLParser.__module__))
     suite.addTest(unittest.makeSuite(XMLParserTestCase, 'test'))
     suite.addTest(unittest.makeSuite(HTMLParserTestCase, 'test'))
     return suite
--- a/markup/tests/output.py
+++ b/markup/tests/output.py
@@ -16,7 +16,7 @@
 import sys
 
 from markup.core import Stream
-from markup.input import HTML
+from markup.input import HTML, XML
 from markup.output import DocType, XMLSerializer, XHTMLSerializer, \
                           HTMLSerializer
 
@@ -85,23 +85,47 @@
 
     def test_textarea_whitespace(self):
         content = '\nHey there.  \n\n    I am indented.\n'
-        stream = HTML('<textarea name="foo">%s</textarea>' % content)
+        stream = XML('<textarea name="foo">%s</textarea>' % content)
         output = stream.render(XHTMLSerializer)
         self.assertEqual('<textarea name="foo">%s</textarea>' % content, output)
 
     def test_xml_space(self):
         text = '<foo xml:space="preserve"> Do not mess  \n\n with me </foo>'
-        output = HTML(text).render(XHTMLSerializer)
+        output = XML(text).render(XHTMLSerializer)
         self.assertEqual(text, output)
 
+    def test_script_escaping(self):
+        text = '<script><![CDATA[if (1 < 2) { alert("Doh"); }]]></script>'
+        output = XML(text).render(XHTMLSerializer)
+        self.assertEqual('<script>if (1 &lt; 2) { alert("Doh"); }</script>',
+                         output)
+
+    def test_style_escaping(self):
+        text = '<style><![CDATA[html > body { display: none; }]]></style>'
+        output = XML(text).render(XHTMLSerializer)
+        self.assertEqual('<style>html &gt; body { display: none; }</style>',
+                         output)
+
 
 class HTMLSerializerTestCase(unittest.TestCase):
 
     def test_xml_space(self):
         text = '<foo xml:space="preserve"> Do not mess  \n\n with me </foo>'
-        output = HTML(text).render(HTMLSerializer)
+        output = XML(text).render(HTMLSerializer)
         self.assertEqual('<foo> Do not mess  \n\n with me </foo>', output)
 
+    def test_script_escaping(self):
+        text = '<script>if (1 &lt; 2) { alert("Doh"); }</script>'
+        output = XML(text).render(HTMLSerializer)
+        self.assertEqual('<script>if (1 < 2) { alert("Doh"); }</script>',
+                         output)
+
+    def test_style_escaping(self):
+        text = '<style>html &gt; body { display: none; }</style>'
+        output = XML(text).render(HTMLSerializer)
+        self.assertEqual('<style>html > body { display: none; }</style>',
+                         output)
+
 
 def suite():
     suite = unittest.TestSuite()
Copyright (C) 2012-2017 Edgewall Software