# HG changeset patch
# User cmlenz
# Date 1154386806 0
# Node ID e815c2c0757277f38d6761ca92eafcbc3072d6d8
# Parent  a834a66696813ca08102ba5c42a119c4bc708de9
Removed the `sanitize()` method from the `Markup` class, and migrate the existing unit tests to `markup.tests.filters`. Provide a `Stream.filter()` method instead which can be used to conveniently apply a filter to a stream.

diff --git a/markup/core.py b/markup/core.py
--- a/markup/core.py
+++ b/markup/core.py
@@ -64,6 +64,14 @@
     def __iter__(self):
         return iter(self.events)
 
+    def filter(self, filter):
+        """Apply a filter to the stream.
+        
+        This method returns a new stream with the given filter applied. The
+        filter must be a callable that accepts the stream object as parameter.
+        """
+        return Stream(filter(html))
+
     def render(self, method='xml', encoding='utf-8', filters=None, **kwargs):
         """Return a string representation of the stream.
         
@@ -238,6 +246,39 @@
         return TEXT, u''.join([x[1] for x in self]), (None, -1, -1)
 
 
+def stripentities(text, keepxmlentities=False):
+    """Return a copy of the given text with any character or numeric entities
+    replaced by the equivalent UTF-8 characters.
+    
+    If the `keepxmlentities` parameter is provided and evaluates to `True`,
+    the core XML entities (&amp;, &apos;, &gt;, &lt; and &quot;) are not
+    stripped.
+    """
+    def _replace_entity(match):
+        if match.group(1): # numeric entity
+            ref = match.group(1)
+            if ref.startswith('x'):
+                ref = int(ref[1:], 16)
+            else:
+                ref = int(ref, 10)
+            return unichr(ref)
+        else: # character entity
+            ref = match.group(2)
+            if keepxmlentities and ref in ('amp', 'apos', 'gt', 'lt',
+                                           'quot'):
+                return '&%s;' % ref
+            try:
+                codepoint = htmlentitydefs.name2codepoint[ref]
+                return unichr(codepoint)
+            except KeyError:
+                if keepxmlentities:
+                    return '&amp;%s;' % ref
+                else:
+                    return ref
+    return re.sub(r'&(?:#((?:\d+)|(?:[xX][0-9a-fA-F]+));?|(\w+);)',
+                  _replace_entity, text)
+
+
 class Markup(unicode):
     """Marks a string as being safe for inclusion in HTML/XML output without
     needing to be escaped.
@@ -276,29 +317,7 @@
         the core XML entities (&amp;, &apos;, &gt;, &lt; and &quot;) are not
         stripped.
         """
-        def _replace_entity(match):
-            if match.group(1): # numeric entity
-                ref = match.group(1)
-                if ref.startswith('x'):
-                    ref = int(ref[1:], 16)
-                else:
-                    ref = int(ref, 10)
-                return unichr(ref)
-            else: # character entity
-                ref = match.group(2)
-                if keepxmlentities and ref in ('amp', 'apos', 'gt', 'lt',
-                                               'quot'):
-                    return '&%s;' % ref
-                try:
-                    codepoint = htmlentitydefs.name2codepoint[ref]
-                    return unichr(codepoint)
-                except KeyError:
-                    if keepxmlentities:
-                        return '&amp;%s;' % ref
-                    else:
-                        return ref
-        return Markup(re.sub(r'&(?:#((?:\d+)|(?:[xX][0-9a-fA-F]+));?|(\w+);)',
-                             _replace_entity, self))
+        return Markup(stripentities(self, keepxmlentities=keepxmlentities))
 
     def striptags(self):
         """Return a copy of the text with all XML/HTML tags removed."""
@@ -342,12 +361,6 @@
             text = text.replace(u'\n', u' ')
         return text
 
-    def sanitize(self):
-        from markup.filters import HTMLSanitizer
-        from markup.input import HTMLParser
-        text = StringIO(self.stripentities(keepxmlentities=True))
-        return Markup(Stream(HTMLSanitizer()(HTMLParser(text))))
-
 
 escape = Markup.escape
 
diff --git a/markup/filters.py b/markup/filters.py
--- a/markup/filters.py
+++ b/markup/filters.py
@@ -20,7 +20,7 @@
     from sets import ImmutableSet as frozenset
 import re
 
-from markup.core import Attributes, Markup, Namespace, escape
+from markup.core import Attributes, Markup, Namespace, escape, stripentities
 from markup.core import END, END_NS, START, START_NS, TEXT
 from markup.path import Path
 
@@ -130,9 +130,9 @@
                         del textbuf[:]
                         yield TEXT, output, pos
                     else:
-                        output = escape(collapse_lines('\n',
+                        output = Markup(collapse_lines('\n',
                             trim_trailing_space('',
-                                textbuf.pop())), quotes=False)
+                                escape(textbuf.pop(), quotes=False))))
                         yield TEXT, output, pos
                 if kind is not None:
                     yield kind, data, pos
@@ -182,6 +182,7 @@
 
                 new_attrib = []
                 for attr, value in attrib:
+                    value = stripentities(value)
                     if attr not in self._SAFE_ATTRS:
                         continue
                     elif attr in self._URI_ATTRS:
diff --git a/markup/tests/core.py b/markup/tests/core.py
--- a/markup/tests/core.py
+++ b/markup/tests/core.py
@@ -91,94 +91,6 @@
         assert isinstance(markup, Markup)
         self.assertEquals('foo', markup)
 
-    def test_sanitize_unchanged(self):
-        markup = Markup('<a href="#">fo<br />o</a>')
-        self.assertEquals('<a href="#">fo<br/>o</a>', str(markup.sanitize()))
-
-    def test_sanitize_escape_text(self):
-        markup = Markup('<a href="#">fo&amp;</a>')
-        self.assertEquals('<a href="#">fo&amp;</a>', str(markup.sanitize()))
-        markup = Markup('<a href="#">&lt;foo&gt;</a>')
-        self.assertEquals('<a href="#">&lt;foo&gt;</a>', str(markup.sanitize()))
-
-    def test_sanitize_entityref_text(self):
-        markup = Markup('<a href="#">fo&ouml;</a>')
-        self.assertEquals(u'<a href="#">foö</a>', unicode(markup.sanitize()))
-
-    def test_sanitize_escape_attr(self):
-        markup = Markup('<div title="&lt;foo&gt;"></div>')
-        self.assertEquals('<div title="&lt;foo&gt;"/>', str(markup.sanitize()))
-
-    def test_sanitize_close_empty_tag(self):
-        markup = Markup('<a href="#">fo<br>o</a>')
-        self.assertEquals('<a href="#">fo<br/>o</a>', str(markup.sanitize()))
-
-    def test_sanitize_invalid_entity(self):
-        markup = Markup('&junk;')
-        self.assertEquals('&amp;junk;', str(markup.sanitize()))
-
-    def test_sanitize_remove_script_elem(self):
-        markup = Markup('<script>alert("Foo")</script>')
-        self.assertEquals('', str(markup.sanitize()))
-        markup = Markup('<SCRIPT SRC="http://example.com/"></SCRIPT>')
-        self.assertEquals('', str(markup.sanitize()))
-        markup = Markup('<SCR\0IPT>alert("foo")</SCR\0IPT>')
-        self.assertRaises(ParseError, markup.sanitize)
-        markup = Markup('<SCRIPT&XYZ SRC="http://example.com/"></SCRIPT>')
-        self.assertRaises(ParseError, markup.sanitize)
-
-    def test_sanitize_remove_onclick_attr(self):
-        markup = Markup('<div onclick=\'alert("foo")\' />')
-        self.assertEquals('<div/>', str(markup.sanitize()))
-
-    def test_sanitize_remove_style_scripts(self):
-        # Inline style with url() using javascript: scheme
-        markup = Markup('<DIV STYLE=\'background: url(javascript:alert("foo"))\'>')
-        self.assertEquals('<div/>', str(markup.sanitize()))
-        # Inline style with url() using javascript: scheme, using control char
-        markup = Markup('<DIV STYLE=\'background: url(javascript:alert("foo"))\'>')
-        self.assertEquals('<div/>', str(markup.sanitize()))
-        # Inline style with url() using javascript: scheme, in quotes
-        markup = Markup('<DIV STYLE=\'background: url("javascript:alert(foo)")\'>')
-        self.assertEquals('<div/>', str(markup.sanitize()))
-        # IE expressions in CSS not allowed
-        markup = Markup('<DIV STYLE=\'width: expression(alert("foo"));\'>')
-        self.assertEquals('<div/>', str(markup.sanitize()))
-        markup = Markup('<DIV STYLE=\'background: url(javascript:alert("foo"));'
-                                     'color: #fff\'>')
-        self.assertEquals('<div style="color: #fff"/>', str(markup.sanitize()))
-
-    def test_sanitize_remove_src_javascript(self):
-        markup = Markup('<img src=\'javascript:alert("foo")\'>')
-        self.assertEquals('<img/>', str(markup.sanitize()))
-        # Case-insensitive protocol matching
-        markup = Markup('<IMG SRC=\'JaVaScRiPt:alert("foo")\'>')
-        self.assertEquals('<img/>', str(markup.sanitize()))
-        # Grave accents (not parsed)
-        markup = Markup('<IMG SRC=`javascript:alert("RSnake says, \'foo\'")`>')
-        self.assertRaises(ParseError, markup.sanitize)
-        # Protocol encoded using UTF-8 numeric entities
-        markup = Markup('<IMG SRC=\'&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;'
-                        '&#112;&#116;&#58;alert("foo")\'>')
-        self.assertEquals('<img/>', str(markup.sanitize()))
-        # Protocol encoded using UTF-8 numeric entities without a semicolon
-        # (which is allowed because the max number of digits is used)
-        markup = Markup('<IMG SRC=\'&#0000106&#0000097&#0000118&#0000097'
-                        '&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116'
-                        '&#0000058alert("foo")\'>')
-        self.assertEquals('<img/>', str(markup.sanitize()))
-        # Protocol encoded using UTF-8 numeric hex entities without a semicolon
-        # (which is allowed because the max number of digits is used)
-        markup = Markup('<IMG SRC=\'&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69'
-                        '&#x70&#x74&#x3A;alert("foo")\'>')
-        self.assertEquals('<img/>', str(markup.sanitize()))
-        # Embedded tab character in protocol
-        markup = Markup('<IMG SRC=\'jav\tascript:alert("foo");\'>')
-        self.assertEquals('<img/>', str(markup.sanitize()))
-        # Embedded tab character in protocol, but encoded this time
-        markup = Markup('<IMG SRC=\'jav&#x09;ascript:alert("foo");\'>')
-        self.assertEquals('<img/>', str(markup.sanitize()))
-
 
 def suite():
     suite = unittest.TestSuite()
diff --git a/markup/tests/filters.py b/markup/tests/filters.py
new file mode 100644
--- /dev/null
+++ b/markup/tests/filters.py
@@ -0,0 +1,125 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2006 Edgewall Software
+# All rights reserved.
+#
+# This software is licensed as described in the file COPYING, which
+# you should have received as part of this distribution. The terms
+# are also available at http://markup.edgewall.org/wiki/License.
+#
+# This software consists of voluntary contributions made by many
+# individuals. For the exact contribution history, see the revision
+# history and logs, available at http://markup.edgewall.org/log/.
+
+import doctest
+import unittest
+
+from markup.core import Stream
+from markup.input import HTML, ParseError
+from markup.filters import HTMLSanitizer
+
+
+class HTMLSanitizerTestCase(unittest.TestCase):
+
+    def test_sanitize_unchanged(self):
+        html = HTML('<a href="#">fo<br />o</a>')
+        self.assertEquals('<a href="#">fo<br/>o</a>',
+                          str(html.filter(HTMLSanitizer()))
+
+    def test_sanitize_escape_text(self):
+        html = HTML('<a href="#">fo&amp;</a>')
+        self.assertEquals('<a href="#">fo&amp;</a>',
+                          str(html.filter(HTMLSanitizer()))
+        html = HTML('<a href="#">&lt;foo&gt;</a>')
+        self.assertEquals('<a href="#">&lt;foo&gt;</a>',
+                          str(html.filter(HTMLSanitizer()))
+
+    def test_sanitize_entityref_text(self):
+        html = HTML('<a href="#">fo&ouml;</a>')
+        self.assertEquals(u'<a href="#">fo&ouml;</a>',
+                          str(html.filter(HTMLSanitizer()))
+
+    def test_sanitize_escape_attr(self):
+        html = HTML('<div title="&lt;foo&gt;"></div>')
+        self.assertEquals('<div title="&lt;foo&gt;"/>',
+                          str(html.filter(HTMLSanitizer())))
+
+    def test_sanitize_close_empty_tag(self):
+        html = HTML('<a href="#">fo<br>o</a>')
+        self.assertEquals('<a href="#">fo<br/>o</a>',
+                          str(html.filter(HTMLSanitizer()))
+
+    def test_sanitize_invalid_entity(self):
+        html = HTML('&junk;')
+        self.assertEquals('&junk;', str(html.filter(HTMLSanitizer()))
+
+    def test_sanitize_remove_script_elem(self):
+        html = HTML('<script>alert("Foo")</script>')
+        self.assertEquals('', str(html.filter(HTMLSanitizer()))
+        html = HTML('<SCRIPT SRC="http://example.com/"></SCRIPT>')
+        self.assertEquals('', str(html.filter(HTMLSanitizer()))
+        self.assertRaises(ParseError, HTML, '<SCR\0IPT>alert("foo")</SCR\0IPT>')
+        self.assertRaises(ParseError, HTML,
+                          '<SCRIPT&XYZ SRC="http://example.com/"></SCRIPT>')
+
+    def test_sanitize_remove_onclick_attr(self):
+        html = HTML('<div onclick=\'alert("foo")\' />')
+        self.assertEquals('<div/>', str(html.filter(HTMLSanitizer()))
+
+    def test_sanitize_remove_style_scripts(self):
+        # Inline style with url() using javascript: scheme
+        html = HTML('<DIV STYLE=\'background: url(javascript:alert("foo"))\'>')
+        self.assertEquals('<div/>', str(html.filter(HTMLSanitizer()))
+        # Inline style with url() using javascript: scheme, using control char
+        html = HTML('<DIV STYLE=\'background: url(javascript:alert("foo"))\'>')
+        self.assertEquals('<div/>', str(html.filter(HTMLSanitizer()))
+        # Inline style with url() using javascript: scheme, in quotes
+        html = HTML('<DIV STYLE=\'background: url("javascript:alert(foo)")\'>')
+        self.assertEquals('<div/>', str(html.filter(HTMLSanitizer()))
+        # IE expressions in CSS not allowed
+        html = HTML('<DIV STYLE=\'width: expression(alert("foo"));\'>')
+        self.assertEquals('<div/>', str(html.filter(HTMLSanitizer()))
+        html = HTML('<DIV STYLE=\'background: url(javascript:alert("foo"));'
+                                 'color: #fff\'>')
+        self.assertEquals('<div style="color: #fff"/>',
+                          str(html.filter(HTMLSanitizer()))
+
+    def test_sanitize_remove_src_javascript(self):
+        html = HTML('<img src=\'javascript:alert("foo")\'>')
+        self.assertEquals('<img/>', str(html.filter(HTMLSanitizer()))
+        # Case-insensitive protocol matching
+        html = HTML('<IMG SRC=\'JaVaScRiPt:alert("foo")\'>')
+        self.assertEquals('<img/>', str(html.filter(HTMLSanitizer()))
+        # Grave accents (not parsed)
+        self.assertRaises(ParseError, HTML,
+                          '<IMG SRC=`javascript:alert("RSnake says, \'foo\'")`>')
+        # Protocol encoded using UTF-8 numeric entities
+        html = HTML('<IMG SRC=\'&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;'
+                    '&#112;&#116;&#58;alert("foo")\'>')
+        self.assertEquals('<img/>', str(html.filter(HTMLSanitizer()))
+        # Protocol encoded using UTF-8 numeric entities without a semicolon
+        # (which is allowed because the max number of digits is used)
+        html = HTML('<IMG SRC=\'&#0000106&#0000097&#0000118&#0000097'
+                    '&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116'
+                    '&#0000058alert("foo")\'>')
+        self.assertEquals('<img/>', str(html.filter(HTMLSanitizer()))
+        # Protocol encoded using UTF-8 numeric hex entities without a semicolon
+        # (which is allowed because the max number of digits is used)
+        html = HTML('<IMG SRC=\'&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69'
+                    '&#x70&#x74&#x3A;alert("foo")\'>')
+        self.assertEquals('<img/>', str(html.filter(HTMLSanitizer()))
+        # Embedded tab character in protocol
+        html = HTML('<IMG SRC=\'jav\tascript:alert("foo");\'>')
+        self.assertEquals('<img/>', str(html.filter(HTMLSanitizer()))
+        # Embedded tab character in protocol, but encoded this time
+        html = HTML('<IMG SRC=\'jav&#x09;ascript:alert("foo");\'>')
+        self.assertEquals('<img/>', str(html.filter(HTMLSanitizer()))
+
+
+def suite():
+    suite = unittest.TestSuite()
+    suite.addTest(unittest.makeSuite(HTMLSanitizerTestCase, 'test'))
+    return suite
+
+if __name__ == '__main__':
+    unittest.main(defaultTest='suite')