Mercurial > genshi > mirror

--- a/markup/core.py
+++ b/markup/core.py
@@ -14,6 +14,7 @@
 """Core classes for markup processing."""

 import htmlentitydefs
+import operator
 import re

 __all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Namespace', 'QName']
@@ -65,17 +66,64 @@
     def __iter__(self):
         return iter(self.events)

+    def __or__(self, function):
+        """Override the "bitwise or" operator to apply filters or serializers
+        to the stream, providing a syntax similar to pipes on Unix shells.
+
+        Assume the following stream produced by the `HTML` function:
+
+        >>> from markup.input import HTML
+        >>> html = HTML('''<p onclick="alert('Whoa')">Hello, world!</p>''')
+        >>> print html
+        <p onclick="alert('Whoa')">Hello, world!</p>
+
+        A filter such as the HTML sanitizer can be applied to that stream using
+        the pipe notation as follows:
+
+        >>> from markup.filters import HTMLSanitizer
+        >>> sanitizer = HTMLSanitizer()
+        >>> print html | sanitizer
+        <p>Hello, world!</p>
+
+        Filters can be any function that accepts and produces a stream (where
+        a stream is anything that iterators over events):
+
+        >>> def uppercase(stream):
+        ...     for kind, data, pos in stream:
+        ...         if kind is TEXT:
+        ...             data = data.upper()
+        ...         yield kind, data, pos
+        >>> print html | sanitizer | uppercase
+        <p>HELLO, WORLD!</p>
+
+        Serializers can also be used with this notation:
+
+        >>> from markup.output import TextSerializer
+        >>> output = TextSerializer()
+        >>> print html | sanitizer | uppercase | output
+        HELLO, WORLD!
+
+        Commonly, serializers should be used at the end of the "pipeline";
+        using them somewhere in the middle may produce unexpected results.
+        """
+        return Stream(_ensure(function(self)))
+
     def filter(self, *filters):
         """Apply filters to the stream.

         This method returns a new stream with the given filters applied. The
         filters must be callables that accept the stream object as parameter,
         and return the filtered stream.
+
+        The call:
+
+            stream.filter(filter1, filter2)
+
+        is equivalent to:
+
+            stream | filter1 | filter2
         """
-        stream = self
-        for filter_ in filters:
-            stream = filter_(iter(stream))
-        return Stream(stream)
+        return reduce(operator.or_, (self,) + filters)

     def render(self, method='xml', encoding='utf-8', **kwargs):
         """Return a string representation of the stream.
@@ -129,8 +177,7 @@
                    'xhtml': output.XHTMLSerializer,
                    'html':  output.HTMLSerializer,
                    'text':  output.TextSerializer}[method]
-        serialize = cls(**kwargs)
-        return serialize(_ensure(self))
+        return cls(**kwargs)(_ensure(self))

     def __str__(self):
         return self.render()
@@ -335,7 +382,10 @@
         return unicode.__new__(cls, text)

     def __add__(self, other):
-        return Markup(unicode(self) + escape(other))
+        return Markup(unicode(self) + unicode(escape(other)))
+
+    def __radd__(self, other):
+        return Markup(unicode(escape(other)) + unicode(self))

     def __mod__(self, args):
         if not isinstance(args, (list, tuple)):
@@ -345,6 +395,9 @@
     def __mul__(self, num):
         return Markup(unicode(self) * num)

+    def __rmul__(self, num):
+        return Markup(num * unicode(self))
+
     def __repr__(self):
         return '<%s "%s">' % (self.__class__.__name__, self)
--- a/markup/tests/core.py
+++ b/markup/tests/core.py
@@ -66,9 +66,9 @@
         self.assertEquals('<b>foo</b><br/>', markup)

     def test_add_reverse(self):
-        markup = 'foo' + Markup('<b>bar</b>')
-        assert isinstance(markup, unicode)
-        self.assertEquals('foo<b>bar</b>', markup)
+        markup = '<br/>' + Markup('<b>bar</b>')
+        assert isinstance(markup, Markup)
+        self.assertEquals('&lt;br/&gt;<b>bar</b>', markup)

     def test_mod(self):
         markup = Markup('<b>%s</b>') % '&'
@@ -85,6 +85,11 @@
         assert isinstance(markup, Markup)
         self.assertEquals('<b>foo</b><b>foo</b>', markup)

+    def test_mul_reverse(self):
+        markup = 2 * Markup('<b>foo</b>')
+        assert isinstance(markup, Markup)
+        self.assertEquals('<b>foo</b><b>foo</b>', markup)
+
     def test_join(self):
         markup = Markup('<br />').join(['foo', '<bar />', Markup('<baz />')])
         assert isinstance(markup, Markup)
--- a/markup/tests/filters.py
+++ b/markup/tests/filters.py
@@ -24,96 +24,96 @@
     def test_sanitize_unchanged(self):
         html = HTML('<a href="#">fo<br />o</a>')
         self.assertEquals(u'<a href="#">fo<br/>o</a>',
-                          unicode(html.filter(HTMLSanitizer())))
+                          unicode(html | HTMLSanitizer()))

     def test_sanitize_escape_text(self):
         html = HTML('<a href="#">fo&amp;</a>')
         self.assertEquals(u'<a href="#">fo&amp;</a>',
-                          unicode(html.filter(HTMLSanitizer())))
+                          unicode(html | HTMLSanitizer()))
         html = HTML('<a href="#">&lt;foo&gt;</a>')
         self.assertEquals(u'<a href="#">&lt;foo&gt;</a>',
-                          unicode(html.filter(HTMLSanitizer())))
+                          unicode(html | HTMLSanitizer()))

     def test_sanitize_entityref_text(self):
         html = HTML('<a href="#">fo&ouml;</a>')
         self.assertEquals(u'<a href="#">foö</a>',
-                          unicode(html.filter(HTMLSanitizer())))
+                          unicode(html | HTMLSanitizer()))

     def test_sanitize_escape_attr(self):
         html = HTML('<div title="&lt;foo&gt;"></div>')
         self.assertEquals(u'<div title="&lt;foo&gt;"/>',
-                          unicode(html.filter(HTMLSanitizer())))
+                          unicode(html | HTMLSanitizer()))

     def test_sanitize_close_empty_tag(self):
         html = HTML('<a href="#">fo<br>o</a>')
         self.assertEquals(u'<a href="#">fo<br/>o</a>',
-                          unicode(html.filter(HTMLSanitizer())))
+                          unicode(html | HTMLSanitizer()))

     def test_sanitize_invalid_entity(self):
         html = HTML('&junk;')
-        self.assertEquals('&amp;junk;', unicode(html.filter(HTMLSanitizer())))
+        self.assertEquals('&amp;junk;', unicode(html | HTMLSanitizer()))

     def test_sanitize_remove_script_elem(self):
         html = HTML('<script>alert("Foo")</script>')
-        self.assertEquals(u'', unicode(html.filter(HTMLSanitizer())))
+        self.assertEquals(u'', unicode(html | HTMLSanitizer()))
         html = HTML('<SCRIPT SRC="http://example.com/"></SCRIPT>')
-        self.assertEquals(u'', unicode(html.filter(HTMLSanitizer())))
+        self.assertEquals(u'', unicode(html | HTMLSanitizer()))
         self.assertRaises(ParseError, HTML, '<SCR\0IPT>alert("foo")</SCR\0IPT>')
         self.assertRaises(ParseError, HTML,
                           '<SCRIPT&XYZ SRC="http://example.com/"></SCRIPT>')

     def test_sanitize_remove_onclick_attr(self):
         html = HTML('<div onclick=\'alert("foo")\' />')
-        self.assertEquals(u'<div/>', unicode(html.filter(HTMLSanitizer())))
+        self.assertEquals(u'<div/>', unicode(html | HTMLSanitizer()))

     def test_sanitize_remove_style_scripts(self):
         # Inline style with url() using javascript: scheme
         html = HTML('<DIV STYLE=\'background: url(javascript:alert("foo"))\'>')
-        self.assertEquals(u'<div/>', unicode(html.filter(HTMLSanitizer())))
+        self.assertEquals(u'<div/>', unicode(html | HTMLSanitizer()))
         # Inline style with url() using javascript: scheme, using control char
         html = HTML('<DIV STYLE=\'background: url(&#1;javascript:alert("foo"))\'>')
-        self.assertEquals(u'<div/>', unicode(html.filter(HTMLSanitizer())))
+        self.assertEquals(u'<div/>', unicode(html | HTMLSanitizer()))
         # Inline style with url() using javascript: scheme, in quotes
         html = HTML('<DIV STYLE=\'background: url("javascript:alert(foo)")\'>')
-        self.assertEquals(u'<div/>', unicode(html.filter(HTMLSanitizer())))
+        self.assertEquals(u'<div/>', unicode(html | HTMLSanitizer()))
         # IE expressions in CSS not allowed
         html = HTML('<DIV STYLE=\'width: expression(alert("foo"));\'>')
-        self.assertEquals(u'<div/>', unicode(html.filter(HTMLSanitizer())))
+        self.assertEquals(u'<div/>', unicode(html | HTMLSanitizer()))
         html = HTML('<DIV STYLE=\'background: url(javascript:alert("foo"));'
                                  'color: #fff\'>')
         self.assertEquals(u'<div style="color: #fff"/>',
-                          unicode(html.filter(HTMLSanitizer())))
+                          unicode(html | HTMLSanitizer()))

     def test_sanitize_remove_src_javascript(self):
         html = HTML('<img src=\'javascript:alert("foo")\'>')
-        self.assertEquals(u'<img/>', unicode(html.filter(HTMLSanitizer())))
+        self.assertEquals(u'<img/>', unicode(html | HTMLSanitizer()))
         # Case-insensitive protocol matching
         html = HTML('<IMG SRC=\'JaVaScRiPt:alert("foo")\'>')
-        self.assertEquals(u'<img/>', unicode(html.filter(HTMLSanitizer())))
+        self.assertEquals(u'<img/>', unicode(html | HTMLSanitizer()))
         # Grave accents (not parsed)
         self.assertRaises(ParseError, HTML,
                           '<IMG SRC=`javascript:alert("RSnake says, \'foo\'")`>')
         # Protocol encoded using UTF-8 numeric entities
         html = HTML('<IMG SRC=\'&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;'
                     '&#112;&#116;&#58;alert("foo")\'>')
-        self.assertEquals(u'<img/>', unicode(html.filter(HTMLSanitizer())))
+        self.assertEquals(u'<img/>', unicode(html | HTMLSanitizer()))
         # Protocol encoded using UTF-8 numeric entities without a semicolon
         # (which is allowed because the max number of digits is used)
         html = HTML('<IMG SRC=\'&#0000106&#0000097&#0000118&#0000097'
                     '&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116'
                     '&#0000058alert("foo")\'>')
-        self.assertEquals(u'<img/>', unicode(html.filter(HTMLSanitizer())))
+        self.assertEquals(u'<img/>', unicode(html | HTMLSanitizer()))
         # Protocol encoded using UTF-8 numeric hex entities without a semicolon
         # (which is allowed because the max number of digits is used)
         html = HTML('<IMG SRC=\'&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69'
                     '&#x70&#x74&#x3A;alert("foo")\'>')
-        self.assertEquals(u'<img/>', unicode(html.filter(HTMLSanitizer())))
+        self.assertEquals(u'<img/>', unicode(html | HTMLSanitizer()))
         # Embedded tab character in protocol
         html = HTML('<IMG SRC=\'jav\tascript:alert("foo");\'>')
-        self.assertEquals(u'<img/>', unicode(html.filter(HTMLSanitizer())))
+        self.assertEquals(u'<img/>', unicode(html | HTMLSanitizer()))
         # Embedded tab character in protocol, but encoded this time
         html = HTML('<IMG SRC=\'jav&#x09;ascript:alert("foo");\'>')
-        self.assertEquals(u'<img/>', unicode(html.filter(HTMLSanitizer())))
+        self.assertEquals(u'<img/>', unicode(html | HTMLSanitizer()))


 def suite():