Mercurial > genshi > mirror
changeset 204:51d4101f49ca trunk
* Implement reverse add/mul operators for `Markup` class, so that the result is also a `Markup` instance.
* Override the bitwise or (`|`) operator on the `Stream` class, which allows a syntax similar to Unix shell pipes for chaining stream filters.
author | cmlenz |
---|---|
date | Fri, 25 Aug 2006 23:58:36 +0000 |
parents | 48fab34e5e4d |
children | b700e5326421 |
files | markup/core.py markup/tests/core.py markup/tests/filters.py |
diffstat | 3 files changed, 90 insertions(+), 32 deletions(-) [+] |
line wrap: on
line diff
--- a/markup/core.py +++ b/markup/core.py @@ -14,6 +14,7 @@ """Core classes for markup processing.""" import htmlentitydefs +import operator import re __all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Namespace', 'QName'] @@ -65,17 +66,64 @@ def __iter__(self): return iter(self.events) + def __or__(self, function): + """Override the "bitwise or" operator to apply filters or serializers + to the stream, providing a syntax similar to pipes on Unix shells. + + Assume the following stream produced by the `HTML` function: + + >>> from markup.input import HTML + >>> html = HTML('''<p onclick="alert('Whoa')">Hello, world!</p>''') + >>> print html + <p onclick="alert('Whoa')">Hello, world!</p> + + A filter such as the HTML sanitizer can be applied to that stream using + the pipe notation as follows: + + >>> from markup.filters import HTMLSanitizer + >>> sanitizer = HTMLSanitizer() + >>> print html | sanitizer + <p>Hello, world!</p> + + Filters can be any function that accepts and produces a stream (where + a stream is anything that iterators over events): + + >>> def uppercase(stream): + ... for kind, data, pos in stream: + ... if kind is TEXT: + ... data = data.upper() + ... yield kind, data, pos + >>> print html | sanitizer | uppercase + <p>HELLO, WORLD!</p> + + Serializers can also be used with this notation: + + >>> from markup.output import TextSerializer + >>> output = TextSerializer() + >>> print html | sanitizer | uppercase | output + HELLO, WORLD! + + Commonly, serializers should be used at the end of the "pipeline"; + using them somewhere in the middle may produce unexpected results. + """ + return Stream(_ensure(function(self))) + def filter(self, *filters): """Apply filters to the stream. This method returns a new stream with the given filters applied. The filters must be callables that accept the stream object as parameter, and return the filtered stream. + + The call: + + stream.filter(filter1, filter2) + + is equivalent to: + + stream | filter1 | filter2 """ - stream = self - for filter_ in filters: - stream = filter_(iter(stream)) - return Stream(stream) + return reduce(operator.or_, (self,) + filters) def render(self, method='xml', encoding='utf-8', **kwargs): """Return a string representation of the stream. @@ -129,8 +177,7 @@ 'xhtml': output.XHTMLSerializer, 'html': output.HTMLSerializer, 'text': output.TextSerializer}[method] - serialize = cls(**kwargs) - return serialize(_ensure(self)) + return cls(**kwargs)(_ensure(self)) def __str__(self): return self.render() @@ -335,7 +382,10 @@ return unicode.__new__(cls, text) def __add__(self, other): - return Markup(unicode(self) + escape(other)) + return Markup(unicode(self) + unicode(escape(other))) + + def __radd__(self, other): + return Markup(unicode(escape(other)) + unicode(self)) def __mod__(self, args): if not isinstance(args, (list, tuple)): @@ -345,6 +395,9 @@ def __mul__(self, num): return Markup(unicode(self) * num) + def __rmul__(self, num): + return Markup(num * unicode(self)) + def __repr__(self): return '<%s "%s">' % (self.__class__.__name__, self)
--- a/markup/tests/core.py +++ b/markup/tests/core.py @@ -66,9 +66,9 @@ self.assertEquals('<b>foo</b><br/>', markup) def test_add_reverse(self): - markup = 'foo' + Markup('<b>bar</b>') - assert isinstance(markup, unicode) - self.assertEquals('foo<b>bar</b>', markup) + markup = '<br/>' + Markup('<b>bar</b>') + assert isinstance(markup, Markup) + self.assertEquals('<br/><b>bar</b>', markup) def test_mod(self): markup = Markup('<b>%s</b>') % '&' @@ -85,6 +85,11 @@ assert isinstance(markup, Markup) self.assertEquals('<b>foo</b><b>foo</b>', markup) + def test_mul_reverse(self): + markup = 2 * Markup('<b>foo</b>') + assert isinstance(markup, Markup) + self.assertEquals('<b>foo</b><b>foo</b>', markup) + def test_join(self): markup = Markup('<br />').join(['foo', '<bar />', Markup('<baz />')]) assert isinstance(markup, Markup)
--- a/markup/tests/filters.py +++ b/markup/tests/filters.py @@ -24,96 +24,96 @@ def test_sanitize_unchanged(self): html = HTML('<a href="#">fo<br />o</a>') self.assertEquals(u'<a href="#">fo<br/>o</a>', - unicode(html.filter(HTMLSanitizer()))) + unicode(html | HTMLSanitizer())) def test_sanitize_escape_text(self): html = HTML('<a href="#">fo&</a>') self.assertEquals(u'<a href="#">fo&</a>', - unicode(html.filter(HTMLSanitizer()))) + unicode(html | HTMLSanitizer())) html = HTML('<a href="#"><foo></a>') self.assertEquals(u'<a href="#"><foo></a>', - unicode(html.filter(HTMLSanitizer()))) + unicode(html | HTMLSanitizer())) def test_sanitize_entityref_text(self): html = HTML('<a href="#">foö</a>') self.assertEquals(u'<a href="#">foƶ</a>', - unicode(html.filter(HTMLSanitizer()))) + unicode(html | HTMLSanitizer())) def test_sanitize_escape_attr(self): html = HTML('<div title="<foo>"></div>') self.assertEquals(u'<div title="<foo>"/>', - unicode(html.filter(HTMLSanitizer()))) + unicode(html | HTMLSanitizer())) def test_sanitize_close_empty_tag(self): html = HTML('<a href="#">fo<br>o</a>') self.assertEquals(u'<a href="#">fo<br/>o</a>', - unicode(html.filter(HTMLSanitizer()))) + unicode(html | HTMLSanitizer())) def test_sanitize_invalid_entity(self): html = HTML('&junk;') - self.assertEquals('&junk;', unicode(html.filter(HTMLSanitizer()))) + self.assertEquals('&junk;', unicode(html | HTMLSanitizer())) def test_sanitize_remove_script_elem(self): html = HTML('<script>alert("Foo")</script>') - self.assertEquals(u'', unicode(html.filter(HTMLSanitizer()))) + self.assertEquals(u'', unicode(html | HTMLSanitizer())) html = HTML('<SCRIPT SRC="http://example.com/"></SCRIPT>') - self.assertEquals(u'', unicode(html.filter(HTMLSanitizer()))) + self.assertEquals(u'', unicode(html | HTMLSanitizer())) self.assertRaises(ParseError, HTML, '<SCR\0IPT>alert("foo")</SCR\0IPT>') self.assertRaises(ParseError, HTML, '<SCRIPT&XYZ SRC="http://example.com/"></SCRIPT>') def test_sanitize_remove_onclick_attr(self): html = HTML('<div onclick=\'alert("foo")\' />') - self.assertEquals(u'<div/>', unicode(html.filter(HTMLSanitizer()))) + self.assertEquals(u'<div/>', unicode(html | HTMLSanitizer())) def test_sanitize_remove_style_scripts(self): # Inline style with url() using javascript: scheme html = HTML('<DIV STYLE=\'background: url(javascript:alert("foo"))\'>') - self.assertEquals(u'<div/>', unicode(html.filter(HTMLSanitizer()))) + self.assertEquals(u'<div/>', unicode(html | HTMLSanitizer())) # Inline style with url() using javascript: scheme, using control char html = HTML('<DIV STYLE=\'background: url(javascript:alert("foo"))\'>') - self.assertEquals(u'<div/>', unicode(html.filter(HTMLSanitizer()))) + self.assertEquals(u'<div/>', unicode(html | HTMLSanitizer())) # Inline style with url() using javascript: scheme, in quotes html = HTML('<DIV STYLE=\'background: url("javascript:alert(foo)")\'>') - self.assertEquals(u'<div/>', unicode(html.filter(HTMLSanitizer()))) + self.assertEquals(u'<div/>', unicode(html | HTMLSanitizer())) # IE expressions in CSS not allowed html = HTML('<DIV STYLE=\'width: expression(alert("foo"));\'>') - self.assertEquals(u'<div/>', unicode(html.filter(HTMLSanitizer()))) + self.assertEquals(u'<div/>', unicode(html | HTMLSanitizer())) html = HTML('<DIV STYLE=\'background: url(javascript:alert("foo"));' 'color: #fff\'>') self.assertEquals(u'<div style="color: #fff"/>', - unicode(html.filter(HTMLSanitizer()))) + unicode(html | HTMLSanitizer())) def test_sanitize_remove_src_javascript(self): html = HTML('<img src=\'javascript:alert("foo")\'>') - self.assertEquals(u'<img/>', unicode(html.filter(HTMLSanitizer()))) + self.assertEquals(u'<img/>', unicode(html | HTMLSanitizer())) # Case-insensitive protocol matching html = HTML('<IMG SRC=\'JaVaScRiPt:alert("foo")\'>') - self.assertEquals(u'<img/>', unicode(html.filter(HTMLSanitizer()))) + self.assertEquals(u'<img/>', unicode(html | HTMLSanitizer())) # Grave accents (not parsed) self.assertRaises(ParseError, HTML, '<IMG SRC=`javascript:alert("RSnake says, \'foo\'")`>') # Protocol encoded using UTF-8 numeric entities html = HTML('<IMG SRC=\'javascri' 'pt:alert("foo")\'>') - self.assertEquals(u'<img/>', unicode(html.filter(HTMLSanitizer()))) + self.assertEquals(u'<img/>', unicode(html | HTMLSanitizer())) # Protocol encoded using UTF-8 numeric entities without a semicolon # (which is allowed because the max number of digits is used) html = HTML('<IMG SRC=\'java' 'script' ':alert("foo")\'>') - self.assertEquals(u'<img/>', unicode(html.filter(HTMLSanitizer()))) + self.assertEquals(u'<img/>', unicode(html | HTMLSanitizer())) # Protocol encoded using UTF-8 numeric hex entities without a semicolon # (which is allowed because the max number of digits is used) html = HTML('<IMG SRC=\'javascri' 'pt:alert("foo")\'>') - self.assertEquals(u'<img/>', unicode(html.filter(HTMLSanitizer()))) + self.assertEquals(u'<img/>', unicode(html | HTMLSanitizer())) # Embedded tab character in protocol html = HTML('<IMG SRC=\'jav\tascript:alert("foo");\'>') - self.assertEquals(u'<img/>', unicode(html.filter(HTMLSanitizer()))) + self.assertEquals(u'<img/>', unicode(html | HTMLSanitizer())) # Embedded tab character in protocol, but encoded this time html = HTML('<IMG SRC=\'jav	ascript:alert("foo");\'>') - self.assertEquals(u'<img/>', unicode(html.filter(HTMLSanitizer()))) + self.assertEquals(u'<img/>', unicode(html | HTMLSanitizer())) def suite():