# HG changeset patch
# User cmlenz
# Date 1154386806 0
# Node ID e815c2c0757277f38d6761ca92eafcbc3072d6d8
# Parent a834a66696813ca08102ba5c42a119c4bc708de9
Removed the `sanitize()` method from the `Markup` class, and migrate the existing unit tests to `markup.tests.filters`. Provide a `Stream.filter()` method instead which can be used to conveniently apply a filter to a stream.
diff --git a/markup/core.py b/markup/core.py
--- a/markup/core.py
+++ b/markup/core.py
@@ -64,6 +64,14 @@
def __iter__(self):
return iter(self.events)
+ def filter(self, filter):
+ """Apply a filter to the stream.
+
+ This method returns a new stream with the given filter applied. The
+ filter must be a callable that accepts the stream object as parameter.
+ """
+ return Stream(filter(html))
+
def render(self, method='xml', encoding='utf-8', filters=None, **kwargs):
"""Return a string representation of the stream.
@@ -238,6 +246,39 @@
return TEXT, u''.join([x[1] for x in self]), (None, -1, -1)
+def stripentities(text, keepxmlentities=False):
+ """Return a copy of the given text with any character or numeric entities
+ replaced by the equivalent UTF-8 characters.
+
+ If the `keepxmlentities` parameter is provided and evaluates to `True`,
+ the core XML entities (&, ', >, < and ") are not
+ stripped.
+ """
+ def _replace_entity(match):
+ if match.group(1): # numeric entity
+ ref = match.group(1)
+ if ref.startswith('x'):
+ ref = int(ref[1:], 16)
+ else:
+ ref = int(ref, 10)
+ return unichr(ref)
+ else: # character entity
+ ref = match.group(2)
+ if keepxmlentities and ref in ('amp', 'apos', 'gt', 'lt',
+ 'quot'):
+ return '&%s;' % ref
+ try:
+ codepoint = htmlentitydefs.name2codepoint[ref]
+ return unichr(codepoint)
+ except KeyError:
+ if keepxmlentities:
+ return '&%s;' % ref
+ else:
+ return ref
+ return re.sub(r'&(?:#((?:\d+)|(?:[xX][0-9a-fA-F]+));?|(\w+);)',
+ _replace_entity, text)
+
+
class Markup(unicode):
"""Marks a string as being safe for inclusion in HTML/XML output without
needing to be escaped.
@@ -276,29 +317,7 @@
the core XML entities (&, ', >, < and ") are not
stripped.
"""
- def _replace_entity(match):
- if match.group(1): # numeric entity
- ref = match.group(1)
- if ref.startswith('x'):
- ref = int(ref[1:], 16)
- else:
- ref = int(ref, 10)
- return unichr(ref)
- else: # character entity
- ref = match.group(2)
- if keepxmlentities and ref in ('amp', 'apos', 'gt', 'lt',
- 'quot'):
- return '&%s;' % ref
- try:
- codepoint = htmlentitydefs.name2codepoint[ref]
- return unichr(codepoint)
- except KeyError:
- if keepxmlentities:
- return '&%s;' % ref
- else:
- return ref
- return Markup(re.sub(r'&(?:#((?:\d+)|(?:[xX][0-9a-fA-F]+));?|(\w+);)',
- _replace_entity, self))
+ return Markup(stripentities(self, keepxmlentities=keepxmlentities))
def striptags(self):
"""Return a copy of the text with all XML/HTML tags removed."""
@@ -342,12 +361,6 @@
text = text.replace(u'\n', u' ')
return text
- def sanitize(self):
- from markup.filters import HTMLSanitizer
- from markup.input import HTMLParser
- text = StringIO(self.stripentities(keepxmlentities=True))
- return Markup(Stream(HTMLSanitizer()(HTMLParser(text))))
-
escape = Markup.escape
diff --git a/markup/filters.py b/markup/filters.py
--- a/markup/filters.py
+++ b/markup/filters.py
@@ -20,7 +20,7 @@
from sets import ImmutableSet as frozenset
import re
-from markup.core import Attributes, Markup, Namespace, escape
+from markup.core import Attributes, Markup, Namespace, escape, stripentities
from markup.core import END, END_NS, START, START_NS, TEXT
from markup.path import Path
@@ -130,9 +130,9 @@
del textbuf[:]
yield TEXT, output, pos
else:
- output = escape(collapse_lines('\n',
+ output = Markup(collapse_lines('\n',
trim_trailing_space('',
- textbuf.pop())), quotes=False)
+ escape(textbuf.pop(), quotes=False))))
yield TEXT, output, pos
if kind is not None:
yield kind, data, pos
@@ -182,6 +182,7 @@
new_attrib = []
for attr, value in attrib:
+ value = stripentities(value)
if attr not in self._SAFE_ATTRS:
continue
elif attr in self._URI_ATTRS:
diff --git a/markup/tests/core.py b/markup/tests/core.py
--- a/markup/tests/core.py
+++ b/markup/tests/core.py
@@ -91,94 +91,6 @@
assert isinstance(markup, Markup)
self.assertEquals('foo', markup)
- def test_sanitize_unchanged(self):
- markup = Markup('fo
o')
- self.assertEquals('fo
o', str(markup.sanitize()))
-
- def test_sanitize_escape_text(self):
- markup = Markup('fo&')
- self.assertEquals('fo&', str(markup.sanitize()))
- markup = Markup('<foo>')
- self.assertEquals('<foo>', str(markup.sanitize()))
-
- def test_sanitize_entityref_text(self):
- markup = Markup('foö')
- self.assertEquals(u'foƶ', unicode(markup.sanitize()))
-
- def test_sanitize_escape_attr(self):
- markup = Markup('