# HG changeset patch # User cmlenz # Date 1167760086 0 # Node ID d6e9170c5ccc910172065a46924054c9fad5fcd8 # Parent 22a581cfa537b084cf76e27c17172a8333e436c3 * Moved some utility functions from `genshi.core` to `genshi.util` (backwards compatibility preserved via imports) * Minor performance tweaks for the serializers diff --git a/UPGRADE.txt b/UPGRADE.txt --- a/UPGRADE.txt +++ b/UPGRADE.txt @@ -16,7 +16,7 @@ Instances of `genshi.core.Attrs` are now immutable. Filters manipulating the attributes in a stream may need to be updated. See -the docstring of the `Attrs` for more information. +the documentation of the `Attrs` class for more information. Upgrading from Markup diff --git a/genshi/core.py b/genshi/core.py --- a/genshi/core.py +++ b/genshi/core.py @@ -13,16 +13,16 @@ """Core classes for markup processing.""" -import htmlentitydefs import operator -import re + +from genshi.util import plaintext, stripentities, striptags __all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Attrs', 'Namespace', 'QName'] class StreamEventKind(str): - """A kind of event on an XML stream.""" + """A kind of event on a markup stream.""" __slots__ = [] _instances = {} @@ -35,19 +35,19 @@ This class is basically an iterator over the events. + Stream events are tuples of the form: + + (kind, data, position) + + where `kind` is the event kind (such as `START`, `END`, `TEXT`, etc), `data` + depends on the kind of event, and `position` is a `(filename, line, offset)` + tuple that contains the location of the original element or text in the + input. If the original location is unknown, `position` is `(None, -1, -1)`. + Also provided are ways to serialize the stream to text. The `serialize()` method will return an iterator over generated strings, while `render()` returns the complete generated text at once. Both accept various parameters that impact the way the stream is serialized. - - Stream events are tuples of the form: - - (kind, data, position) - - where `kind` is the event kind (such as `START`, `END`, `TEXT`, etc), `data` - depends on the kind of event, and `position` is a `(filename, line, offset)` - tuple that contains the location of the original element or text in the - input. If the original location is unknown, `position` is `(None, -1, -1)`. """ __slots__ = ['events'] @@ -92,7 +92,7 @@

Hello, world!

Filters can be any function that accepts and produces a stream (where - a stream is anything that iterators over events): + a stream is anything that iterates over events): >>> def uppercase(stream): ... for kind, data, pos in stream: @@ -326,51 +326,6 @@ return TEXT, u''.join([x[1] for x in self]), (None, -1, -1) -def plaintext(text, keeplinebreaks=True): - """Returns the text as a `unicode` string with all entities and tags - removed. - """ - text = stripentities(striptags(text)) - if not keeplinebreaks: - text = text.replace(u'\n', u' ') - return text - -def stripentities(text, keepxmlentities=False): - """Return a copy of the given text with any character or numeric entities - replaced by the equivalent UTF-8 characters. - - If the `keepxmlentities` parameter is provided and evaluates to `True`, - the core XML entities (&, ', >, < and ") are not - stripped. - """ - def _replace_entity(match): - if match.group(1): # numeric entity - ref = match.group(1) - if ref.startswith('x'): - ref = int(ref[1:], 16) - else: - ref = int(ref, 10) - return unichr(ref) - else: # character entity - ref = match.group(2) - if keepxmlentities and ref in ('amp', 'apos', 'gt', 'lt', 'quot'): - return '&%s;' % ref - try: - codepoint = htmlentitydefs.name2codepoint[ref] - return unichr(codepoint) - except KeyError: - if keepxmlentities: - return '&%s;' % ref - else: - return ref - return re.sub(r'&(?:#((?:\d+)|(?:[xX][0-9a-fA-F]+));?|(\w+);)', - _replace_entity, text) - -def striptags(text): - """Return a copy of the text with all XML/HTML tags removed.""" - return re.sub(r'<[^>]*?>', '', text) - - class Markup(unicode): """Marks a string as being safe for inclusion in HTML/XML output without needing to be escaped. diff --git a/genshi/output.py b/genshi/output.py --- a/genshi/output.py +++ b/genshi/output.py @@ -98,21 +98,21 @@ ns_attrib.append((QName('xmlns'), namespace)) buf = ['<', tagname] - for attr, value in attrib + tuple(ns_attrib): + if ns_attrib: + attrib += tuple(ns_attrib) + for attr, value in attrib: attrname = attr.localname - if attr.namespace: - prefix = ns_mapping.get(attr.namespace) + attrns = attr.namespace + if attrns: + prefix = ns_mapping.get(attrns) if prefix: attrname = '%s:%s' % (prefix, attrname) buf += [' ', attrname, '="', escape(value), '"'] ns_attrib = [] - if kind is EMPTY: - buf += ['/>'] - else: - buf += ['>'] + buf.append(kind is EMPTY and '/>' or '>') - yield Markup(''.join(buf)) + yield Markup(u''.join(buf)) elif kind is END: tag = data @@ -136,13 +136,13 @@ name, pubid, sysid = data buf = ['\n'] - yield Markup(''.join(buf), *filter(None, data)) + buf.append(' "%s"') + buf.append('>\n') + yield Markup(u''.join(buf), *filter(None, data)) have_doctype = True elif kind is START_NS: @@ -216,10 +216,13 @@ ns_attrib.append((QName('xmlns'), tagns)) buf = ['<', tagname] - for attr, value in chain(attrib, ns_attrib): + if ns_attrib: + attrib += tuple(ns_attrib) + for attr, value in attrib: attrname = attr.localname - if attr.namespace: - prefix = ns_mapping.get(attr.namespace) + attrns = attr.namespace + if attrns: + prefix = ns_mapping.get(attrns) if prefix: attrname = '%s:%s' % (prefix, attrname) if attrname in boolean_attrs: @@ -231,14 +234,14 @@ if kind is EMPTY: if (tagns and tagns != namespace.uri) \ - or tag.localname in empty_elems: - buf += [' />'] + or tagname in empty_elems: + buf.append(' />') else: - buf += ['>' % tagname] + buf.append('>' % tagname) else: - buf += ['>'] + buf.append('>') - yield Markup(''.join(buf)) + yield Markup(u''.join(buf)) elif kind is END: tag = data @@ -262,13 +265,13 @@ name, pubid, sysid = data buf = ['\n'] - yield Markup(''.join(buf), *filter(None, data)) + buf.append(' "%s"') + buf.append('>\n') + yield Markup(u''.join(buf), *filter(None, data)) have_doctype = True elif kind is START_NS: @@ -349,13 +352,13 @@ else: buf += [' ', attrname, '="', escape(value), '"'] - buf += ['>'] + buf.append('>') if kind is EMPTY: if tagname not in empty_elems: - buf += ['' % tagname] + buf.append('' % tagname) - yield Markup(''.join(buf)) + yield Markup(u''.join(buf)) if tagname in noescape_elems: noescape = True @@ -380,13 +383,13 @@ name, pubid, sysid = data buf = ['\n'] - yield Markup(''.join(buf), *filter(None, data)) + buf.append(' "%s"') + buf.append('>\n') + yield Markup(u''.join(buf), *filter(None, data)) have_doctype = True elif kind is START_NS and data[1] not in ns_mapping: @@ -460,7 +463,7 @@ """Initialize the filter. @param preserve: a set or sequence of tag names for which white-space - should be ignored. + should be preserved @param noescape: a set or sequence of tag names for which text content should not be escaped diff --git a/genshi/util.py b/genshi/util.py --- a/genshi/util.py +++ b/genshi/util.py @@ -13,6 +13,9 @@ """Various utility classes and functions.""" +import htmlentitydefs +import re + class LRUCache(dict): """A dictionary-like object that stores only a certain number of items, and @@ -150,3 +153,66 @@ else: retval.append(item) return retval + +def plaintext(text, keeplinebreaks=True): + """Returns the text as a `unicode` string with all entities and tags + removed. + """ + text = stripentities(striptags(text)) + if not keeplinebreaks: + text = text.replace(u'\n', u' ') + return text + +_STRIPENTITIES_RE = re.compile(r'&(?:#((?:\d+)|(?:[xX][0-9a-fA-F]+));?|(\w+);)') +def stripentities(text, keepxmlentities=False): + """Return a copy of the given text with any character or numeric entities + replaced by the equivalent UTF-8 characters. + + >>> stripentities('1 < 2') + u'1 < 2' + >>> stripentities('more …') + u'more \u2026' + >>> stripentities('…') + u'\u2026' + >>> stripentities('…') + u'\u2026' + + If the `keepxmlentities` parameter is provided and is a truth value, the + core XML entities (&, ', >, < and ") are left intact. + + >>> stripentities('1 < 2 …', keepxmlentities=True) + u'1 < 2 \u2026' + """ + def _replace_entity(match): + if match.group(1): # numeric entity + ref = match.group(1) + if ref.startswith('x'): + ref = int(ref[1:], 16) + else: + ref = int(ref, 10) + return unichr(ref) + else: # character entity + ref = match.group(2) + if keepxmlentities and ref in ('amp', 'apos', 'gt', 'lt', 'quot'): + return u'&%s;' % ref + try: + return unichr(htmlentitydefs.name2codepoint[ref]) + except KeyError: + if keepxmlentities: + return u'&%s;' % ref + else: + return ref + return _STRIPENTITIES_RE.sub(_replace_entity, text) + +_STRIPTAGS_RE = re.compile(r'<[^>]*?>') +def striptags(text): + """Return a copy of the text with all XML/HTML tags removed. + + >>> striptags('Foo bar') + 'Foo bar' + >>> striptags('Foo') + 'Foo' + >>> striptags('Foo
') + 'Foo' + """ + return _STRIPTAGS_RE.sub('', text)