Mercurial > genshi > mirror
view genshi/core.py @ 713:5420fe9d99a9 trunk
The `Markup` class now supports mappings for right hand of the `%` (modulo) operator in the same way the Python string classes do, except that the substituted values are escape. Also, the special constructor which took positional arguments that would be substituted was removed. Thus the `Markup` class now supports the same arguments as that of its `unicode` base class. Closes #211. Many thanks to Christian Boos for the patch!
author | cmlenz |
---|---|
date | Tue, 08 Apr 2008 18:18:18 +0000 |
parents | d8571da25bc5 |
children | 4bc6741b2811 |
line wrap: on
line source
# -*- coding: utf-8 -*- # # Copyright (C) 2006-2007 Edgewall Software # All rights reserved. # # This software is licensed as described in the file COPYING, which # you should have received as part of this distribution. The terms # are also available at http://genshi.edgewall.org/wiki/License. # # This software consists of voluntary contributions made by many # individuals. For the exact contribution history, see the revision # history and logs, available at http://genshi.edgewall.org/log/. """Core classes for markup processing.""" from itertools import chain import operator from genshi.util import plaintext, stripentities, striptags __all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Attrs', 'Namespace', 'QName'] __docformat__ = 'restructuredtext en' class StreamEventKind(str): """A kind of event on a markup stream.""" __slots__ = [] _instances = {} def __new__(cls, val): return cls._instances.setdefault(val, str.__new__(cls, val)) class Stream(object): """Represents a stream of markup events. This class is basically an iterator over the events. Stream events are tuples of the form:: (kind, data, position) where ``kind`` is the event kind (such as `START`, `END`, `TEXT`, etc), ``data`` depends on the kind of event, and ``position`` is a ``(filename, line, offset)`` tuple that contains the location of the original element or text in the input. If the original location is unknown, ``position`` is ``(None, -1, -1)``. Also provided are ways to serialize the stream to text. The `serialize()` method will return an iterator over generated strings, while `render()` returns the complete generated text at once. Both accept various parameters that impact the way the stream is serialized. """ __slots__ = ['events', 'serializer'] START = StreamEventKind('START') #: a start tag END = StreamEventKind('END') #: an end tag TEXT = StreamEventKind('TEXT') #: literal text XML_DECL = StreamEventKind('XML_DECL') #: XML declaration DOCTYPE = StreamEventKind('DOCTYPE') #: doctype declaration START_NS = StreamEventKind('START_NS') #: start namespace mapping END_NS = StreamEventKind('END_NS') #: end namespace mapping START_CDATA = StreamEventKind('START_CDATA') #: start CDATA section END_CDATA = StreamEventKind('END_CDATA') #: end CDATA section PI = StreamEventKind('PI') #: processing instruction COMMENT = StreamEventKind('COMMENT') #: comment def __init__(self, events, serializer=None): """Initialize the stream with a sequence of markup events. :param events: a sequence or iterable providing the events :param serializer: the default serialization method to use for this stream :note: Changed in 0.5: added the `serializer` argument """ self.events = events #: The underlying iterable producing the events self.serializer = serializer #: The default serializion method def __iter__(self): return iter(self.events) def __or__(self, function): """Override the "bitwise or" operator to apply filters or serializers to the stream, providing a syntax similar to pipes on Unix shells. Assume the following stream produced by the `HTML` function: >>> from genshi.input import HTML >>> html = HTML('''<p onclick="alert('Whoa')">Hello, world!</p>''') >>> print html <p onclick="alert('Whoa')">Hello, world!</p> A filter such as the HTML sanitizer can be applied to that stream using the pipe notation as follows: >>> from genshi.filters import HTMLSanitizer >>> sanitizer = HTMLSanitizer() >>> print html | sanitizer <p>Hello, world!</p> Filters can be any function that accepts and produces a stream (where a stream is anything that iterates over events): >>> def uppercase(stream): ... for kind, data, pos in stream: ... if kind is TEXT: ... data = data.upper() ... yield kind, data, pos >>> print html | sanitizer | uppercase <p>HELLO, WORLD!</p> Serializers can also be used with this notation: >>> from genshi.output import TextSerializer >>> output = TextSerializer() >>> print html | sanitizer | uppercase | output HELLO, WORLD! Commonly, serializers should be used at the end of the "pipeline"; using them somewhere in the middle may produce unexpected results. :param function: the callable object that should be applied as a filter :return: the filtered stream :rtype: `Stream` """ return Stream(_ensure(function(self)), serializer=self.serializer) def filter(self, *filters): """Apply filters to the stream. This method returns a new stream with the given filters applied. The filters must be callables that accept the stream object as parameter, and return the filtered stream. The call:: stream.filter(filter1, filter2) is equivalent to:: stream | filter1 | filter2 :param filters: one or more callable objects that should be applied as filters :return: the filtered stream :rtype: `Stream` """ return reduce(operator.or_, (self,) + filters) def render(self, method=None, encoding='utf-8', out=None, **kwargs): """Return a string representation of the stream. Any additional keyword arguments are passed to the serializer, and thus depend on the `method` parameter value. :param method: determines how the stream is serialized; can be either "xml", "xhtml", "html", "text", or a custom serializer class; if `None`, the default serialization method of the stream is used :param encoding: how the output string should be encoded; if set to `None`, this method returns a `unicode` object :param out: a file-like object that the output should be written to instead of being returned as one big string; note that if this is a file or socket (or similar), the `encoding` must not be `None` (that is, the output must be encoded) :return: a `str` or `unicode` object (depending on the `encoding` parameter), or `None` if the `out` parameter is provided :rtype: `basestring` :see: XMLSerializer, XHTMLSerializer, HTMLSerializer, TextSerializer :note: Changed in 0.5: added the `out` parameter """ from genshi.output import encode if method is None: method = self.serializer or 'xml' generator = self.serialize(method=method, **kwargs) return encode(generator, method=method, encoding=encoding, out=out) def select(self, path, namespaces=None, variables=None): """Return a new stream that contains the events matching the given XPath expression. >>> from genshi import HTML >>> stream = HTML('<doc><elem>foo</elem><elem>bar</elem></doc>') >>> print stream.select('elem') <elem>foo</elem><elem>bar</elem> >>> print stream.select('elem/text()') foobar Note that the outermost element of the stream becomes the *context node* for the XPath test. That means that the expression "doc" would not match anything in the example above, because it only tests against child elements of the outermost element: >>> print stream.select('doc') <BLANKLINE> You can use the "." expression to match the context node itself (although that usually makes little sense): >>> print stream.select('.') <doc><elem>foo</elem><elem>bar</elem></doc> :param path: a string containing the XPath expression :param namespaces: mapping of namespace prefixes used in the path :param variables: mapping of variable names to values :return: the selected substream :rtype: `Stream` :raises PathSyntaxError: if the given path expression is invalid or not supported """ from genshi.path import Path return Path(path).select(self, namespaces, variables) def serialize(self, method='xml', **kwargs): """Generate strings corresponding to a specific serialization of the stream. Unlike the `render()` method, this method is a generator that returns the serialized output incrementally, as opposed to returning a single string. Any additional keyword arguments are passed to the serializer, and thus depend on the `method` parameter value. :param method: determines how the stream is serialized; can be either "xml", "xhtml", "html", "text", or a custom serializer class; if `None`, the default serialization method of the stream is used :return: an iterator over the serialization results (`Markup` or `unicode` objects, depending on the serialization method) :rtype: ``iterator`` :see: XMLSerializer, XHTMLSerializer, HTMLSerializer, TextSerializer """ from genshi.output import get_serializer if method is None: method = self.serializer or 'xml' return get_serializer(method, **kwargs)(_ensure(self)) def __str__(self): return self.render() def __unicode__(self): return self.render(encoding=None) START = Stream.START END = Stream.END TEXT = Stream.TEXT XML_DECL = Stream.XML_DECL DOCTYPE = Stream.DOCTYPE START_NS = Stream.START_NS END_NS = Stream.END_NS START_CDATA = Stream.START_CDATA END_CDATA = Stream.END_CDATA PI = Stream.PI COMMENT = Stream.COMMENT def _ensure(stream): """Ensure that every item on the stream is actually a markup event.""" stream = iter(stream) event = stream.next() # Check whether the iterable is a real markup event stream by examining the # first item it yields; if it's not we'll need to do some conversion if type(event) is not tuple or len(event) != 3: for event in chain([event], stream): if hasattr(event, 'totuple'): event = event.totuple() else: event = TEXT, unicode(event), (None, -1, -1) yield event return # This looks like a markup event stream, so we'll just pass it through # unchanged yield event for event in stream: yield event class Attrs(tuple): """Immutable sequence type that stores the attributes of an element. Ordering of the attributes is preserved, while access by name is also supported. >>> attrs = Attrs([('href', '#'), ('title', 'Foo')]) >>> attrs Attrs([('href', '#'), ('title', 'Foo')]) >>> 'href' in attrs True >>> 'tabindex' in attrs False >>> attrs.get('title') 'Foo' Instances may not be manipulated directly. Instead, the operators ``|`` and ``-`` can be used to produce new instances that have specific attributes added, replaced or removed. To remove an attribute, use the ``-`` operator. The right hand side can be either a string or a set/sequence of strings, identifying the name(s) of the attribute(s) to remove: >>> attrs - 'title' Attrs([('href', '#')]) >>> attrs - ('title', 'href') Attrs() The original instance is not modified, but the operator can of course be used with an assignment: >>> attrs Attrs([('href', '#'), ('title', 'Foo')]) >>> attrs -= 'title' >>> attrs Attrs([('href', '#')]) To add a new attribute, use the ``|`` operator, where the right hand value is a sequence of ``(name, value)`` tuples (which includes `Attrs` instances): >>> attrs | [('title', 'Bar')] Attrs([('href', '#'), ('title', 'Bar')]) If the attributes already contain an attribute with a given name, the value of that attribute is replaced: >>> attrs | [('href', 'http://example.org/')] Attrs([('href', 'http://example.org/')]) """ __slots__ = [] def __contains__(self, name): """Return whether the list includes an attribute with the specified name. :return: `True` if the list includes the attribute :rtype: `bool` """ for attr, _ in self: if attr == name: return True def __getslice__(self, i, j): """Return a slice of the attributes list. >>> attrs = Attrs([('href', '#'), ('title', 'Foo')]) >>> attrs[1:] Attrs([('title', 'Foo')]) """ return Attrs(tuple.__getslice__(self, i, j)) def __or__(self, attrs): """Return a new instance that contains the attributes in `attrs` in addition to any already existing attributes. :return: a new instance with the merged attributes :rtype: `Attrs` """ repl = dict([(an, av) for an, av in attrs if an in self]) return Attrs([(sn, repl.get(sn, sv)) for sn, sv in self] + [(an, av) for an, av in attrs if an not in self]) def __repr__(self): if not self: return 'Attrs()' return 'Attrs([%s])' % ', '.join([repr(item) for item in self]) def __sub__(self, names): """Return a new instance with all attributes with a name in `names` are removed. :param names: the names of the attributes to remove :return: a new instance with the attribute removed :rtype: `Attrs` """ if isinstance(names, basestring): names = (names,) return Attrs([(name, val) for name, val in self if name not in names]) def get(self, name, default=None): """Return the value of the attribute with the specified name, or the value of the `default` parameter if no such attribute is found. :param name: the name of the attribute :param default: the value to return when the attribute does not exist :return: the attribute value, or the `default` value if that attribute does not exist :rtype: `object` """ for attr, value in self: if attr == name: return value return default def totuple(self): """Return the attributes as a markup event. The returned event is a `TEXT` event, the data is the value of all attributes joined together. >>> Attrs([('href', '#'), ('title', 'Foo')]).totuple() ('TEXT', u'#Foo', (None, -1, -1)) :return: a `TEXT` event :rtype: `tuple` """ return TEXT, u''.join([x[1] for x in self]), (None, -1, -1) class Markup(unicode): """Marks a string as being safe for inclusion in HTML/XML output without needing to be escaped. """ __slots__ = [] def __add__(self, other): return Markup(unicode(self) + unicode(escape(other))) def __radd__(self, other): return Markup(unicode(escape(other)) + unicode(self)) def __mod__(self, args): if isinstance(args, dict): args = dict(zip(args.keys(), map(escape, args.values()))) elif isinstance(args, (list, tuple)): args = tuple(map(escape, args)) else: args = escape(args) return Markup(unicode.__mod__(self, args)) def __mul__(self, num): return Markup(unicode(self) * num) def __rmul__(self, num): return Markup(num * unicode(self)) def __repr__(self): return '<%s %r>' % (self.__class__.__name__, unicode(self)) def join(self, seq, escape_quotes=True): """Return a `Markup` object which is the concatenation of the strings in the given sequence, where this `Markup` object is the separator between the joined elements. Any element in the sequence that is not a `Markup` instance is automatically escaped. :param seq: the sequence of strings to join :param escape_quotes: whether double quote characters in the elements should be escaped :return: the joined `Markup` object :rtype: `Markup` :see: `escape` """ return Markup(unicode(self).join([escape(item, quotes=escape_quotes) for item in seq])) def escape(cls, text, quotes=True): """Create a Markup instance from a string and escape special characters it may contain (<, >, & and \"). >>> escape('"1 < 2"') <Markup u'"1 < 2"'> If the `quotes` parameter is set to `False`, the \" character is left as is. Escaping quotes is generally only required for strings that are to be used in attribute values. >>> escape('"1 < 2"', quotes=False) <Markup u'"1 < 2"'> :param text: the text to escape :param quotes: if ``True``, double quote characters are escaped in addition to the other special characters :return: the escaped `Markup` string :rtype: `Markup` """ if not text: return cls() if type(text) is cls: return text text = unicode(text).replace('&', '&') \ .replace('<', '<') \ .replace('>', '>') if quotes: text = text.replace('"', '"') return cls(text) escape = classmethod(escape) def unescape(self): """Reverse-escapes &, <, >, and \" and returns a `unicode` object. >>> Markup('1 < 2').unescape() u'1 < 2' :return: the unescaped string :rtype: `unicode` :see: `genshi.core.unescape` """ if not self: return u'' return unicode(self).replace('"', '"') \ .replace('>', '>') \ .replace('<', '<') \ .replace('&', '&') def stripentities(self, keepxmlentities=False): """Return a copy of the text with any character or numeric entities replaced by the equivalent UTF-8 characters. If the `keepxmlentities` parameter is provided and evaluates to `True`, the core XML entities (``&``, ``'``, ``>``, ``<`` and ``"``) are not stripped. :return: a `Markup` instance with entities removed :rtype: `Markup` :see: `genshi.util.stripentities` """ return Markup(stripentities(self, keepxmlentities=keepxmlentities)) def striptags(self): """Return a copy of the text with all XML/HTML tags removed. :return: a `Markup` instance with all tags removed :rtype: `Markup` :see: `genshi.util.striptags` """ return Markup(striptags(self)) try: from genshi._speedups import Markup except ImportError: pass # just use the Python implementation escape = Markup.escape def unescape(text): """Reverse-escapes &, <, >, and \" and returns a `unicode` object. >>> unescape(Markup('1 < 2')) u'1 < 2' If the provided `text` object is not a `Markup` instance, it is returned unchanged. >>> unescape('1 < 2') '1 < 2' :param text: the text to unescape :return: the unescsaped string :rtype: `unicode` """ if not isinstance(text, Markup): return text return text.unescape() class Namespace(object): """Utility class creating and testing elements with a namespace. Internally, namespace URIs are encoded in the `QName` of any element or attribute, the namespace URI being enclosed in curly braces. This class helps create and test these strings. A `Namespace` object is instantiated with the namespace URI. >>> html = Namespace('http://www.w3.org/1999/xhtml') >>> html <Namespace "http://www.w3.org/1999/xhtml"> >>> html.uri u'http://www.w3.org/1999/xhtml' The `Namespace` object can than be used to generate `QName` objects with that namespace: >>> html.body QName(u'http://www.w3.org/1999/xhtml}body') >>> html.body.localname u'body' >>> html.body.namespace u'http://www.w3.org/1999/xhtml' The same works using item access notation, which is useful for element or attribute names that are not valid Python identifiers: >>> html['body'] QName(u'http://www.w3.org/1999/xhtml}body') A `Namespace` object can also be used to test whether a specific `QName` belongs to that namespace using the ``in`` operator: >>> qname = html.body >>> qname in html True >>> qname in Namespace('http://www.w3.org/2002/06/xhtml2') False """ def __new__(cls, uri): if type(uri) is cls: return uri return object.__new__(cls) def __getnewargs__(self): return (self.uri,) def __getstate__(self): return self.uri def __setstate__(self, uri): self.uri = uri def __init__(self, uri): self.uri = unicode(uri) def __contains__(self, qname): return qname.namespace == self.uri def __ne__(self, other): return not self == other def __eq__(self, other): if isinstance(other, Namespace): return self.uri == other.uri return self.uri == other def __getitem__(self, name): return QName(self.uri + u'}' + name) __getattr__ = __getitem__ def __repr__(self): return '<Namespace "%s">' % self.uri def __str__(self): return self.uri.encode('utf-8') def __unicode__(self): return self.uri # The namespace used by attributes such as xml:lang and xml:space XML_NAMESPACE = Namespace('http://www.w3.org/XML/1998/namespace') class QName(unicode): """A qualified element or attribute name. The unicode value of instances of this class contains the qualified name of the element or attribute, in the form ``{namespace-uri}local-name``. The namespace URI can be obtained through the additional `namespace` attribute, while the local name can be accessed through the `localname` attribute. >>> qname = QName('foo') >>> qname QName(u'foo') >>> qname.localname u'foo' >>> qname.namespace >>> qname = QName('http://www.w3.org/1999/xhtml}body') >>> qname QName(u'http://www.w3.org/1999/xhtml}body') >>> qname.localname u'body' >>> qname.namespace u'http://www.w3.org/1999/xhtml' """ __slots__ = ['namespace', 'localname'] def __new__(cls, qname): """Create the `QName` instance. :param qname: the qualified name as a string of the form ``{namespace-uri}local-name``, where the leading curly brace is optional """ if type(qname) is cls: return qname parts = qname.lstrip(u'{').split(u'}', 1) if len(parts) > 1: self = unicode.__new__(cls, u'{%s' % qname) self.namespace, self.localname = map(unicode, parts) else: self = unicode.__new__(cls, qname) self.namespace, self.localname = None, unicode(qname) return self def __getnewargs__(self): return (self.lstrip('{'),) def __repr__(self): return 'QName(%s)' % unicode.__repr__(self.lstrip('{'))