cmlenz@1: # -*- coding: utf-8 -*- cmlenz@1: # cmlenz@66: # Copyright (C) 2006 Edgewall Software cmlenz@1: # All rights reserved. cmlenz@1: # cmlenz@1: # This software is licensed as described in the file COPYING, which cmlenz@1: # you should have received as part of this distribution. The terms cmlenz@66: # are also available at http://markup.edgewall.org/wiki/License. cmlenz@1: # cmlenz@1: # This software consists of voluntary contributions made by many cmlenz@1: # individuals. For the exact contribution history, see the revision cmlenz@66: # history and logs, available at http://markup.edgewall.org/log/. cmlenz@1: cmlenz@1: """Core classes for markup processing.""" cmlenz@1: cmlenz@1: import htmlentitydefs cmlenz@1: import re cmlenz@1: from StringIO import StringIO cmlenz@1: cmlenz@1: __all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Namespace', 'QName'] cmlenz@1: cmlenz@1: cmlenz@17: class StreamEventKind(str): cmlenz@1: """A kind of event on an XML stream.""" cmlenz@1: cmlenz@1: cmlenz@1: class Stream(object): cmlenz@1: """Represents a stream of markup events. cmlenz@1: cmlenz@1: This class is basically an iterator over the events. cmlenz@1: cmlenz@1: Also provided are ways to serialize the stream to text. The `serialize()` cmlenz@1: method will return an iterator over generated strings, while `render()` cmlenz@1: returns the complete generated text at once. Both accept various parameters cmlenz@1: that impact the way the stream is serialized. cmlenz@1: cmlenz@1: Stream events are tuples of the form: cmlenz@1: cmlenz@1: (kind, data, position) cmlenz@1: cmlenz@1: where `kind` is the event kind (such as `START`, `END`, `TEXT`, etc), `data` cmlenz@1: depends on the kind of event, and `position` is a `(line, offset)` tuple cmlenz@1: that contains the location of the original element or text in the input. cmlenz@1: """ cmlenz@1: __slots__ = ['events'] cmlenz@1: cmlenz@17: START = StreamEventKind('START') # a start tag cmlenz@17: END = StreamEventKind('END') # an end tag cmlenz@17: TEXT = StreamEventKind('TEXT') # literal text cmlenz@17: PROLOG = StreamEventKind('PROLOG') # XML prolog cmlenz@17: DOCTYPE = StreamEventKind('DOCTYPE') # doctype declaration cmlenz@17: START_NS = StreamEventKind('START-NS') # start namespace mapping cmlenz@17: END_NS = StreamEventKind('END-NS') # end namespace mapping cmlenz@17: PI = StreamEventKind('PI') # processing instruction cmlenz@17: COMMENT = StreamEventKind('COMMENT') # comment cmlenz@1: cmlenz@1: def __init__(self, events): cmlenz@1: """Initialize the stream with a sequence of markup events. cmlenz@1: cmlenz@27: @param events: a sequence or iterable providing the events cmlenz@1: """ cmlenz@1: self.events = events cmlenz@1: cmlenz@1: def __iter__(self): cmlenz@1: return iter(self.events) cmlenz@1: cmlenz@114: def filter(self, func): cmlenz@113: """Apply a filter to the stream. cmlenz@113: cmlenz@113: This method returns a new stream with the given filter applied. The cmlenz@114: filter must be a callable that accepts the stream object as parameter, cmlenz@114: and returns the filtered stream. cmlenz@113: """ cmlenz@115: return Stream(func(self)) cmlenz@113: cmlenz@17: def render(self, method='xml', encoding='utf-8', filters=None, **kwargs): cmlenz@1: """Return a string representation of the stream. cmlenz@1: cmlenz@1: @param method: determines how the stream is serialized; can be either cmlenz@96: "xml", "xhtml", or "html", or a custom `Serializer` cmlenz@96: subclass cmlenz@1: @param encoding: how the output string should be encoded; if set to cmlenz@1: `None`, this method returns a `unicode` object cmlenz@1: cmlenz@1: Any additional keyword arguments are passed to the serializer, and thus cmlenz@1: depend on the `method` parameter value. cmlenz@1: """ cmlenz@17: generator = self.serialize(method=method, filters=filters, **kwargs) cmlenz@17: output = u''.join(list(generator)) cmlenz@1: if encoding is not None: cmlenz@9: return output.encode(encoding) cmlenz@8: return output cmlenz@1: cmlenz@1: def select(self, path): cmlenz@1: """Return a new stream that contains the events matching the given cmlenz@1: XPath expression. cmlenz@1: cmlenz@1: @param path: a string containing the XPath expression cmlenz@1: """ cmlenz@1: from markup.path import Path cmlenz@17: return Path(path).select(self) cmlenz@1: cmlenz@17: def serialize(self, method='xml', filters=None, **kwargs): cmlenz@1: """Generate strings corresponding to a specific serialization of the cmlenz@1: stream. cmlenz@1: cmlenz@18: Unlike the `render()` method, this method is a generator that returns cmlenz@1: the serialized output incrementally, as opposed to returning a single cmlenz@1: string. cmlenz@1: cmlenz@1: @param method: determines how the stream is serialized; can be either cmlenz@96: "xml", "xhtml", or "html", or a custom `Serializer` cmlenz@96: subclass cmlenz@18: @param filters: list of filters to apply to the stream before cmlenz@18: serialization. The default is to apply whitespace cmlenz@18: reduction using `markup.filters.WhitespaceFilter`. cmlenz@1: """ cmlenz@17: from markup.filters import WhitespaceFilter cmlenz@1: from markup import output cmlenz@1: cls = method cmlenz@1: if isinstance(method, basestring): cmlenz@96: cls = {'xml': output.XMLSerializer, cmlenz@96: 'xhtml': output.XHTMLSerializer, cmlenz@96: 'html': output.HTMLSerializer}[method] cmlenz@1: else: cmlenz@27: assert issubclass(cls, output.Serializer) cmlenz@1: serializer = cls(**kwargs) cmlenz@17: cmlenz@111: stream = _ensure(self) cmlenz@17: if filters is None: cmlenz@17: filters = [WhitespaceFilter()] cmlenz@17: for filter_ in filters: cmlenz@17: stream = filter_(iter(stream)) cmlenz@17: cmlenz@17: return serializer.serialize(stream) cmlenz@1: cmlenz@1: def __str__(self): cmlenz@1: return self.render() cmlenz@1: cmlenz@1: def __unicode__(self): cmlenz@1: return self.render(encoding=None) cmlenz@1: cmlenz@1: cmlenz@69: START = Stream.START cmlenz@69: END = Stream.END cmlenz@69: TEXT = Stream.TEXT cmlenz@69: PROLOG = Stream.PROLOG cmlenz@69: DOCTYPE = Stream.DOCTYPE cmlenz@69: START_NS = Stream.START_NS cmlenz@69: END_NS = Stream.END_NS cmlenz@69: PI = Stream.PI cmlenz@69: COMMENT = Stream.COMMENT cmlenz@69: cmlenz@111: def _ensure(stream): cmlenz@111: """Ensure that every item on the stream is actually a markup event.""" cmlenz@111: for event in stream: cmlenz@111: try: cmlenz@111: kind, data, pos = event cmlenz@111: except ValueError: cmlenz@111: kind, data, pos = event.totuple() cmlenz@111: yield kind, data, pos cmlenz@111: cmlenz@69: cmlenz@1: class Attributes(list): cmlenz@18: """Sequence type that stores the attributes of an element. cmlenz@18: cmlenz@18: The order of the attributes is preserved, while accessing and manipulating cmlenz@18: attributes by name is also supported. cmlenz@18: cmlenz@18: >>> attrs = Attributes([('href', '#'), ('title', 'Foo')]) cmlenz@18: >>> attrs cmlenz@18: [(u'href', '#'), (u'title', 'Foo')] cmlenz@18: cmlenz@18: >>> 'href' in attrs cmlenz@18: True cmlenz@18: >>> 'tabindex' in attrs cmlenz@18: False cmlenz@18: cmlenz@18: >>> attrs.get(u'title') cmlenz@18: 'Foo' cmlenz@18: >>> attrs.set(u'title', 'Bar') cmlenz@18: >>> attrs cmlenz@18: [(u'href', '#'), (u'title', 'Bar')] cmlenz@18: >>> attrs.remove(u'title') cmlenz@18: >>> attrs cmlenz@18: [(u'href', '#')] cmlenz@18: cmlenz@18: New attributes added using the `set()` method are appended to the end of cmlenz@18: the list: cmlenz@18: cmlenz@18: >>> attrs.set(u'accesskey', 'k') cmlenz@18: >>> attrs cmlenz@18: [(u'href', '#'), (u'accesskey', 'k')] cmlenz@18: """ cmlenz@18: __slots__ = [] cmlenz@1: cmlenz@1: def __init__(self, attrib=None): cmlenz@18: """Create the `Attributes` instance. cmlenz@18: cmlenz@18: If the `attrib` parameter is provided, it is expected to be a sequence cmlenz@18: of `(name, value)` tuples. cmlenz@18: """ cmlenz@27: if attrib is None: cmlenz@27: attrib = [] cmlenz@27: list.__init__(self, [(QName(name), value) for name, value in attrib]) cmlenz@1: cmlenz@1: def __contains__(self, name): cmlenz@18: """Return whether the list includes an attribute with the specified cmlenz@18: name. cmlenz@18: """ cmlenz@27: return name in [attr for attr, _ in self] cmlenz@1: cmlenz@1: def get(self, name, default=None): cmlenz@18: """Return the value of the attribute with the specified name, or the cmlenz@18: value of the `default` parameter if no such attribute is found. cmlenz@18: """ cmlenz@1: for attr, value in self: cmlenz@1: if attr == name: cmlenz@1: return value cmlenz@1: return default cmlenz@1: cmlenz@5: def remove(self, name): cmlenz@18: """Removes the attribute with the specified name. cmlenz@18: cmlenz@18: If no such attribute is found, this method does nothing. cmlenz@18: """ cmlenz@5: for idx, (attr, _) in enumerate(self): cmlenz@5: if attr == name: cmlenz@5: del self[idx] cmlenz@5: break cmlenz@5: cmlenz@1: def set(self, name, value): cmlenz@18: """Sets the specified attribute to the given value. cmlenz@18: cmlenz@18: If an attribute with the specified name is already in the list, the cmlenz@18: value of the existing entry is updated. Otherwise, a new attribute is cmlenz@18: appended to the end of the list. cmlenz@18: """ cmlenz@1: for idx, (attr, _) in enumerate(self): cmlenz@1: if attr == name: cmlenz@1: self[idx] = (attr, value) cmlenz@1: break cmlenz@1: else: cmlenz@1: self.append((QName(name), value)) cmlenz@1: cmlenz@77: def totuple(self): cmlenz@77: return TEXT, u''.join([x[1] for x in self]), (None, -1, -1) cmlenz@77: cmlenz@1: cmlenz@113: def stripentities(text, keepxmlentities=False): cmlenz@113: """Return a copy of the given text with any character or numeric entities cmlenz@113: replaced by the equivalent UTF-8 characters. cmlenz@113: cmlenz@113: If the `keepxmlentities` parameter is provided and evaluates to `True`, cmlenz@113: the core XML entities (&, ', >, < and ") are not cmlenz@113: stripped. cmlenz@113: """ cmlenz@113: def _replace_entity(match): cmlenz@113: if match.group(1): # numeric entity cmlenz@113: ref = match.group(1) cmlenz@113: if ref.startswith('x'): cmlenz@113: ref = int(ref[1:], 16) cmlenz@113: else: cmlenz@113: ref = int(ref, 10) cmlenz@113: return unichr(ref) cmlenz@113: else: # character entity cmlenz@113: ref = match.group(2) cmlenz@113: if keepxmlentities and ref in ('amp', 'apos', 'gt', 'lt', cmlenz@113: 'quot'): cmlenz@113: return '&%s;' % ref cmlenz@113: try: cmlenz@113: codepoint = htmlentitydefs.name2codepoint[ref] cmlenz@113: return unichr(codepoint) cmlenz@113: except KeyError: cmlenz@113: if keepxmlentities: cmlenz@113: return '&%s;' % ref cmlenz@113: else: cmlenz@113: return ref cmlenz@113: return re.sub(r'&(?:#((?:\d+)|(?:[xX][0-9a-fA-F]+));?|(\w+);)', cmlenz@113: _replace_entity, text) cmlenz@113: cmlenz@113: cmlenz@1: class Markup(unicode): cmlenz@1: """Marks a string as being safe for inclusion in HTML/XML output without cmlenz@1: needing to be escaped. cmlenz@1: """ cmlenz@18: __slots__ = [] cmlenz@18: cmlenz@27: def __new__(cls, text='', *args): cmlenz@1: if args: cmlenz@1: text %= tuple([escape(arg) for arg in args]) cmlenz@27: return unicode.__new__(cls, text) cmlenz@1: cmlenz@1: def __add__(self, other): cmlenz@18: return Markup(unicode(self) + escape(other)) cmlenz@1: cmlenz@1: def __mod__(self, args): cmlenz@1: if not isinstance(args, (list, tuple)): cmlenz@1: args = [args] cmlenz@1: return Markup(unicode.__mod__(self, cmlenz@1: tuple([escape(arg) for arg in args]))) cmlenz@1: cmlenz@1: def __mul__(self, num): cmlenz@1: return Markup(unicode(self) * num) cmlenz@1: cmlenz@17: def __repr__(self): cmlenz@17: return '<%s "%s">' % (self.__class__.__name__, self) cmlenz@17: cmlenz@54: def join(self, seq, escape_quotes=True): cmlenz@54: return Markup(unicode(self).join([escape(item, quotes=escape_quotes) mgood@34: for item in seq])) cmlenz@1: cmlenz@1: def stripentities(self, keepxmlentities=False): cmlenz@1: """Return a copy of the text with any character or numeric entities cmlenz@1: replaced by the equivalent UTF-8 characters. cmlenz@1: cmlenz@1: If the `keepxmlentities` parameter is provided and evaluates to `True`, cmlenz@17: the core XML entities (&, ', >, < and ") are not cmlenz@17: stripped. cmlenz@1: """ cmlenz@113: return Markup(stripentities(self, keepxmlentities=keepxmlentities)) cmlenz@1: cmlenz@1: def striptags(self): cmlenz@1: """Return a copy of the text with all XML/HTML tags removed.""" cmlenz@1: return Markup(re.sub(r'<[^>]*?>', '', self)) cmlenz@1: cmlenz@1: def escape(cls, text, quotes=True): cmlenz@1: """Create a Markup instance from a string and escape special characters cmlenz@1: it may contain (<, >, & and \"). cmlenz@1: cmlenz@1: If the `quotes` parameter is set to `False`, the \" character is left cmlenz@1: as is. Escaping quotes is generally only required for strings that are cmlenz@1: to be used in attribute values. cmlenz@1: """ cmlenz@73: if not text: cmlenz@73: return cls() cmlenz@73: if type(text) is cls: cmlenz@1: return text cmlenz@73: text = unicode(text).replace('&', '&') \ cmlenz@73: .replace('<', '<') \ cmlenz@73: .replace('>', '>') cmlenz@1: if quotes: cmlenz@1: text = text.replace('"', '"') cmlenz@1: return cls(text) cmlenz@1: escape = classmethod(escape) cmlenz@1: cmlenz@1: def unescape(self): cmlenz@1: """Reverse-escapes &, <, > and \" and returns a `unicode` object.""" cmlenz@1: if not self: cmlenz@18: return u'' cmlenz@1: return unicode(self).replace('"', '"') \ cmlenz@1: .replace('>', '>') \ cmlenz@1: .replace('<', '<') \ cmlenz@1: .replace('&', '&') cmlenz@1: cmlenz@1: def plaintext(self, keeplinebreaks=True): cmlenz@6: """Returns the text as a `unicode` string with all entities and tags cmlenz@6: removed. cmlenz@6: """ cmlenz@1: text = unicode(self.striptags().stripentities()) cmlenz@1: if not keeplinebreaks: cmlenz@18: text = text.replace(u'\n', u' ') cmlenz@1: return text cmlenz@1: cmlenz@1: cmlenz@1: escape = Markup.escape cmlenz@1: cmlenz@1: def unescape(text): cmlenz@1: """Reverse-escapes &, <, > and \" and returns a `unicode` object.""" cmlenz@1: if not isinstance(text, Markup): cmlenz@1: return text cmlenz@1: return text.unescape() cmlenz@1: cmlenz@1: cmlenz@1: class Namespace(object): cmlenz@18: """Utility class creating and testing elements with a namespace. cmlenz@18: cmlenz@18: Internally, namespace URIs are encoded in the `QName` of any element or cmlenz@18: attribute, the namespace URI being enclosed in curly braces. This class cmlenz@18: helps create and test these strings. cmlenz@18: cmlenz@18: A `Namespace` object is instantiated with the namespace URI. cmlenz@18: cmlenz@18: >>> html = Namespace('http://www.w3.org/1999/xhtml') cmlenz@18: >>> html cmlenz@18: cmlenz@18: >>> html.uri cmlenz@18: u'http://www.w3.org/1999/xhtml' cmlenz@18: cmlenz@18: The `Namespace` object can than be used to generate `QName` objects with cmlenz@18: that namespace: cmlenz@18: cmlenz@18: >>> html.body cmlenz@18: u'{http://www.w3.org/1999/xhtml}body' cmlenz@18: >>> html.body.localname cmlenz@18: u'body' cmlenz@18: >>> html.body.namespace cmlenz@18: u'http://www.w3.org/1999/xhtml' cmlenz@18: cmlenz@18: The same works using item access notation, which is useful for element or cmlenz@18: attribute names that are not valid Python identifiers: cmlenz@18: cmlenz@18: >>> html['body'] cmlenz@18: u'{http://www.w3.org/1999/xhtml}body' cmlenz@18: cmlenz@18: A `Namespace` object can also be used to test whether a specific `QName` cmlenz@18: belongs to that namespace using the `in` operator: cmlenz@18: cmlenz@18: >>> qname = html.body cmlenz@18: >>> qname in html cmlenz@18: True cmlenz@18: >>> qname in Namespace('http://www.w3.org/2002/06/xhtml2') cmlenz@18: False cmlenz@18: """ cmlenz@18: def __init__(self, uri): cmlenz@18: self.uri = unicode(uri) cmlenz@1: cmlenz@18: def __contains__(self, qname): cmlenz@18: return qname.namespace == self.uri cmlenz@18: cmlenz@18: def __eq__(self, other): cmlenz@18: if isinstance(other, Namespace): cmlenz@18: return self.uri == other.uri cmlenz@18: return self.uri == other cmlenz@1: cmlenz@1: def __getitem__(self, name): cmlenz@18: return QName(self.uri + u'}' + name) cmlenz@1: __getattr__ = __getitem__ cmlenz@1: cmlenz@1: def __repr__(self): cmlenz@1: return '' % self.uri cmlenz@1: cmlenz@1: def __str__(self): cmlenz@18: return self.uri.encode('utf-8') cmlenz@1: cmlenz@1: def __unicode__(self): cmlenz@18: return self.uri cmlenz@1: cmlenz@1: cmlenz@1: class QName(unicode): cmlenz@1: """A qualified element or attribute name. cmlenz@1: cmlenz@1: The unicode value of instances of this class contains the qualified name of cmlenz@1: the element or attribute, in the form `{namespace}localname`. The namespace cmlenz@1: URI can be obtained through the additional `namespace` attribute, while the cmlenz@1: local name can be accessed through the `localname` attribute. cmlenz@18: cmlenz@18: >>> qname = QName('foo') cmlenz@18: >>> qname cmlenz@18: u'foo' cmlenz@18: >>> qname.localname cmlenz@18: u'foo' cmlenz@18: >>> qname.namespace cmlenz@18: cmlenz@18: >>> qname = QName('http://www.w3.org/1999/xhtml}body') cmlenz@18: >>> qname cmlenz@18: u'{http://www.w3.org/1999/xhtml}body' cmlenz@18: >>> qname.localname cmlenz@18: u'body' cmlenz@18: >>> qname.namespace cmlenz@18: u'http://www.w3.org/1999/xhtml' cmlenz@1: """ cmlenz@1: __slots__ = ['namespace', 'localname'] cmlenz@1: cmlenz@1: def __new__(cls, qname): cmlenz@100: if type(qname) is cls: cmlenz@1: return qname cmlenz@1: cmlenz@18: parts = qname.split(u'}', 1) cmlenz@100: if len(parts) > 1: cmlenz@18: self = unicode.__new__(cls, u'{' + qname) cmlenz@18: self.namespace = unicode(parts[0]) cmlenz@18: self.localname = unicode(parts[1]) cmlenz@1: else: cmlenz@1: self = unicode.__new__(cls, qname) cmlenz@1: self.namespace = None cmlenz@18: self.localname = unicode(qname) cmlenz@1: return self