cmlenz@1: # -*- coding: utf-8 -*- cmlenz@1: # cmlenz@1: # Copyright (C) 2006 Christopher Lenz cmlenz@1: # All rights reserved. cmlenz@1: # cmlenz@1: # This software is licensed as described in the file COPYING, which cmlenz@1: # you should have received as part of this distribution. The terms cmlenz@1: # are also available at http://trac.edgewall.com/license.html. cmlenz@1: # cmlenz@1: # This software consists of voluntary contributions made by many cmlenz@1: # individuals. For the exact contribution history, see the revision cmlenz@1: # history and logs, available at http://projects.edgewall.com/trac/. cmlenz@1: cmlenz@1: """Core classes for markup processing.""" cmlenz@1: cmlenz@1: import htmlentitydefs cmlenz@1: import re cmlenz@1: from StringIO import StringIO cmlenz@1: cmlenz@1: __all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Namespace', 'QName'] cmlenz@1: cmlenz@1: cmlenz@1: class StreamEventKind(object): cmlenz@1: """A kind of event on an XML stream.""" cmlenz@1: cmlenz@1: __slots__ = ['name'] cmlenz@1: cmlenz@1: def __init__(self, name): cmlenz@1: self.name = name cmlenz@1: cmlenz@1: def __repr__(self): cmlenz@1: return self.name cmlenz@1: cmlenz@1: cmlenz@1: class Stream(object): cmlenz@1: """Represents a stream of markup events. cmlenz@1: cmlenz@1: This class is basically an iterator over the events. cmlenz@1: cmlenz@1: Also provided are ways to serialize the stream to text. The `serialize()` cmlenz@1: method will return an iterator over generated strings, while `render()` cmlenz@1: returns the complete generated text at once. Both accept various parameters cmlenz@1: that impact the way the stream is serialized. cmlenz@1: cmlenz@1: Stream events are tuples of the form: cmlenz@1: cmlenz@1: (kind, data, position) cmlenz@1: cmlenz@1: where `kind` is the event kind (such as `START`, `END`, `TEXT`, etc), `data` cmlenz@1: depends on the kind of event, and `position` is a `(line, offset)` tuple cmlenz@1: that contains the location of the original element or text in the input. cmlenz@1: """ cmlenz@1: __slots__ = ['events'] cmlenz@1: cmlenz@1: START = StreamEventKind('start') # a start tag cmlenz@1: END = StreamEventKind('end') # an end tag cmlenz@1: TEXT = StreamEventKind('text') # literal text cmlenz@1: EXPR = StreamEventKind('expr') # an expression cmlenz@1: SUB = StreamEventKind('sub') # a "subprogram" cmlenz@1: PROLOG = StreamEventKind('prolog') # XML prolog cmlenz@1: DOCTYPE = StreamEventKind('doctype') # doctype declaration cmlenz@1: START_NS = StreamEventKind('start-ns') # start namespace mapping cmlenz@1: END_NS = StreamEventKind('end-ns') # end namespace mapping cmlenz@1: PI = StreamEventKind('pi') # processing instruction cmlenz@1: COMMENT = StreamEventKind('comment') # comment cmlenz@1: cmlenz@1: def __init__(self, events): cmlenz@1: """Initialize the stream with a sequence of markup events. cmlenz@1: cmlenz@1: @oaram events: a sequence or iterable providing the events cmlenz@1: """ cmlenz@1: self.events = events cmlenz@1: cmlenz@1: def __iter__(self): cmlenz@1: return iter(self.events) cmlenz@1: cmlenz@1: def render(self, method='xml', encoding='utf-8', **kwargs): cmlenz@1: """Return a string representation of the stream. cmlenz@1: cmlenz@1: @param method: determines how the stream is serialized; can be either cmlenz@1: 'xml' or 'html', or a custom `Serializer` subclass cmlenz@1: @param encoding: how the output string should be encoded; if set to cmlenz@1: `None`, this method returns a `unicode` object cmlenz@1: cmlenz@1: Any additional keyword arguments are passed to the serializer, and thus cmlenz@1: depend on the `method` parameter value. cmlenz@1: """ cmlenz@1: retval = u''.join(self.serialize(method=method, **kwargs)) cmlenz@1: if encoding is not None: cmlenz@1: return retval.encode('utf-8') cmlenz@1: return retval cmlenz@1: cmlenz@1: def select(self, path): cmlenz@1: """Return a new stream that contains the events matching the given cmlenz@1: XPath expression. cmlenz@1: cmlenz@1: @param path: a string containing the XPath expression cmlenz@1: """ cmlenz@1: from markup.path import Path cmlenz@1: path = Path(path) cmlenz@1: return path.select(self) cmlenz@1: cmlenz@1: def serialize(self, method='xml', **kwargs): cmlenz@1: """Generate strings corresponding to a specific serialization of the cmlenz@1: stream. cmlenz@1: cmlenz@1: Unlike the `render()` method, this method is a generator this returns cmlenz@1: the serialized output incrementally, as opposed to returning a single cmlenz@1: string. cmlenz@1: cmlenz@1: @param method: determines how the stream is serialized; can be either cmlenz@1: 'xml' or 'html', or a custom `Serializer` subclass cmlenz@1: """ cmlenz@1: from markup import output cmlenz@1: cls = method cmlenz@1: if isinstance(method, basestring): cmlenz@1: cls = {'xml': output.XMLSerializer, cmlenz@1: 'html': output.HTMLSerializer}[method] cmlenz@1: else: cmlenz@1: assert issubclass(cls, serializers.Serializer) cmlenz@1: serializer = cls(**kwargs) cmlenz@1: return serializer.serialize(self) cmlenz@1: cmlenz@1: def __str__(self): cmlenz@1: return self.render() cmlenz@1: cmlenz@1: def __unicode__(self): cmlenz@1: return self.render(encoding=None) cmlenz@1: cmlenz@1: cmlenz@1: class Attributes(list): cmlenz@1: cmlenz@1: def __init__(self, attrib=None): cmlenz@1: list.__init__(self, map(lambda (k, v): (QName(k), v), attrib or [])) cmlenz@1: cmlenz@1: def __contains__(self, name): cmlenz@1: return name in [attr for attr, value in self] cmlenz@1: cmlenz@1: def get(self, name, default=None): cmlenz@1: for attr, value in self: cmlenz@1: if attr == name: cmlenz@1: return value cmlenz@1: return default cmlenz@1: cmlenz@1: def set(self, name, value): cmlenz@1: for idx, (attr, _) in enumerate(self): cmlenz@1: if attr == name: cmlenz@1: self[idx] = (attr, value) cmlenz@1: break cmlenz@1: else: cmlenz@1: self.append((QName(name), value)) cmlenz@1: cmlenz@1: cmlenz@1: class Markup(unicode): cmlenz@1: """Marks a string as being safe for inclusion in HTML/XML output without cmlenz@1: needing to be escaped. cmlenz@1: """ cmlenz@1: def __new__(self, text='', *args): cmlenz@1: if args: cmlenz@1: text %= tuple([escape(arg) for arg in args]) cmlenz@1: return unicode.__new__(self, text) cmlenz@1: cmlenz@1: def __add__(self, other): cmlenz@1: return Markup(unicode(self) + Markup.escape(other)) cmlenz@1: cmlenz@1: def __mod__(self, args): cmlenz@1: if not isinstance(args, (list, tuple)): cmlenz@1: args = [args] cmlenz@1: return Markup(unicode.__mod__(self, cmlenz@1: tuple([escape(arg) for arg in args]))) cmlenz@1: cmlenz@1: def __mul__(self, num): cmlenz@1: return Markup(unicode(self) * num) cmlenz@1: cmlenz@1: def join(self, seq): cmlenz@1: return Markup(unicode(self).join([Markup.escape(item) for item in seq])) cmlenz@1: cmlenz@1: def stripentities(self, keepxmlentities=False): cmlenz@1: """Return a copy of the text with any character or numeric entities cmlenz@1: replaced by the equivalent UTF-8 characters. cmlenz@1: cmlenz@1: If the `keepxmlentities` parameter is provided and evaluates to `True`, cmlenz@1: the core XML entities (&, ', >, < and "). cmlenz@1: """ cmlenz@1: def _replace_entity(match): cmlenz@1: if match.group(1): # numeric entity cmlenz@1: ref = match.group(1) cmlenz@1: if ref.startswith('x'): cmlenz@1: ref = int(ref[1:], 16) cmlenz@1: else: cmlenz@1: ref = int(ref, 10) cmlenz@1: return unichr(ref) cmlenz@1: else: # character entity cmlenz@1: ref = match.group(2) cmlenz@1: if keepxmlentities and ref in ('amp', 'apos', 'gt', 'lt', 'quot'): cmlenz@1: return '&%s;' % ref cmlenz@1: try: cmlenz@1: codepoint = htmlentitydefs.name2codepoint[ref] cmlenz@1: return unichr(codepoint) cmlenz@1: except KeyError: cmlenz@1: if keepxmlentities: cmlenz@1: return '&%s;' % ref cmlenz@1: else: cmlenz@1: return ref cmlenz@1: return Markup(re.sub(r'&(?:#((?:\d+)|(?:[xX][0-9a-fA-F]+));?|(\w+);)', cmlenz@1: _replace_entity, self)) cmlenz@1: cmlenz@1: def striptags(self): cmlenz@1: """Return a copy of the text with all XML/HTML tags removed.""" cmlenz@1: return Markup(re.sub(r'<[^>]*?>', '', self)) cmlenz@1: cmlenz@1: def escape(cls, text, quotes=True): cmlenz@1: """Create a Markup instance from a string and escape special characters cmlenz@1: it may contain (<, >, & and \"). cmlenz@1: cmlenz@1: If the `quotes` parameter is set to `False`, the \" character is left cmlenz@1: as is. Escaping quotes is generally only required for strings that are cmlenz@1: to be used in attribute values. cmlenz@1: """ cmlenz@1: if isinstance(text, cls): cmlenz@1: return text cmlenz@1: text = unicode(text) cmlenz@1: if not text: cmlenz@1: return cls() cmlenz@1: text = text.replace('&', '&') \ cmlenz@1: .replace('<', '<') \ cmlenz@1: .replace('>', '>') cmlenz@1: if quotes: cmlenz@1: text = text.replace('"', '"') cmlenz@1: return cls(text) cmlenz@1: escape = classmethod(escape) cmlenz@1: cmlenz@1: def unescape(self): cmlenz@1: """Reverse-escapes &, <, > and \" and returns a `unicode` object.""" cmlenz@1: if not self: cmlenz@1: return '' cmlenz@1: return unicode(self).replace('"', '"') \ cmlenz@1: .replace('>', '>') \ cmlenz@1: .replace('<', '<') \ cmlenz@1: .replace('&', '&') cmlenz@1: cmlenz@1: def plaintext(self, keeplinebreaks=True): cmlenz@1: """Returns the text as a `unicode`with all entities and tags removed.""" cmlenz@1: text = unicode(self.striptags().stripentities()) cmlenz@1: if not keeplinebreaks: cmlenz@1: text = text.replace('\n', ' ') cmlenz@1: return text cmlenz@1: cmlenz@1: def sanitize(self): cmlenz@1: from markup.filters import HTMLSanitizer cmlenz@1: from markup.input import HTMLParser cmlenz@1: sanitize = HTMLSanitizer() cmlenz@1: text = self.stripentities(keepxmlentities=True) cmlenz@1: return Stream(sanitize(HTMLParser(StringIO(text)), None)) cmlenz@1: cmlenz@1: cmlenz@1: escape = Markup.escape cmlenz@1: cmlenz@1: def unescape(text): cmlenz@1: """Reverse-escapes &, <, > and \" and returns a `unicode` object.""" cmlenz@1: if not isinstance(text, Markup): cmlenz@1: return text cmlenz@1: return text.unescape() cmlenz@1: cmlenz@1: cmlenz@1: class Namespace(object): cmlenz@1: cmlenz@1: def __init__(self, uri): cmlenz@1: self.uri = uri cmlenz@1: cmlenz@1: def __getitem__(self, name): cmlenz@1: return QName(self.uri + '}' + name) cmlenz@1: cmlenz@1: __getattr__ = __getitem__ cmlenz@1: cmlenz@1: def __repr__(self): cmlenz@1: return '' % self.uri cmlenz@1: cmlenz@1: def __str__(self): cmlenz@1: return self.uri cmlenz@1: cmlenz@1: def __unicode__(self): cmlenz@1: return unicode(self.uri) cmlenz@1: cmlenz@1: cmlenz@1: class QName(unicode): cmlenz@1: """A qualified element or attribute name. cmlenz@1: cmlenz@1: The unicode value of instances of this class contains the qualified name of cmlenz@1: the element or attribute, in the form `{namespace}localname`. The namespace cmlenz@1: URI can be obtained through the additional `namespace` attribute, while the cmlenz@1: local name can be accessed through the `localname` attribute. cmlenz@1: """ cmlenz@1: __slots__ = ['namespace', 'localname'] cmlenz@1: cmlenz@1: def __new__(cls, qname): cmlenz@1: if isinstance(qname, QName): cmlenz@1: return qname cmlenz@1: cmlenz@1: parts = qname.split('}', 1) cmlenz@1: if qname.find('}') > 0: cmlenz@1: self = unicode.__new__(cls, '{' + qname) cmlenz@1: self.namespace = parts[0] cmlenz@1: self.localname = parts[1] cmlenz@1: else: cmlenz@1: self = unicode.__new__(cls, qname) cmlenz@1: self.namespace = None cmlenz@1: self.localname = qname cmlenz@1: return self