cmlenz@1: # -*- coding: utf-8 -*-
cmlenz@1: #
cmlenz@1: # Copyright (C) 2006 Christopher Lenz
cmlenz@1: # All rights reserved.
cmlenz@1: #
cmlenz@1: # This software is licensed as described in the file COPYING, which
cmlenz@1: # you should have received as part of this distribution. The terms
cmlenz@1: # are also available at http://trac.edgewall.com/license.html.
cmlenz@1: #
cmlenz@1: # This software consists of voluntary contributions made by many
cmlenz@1: # individuals. For the exact contribution history, see the revision
cmlenz@1: # history and logs, available at http://projects.edgewall.com/trac/.
cmlenz@1: 
cmlenz@1: """Core classes for markup processing."""
cmlenz@1: 
cmlenz@1: import htmlentitydefs
cmlenz@1: import re
cmlenz@1: from StringIO import StringIO
cmlenz@1: 
cmlenz@1: __all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Namespace', 'QName']
cmlenz@1: 
cmlenz@1: 
cmlenz@1: class StreamEventKind(object):
cmlenz@1:     """A kind of event on an XML stream."""
cmlenz@1: 
cmlenz@1:     __slots__ = ['name']
cmlenz@1: 
cmlenz@1:     def __init__(self, name):
cmlenz@1:         self.name = name
cmlenz@1: 
cmlenz@1:     def __repr__(self):
cmlenz@1:         return self.name
cmlenz@1: 
cmlenz@1: 
cmlenz@1: class Stream(object):
cmlenz@1:     """Represents a stream of markup events.
cmlenz@1:     
cmlenz@1:     This class is basically an iterator over the events.
cmlenz@1:     
cmlenz@1:     Also provided are ways to serialize the stream to text. The `serialize()`
cmlenz@1:     method will return an iterator over generated strings, while `render()`
cmlenz@1:     returns the complete generated text at once. Both accept various parameters
cmlenz@1:     that impact the way the stream is serialized.
cmlenz@1:     
cmlenz@1:     Stream events are tuples of the form:
cmlenz@1: 
cmlenz@1:       (kind, data, position)
cmlenz@1: 
cmlenz@1:     where `kind` is the event kind (such as `START`, `END`, `TEXT`, etc), `data`
cmlenz@1:     depends on the kind of event, and `position` is a `(line, offset)` tuple
cmlenz@1:     that contains the location of the original element or text in the input.
cmlenz@1:     """
cmlenz@1:     __slots__ = ['events']
cmlenz@1: 
cmlenz@1:     START = StreamEventKind('start') # a start tag
cmlenz@1:     END = StreamEventKind('end') # an end tag
cmlenz@1:     TEXT = StreamEventKind('text') # literal text
cmlenz@1:     EXPR = StreamEventKind('expr') # an expression
cmlenz@1:     SUB = StreamEventKind('sub') # a "subprogram"
cmlenz@1:     PROLOG = StreamEventKind('prolog') # XML prolog
cmlenz@1:     DOCTYPE = StreamEventKind('doctype') # doctype declaration
cmlenz@1:     START_NS = StreamEventKind('start-ns') # start namespace mapping
cmlenz@1:     END_NS = StreamEventKind('end-ns') # end namespace mapping
cmlenz@1:     PI = StreamEventKind('pi') # processing instruction
cmlenz@1:     COMMENT = StreamEventKind('comment') # comment
cmlenz@1: 
cmlenz@1:     def __init__(self, events):
cmlenz@1:         """Initialize the stream with a sequence of markup events.
cmlenz@1:         
cmlenz@1:         @oaram events: a sequence or iterable providing the events
cmlenz@1:         """
cmlenz@1:         self.events = events
cmlenz@1: 
cmlenz@1:     def __iter__(self):
cmlenz@1:         return iter(self.events)
cmlenz@1: 
cmlenz@1:     def render(self, method='xml', encoding='utf-8', **kwargs):
cmlenz@1:         """Return a string representation of the stream.
cmlenz@1:         
cmlenz@1:         @param method: determines how the stream is serialized; can be either
cmlenz@1:                        'xml' or 'html', or a custom `Serializer` subclass
cmlenz@1:         @param encoding: how the output string should be encoded; if set to
cmlenz@1:                          `None`, this method returns a `unicode` object
cmlenz@1: 
cmlenz@1:         Any additional keyword arguments are passed to the serializer, and thus
cmlenz@1:         depend on the `method` parameter value.
cmlenz@1:         """
cmlenz@1:         retval = u''.join(self.serialize(method=method, **kwargs))
cmlenz@1:         if encoding is not None:
cmlenz@1:             return retval.encode('utf-8')
cmlenz@1:         return retval
cmlenz@1: 
cmlenz@1:     def select(self, path):
cmlenz@1:         """Return a new stream that contains the events matching the given
cmlenz@1:         XPath expression.
cmlenz@1:         
cmlenz@1:         @param path: a string containing the XPath expression
cmlenz@1:         """
cmlenz@1:         from markup.path import Path
cmlenz@1:         path = Path(path)
cmlenz@1:         return path.select(self)
cmlenz@1: 
cmlenz@1:     def serialize(self, method='xml', **kwargs):
cmlenz@1:         """Generate strings corresponding to a specific serialization of the
cmlenz@1:         stream.
cmlenz@1:         
cmlenz@1:         Unlike the `render()` method, this method is a generator this returns
cmlenz@1:         the serialized output incrementally, as opposed to returning a single
cmlenz@1:         string.
cmlenz@1:         
cmlenz@1:         @param method: determines how the stream is serialized; can be either
cmlenz@1:                        'xml' or 'html', or a custom `Serializer` subclass
cmlenz@1:         """
cmlenz@1:         from markup import output
cmlenz@1:         cls = method
cmlenz@1:         if isinstance(method, basestring):
cmlenz@1:             cls = {'xml': output.XMLSerializer,
cmlenz@1:                    'html': output.HTMLSerializer}[method]
cmlenz@1:         else:
cmlenz@1:             assert issubclass(cls, serializers.Serializer)
cmlenz@1:         serializer = cls(**kwargs)
cmlenz@1:         return serializer.serialize(self)
cmlenz@1: 
cmlenz@1:     def __str__(self):
cmlenz@1:         return self.render()
cmlenz@1: 
cmlenz@1:     def __unicode__(self):
cmlenz@1:         return self.render(encoding=None)
cmlenz@1: 
cmlenz@1: 
cmlenz@1: class Attributes(list):
cmlenz@1: 
cmlenz@1:     def __init__(self, attrib=None):
cmlenz@1:         list.__init__(self, map(lambda (k, v): (QName(k), v), attrib or []))
cmlenz@1: 
cmlenz@1:     def __contains__(self, name):
cmlenz@1:         return name in [attr for attr, value in self]
cmlenz@1: 
cmlenz@1:     def get(self, name, default=None):
cmlenz@1:         for attr, value in self:
cmlenz@1:             if attr == name:
cmlenz@1:                 return value
cmlenz@1:         return default
cmlenz@1: 
cmlenz@1:     def set(self, name, value):
cmlenz@1:         for idx, (attr, _) in enumerate(self):
cmlenz@1:             if attr == name:
cmlenz@1:                 self[idx] = (attr, value)
cmlenz@1:                 break
cmlenz@1:         else:
cmlenz@1:             self.append((QName(name), value))
cmlenz@1: 
cmlenz@1: 
cmlenz@1: class Markup(unicode):
cmlenz@1:     """Marks a string as being safe for inclusion in HTML/XML output without
cmlenz@1:     needing to be escaped.
cmlenz@1:     """
cmlenz@1:     def __new__(self, text='', *args):
cmlenz@1:         if args:
cmlenz@1:             text %= tuple([escape(arg) for arg in args])
cmlenz@1:         return unicode.__new__(self, text)
cmlenz@1: 
cmlenz@1:     def __add__(self, other):
cmlenz@1:         return Markup(unicode(self) + Markup.escape(other))
cmlenz@1: 
cmlenz@1:     def __mod__(self, args):
cmlenz@1:         if not isinstance(args, (list, tuple)):
cmlenz@1:             args = [args]
cmlenz@1:         return Markup(unicode.__mod__(self,
cmlenz@1:                                       tuple([escape(arg) for arg in args])))
cmlenz@1: 
cmlenz@1:     def __mul__(self, num):
cmlenz@1:         return Markup(unicode(self) * num)
cmlenz@1: 
cmlenz@1:     def join(self, seq):
cmlenz@1:         return Markup(unicode(self).join([Markup.escape(item) for item in seq]))
cmlenz@1: 
cmlenz@1:     def stripentities(self, keepxmlentities=False):
cmlenz@1:         """Return a copy of the text with any character or numeric entities
cmlenz@1:         replaced by the equivalent UTF-8 characters.
cmlenz@1:         
cmlenz@1:         If the `keepxmlentities` parameter is provided and evaluates to `True`,
cmlenz@1:         the core XML entities (&amp;, &apos;, &gt;, &lt; and &quot;).
cmlenz@1:         """
cmlenz@1:         def _replace_entity(match):
cmlenz@1:             if match.group(1): # numeric entity
cmlenz@1:                 ref = match.group(1)
cmlenz@1:                 if ref.startswith('x'):
cmlenz@1:                     ref = int(ref[1:], 16)
cmlenz@1:                 else:
cmlenz@1:                     ref = int(ref, 10)
cmlenz@1:                 return unichr(ref)
cmlenz@1:             else: # character entity
cmlenz@1:                 ref = match.group(2)
cmlenz@1:                 if keepxmlentities and ref in ('amp', 'apos', 'gt', 'lt', 'quot'):
cmlenz@1:                     return '&%s;' % ref
cmlenz@1:                 try:
cmlenz@1:                     codepoint = htmlentitydefs.name2codepoint[ref]
cmlenz@1:                     return unichr(codepoint)
cmlenz@1:                 except KeyError:
cmlenz@1:                     if keepxmlentities:
cmlenz@1:                         return '&amp;%s;' % ref
cmlenz@1:                     else:
cmlenz@1:                         return ref
cmlenz@1:         return Markup(re.sub(r'&(?:#((?:\d+)|(?:[xX][0-9a-fA-F]+));?|(\w+);)',
cmlenz@1:                              _replace_entity, self))
cmlenz@1: 
cmlenz@1:     def striptags(self):
cmlenz@1:         """Return a copy of the text with all XML/HTML tags removed."""
cmlenz@1:         return Markup(re.sub(r'<[^>]*?>', '', self))
cmlenz@1: 
cmlenz@1:     def escape(cls, text, quotes=True):
cmlenz@1:         """Create a Markup instance from a string and escape special characters
cmlenz@1:         it may contain (<, >, & and \").
cmlenz@1:         
cmlenz@1:         If the `quotes` parameter is set to `False`, the \" character is left
cmlenz@1:         as is. Escaping quotes is generally only required for strings that are
cmlenz@1:         to be used in attribute values.
cmlenz@1:         """
cmlenz@1:         if isinstance(text, cls):
cmlenz@1:             return text
cmlenz@1:         text = unicode(text)
cmlenz@1:         if not text:
cmlenz@1:             return cls()
cmlenz@1:         text = text.replace('&', '&amp;') \
cmlenz@1:                    .replace('<', '&lt;') \
cmlenz@1:                    .replace('>', '&gt;')
cmlenz@1:         if quotes:
cmlenz@1:             text = text.replace('"', '&#34;')
cmlenz@1:         return cls(text)
cmlenz@1:     escape = classmethod(escape)
cmlenz@1: 
cmlenz@1:     def unescape(self):
cmlenz@1:         """Reverse-escapes &, <, > and \" and returns a `unicode` object."""
cmlenz@1:         if not self:
cmlenz@1:             return ''
cmlenz@1:         return unicode(self).replace('&#34;', '"') \
cmlenz@1:                             .replace('&gt;', '>') \
cmlenz@1:                             .replace('&lt;', '<') \
cmlenz@1:                             .replace('&amp;', '&')
cmlenz@1: 
cmlenz@1:     def plaintext(self, keeplinebreaks=True):
cmlenz@1:         """Returns the text as a `unicode`with all entities and tags removed."""
cmlenz@1:         text = unicode(self.striptags().stripentities())
cmlenz@1:         if not keeplinebreaks:
cmlenz@1:             text = text.replace('\n', ' ')
cmlenz@1:         return text
cmlenz@1: 
cmlenz@1:     def sanitize(self):
cmlenz@1:         from markup.filters import HTMLSanitizer
cmlenz@1:         from markup.input import HTMLParser
cmlenz@1:         sanitize = HTMLSanitizer()
cmlenz@1:         text = self.stripentities(keepxmlentities=True)
cmlenz@1:         return Stream(sanitize(HTMLParser(StringIO(text)), None))
cmlenz@1: 
cmlenz@1: 
cmlenz@1: escape = Markup.escape
cmlenz@1: 
cmlenz@1: def unescape(text):
cmlenz@1:     """Reverse-escapes &, <, > and \" and returns a `unicode` object."""
cmlenz@1:     if not isinstance(text, Markup):
cmlenz@1:         return text
cmlenz@1:     return text.unescape()
cmlenz@1: 
cmlenz@1: 
cmlenz@1: class Namespace(object):
cmlenz@1: 
cmlenz@1:     def __init__(self, uri):
cmlenz@1:         self.uri = uri
cmlenz@1: 
cmlenz@1:     def __getitem__(self, name):
cmlenz@1:         return QName(self.uri + '}' + name)
cmlenz@1: 
cmlenz@1:     __getattr__ = __getitem__
cmlenz@1: 
cmlenz@1:     def __repr__(self):
cmlenz@1:         return '<Namespace "%s">' % self.uri
cmlenz@1: 
cmlenz@1:     def __str__(self):
cmlenz@1:         return self.uri
cmlenz@1: 
cmlenz@1:     def __unicode__(self):
cmlenz@1:         return unicode(self.uri)
cmlenz@1: 
cmlenz@1: 
cmlenz@1: class QName(unicode):
cmlenz@1:     """A qualified element or attribute name.
cmlenz@1:     
cmlenz@1:     The unicode value of instances of this class contains the qualified name of
cmlenz@1:     the element or attribute, in the form `{namespace}localname`. The namespace
cmlenz@1:     URI can be obtained through the additional `namespace` attribute, while the
cmlenz@1:     local name can be accessed through the `localname` attribute.
cmlenz@1:     """
cmlenz@1:     __slots__ = ['namespace', 'localname']
cmlenz@1: 
cmlenz@1:     def __new__(cls, qname):
cmlenz@1:         if isinstance(qname, QName):
cmlenz@1:             return qname
cmlenz@1: 
cmlenz@1:         parts = qname.split('}', 1)
cmlenz@1:         if qname.find('}') > 0:
cmlenz@1:             self = unicode.__new__(cls, '{' + qname)
cmlenz@1:             self.namespace = parts[0]
cmlenz@1:             self.localname = parts[1]
cmlenz@1:         else:
cmlenz@1:             self = unicode.__new__(cls, qname)
cmlenz@1:             self.namespace = None
cmlenz@1:             self.localname = qname
cmlenz@1:         return self