view genshi/filters/transform.py @ 501:3073ac688651

Added new markup transformation filter contributed by Alec Thomas (#122). This provides gorgeous jQuery-inspired stream transformation capabilities based on XPath expressions.
author cmlenz
date Mon, 04 Jun 2007 17:13:10 +0000
parents
children 53b478e3f3e2
line wrap: on
line source
# -*- coding: utf-8 -*-
#
# Copyright (C) 2006-2007 Edgewall Software
# All rights reserved.
#
# This software is licensed as described in the file COPYING, which
# you should have received as part of this distribution. The terms
# are also available at http://genshi.edgewall.org/wiki/License.
#
# This software consists of voluntary contributions made by many
# individuals. For the exact contribution history, see the revision
# history and logs, available at http://genshi.edgewall.org/log/.

"""A filter for generalised functional-style transformations of markup streams,
inspired by JQuery.
"""

import sys

from genshi.path import Path
from genshi.builder import Element
from genshi.core import Stream, Attrs, QName, TEXT, START, END

__all__ = ['Transformer', 'Injector', 'BEGIN', 'FINISH', 'INSIDE', 'OUTSIDE']


class TransformMark(str):
    """A mark on a transformation stream."""
    __slots__ = []
    _instances = {}

    def __new__(cls, val):
        return cls._instances.setdefault(val, str.__new__(cls, val))


BEGIN = TransformMark('BEGIN')
INSIDE = TransformMark('INSIDE')
OUTSIDE = TransformMark('OUTSIDE')
FINISH = TransformMark('FINISH')


class Transformer(object):
    """Stream filter that can apply a variety of different transformations to
    a stream.

    This is achieved by selecting the events to be transformed using XPath,
    then applying the transformations to the events matched by the path
    expression. Each marked event is in the form (mark, (kind, data, pos)),
    where mark can be any of `BEGIN`, `FINISH`, `INSIDE`, `OUTSIDE` or None.

    The first three marks match `START` and `END` events, and any events
    contained `INSIDE` any selected XML/HTML element. A non-element match
    outside a `START`/`END` container (e.g. ``text()``) will yield an `OUTSIDE`
    mark.

    >>> stream = HTML('<html><head><title>Some Title</title></head>'
    ...               '<body>Some <em>body</em> text.</body></html>')
    >>> short_stream = HTML('<body>Some <em>test</em> text</body>')

    Transformations act on selected stream events matching an XPath. Here's an
    example of removing some markup (title) selected by an expression:

    >>> print stream | Transformer('.//title').remove()
    <html><head/><body>Some <em>body</em> text.</body></html>

    Inserted content can be passed in the form of a string, or a Genshi event
    Stream, which includes ``genshi.builder.tag``:

    >>> from genshi.builder import tag
    >>> print stream | Transformer('.//body').prepend(tag.h1('Document Title'))
    <html><head><title>Some Title</title></head><body><h1>Document
    Title</h1>Some <em>body</em> text.</body></html>

    Each XPath expression determines the set of tags that will be acted upon by
    subsequent transformations. In this example we select the <title> text, copy
    it into a buffer, then select the <body> element and paste the copied text
    into the body as <h1> enclosed text:

    >>> buffer = []
    >>> print stream | Transformer('.//title/text()').copy(buffer) \
            .select('.//body').prepend(tag.h1(buffer))
    <html><head><title>Some Title</title></head><body><h1>Some Title</h1>Some
    <em>body</em> text.</body></html>

    Transformations can also be assigned and reused, although care must be
    taken when using buffers, to ensure that buffers are cleared between
    transforms:

    >>> emphasis = Transformer('.//em').setattr('class', 'emphasis')
    >>> print stream | emphasis
    <html><head><title>Some Title</title></head><body>Some <em
    class="emphasis">body</em> text.</body></html>
    >>> print HTML('<html><body><em>Emphasis</em></body></html>') | emphasis
    <html><body><em class="emphasis">Emphasis</em></body></html>
    """

    __slots__ = ('transforms',)

    def __init__(self, path=None):
        """Construct a new transformation filter.

        :param path: the XPath expression
        """
        self.transforms = []
        if path:
            self.transforms.append(Select(path))

    def __call__(self, stream):
        """Apply the transform filter to the marked stream.

        :param stream: The marked event stream to filter
        :return: the transformed stream
        :rtype: `Stream`
        """
        transforms = self._mark(stream)
        for link in self.transforms:
            transforms = link(transforms)
        return Stream(self._unmark(transforms))

    def __or__(self, function):
        """Combine transformations.

        Transformations can be chained, similar to stream filters. Any callable
        accepting a marked stream can be used as a transform.

        As an example, here is a simple `TEXT` event upper-casing transform:

        >>> def upper(stream):
        ...     for mark, (kind, data, pos) in stream:
        ...         if mark and kind is TEXT:
        ...             yield mark, (kind, data.upper(), pos)
        ...         else:
        ...             yield mark, (kind, data, pos)
        >>> short_stream = HTML('<body>Some <em>test</em> text</body>')
        >>> print short_stream | (Transformer('.//em/text()') | upper)
        <body>Some <em>TEST</em> text</body>
        """
        transform = Transformer()
        transform.transforms = self.transforms[:]
        if isinstance(function, Transformer):
            transform.transforms.extend(function.transforms)
        else:
            transform.transforms.append(function)
        return transform

    #{ Selection operations

    def select(self, path):
        """Mark events matching the given XPath expression.

        >>> html = HTML('<body>Some <em>test</em> text</body>')
        >>> print html | Transformer().select('.//em').trace()
        (None, ('START', (QName(u'body'), Attrs()), (None, 1, 0)))
        (None, ('TEXT', u'Some ', (None, 1, 6)))
        ('BEGIN', ('START', (QName(u'em'), Attrs()), (None, 1, 11)))
        ('INSIDE', ('TEXT', u'test', (None, 1, 15)))
        ('FINISH', ('END', QName(u'em'), (None, 1, 19)))
        (None, ('TEXT', u' text', (None, 1, 24)))
        (None, ('END', QName(u'body'), (None, 1, 29)))
        <body>Some <em>test</em> text</body>

        :return: the stream augmented by transformation marks
        :rtype: `Transformer`
        """
        return self | Select(path)

    def invert(self):
        """Invert selection so that marked events become unmarked, and vice
        versa.

        Specificaly, all marks are converted to null marks, and all null marks
        are converted to OUTSIDE marks.

        >>> html = HTML('<body>Some <em>test</em> text</body>')
        >>> print html | Transformer('//em').invert().trace()
        ('OUTSIDE', ('START', (QName(u'body'), Attrs()), (None, 1, 0)))
        ('OUTSIDE', ('TEXT', u'Some ', (None, 1, 6)))
        (None, ('START', (QName(u'em'), Attrs()), (None, 1, 11)))
        (None, ('TEXT', u'test', (None, 1, 15)))
        (None, ('END', QName(u'em'), (None, 1, 19)))
        ('OUTSIDE', ('TEXT', u' text', (None, 1, 24)))
        ('OUTSIDE', ('END', QName(u'body'), (None, 1, 29)))
        <body>Some <em>test</em> text</body>

        :rtype: `Transformer`
        """
        return self | invert

    #{ Deletion operations

    def empty(self):
        """Empty selected elements of all content.

        Example:

        >>> html = HTML('<html><head><title>Some Title</title></head>'
        ...             '<body>Some <em>body</em> text.</body></html>')
        >>> print html | Transformer('.//em').empty()
        <html><head><title>Some Title</title></head><body>Some <em/>
        text.</body></html>

        :rtype: `Transformer`
        """
        return self | empty

    def remove(self):
        """Remove selection from the stream.

        Example:

        >>> html = HTML('<html><head><title>Some Title</title></head>'
        ...             '<body>Some <em>body</em> text.</body></html>')
        >>> print html | Transformer('.//em').remove()
        <html><head><title>Some Title</title></head><body>Some
        text.</body></html>

        :rtype: `Transformer`
        """
        return self | remove

    #{ Direct element operations

    def unwrap(self):
        """Remove outtermost enclosing elements from selection.

        Example:

        >>> html = HTML('<html><head><title>Some Title</title></head>'
        ...             '<body>Some <em>body</em> text.</body></html>')
        >>> print html | Transformer('.//em').unwrap()
        <html><head><title>Some Title</title></head><body>Some body
        text.</body></html>

        :rtype: `Transformer`
        """
        return self | unwrap


    def wrap(self, element):
        """Wrap selection in an element.

        >>> html = HTML('<html><head><title>Some Title</title></head>'
        ...             '<body>Some <em>body</em> text.</body></html>')
        >>> print html | Transformer('.//em').wrap('strong')
        <html><head><title>Some Title</title></head><body>Some
        <strong><em>body</em></strong> text.</body></html>

        :param element: Either a string tag name or a Genshi builder element.
        :rtype: `Transformer`
        """
        return self | Wrap(element)

    #{ Content insertion operations

    def replace(self, content):
        """Replace selection with content.

        >>> html = HTML('<html><head><title>Some Title</title></head>'
        ...             '<body>Some <em>body</em> text.</body></html>')
        >>> print html | Transformer('.//title/text()').replace('New Title')
        <html><head><title>New Title</title></head><body>Some <em>body</em>
        text.</body></html>

        :param content: Either an iterable of events or a string to insert.
        :rtype: `Transformer`
        """
        return self | Replace(content)

    def before(self, content):
        """Insert content before selection.

        In this example we insert the word 'emphasised' before the <em> opening
        tag:

        >>> html = HTML('<html><head><title>Some Title</title></head>'
        ...             '<body>Some <em>body</em> text.</body></html>')
        >>> print html | Transformer('.//em').before('emphasised ')
        <html><head><title>Some Title</title></head><body>Some emphasised
        <em>body</em> text.</body></html>

        :param content: Either an iterable of events or a string to insert.
        :rtype: `Transformer`
        """
        return self | Before(content)

    def after(self, content):
        """Insert content after selection.

        Here, we insert some text after the </em> closing tag:

        >>> html = HTML('<html><head><title>Some Title</title></head>'
        ...             '<body>Some <em>body</em> text.</body></html>')
        >>> print html | Transformer('.//em').after(' rock')
        <html><head><title>Some Title</title></head><body>Some <em>body</em>
        rock text.</body></html>

        :param content: Either an iterable of events or a string to insert.
        :rtype: `Transformer`
        """
        return self | After(content)

    def prepend(self, content):
        """Insert content after the BEGIN event of the selection.

        Inserting some new text at the start of the <body>:

        >>> html = HTML('<html><head><title>Some Title</title></head>'
        ...             '<body>Some <em>body</em> text.</body></html>')
        >>> print html | Transformer('.//body').prepend('Some new body text. ')
        <html><head><title>Some Title</title></head><body>Some new body text.
        Some <em>body</em> text.</body></html>

        :param content: Either an iterable of events or a string to insert.
        :rtype: `Transformer`
        """
        return self | Prepend(content)

    def append(self, content):
        """Insert content before the END event of the selection.

        >>> html = HTML('<html><head><title>Some Title</title></head>'
        ...             '<body>Some <em>body</em> text.</body></html>')
        >>> print html | Transformer('.//body').append(' Some new body text.')
        <html><head><title>Some Title</title></head><body>Some <em>body</em>
        text. Some new body text.</body></html>

        :param content: Either an iterable of events or a string to insert.
        :rtype: `Transformer`
        """
        return self | Append(content)

    #{ Attribute manipulation

    def setattr(self, key, value):
        """Add or replace an attribute to selected elements.

        Example:

        >>> html = HTML('<html><head><title>Some Title</title></head>'
        ...             '<body>Some <em>body</em> text.</body></html>')
        >>> print html | Transformer('.//em').setattr('class', 'emphasis')
        <html><head><title>Some Title</title></head><body>Some <em
        class="emphasis">body</em> text.</body></html>

        :rtype: `Transformer`
        """
        return self | SetAttr(key, value)

    def delattr(self, key):
        """Delete an attribute from selected elements.

        Example:

        >>> html = HTML('<html><head><title>Some Title</title></head>'
        ...             '<body>Some <em class="emphasis">body</em> '
        ...             'text.</body></html>')
        >>> print html | Transformer('.//*[@class="emphasis"]').delattr('class')
        <html><head><title>Some Title</title></head><body>Some <em>body</em>
        text.</body></html>

        :rtype: `Transformer`
        """
        return self | DelAttr(key)

    #{ Buffer operations

    def copy(self, buffer):
        """Copy selection into buffer.

        >>> from genshi.builder import tag
        >>> buffer = []
        >>> html = HTML('<html><head><title>Some Title</title></head>'
        ...             '<body>Some <em>body</em> text.</body></html>')
        >>> print html | Transformer('.//title/text()').copy(buffer) \\
        ...     .select('.//body').prepend(tag.h1(buffer))
        <html><head><title>Some Title</title></head><body><h1>Some Title</h1>Some
        <em>body</em> text.</body></html>

        :param buffer: a list-like object (must support .append() and be
                       iterable) where the selection will be buffered.
        :rtype: `Transformer`
        :note: this transformation will buffer the entire input stream
        """
        return self | Copy(buffer)

    def cut(self, buffer):
        """Copy selection into buffer and remove the selection from the stream.

        >>> from genshi.builder import tag
        >>> buffer = []
        >>> html = HTML('<html><head><title>Some Title</title></head>'
        ...             '<body>Some <em>body</em> text.</body></html>')
        >>> print html | Transformer('.//em/text()').cut(buffer) \\
        ...     .select('.//em').after(tag.h1(buffer))
        <html><head><title>Some Title</title></head><body>Some
        <em/><h1>body</h1> text.</body></html>

        :param buffer: a list-like object (must support .append() and be
                       iterable) where the selection will be buffered.
        :rtype: `Transformer`
        :note: this transformation will buffer the entire input stream
        """
        return self | Cut(buffer)

    #{ Miscellaneous operations

    def apply(self, function, kind):
        """Apply a function to the ``data`` element of events of ``kind`` in
        the selection.

        >>> import string
        >>> html = HTML('<html><head><title>Some Title</title></head>'
        ...               '<body>Some <em>body</em> text.</body></html>')
        >>> print html | Transformer('head/title').apply(string.upper, TEXT)
        <html><head><title>SOME TITLE</title></head><body>Some <em>body</em>
        text.</body></html>

        :param function: the function to apply
        :param kind: the kind of event the function should be applied to
        :rtype: `Transformer`
        """
        return self | Apply(function, kind)

    def trace(self, prefix='', fileobj=None):
        """Print events as they pass through the transform.

        >>> html = HTML('<body>Some <em>test</em> text</body>')
        >>> print html | Transformer('em').trace()
        (None, ('START', (QName(u'body'), Attrs()), (None, 1, 0)))
        (None, ('TEXT', u'Some ', (None, 1, 6)))
        ('BEGIN', ('START', (QName(u'em'), Attrs()), (None, 1, 11)))
        ('INSIDE', ('TEXT', u'test', (None, 1, 15)))
        ('FINISH', ('END', QName(u'em'), (None, 1, 19)))
        (None, ('TEXT', u' text', (None, 1, 24)))
        (None, ('END', QName(u'body'), (None, 1, 29)))
        <body>Some <em>test</em> text</body>

        :param prefix: a string to prefix each event with in the output
        :param fileobj: the writable file-like object to write to; defaults to
                        the standard output stream
        :rtype: `Transformer`
        """
        return self | Trace(prefix, fileobj=fileobj)

    # Internal methods

    def _mark(self, stream):
        for event in stream:
            yield None, event

    def _unmark(self, stream):
        for mark, event in stream:
            yield event


class Select(object):
    """Select and mark events that match an XPath expression."""
    def __init__(self, path):
        """Create selection.

        :param path: XPath expression.
        """
        self.path = Path(path)

    def __call__(self, stream):
        """Apply the transform filter to the marked stream.

        :param stream: The marked event stream to filter
        """
        namespaces = {}
        variables = {}
        test = self.path.test()
        stream = iter(stream)
        for mark, event in stream:
            result = test(event, {}, {})
            if result is True:
                if event[0] is START:
                    yield BEGIN, event
                    depth = 1
                    while depth > 0:
                        mark, subevent = stream.next()
                        if subevent[0] is START:
                            depth += 1
                        elif subevent[0] is END:
                            depth -= 1
                        if depth == 0:
                            yield FINISH, subevent
                        else:
                            yield INSIDE, subevent
                        test(subevent, {}, {}, updateonly=True)
                else:
                    yield OUTSIDE, event
            elif result:
                yield BEGIN, result
            else:
                yield None, event


def invert(stream):
    """Invert selection so that marked events become unmarked, and vice versa.

    Specificaly, all input marks are converted to null marks, and all input
    null marks are converted to OUTSIDE marks.

    :param stream: The marked event stream to filter
    """
    for mark, event in stream:
        if mark:
            yield None, event
        else:
            yield OUTSIDE, event

def empty(stream):
    """Empty selected elements of all content.

    :param stream: The marked event stream to filter
    """
    for mark, event in stream:
        if mark not in (INSIDE, OUTSIDE):
            yield mark, event

def remove(stream):
    """Remove selection from the stream.

    :param stream: The marked event stream to filter
    """
    for mark, event in stream:
        if mark is None:
            yield mark, event

def unwrap(stream):
    """Remove outtermost enclosing elements from selection.

    :param stream: The marked event stream to filter
    """
    for mark, event in stream:
        if mark not in (BEGIN, FINISH):
            yield mark, event


class Wrap(object):
    """Wrap selection in an element."""

    def __init__(self, element):
        if isinstance(element, Element):
            self.element = element
        else:
            self.element = Element(element)

    def __call__(self, stream):
        for mark, event in stream:
            if mark:
                element = list(self.element.generate())
                for prefix in element[:-1]:
                    yield None, prefix
                yield mark, event
                while True:
                    mark, event = stream.next()
                    if not mark:
                        break
                    yield mark, event
                yield None, element[-1]
                yield mark, event
            else:
                yield mark, event


class Trace(object):
    """Print events as they pass through the transform."""

    def __init__(self, prefix='', fileobj=None):
        """Trace constructor.

        :param prefix: text to prefix each traced line with.
        :param fileobj: the writable file-like object to write to
        """
        self.prefix = prefix
        self.fileobj = fileobj or sys.stdout

    def __call__(self, stream):
        """Apply the transform filter to the marked stream.

        :param stream: The marked event stream to filter
        """
        for event in stream:
            print>>self.fileobj, self.prefix + str(event)
            yield event


class Apply(object):
    """Apply a function to the `data` element of events of ``kind`` in the
    selection.
    """

    def __init__(self, function, kind):
        """Create the transform.

        :param function: The function to apply. The function must take one
                         argument, the `data` element of each selected event.
        :param kind: The Genshi event `kind` to apply ``function`` to.
        """
        self.function = function
        self.kind = kind

    def __call__(self, stream):
        """Apply the transform filter to the marked stream.

        :param stream: The marked event stream to filter
        """
        for mark, (kind, data, pos) in stream:
            if mark and kind == self.kind:
                yield mark, (kind, self.function(data), pos)
            else:
                yield mark, (kind, data, pos)


class Injector(object):
    """Abstract base class for transformations that inject content into a
    stream.

    >>> class Top(Injector):
    ...     def __call__(self, stream):
    ...         for event in self._inject():
    ...             yield event
    ...         for event in stream:
    ...             yield event
    >>> html = HTML('<body>Some <em>test</em> text</body>')
    >>> print html | (Transformer('.//em') | Top('Prefix '))
    Prefix <body>Some <em>test</em> text</body>
    """
    def __init__(self, content):
        """Create a new injector.

        :param content: An iterable of Genshi stream events, or a string to be
                        injected.
        """
        self.content = content

    def _inject(self):
        if isinstance(self.content, basestring):
            yield None, (TEXT, self.content, (None, -1, -1))
        else:
            for event in self.content:
                yield None, event


class Replace(Injector):
    """Replace selection with content."""
    def __call__(self, stream):
        """Apply the transform filter to the marked stream.

        :param stream: The marked event stream to filter
        """
        for mark, event in stream:
            if mark is not None:
                for subevent in self._inject():
                    yield subevent
                while True:
                    mark, event = stream.next()
                    if mark is None:
                        yield mark, event
                        break
            else:
                yield mark, event


class Before(Injector):
    """Insert content before selection."""
    def __call__(self, stream):
        """Apply the transform filter to the marked stream.

        :param stream: The marked event stream to filter
        """
        for mark, event in stream:
            if mark in (BEGIN, OUTSIDE):
                for subevent in self._inject():
                    yield subevent
            yield mark, event


class After(Injector):
    """Insert content after selection."""
    def __call__(self, stream):
        """Apply the transform filter to the marked stream.

        :param stream: The marked event stream to filter
        """
        for mark, event in stream:
            yield mark, event
            if mark:
                while True:
                    mark, event = stream.next()
                    if not mark:
                        break
                    yield mark, event
                for subevent in self._inject():
                    yield subevent
                yield mark, event


class Prepend(Injector):
    """Prepend content to the inside of selected elements."""
    def __call__(self, stream):
        """Apply the transform filter to the marked stream.

        :param stream: The marked event stream to filter
        """
        for mark, event in stream:
            yield mark, event
            if mark in (BEGIN, OUTSIDE):
                for subevent in self._inject():
                    yield subevent


class Append(Injector):
    """Append content after the content of selected elements."""
    def __call__(self, stream):
        """Apply the transform filter to the marked stream.

        :param stream: The marked event stream to filter
        """
        for mark, event in stream:
            yield mark, event
            if mark is BEGIN:
                while True:
                    mark, event = stream.next()
                    if mark is FINISH:
                        break
                    yield mark, event
                for subevent in self._inject():
                    yield subevent
                yield mark, event


class SetAttr(object):
    """Set an attribute on selected elements."""
    def __init__(self, key, value):
        """Construct transform.

        :param key: Attribute to set.
        :param value: Value of attribute.
        """
        self.key = key
        self.value = value

    def __call__(self, stream):
        """Apply the transform filter to the marked stream.

        :param stream: The marked event stream to filter
        """
        for mark, (kind, data, pos) in stream:
            if mark is BEGIN:
                data = (data[0], data[1] | [(QName(self.key), self.value)])
            yield mark, (kind, data, pos)


class DelAttr(object):
    """Delete an attribute of selected elements."""
    def __init__(self, key):
        """Construct transform.

        :param key: The attribute to remove."""
        self.key = key

    def __call__(self, stream):
        """Apply the transform filter to the marked stream.

        :param stream: The marked event stream to filter
        """
        for mark, (kind, data, pos) in stream:
            if mark is BEGIN:
                data = (data[0], data[1] - self.key)
            yield mark, (kind, data, pos)


class Copy(object):
    """Copy selected events into a buffer for later insertion."""
    def __init__(self, buffer):
        """Create a Copy transform filter.

        :param buffer: A list-like object (must support .append() and be
                       iterable) where the buffered events will be stored.
        """
        self.buffer = buffer

    def __call__(self, stream):
        """Apply the transform filter to the marked stream.

        :param stream: The marked event stream to filter
        """
        stream = list(stream)
        for mark, event in stream:
            if mark:
                self.buffer.append(event)
        return stream


class Cut(Copy):
    """Cut selected events into a buffer for later insertion and remove the
    selection."""
    def __call__(self, stream):
        """Apply the transform filter to the marked stream.

        :param stream: The marked event stream to filter
        """
        stream = Copy.__call__(self, stream)
        return remove(stream)


if __name__ == '__main__':
    import doctest
    from genshi.input import HTML
    doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE,
                    extraglobs={'HTML': HTML})
Copyright (C) 2012-2017 Edgewall Software