diff genshi/filters/transform.py @ 784:67d324a62cc0 experimental-match-fastpaths

update to 0.5.x branch, up through r907 don't know how this fits in with SoC work, but I wanted to do due diligence and keep this branch working in case it someday gets considered for trunk
author aflett
date Mon, 21 Jul 2008 23:17:52 +0000
parents b57681255af9
children
line wrap: on
line diff
--- a/genshi/filters/transform.py
+++ b/genshi/filters/transform.py
@@ -55,7 +55,7 @@
 from genshi.path import Path
 
 __all__ = ['Transformer', 'StreamBuffer', 'InjectorTransformation', 'ENTER',
-           'EXIT', 'INSIDE', 'OUTSIDE']
+           'EXIT', 'INSIDE', 'OUTSIDE', 'BREAK']
 
 
 class TransformMark(str):
@@ -86,6 +86,40 @@
 """Stream augmentation mark indicating that a selected element is being
 exited."""
 
+BREAK = TransformMark('BREAK')
+"""Stream augmentation mark indicating a break between two otherwise contiguous
+blocks of marked events.
+
+This is used primarily by the cut() transform to provide later transforms with
+an opportunity to operate on the cut buffer.
+"""
+
+
+class PushBackStream(object):
+    """Allows a single event to be pushed back onto the stream and re-consumed.
+    """
+    def __init__(self, stream):
+        self.stream = iter(stream)
+        self.peek = None
+
+    def push(self, event):
+        assert self.peek is None
+        self.peek = event
+
+    def __iter__(self):
+        while True:
+            if self.peek is not None:
+                peek = self.peek
+                self.peek = None
+                yield peek
+            else:
+                try:
+                    event = self.stream.next()
+                    yield event
+                except StopIteration:
+                    if self.peek is None:
+                        raise
+
 
 class Transformer(object):
     """Stream filter that can apply a variety of different transformations to
@@ -150,17 +184,21 @@
         """
         self.transforms = [SelectTransformation(path)]
 
-    def __call__(self, stream):
+    def __call__(self, stream, keep_marks=False):
         """Apply the transform filter to the marked stream.
 
         :param stream: the marked event stream to filter
+        :param keep_marks: Do not strip transformer selection marks from the
+                           stream. Useful for testing.
         :return: the transformed stream
         :rtype: `Stream`
         """
         transforms = self._mark(stream)
         for link in self.transforms:
             transforms = link(transforms)
-        return Stream(self._unmark(transforms),
+        if not keep_marks:
+            transforms = self._unmark(transforms)
+        return Stream(transforms,
                       serializer=getattr(stream, 'serializer', None))
 
     def apply(self, function):
@@ -329,7 +367,8 @@
         <html><head><title>New Title</title></head><body>Some <em>body</em>
         text.</body></html>
 
-        :param content: Either an iterable of events or a string to insert.
+        :param content: Either a callable, an iterable of events, or a string
+                        to insert.
         :rtype: `Transformer`
         """
         return self.apply(ReplaceTransformation(content))
@@ -346,7 +385,8 @@
         <html><head><title>Some Title</title></head><body>Some emphasised
         <em>body</em> text.</body></html>
 
-        :param content: Either an iterable of events or a string to insert.
+        :param content: Either a callable, an iterable of events, or a string
+                        to insert.
         :rtype: `Transformer`
         """
         return self.apply(BeforeTransformation(content))
@@ -362,7 +402,8 @@
         <html><head><title>Some Title</title></head><body>Some <em>body</em>
         rock text.</body></html>
 
-        :param content: Either an iterable of events or a string to insert.
+        :param content: Either a callable, an iterable of events, or a string
+                        to insert.
         :rtype: `Transformer`
         """
         return self.apply(AfterTransformation(content))
@@ -378,7 +419,8 @@
         <html><head><title>Some Title</title></head><body>Some new body text.
         Some <em>body</em> text.</body></html>
 
-        :param content: Either an iterable of events or a string to insert.
+        :param content: Either a callable, an iterable of events, or a string
+                        to insert.
         :rtype: `Transformer`
         """
         return self.apply(PrependTransformation(content))
@@ -392,7 +434,8 @@
         <html><head><title>Some Title</title></head><body>Some <em>body</em>
         text. Some new body text.</body></html>
 
-        :param content: Either an iterable of events or a string to insert.
+        :param content: Either a callable, an iterable of events, or a string
+                        to insert.
         :rtype: `Transformer`
         """
         return self.apply(AppendTransformation(content))
@@ -440,9 +483,13 @@
 
     #{ Buffer operations
 
-    def copy(self, buffer):
+    def copy(self, buffer, accumulate=False):
         """Copy selection into buffer.
 
+        The buffer is replaced by each *contiguous* selection before being passed
+        to the next transformation. If accumulate=True, further selections will
+        be appended to the buffer rather than replacing it.
+
         >>> from genshi.builder import tag
         >>> buffer = StreamBuffer()
         >>> html = HTML('<html><head><title>Some Title</title></head>'
@@ -452,17 +499,14 @@
         <html><head><title>Some Title</title></head><body><h1>Some
         Title</h1>Some <em>body</em> text.</body></html>
 
-        To ensure that a transformation can be reused deterministically, the
-        contents of ``buffer`` is replaced by the ``copy()`` operation:
+        This example illustrates that only a single contiguous selection will
+        be buffered:
 
-        >>> print buffer
-        Some Title
         >>> print html | Transformer('head/title/text()').copy(buffer) \\
         ...     .end().select('body/em').copy(buffer).end().select('body') \\
         ...     .prepend(tag.h1(buffer))
-        <html><head><title>Some
-        Title</title></head><body><h1><em>body</em></h1>Some <em>body</em>
-        text.</body></html>
+        <html><head><title>Some Title</title></head><body><h1>Some
+        Title</h1>Some <em>body</em> text.</body></html>
         >>> print buffer
         <em>body</em>
 
@@ -475,7 +519,8 @@
         >>> def apply_attr(name, entry):
         ...     return list(buffer)[0][1][1].get('class')
         >>> print html | Transformer('body/em[@class]/@class').copy(buffer) \\
-        ...     .end().select('body/em[not(@class)]').attr('class', apply_attr)
+        ...     .end().buffer().select('body/em[not(@class)]') \\
+        ...     .attr('class', apply_attr)
         <html><head><title>Some Title</title></head><body><em
         class="before">Some</em> <em class="before">body</em><em
         class="before">text</em>.</body></html>
@@ -484,11 +529,18 @@
         :param buffer: the `StreamBuffer` in which the selection should be
                        stored
         :rtype: `Transformer`
-        :note: this transformation will buffer the entire input stream
+        :note: Copy (and cut) copy each individual selected object into the
+               buffer before passing to the next transform. For example, the
+               XPath ``*|text()`` will select all elements and text, each
+               instance of which will be copied to the buffer individually
+               before passing to the next transform. This has implications for
+               how ``StreamBuffer`` objects can be used, so some
+               experimentation may be required.
+
         """
-        return self.apply(CopyTransformation(buffer))
+        return self.apply(CopyTransformation(buffer, accumulate))
 
-    def cut(self, buffer):
+    def cut(self, buffer, accumulate=False):
         """Copy selection into buffer and remove the selection from the stream.
 
         >>> from genshi.builder import tag
@@ -500,12 +552,40 @@
         <html><head><title>Some Title</title></head><body>Some
         <em/><h1>body</h1> text.</body></html>
 
+        Specifying accumulate=True, appends all selected intervals onto the
+        buffer. Combining this with the .buffer() operation allows us operate
+        on all copied events rather than per-segment. See the documentation on
+        buffer() for more information.
+
         :param buffer: the `StreamBuffer` in which the selection should be
                        stored
         :rtype: `Transformer`
         :note: this transformation will buffer the entire input stream
         """
-        return self.apply(CutTransformation(buffer))
+        return self.apply(CutTransformation(buffer, accumulate))
+
+    def buffer(self):
+        """Buffer the entire stream (can consume a considerable amount of
+        memory).
+
+        Useful in conjunction with copy(accumulate=True) and
+        cut(accumulate=True) to ensure that all marked events in the entire
+        stream are copied to the buffer before further transformations are
+        applied.
+
+        For example, to move all <note> elements inside a <notes> tag at the
+        top of the document:
+
+        >>> doc = HTML('<doc><notes></notes><body>Some <note>one</note> '
+        ...            'text <note>two</note>.</body></doc>')
+        >>> buffer = StreamBuffer()
+        >>> print doc | Transformer('body/note').cut(buffer, accumulate=True) \\
+        ...     .end().buffer().select('notes').prepend(buffer)
+        <doc><notes><note>one</note><note>two</note></notes><body>Some  text
+        .</body></doc>
+
+        """
+        return self.apply(list)
 
     #{ Miscellaneous operations
 
@@ -546,13 +626,17 @@
         Refer to the documentation for ``re.sub()`` for details.
 
         >>> html = HTML('<html><body>Some text, some more text and '
-        ...             '<b>some bold text</b></body></html>')
-        >>> print html | Transformer('body').substitute('(?i)some', 'SOME')
-        <html><body>SOME text, some more text and <b>SOME bold text</b></body></html>
-        >>> tags = tag.html(tag.body('Some text, some more text and ',
+        ...             '<b>some bold text</b>\\n'
+        ...             '<i>some italicised text</i></body></html>')
+        >>> print html | Transformer('body/b').substitute('(?i)some', 'SOME')
+        <html><body>Some text, some more text and <b>SOME bold text</b>
+        <i>some italicised text</i></body></html>
+        >>> tags = tag.html(tag.body('Some text, some more text and\\n',
         ...      Markup('<b>some bold text</b>')))
-        >>> print tags.generate() | Transformer('body').substitute('(?i)some', 'SOME')
-        <html><body>SOME text, some more text and <b>SOME bold text</b></body></html>
+        >>> print tags.generate() | Transformer('body').substitute(
+        ...     '(?i)some', 'SOME')
+        <html><body>SOME text, some more text and
+        <b>SOME bold text</b></body></html>
 
         :param pattern: A regular expression object or string.
         :param replace: Replacement pattern.
@@ -600,7 +684,8 @@
 
     def _unmark(self, stream):
         for mark, event in stream:
-            if event[0] is not None:
+            kind = event[0]
+            if not (kind is None or kind is ATTR or kind is BREAK):
                 yield event
 
 
@@ -652,9 +737,12 @@
             elif isinstance(result, Attrs):
                 # XXX  Selected *attributes* are given a "kind" of None to
                 # indicate they are not really part of the stream.
-                yield ATTR, (None, (QName(event[1][0] + '@*'), result), event[2])
+                yield ATTR, (ATTR, (QName(event[1][0] + '@*'), result), event[2])
                 yield None, event
+            elif isinstance(result, tuple):
+                yield OUTSIDE, result
             elif result:
+                # XXX Assume everything else is "text"?
                 yield None, (TEXT, unicode(result), (None, -1, -1))
             else:
                 yield None, event
@@ -700,8 +788,12 @@
         :param stream: the marked event stream to filter
         """
         for mark, event in stream:
-            if mark not in (INSIDE, OUTSIDE):
-                yield mark, event
+            yield mark, event
+            if mark is ENTER:
+                for mark, event in stream:
+                    if mark is EXIT:
+                        yield mark, event
+                        break
 
 
 class RemoveTransformation(object):
@@ -746,16 +838,21 @@
                 for prefix in element[:-1]:
                     yield None, prefix
                 yield mark, event
-                while True:
-                    try:
-                        mark, event = stream.next()
-                    except StopIteration:
-                        yield None, element[-1]
+                start = mark
+                stopped = False
+                for mark, event in stream:
+                    if start is ENTER and mark is EXIT:
+                        yield mark, event
+                        stopped = True
+                        break
                     if not mark:
                         break
                     yield mark, event
+                else:
+                    stopped = True
                 yield None, element[-1]
-                yield mark, event
+                if not stopped:
+                    yield mark, event
             else:
                 yield mark, event
 
@@ -784,7 +881,7 @@
 
 class FilterTransformation(object):
     """Apply a normal stream filter to the selection. The filter is called once
-    for each contiguous block of marked events."""
+    for each selection."""
 
     def __init__(self, filter):
         """Create the transform.
@@ -806,14 +903,31 @@
 
         queue = []
         for mark, event in stream:
-            if mark:
+            if mark is ENTER:
                 queue.append(event)
-            else:
+                for mark, event in stream:
+                    queue.append(event)
+                    if mark is EXIT:
+                        break
                 for queue_event in flush(queue):
                     yield queue_event
-                yield None, event
-        for event in flush(queue):
-            yield event
+            elif mark is OUTSIDE:
+                stopped = True
+                queue.append(event)
+                for mark, event in stream:
+                    if mark is not OUTSIDE:
+                        break
+                    queue.append(event)
+                else:
+                    stopped = True
+                for queue_event in flush(queue):
+                    yield queue_event
+                if not stopped:
+                    yield None, event
+            else:
+                yield mark, event
+        for queue_event in flush(queue):
+            yield queue_event
 
 
 class MapTransformation(object):
@@ -848,7 +962,7 @@
 
     Refer to the documentation for ``re.sub()`` for details.
     """
-    def __init__(self, pattern, replace, count=1):
+    def __init__(self, pattern, replace, count=0):
         """Create the transform.
 
         :param pattern: A regular expression object, or string.
@@ -868,7 +982,7 @@
         :param stream: The marked event stream to filter
         """
         for mark, (kind, data, pos) in stream:
-            if kind is TEXT:
+            if mark is not None and kind is TEXT:
                 new_data = self.pattern.sub(self.replace, data, self.count)
                 if isinstance(data, Markup):
                     data = Markup(new_data)
@@ -922,7 +1036,10 @@
         self.content = content
 
     def _inject(self):
-        for event in _ensure(self.content):
+        content = self.content
+        if callable(content):
+            content = content()
+        for event in _ensure(content):
             yield None, event
 
 
@@ -934,14 +1051,18 @@
 
         :param stream: The marked event stream to filter
         """
+        stream = PushBackStream(stream)
         for mark, event in stream:
             if mark is not None:
+                start = mark
                 for subevent in self._inject():
                     yield subevent
-                while True:
-                    mark, event = stream.next()
-                    if mark is None:
-                        yield mark, event
+                for mark, event in stream:
+                    if start is ENTER:
+                        if mark is EXIT:
+                            break
+                    elif mark != start:
+                        stream.push((mark, event))
                         break
             else:
                 yield mark, event
@@ -955,17 +1076,22 @@
 
         :param stream: The marked event stream to filter
         """
+        stream = PushBackStream(stream)
         for mark, event in stream:
             if mark is not None:
+                start = mark
                 for subevent in self._inject():
                     yield subevent
                 yield mark, event
-                while True:
-                    mark, event = stream.next()
-                    if not mark:
+                for mark, event in stream:
+                    if mark != start and start is not ENTER:
+                        stream.push((mark, event))
                         break
                     yield mark, event
-            yield mark, event
+                    if start is ENTER and mark is EXIT:
+                        break
+            else:
+                yield mark, event
 
 
 class AfterTransformation(InjectorTransformation):
@@ -976,20 +1102,20 @@
 
         :param stream: The marked event stream to filter
         """
+        stream = PushBackStream(stream)
         for mark, event in stream:
             yield mark, event
             if mark:
-                while True:
-                    try:
-                        mark, event = stream.next()
-                    except StopIteration:
-                        break
-                    if not mark:
+                start = mark
+                for mark, event in stream:
+                    if start is not ENTER and mark != start:
+                        stream.push((mark, event))
                         break
                     yield mark, event
+                    if start is ENTER and mark is EXIT:
+                        break
                 for subevent in self._inject():
                     yield subevent
-                yield mark, event
 
 
 class PrependTransformation(InjectorTransformation):
@@ -1002,7 +1128,7 @@
         """
         for mark, event in stream:
             yield mark, event
-            if mark in (ENTER, OUTSIDE):
+            if mark is ENTER:
                 for subevent in self._inject():
                     yield subevent
 
@@ -1018,8 +1144,7 @@
         for mark, event in stream:
             yield mark, event
             if mark is ENTER:
-                while True:
-                    mark, event = stream.next()
+                for mark, event in stream:
                     if mark is EXIT:
                         break
                     yield mark, event
@@ -1076,32 +1201,50 @@
         self.events.append(event)
 
     def reset(self):
-        """Reset the buffer so that it's empty."""
+        """Empty the buffer of events."""
         del self.events[:]
 
 
 class CopyTransformation(object):
     """Copy selected events into a buffer for later insertion."""
 
-    def __init__(self, buffer):
+    def __init__(self, buffer, accumulate=False):
         """Create the copy transformation.
 
         :param buffer: the `StreamBuffer` in which the selection should be
                        stored
         """
+        if not accumulate:
+            buffer.reset()
         self.buffer = buffer
+        self.accumulate = accumulate
 
     def __call__(self, stream):
         """Apply the transformation to the marked stream.
 
         :param stream: the marked event stream to filter
         """
-        self.buffer.reset()
-        stream = list(stream)
+        stream = PushBackStream(stream)
+
         for mark, event in stream:
             if mark:
+                if not self.accumulate:
+                    self.buffer.reset()
+                events = [(mark, event)]
                 self.buffer.append(event)
-        return stream
+                start = mark
+                for mark, event in stream:
+                    if start is not ENTER and mark != start:
+                        stream.push((mark, event))
+                        break
+                    events.append((mark, event))
+                    self.buffer.append(event)
+                    if start is ENTER and mark is EXIT:
+                        break
+                for i in events:
+                    yield i
+            else:
+                yield mark, event
 
 
 class CutTransformation(object):
@@ -1109,36 +1252,58 @@
     selection.
     """
 
-    def __init__(self, buffer):
+    def __init__(self, buffer, accumulate=False):
         """Create the cut transformation.
 
         :param buffer: the `StreamBuffer` in which the selection should be
                        stored
         """
         self.buffer = buffer
+        self.accumulate = accumulate
+
 
     def __call__(self, stream):
         """Apply the transform filter to the marked stream.
 
         :param stream: the marked event stream to filter
         """
-        out_stream = []
-        attributes = None
-        for mark, (kind, data, pos) in stream:
-            if attributes:
-                assert kind is START
-                data = (data[0], data[1] - attributes)
-                attributes = None
+        attributes = []
+        stream = PushBackStream(stream)
+        broken = False
+        if not self.accumulate:
+            self.buffer.reset()
+        for mark, event in stream:
             if mark:
-                # There is some magic here. ATTR marked events are pushed into
-                # the stream *before* the START event they originated from.
-                # This allows cut() to strip out the attributes from START
-                # event as would be expected.
+                # Send a BREAK event if there was no other event sent between 
+                if not self.accumulate:
+                    if not broken and self.buffer:
+                        yield BREAK, (BREAK, None, None)
+                    self.buffer.reset()
+                self.buffer.append(event)
+                start = mark
                 if mark is ATTR:
-                    self.buffer.append((kind, data, pos))
-                    attributes = [name for name, _ in data[1]]
-                else:
-                    self.buffer.append((kind, data, pos))
+                    attributes.extend([name for name, _ in event[1][1]])
+                for mark, event in stream:
+                    if start is mark is ATTR:
+                        attributes.extend([name for name, _ in event[1][1]])
+                    # Handle non-element contiguous selection
+                    if start is not ENTER and mark != start:
+                        # Operating on the attributes of a START event
+                        if start is ATTR:
+                            kind, data, pos = event
+                            assert kind is START
+                            data = (data[0], data[1] - attributes)
+                            attributes = None
+                            stream.push((mark, (kind, data, pos)))
+                        else:
+                            stream.push((mark, event))
+                        break
+                    self.buffer.append(event)
+                    if start is ENTER and mark is EXIT:
+                        break
+                broken = False
             else:
-                out_stream.append((mark, (kind, data, pos)))
-        return out_stream
+                broken = True
+                yield mark, event
+        if not broken and self.buffer:
+            yield BREAK, (BREAK, None, None)
Copyright (C) 2012-2017 Edgewall Software