Mercurial > genshi > genshi-test
diff genshi/filters/transform.py @ 784:67d324a62cc0 experimental-match-fastpaths
update to 0.5.x branch, up through r907
don't know how this fits in with SoC work, but I wanted to do due diligence and keep this branch working in case it someday gets considered for trunk
author | aflett |
---|---|
date | Mon, 21 Jul 2008 23:17:52 +0000 |
parents | b57681255af9 |
children |
line wrap: on
line diff
--- a/genshi/filters/transform.py +++ b/genshi/filters/transform.py @@ -55,7 +55,7 @@ from genshi.path import Path __all__ = ['Transformer', 'StreamBuffer', 'InjectorTransformation', 'ENTER', - 'EXIT', 'INSIDE', 'OUTSIDE'] + 'EXIT', 'INSIDE', 'OUTSIDE', 'BREAK'] class TransformMark(str): @@ -86,6 +86,40 @@ """Stream augmentation mark indicating that a selected element is being exited.""" +BREAK = TransformMark('BREAK') +"""Stream augmentation mark indicating a break between two otherwise contiguous +blocks of marked events. + +This is used primarily by the cut() transform to provide later transforms with +an opportunity to operate on the cut buffer. +""" + + +class PushBackStream(object): + """Allows a single event to be pushed back onto the stream and re-consumed. + """ + def __init__(self, stream): + self.stream = iter(stream) + self.peek = None + + def push(self, event): + assert self.peek is None + self.peek = event + + def __iter__(self): + while True: + if self.peek is not None: + peek = self.peek + self.peek = None + yield peek + else: + try: + event = self.stream.next() + yield event + except StopIteration: + if self.peek is None: + raise + class Transformer(object): """Stream filter that can apply a variety of different transformations to @@ -150,17 +184,21 @@ """ self.transforms = [SelectTransformation(path)] - def __call__(self, stream): + def __call__(self, stream, keep_marks=False): """Apply the transform filter to the marked stream. :param stream: the marked event stream to filter + :param keep_marks: Do not strip transformer selection marks from the + stream. Useful for testing. :return: the transformed stream :rtype: `Stream` """ transforms = self._mark(stream) for link in self.transforms: transforms = link(transforms) - return Stream(self._unmark(transforms), + if not keep_marks: + transforms = self._unmark(transforms) + return Stream(transforms, serializer=getattr(stream, 'serializer', None)) def apply(self, function): @@ -329,7 +367,8 @@ <html><head><title>New Title</title></head><body>Some <em>body</em> text.</body></html> - :param content: Either an iterable of events or a string to insert. + :param content: Either a callable, an iterable of events, or a string + to insert. :rtype: `Transformer` """ return self.apply(ReplaceTransformation(content)) @@ -346,7 +385,8 @@ <html><head><title>Some Title</title></head><body>Some emphasised <em>body</em> text.</body></html> - :param content: Either an iterable of events or a string to insert. + :param content: Either a callable, an iterable of events, or a string + to insert. :rtype: `Transformer` """ return self.apply(BeforeTransformation(content)) @@ -362,7 +402,8 @@ <html><head><title>Some Title</title></head><body>Some <em>body</em> rock text.</body></html> - :param content: Either an iterable of events or a string to insert. + :param content: Either a callable, an iterable of events, or a string + to insert. :rtype: `Transformer` """ return self.apply(AfterTransformation(content)) @@ -378,7 +419,8 @@ <html><head><title>Some Title</title></head><body>Some new body text. Some <em>body</em> text.</body></html> - :param content: Either an iterable of events or a string to insert. + :param content: Either a callable, an iterable of events, or a string + to insert. :rtype: `Transformer` """ return self.apply(PrependTransformation(content)) @@ -392,7 +434,8 @@ <html><head><title>Some Title</title></head><body>Some <em>body</em> text. Some new body text.</body></html> - :param content: Either an iterable of events or a string to insert. + :param content: Either a callable, an iterable of events, or a string + to insert. :rtype: `Transformer` """ return self.apply(AppendTransformation(content)) @@ -440,9 +483,13 @@ #{ Buffer operations - def copy(self, buffer): + def copy(self, buffer, accumulate=False): """Copy selection into buffer. + The buffer is replaced by each *contiguous* selection before being passed + to the next transformation. If accumulate=True, further selections will + be appended to the buffer rather than replacing it. + >>> from genshi.builder import tag >>> buffer = StreamBuffer() >>> html = HTML('<html><head><title>Some Title</title></head>' @@ -452,17 +499,14 @@ <html><head><title>Some Title</title></head><body><h1>Some Title</h1>Some <em>body</em> text.</body></html> - To ensure that a transformation can be reused deterministically, the - contents of ``buffer`` is replaced by the ``copy()`` operation: + This example illustrates that only a single contiguous selection will + be buffered: - >>> print buffer - Some Title >>> print html | Transformer('head/title/text()').copy(buffer) \\ ... .end().select('body/em').copy(buffer).end().select('body') \\ ... .prepend(tag.h1(buffer)) - <html><head><title>Some - Title</title></head><body><h1><em>body</em></h1>Some <em>body</em> - text.</body></html> + <html><head><title>Some Title</title></head><body><h1>Some + Title</h1>Some <em>body</em> text.</body></html> >>> print buffer <em>body</em> @@ -475,7 +519,8 @@ >>> def apply_attr(name, entry): ... return list(buffer)[0][1][1].get('class') >>> print html | Transformer('body/em[@class]/@class').copy(buffer) \\ - ... .end().select('body/em[not(@class)]').attr('class', apply_attr) + ... .end().buffer().select('body/em[not(@class)]') \\ + ... .attr('class', apply_attr) <html><head><title>Some Title</title></head><body><em class="before">Some</em> <em class="before">body</em><em class="before">text</em>.</body></html> @@ -484,11 +529,18 @@ :param buffer: the `StreamBuffer` in which the selection should be stored :rtype: `Transformer` - :note: this transformation will buffer the entire input stream + :note: Copy (and cut) copy each individual selected object into the + buffer before passing to the next transform. For example, the + XPath ``*|text()`` will select all elements and text, each + instance of which will be copied to the buffer individually + before passing to the next transform. This has implications for + how ``StreamBuffer`` objects can be used, so some + experimentation may be required. + """ - return self.apply(CopyTransformation(buffer)) + return self.apply(CopyTransformation(buffer, accumulate)) - def cut(self, buffer): + def cut(self, buffer, accumulate=False): """Copy selection into buffer and remove the selection from the stream. >>> from genshi.builder import tag @@ -500,12 +552,40 @@ <html><head><title>Some Title</title></head><body>Some <em/><h1>body</h1> text.</body></html> + Specifying accumulate=True, appends all selected intervals onto the + buffer. Combining this with the .buffer() operation allows us operate + on all copied events rather than per-segment. See the documentation on + buffer() for more information. + :param buffer: the `StreamBuffer` in which the selection should be stored :rtype: `Transformer` :note: this transformation will buffer the entire input stream """ - return self.apply(CutTransformation(buffer)) + return self.apply(CutTransformation(buffer, accumulate)) + + def buffer(self): + """Buffer the entire stream (can consume a considerable amount of + memory). + + Useful in conjunction with copy(accumulate=True) and + cut(accumulate=True) to ensure that all marked events in the entire + stream are copied to the buffer before further transformations are + applied. + + For example, to move all <note> elements inside a <notes> tag at the + top of the document: + + >>> doc = HTML('<doc><notes></notes><body>Some <note>one</note> ' + ... 'text <note>two</note>.</body></doc>') + >>> buffer = StreamBuffer() + >>> print doc | Transformer('body/note').cut(buffer, accumulate=True) \\ + ... .end().buffer().select('notes').prepend(buffer) + <doc><notes><note>one</note><note>two</note></notes><body>Some text + .</body></doc> + + """ + return self.apply(list) #{ Miscellaneous operations @@ -546,13 +626,17 @@ Refer to the documentation for ``re.sub()`` for details. >>> html = HTML('<html><body>Some text, some more text and ' - ... '<b>some bold text</b></body></html>') - >>> print html | Transformer('body').substitute('(?i)some', 'SOME') - <html><body>SOME text, some more text and <b>SOME bold text</b></body></html> - >>> tags = tag.html(tag.body('Some text, some more text and ', + ... '<b>some bold text</b>\\n' + ... '<i>some italicised text</i></body></html>') + >>> print html | Transformer('body/b').substitute('(?i)some', 'SOME') + <html><body>Some text, some more text and <b>SOME bold text</b> + <i>some italicised text</i></body></html> + >>> tags = tag.html(tag.body('Some text, some more text and\\n', ... Markup('<b>some bold text</b>'))) - >>> print tags.generate() | Transformer('body').substitute('(?i)some', 'SOME') - <html><body>SOME text, some more text and <b>SOME bold text</b></body></html> + >>> print tags.generate() | Transformer('body').substitute( + ... '(?i)some', 'SOME') + <html><body>SOME text, some more text and + <b>SOME bold text</b></body></html> :param pattern: A regular expression object or string. :param replace: Replacement pattern. @@ -600,7 +684,8 @@ def _unmark(self, stream): for mark, event in stream: - if event[0] is not None: + kind = event[0] + if not (kind is None or kind is ATTR or kind is BREAK): yield event @@ -652,9 +737,12 @@ elif isinstance(result, Attrs): # XXX Selected *attributes* are given a "kind" of None to # indicate they are not really part of the stream. - yield ATTR, (None, (QName(event[1][0] + '@*'), result), event[2]) + yield ATTR, (ATTR, (QName(event[1][0] + '@*'), result), event[2]) yield None, event + elif isinstance(result, tuple): + yield OUTSIDE, result elif result: + # XXX Assume everything else is "text"? yield None, (TEXT, unicode(result), (None, -1, -1)) else: yield None, event @@ -700,8 +788,12 @@ :param stream: the marked event stream to filter """ for mark, event in stream: - if mark not in (INSIDE, OUTSIDE): - yield mark, event + yield mark, event + if mark is ENTER: + for mark, event in stream: + if mark is EXIT: + yield mark, event + break class RemoveTransformation(object): @@ -746,16 +838,21 @@ for prefix in element[:-1]: yield None, prefix yield mark, event - while True: - try: - mark, event = stream.next() - except StopIteration: - yield None, element[-1] + start = mark + stopped = False + for mark, event in stream: + if start is ENTER and mark is EXIT: + yield mark, event + stopped = True + break if not mark: break yield mark, event + else: + stopped = True yield None, element[-1] - yield mark, event + if not stopped: + yield mark, event else: yield mark, event @@ -784,7 +881,7 @@ class FilterTransformation(object): """Apply a normal stream filter to the selection. The filter is called once - for each contiguous block of marked events.""" + for each selection.""" def __init__(self, filter): """Create the transform. @@ -806,14 +903,31 @@ queue = [] for mark, event in stream: - if mark: + if mark is ENTER: queue.append(event) - else: + for mark, event in stream: + queue.append(event) + if mark is EXIT: + break for queue_event in flush(queue): yield queue_event - yield None, event - for event in flush(queue): - yield event + elif mark is OUTSIDE: + stopped = True + queue.append(event) + for mark, event in stream: + if mark is not OUTSIDE: + break + queue.append(event) + else: + stopped = True + for queue_event in flush(queue): + yield queue_event + if not stopped: + yield None, event + else: + yield mark, event + for queue_event in flush(queue): + yield queue_event class MapTransformation(object): @@ -848,7 +962,7 @@ Refer to the documentation for ``re.sub()`` for details. """ - def __init__(self, pattern, replace, count=1): + def __init__(self, pattern, replace, count=0): """Create the transform. :param pattern: A regular expression object, or string. @@ -868,7 +982,7 @@ :param stream: The marked event stream to filter """ for mark, (kind, data, pos) in stream: - if kind is TEXT: + if mark is not None and kind is TEXT: new_data = self.pattern.sub(self.replace, data, self.count) if isinstance(data, Markup): data = Markup(new_data) @@ -922,7 +1036,10 @@ self.content = content def _inject(self): - for event in _ensure(self.content): + content = self.content + if callable(content): + content = content() + for event in _ensure(content): yield None, event @@ -934,14 +1051,18 @@ :param stream: The marked event stream to filter """ + stream = PushBackStream(stream) for mark, event in stream: if mark is not None: + start = mark for subevent in self._inject(): yield subevent - while True: - mark, event = stream.next() - if mark is None: - yield mark, event + for mark, event in stream: + if start is ENTER: + if mark is EXIT: + break + elif mark != start: + stream.push((mark, event)) break else: yield mark, event @@ -955,17 +1076,22 @@ :param stream: The marked event stream to filter """ + stream = PushBackStream(stream) for mark, event in stream: if mark is not None: + start = mark for subevent in self._inject(): yield subevent yield mark, event - while True: - mark, event = stream.next() - if not mark: + for mark, event in stream: + if mark != start and start is not ENTER: + stream.push((mark, event)) break yield mark, event - yield mark, event + if start is ENTER and mark is EXIT: + break + else: + yield mark, event class AfterTransformation(InjectorTransformation): @@ -976,20 +1102,20 @@ :param stream: The marked event stream to filter """ + stream = PushBackStream(stream) for mark, event in stream: yield mark, event if mark: - while True: - try: - mark, event = stream.next() - except StopIteration: - break - if not mark: + start = mark + for mark, event in stream: + if start is not ENTER and mark != start: + stream.push((mark, event)) break yield mark, event + if start is ENTER and mark is EXIT: + break for subevent in self._inject(): yield subevent - yield mark, event class PrependTransformation(InjectorTransformation): @@ -1002,7 +1128,7 @@ """ for mark, event in stream: yield mark, event - if mark in (ENTER, OUTSIDE): + if mark is ENTER: for subevent in self._inject(): yield subevent @@ -1018,8 +1144,7 @@ for mark, event in stream: yield mark, event if mark is ENTER: - while True: - mark, event = stream.next() + for mark, event in stream: if mark is EXIT: break yield mark, event @@ -1076,32 +1201,50 @@ self.events.append(event) def reset(self): - """Reset the buffer so that it's empty.""" + """Empty the buffer of events.""" del self.events[:] class CopyTransformation(object): """Copy selected events into a buffer for later insertion.""" - def __init__(self, buffer): + def __init__(self, buffer, accumulate=False): """Create the copy transformation. :param buffer: the `StreamBuffer` in which the selection should be stored """ + if not accumulate: + buffer.reset() self.buffer = buffer + self.accumulate = accumulate def __call__(self, stream): """Apply the transformation to the marked stream. :param stream: the marked event stream to filter """ - self.buffer.reset() - stream = list(stream) + stream = PushBackStream(stream) + for mark, event in stream: if mark: + if not self.accumulate: + self.buffer.reset() + events = [(mark, event)] self.buffer.append(event) - return stream + start = mark + for mark, event in stream: + if start is not ENTER and mark != start: + stream.push((mark, event)) + break + events.append((mark, event)) + self.buffer.append(event) + if start is ENTER and mark is EXIT: + break + for i in events: + yield i + else: + yield mark, event class CutTransformation(object): @@ -1109,36 +1252,58 @@ selection. """ - def __init__(self, buffer): + def __init__(self, buffer, accumulate=False): """Create the cut transformation. :param buffer: the `StreamBuffer` in which the selection should be stored """ self.buffer = buffer + self.accumulate = accumulate + def __call__(self, stream): """Apply the transform filter to the marked stream. :param stream: the marked event stream to filter """ - out_stream = [] - attributes = None - for mark, (kind, data, pos) in stream: - if attributes: - assert kind is START - data = (data[0], data[1] - attributes) - attributes = None + attributes = [] + stream = PushBackStream(stream) + broken = False + if not self.accumulate: + self.buffer.reset() + for mark, event in stream: if mark: - # There is some magic here. ATTR marked events are pushed into - # the stream *before* the START event they originated from. - # This allows cut() to strip out the attributes from START - # event as would be expected. + # Send a BREAK event if there was no other event sent between + if not self.accumulate: + if not broken and self.buffer: + yield BREAK, (BREAK, None, None) + self.buffer.reset() + self.buffer.append(event) + start = mark if mark is ATTR: - self.buffer.append((kind, data, pos)) - attributes = [name for name, _ in data[1]] - else: - self.buffer.append((kind, data, pos)) + attributes.extend([name for name, _ in event[1][1]]) + for mark, event in stream: + if start is mark is ATTR: + attributes.extend([name for name, _ in event[1][1]]) + # Handle non-element contiguous selection + if start is not ENTER and mark != start: + # Operating on the attributes of a START event + if start is ATTR: + kind, data, pos = event + assert kind is START + data = (data[0], data[1] - attributes) + attributes = None + stream.push((mark, (kind, data, pos))) + else: + stream.push((mark, event)) + break + self.buffer.append(event) + if start is ENTER and mark is EXIT: + break + broken = False else: - out_stream.append((mark, (kind, data, pos))) - return out_stream + broken = True + yield mark, event + if not broken and self.buffer: + yield BREAK, (BREAK, None, None)