# HG changeset patch # User cmlenz # Date 1151484904 0 # Node ID 3c1a022be04c5d2c4e2aeeed6c41cf7f775e98b9 # Parent e3be27f5bcf5f1ec07c5e634b13e3c234c23a921 * Split out the XPath tests into a separate `unittest`-based file. * Added many more docstrings. * Cleaned up the implementation of the XML/HTML parsers a bit. * The HTML parser now correctly handles minimized attributes. * Added `COPYING` and `README` files. diff --git a/COPYING b/COPYING new file mode 100644 --- /dev/null +++ b/COPYING @@ -0,0 +1,28 @@ +Copyright (C) 2006 Christopher Lenz +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. The name of the author may not be used to endorse or promote + products derived from this software without specific prior + written permission. + +THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS +OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER +IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN +IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.txt b/README.txt new file mode 100644 --- /dev/null +++ b/README.txt @@ -0,0 +1,11 @@ +About Markup +============ + +Markup is a Python library that provides a integrated set of components +for parsing, generating, and processing HTML or XML content in a uniform +manner. The major feature is a template language, which is heavily +inspired by Kid. + +For more information visit the Markup web site: + + diff --git a/markup/input.py b/markup/input.py --- a/markup/input.py +++ b/markup/input.py @@ -18,7 +18,6 @@ from sets import ImmutableSet as frozenset import HTMLParser as html import htmlentitydefs -import re from StringIO import StringIO from markup.core import Attributes, Markup, QName, Stream @@ -37,9 +36,26 @@ class XMLParser(object): """Generator-based XML parser based on roughly equivalent code in - Kid/ElementTree.""" + Kid/ElementTree. + + The parsing is initiated by iterating over the parser object: + + >>> parser = XMLParser(StringIO('Foo')) + >>> for kind, data, pos in parser: + ... print kind, data + START (u'root', [(u'id', u'2')]) + START (u'child', []) + TEXT Foo + END child + END root + """ def __init__(self, source, filename=None): + """Initialize the parser for the given XML text. + + @param source: the XML text as a file-like object + @param filename: the name of the file, if appropriate + """ self.source = source self.filename = filename @@ -90,6 +106,11 @@ msg += ', in ' + self.filename raise ParseError(msg, self.filename, e.lineno, e.offset) + def _enqueue(self, kind, data, pos=None): + if pos is None: + pos = self._getpos() + self._queue.append((kind, data, pos)) + def _getpos_unknown(self): return (self.filename or '', -1, -1) @@ -98,40 +119,38 @@ self.expat.CurrentColumnNumber) def _handle_start(self, tag, attrib): - self._queue.append((Stream.START, (QName(tag), Attributes(attrib.items())), - self._getpos())) + self._enqueue(Stream.START, (QName(tag), Attributes(attrib.items()))) def _handle_end(self, tag): - self._queue.append((Stream.END, QName(tag), self._getpos())) + self._enqueue(Stream.END, QName(tag)) def _handle_data(self, text): - self._queue.append((Stream.TEXT, text, self._getpos())) + self._enqueue(Stream.TEXT, text) def _handle_prolog(self, version, encoding, standalone): - self._queue.append((Stream.PROLOG, (version, encoding, standalone), - self._getpos())) + self._enqueue(Stream.PROLOG, (version, encoding, standalone)) def _handle_doctype(self, name, sysid, pubid, has_internal_subset): - self._queue.append((Stream.DOCTYPE, (name, pubid, sysid), self._getpos())) + self._enqueue(Stream.DOCTYPE, (name, pubid, sysid)) def _handle_start_ns(self, prefix, uri): - self._queue.append((Stream.START_NS, (prefix or '', uri), self._getpos())) + self._enqueue(Stream.START_NS, (prefix or '', uri)) def _handle_end_ns(self, prefix): - self._queue.append((Stream.END_NS, prefix or '', self._getpos())) + self._enqueue(Stream.END_NS, prefix or '') def _handle_pi(self, target, data): - self._queue.append((Stream.PI, (target, data), self._getpos())) + self._enqueue(Stream.PI, (target, data)) def _handle_comment(self, text): - self._queue.append((Stream.COMMENT, text, self._getpos())) + self._enqueue(Stream.COMMENT, text) def _handle_other(self, text): if text.startswith('&'): # deal with undefined entities try: text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) - self._queue.append((Stream.TEXT, text, self._getpos())) + self._enqueue(Stream.TEXT, text) except KeyError: lineno, offset = self._getpos() raise expat.error("undefined entity %s: line %d, column %d" % @@ -147,6 +166,17 @@ This class provides the same interface for generating stream events as `XMLParser`, and attempts to automatically balance tags. + + The parsing is initiated by iterating over the parser object: + + >>> parser = HTMLParser(StringIO('')) + >>> for kind, data, pos in parser: + ... print kind, data + START (u'ul', [(u'compact', u'compact')]) + START (u'li', []) + TEXT Foo + END li + END ul """ _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', @@ -187,45 +217,53 @@ msg += ', in %s' % self.filename raise ParseError(msg, self.filename, e.lineno, e.offset) + def _enqueue(self, kind, data, pos=None): + if pos is None: + pos = self._getpos() + self._queue.append((kind, data, pos)) + def _getpos(self): lineno, column = self.getpos() return (self.filename, lineno, column) def handle_starttag(self, tag, attrib): - pos = self._getpos() - self._queue.append((Stream.START, (QName(tag), Attributes(attrib)), pos)) + fixed_attrib = [] + for name, value in attrib: # Fixup minimized attributes + if value is None: + value = name + fixed_attrib.append((name, unicode(value))) + + self._enqueue(Stream.START, (QName(tag), Attributes(fixed_attrib))) if tag in self._EMPTY_ELEMS: - self._queue.append((Stream.END, QName(tag), pos)) + self._enqueue(Stream.END, QName(tag)) else: self._open_tags.append(tag) def handle_endtag(self, tag): if tag not in self._EMPTY_ELEMS: - pos = self._getpos() while self._open_tags: open_tag = self._open_tags.pop() if open_tag.lower() == tag.lower(): break - self._queue.append((Stream.END, QName(open_tag), pos)) - self._queue.append((Stream.END, QName(tag), pos)) + self._enqueue(Stream.END, QName(open_tag)) + self._enqueue(Stream.END, QName(tag)) def handle_data(self, text): - self._queue.append((Stream.TEXT, text, self._getpos())) + self._enqueue(Stream.TEXT, text) def handle_charref(self, name): - self._queue.append((Stream.TEXT, Markup('&#%s;' % name), self._getpos())) + self._enqueue(Stream.TEXT, Markup('&#%s;' % name)) def handle_entityref(self, name): - self._queue.append((Stream.TEXT, Markup('&%s;' % name), self._getpos())) + self._enqueue(Stream.TEXT, Markup('&%s;' % name)) def handle_pi(self, data): target, data = data.split(maxsplit=1) data = data.rstrip('?') - self._queue.append((Stream.PI, (target.strip(), data.strip()), - self._getpos())) + self._enqueue(Stream.PI, (target.strip(), data.strip())) def handle_comment(self, text): - self._queue.append((Stream.COMMENT, text, self._getpos())) + self._enqueue(Stream.COMMENT, text) def HTML(text): diff --git a/markup/output.py b/markup/output.py --- a/markup/output.py +++ b/markup/output.py @@ -21,7 +21,6 @@ from sets import ImmutableSet as frozenset from markup.core import Markup, Namespace, QName, Stream -from markup.filters import WhitespaceFilter __all__ = ['Serializer', 'XMLSerializer', 'HTMLSerializer'] @@ -30,6 +29,12 @@ """Base class for serializers.""" def serialize(self, stream): + """Must be implemented by concrete subclasses to serialize the given + stream. + + This method must be implemented as a generator, producing the + serialized output incrementally as unicode strings. + """ raise NotImplementedError @@ -46,7 +51,7 @@ ns_attrib = [] ns_mapping = {} - stream = PushbackIterator(stream) + stream = _PushbackIterator(stream) for kind, data, pos in stream: if kind is Stream.DOCTYPE: @@ -81,11 +86,7 @@ for attr, value in attrib: attrname = attr.localname if attr.namespace: - try: - prefix = ns_mapping[attr.namespace] - except KeyError: - # FIXME: synthesize a prefix for the attribute? - prefix = '' + prefix = ns_mapping.get(attr.namespace) if prefix: attrname = prefix + ':' + attrname buf.append(' %s="%s"' % (attrname, Markup.escape(value))) @@ -103,12 +104,9 @@ tag = data tagname = tag.localname if tag.namespace: - try: - prefix = ns_mapping[tag.namespace] - if prefix: - tagname = prefix + ':' + tag.localname - except KeyError: - pass + prefix = ns_mapping.get(tag.namespace) + if prefix: + tagname = prefix + ':' + tag.localname yield Markup('' % tagname) elif kind is Stream.TEXT: @@ -136,7 +134,7 @@ def serialize(self, stream): ns_mapping = {} - stream = PushbackIterator(stream) + stream = _PushbackIterator(stream) for kind, data, pos in stream: if kind is Stream.DOCTYPE: @@ -179,7 +177,7 @@ yield Markup.escape(data, quotes=False) -class PushbackIterator(object): +class _PushbackIterator(object): """A simple wrapper for iterators that allows pushing items back on the queue via the `pushback()` method. diff --git a/markup/path.py b/markup/path.py --- a/markup/path.py +++ b/markup/path.py @@ -19,98 +19,24 @@ __all__ = ['Path'] -_QUOTES = (("'", "'"), ('"', '"')) class Path(object): - """Basic XPath support on markup event streams. - - >>> from markup.input import XML - - Selecting specific tags: - - >>> Path('root').select(XML('')).render() - '' - >>> Path('//root').select(XML('')).render() - '' - - Using wildcards for tag names: - - >>> Path('*').select(XML('')).render() - '' - >>> Path('//*').select(XML('')).render() - '' - - Selecting attribute values: - - >>> Path('@foo').select(XML('')).render() - '' - >>> Path('@foo').select(XML('')).render() - 'bar' - - Selecting descendants: - - >>> Path("root/*").select(XML('')).render() - '' - >>> Path("root/bar").select(XML('')).render() - '' - >>> Path("root/baz").select(XML('')).render() - '' - >>> Path("root/foo/*").select( - ... XML('')).render() - '' + """Implements basic XPath support on streams. - Selecting text nodes: - >>> Path("item/text()").select( - ... XML('Foo')).render() - 'Foo' - >>> Path("item/text()").select( - ... XML('FooBar')).render() - 'FooBar' - - Skipping ancestors: - - >>> Path("foo/bar").select( - ... XML('')).render() - '' - >>> Path("foo/*").select( - ... XML('')).render() - '' - >>> Path("root/bar").select( - ... XML('')).render() - '' - >>> Path("root/bar").select( - ... XML('')).render() - '' - >>> Path("root/*/bar").select( - ... XML('')).render() - '' - >>> Path("root//bar").select( - ... XML('')).render() - '' - >>> Path("root//bar").select( - ... XML('')).render() - '' - - Using simple attribute predicates: - >>> Path("root/item[@important]").select( - ... XML('')).render() - '' - >>> Path('root/item[@important="very"]').select( - ... XML('')).render() - '' - >>> Path("root/item[@important='very']").select( - ... XML('')).render() - '' - >>> Path("root/item[@important!='very']").select( - ... XML('')).render() - '' + Instances of this class represent a "compiled" XPath expression, and provide + methods for testing the path against a stream, as well as extracting a + substream matching that path. """ - _TOKEN_RE = re.compile('(::|\.\.|\(\)|[/.:\[\]\(\)@=!])|' '([^/:\[\]\(\)@=!\s]+)|' '\s+') + _QUOTES = (("'", "'"), ('"', '"')) def __init__(self, text): + """Create the path object from a string. + + @param text: the path expression + """ self.source = text steps = [] @@ -125,7 +51,7 @@ in_predicate = False elif op.startswith('('): if cur_tag == 'text': - steps[-1] = (False, self.fn_text(), []) + steps[-1] = (False, self._FunctionText(), []) else: raise NotImplementedError('XPath function "%s" not ' 'supported' % cur_tag) @@ -136,23 +62,25 @@ closure = cur_op in ('', '//') if cur_op == '@': if tag == '*': - node_test = self.any_attribute() + node_test = self._AnyAttribute() else: - node_test = self.attribute_by_name(tag) + node_test = self._AttributeByName(tag) else: if tag == '*': - node_test = self.any_element() + node_test = self._AnyElement() elif in_predicate: - if len(tag) > 1 and (tag[0], tag[-1]) in _QUOTES: - node_test = self.literal_string(tag[1:-1]) + if len(tag) > 1 and (tag[0], tag[-1]) in self._QUOTES: + node_test = self._LiteralString(tag[1:-1]) if cur_op == '=': - node_test = self.op_eq(steps[-1][2][-1], node_test) + node_test = self._OperatorEq(steps[-1][2][-1], + node_test) steps[-1][2].pop() elif cur_op == '!=': - node_test = self.op_neq(steps[-1][2][-1], node_test) + node_test = self._OperatorNeq(steps[-1][2][-1], + node_test) steps[-1][2].pop() else: - node_test = self.element_by_name(tag) + node_test = self._ElementByName(tag) if in_predicate: steps[-1][2].append(node_test) else: @@ -165,8 +93,15 @@ return '<%s "%s">' % (self.__class__.__name__, self.source) def select(self, stream): + """Returns a substream of the given stream that matches the path. + + If there are no matches, this method returns an empty stream. + + @param stream: the stream to select from + @return: the substream matching the path, or an empty stream + """ stream = iter(stream) - def _generate(tests): + def _generate(): test = self.test() for kind, data, pos in stream: result = test(kind, data, pos) @@ -183,9 +118,17 @@ test(*ev) elif result: yield result - return Stream(_generate(self.steps)) + return Stream(_generate()) def test(self): + """Returns a function that can be used to track whether the path matches + a specific stream event. + + The function returned expects the positional arguments `kind`, `data`, + and `pos`, i.e. basically an unpacked stream event. If the path matches + the event, the function returns the match (for example, a `START` or + `TEXT` event.) Otherwise, it returns `None` or `False`. + """ stack = [0] # stack of cursors into the location path def _test(kind, data, pos): @@ -234,28 +177,31 @@ return _test - class any_element(object): - def __call__(self, kind, data, pos): + class _AnyElement(object): + """Node test that matches any element.""" + def __call__(self, kind, *_): if kind is Stream.START: return True return None def __repr__(self): return '<%s>' % self.__class__.__name__ - class element_by_name(object): + class _ElementByName(object): + """Node test that matches an element with a specific tag name.""" def __init__(self, name): self.name = QName(name) - def __call__(self, kind, data, pos): + def __call__(self, kind, data, _): if kind is Stream.START: return data[0].localname == self.name return None def __repr__(self): return '<%s "%s">' % (self.__class__.__name__, self.name) - class any_attribute(object): + class _AnyAttribute(object): + """Node test that matches any attribute.""" def __call__(self, kind, data, pos): if kind is Stream.START: - text = ''.join([val for name, val in data[1]]) + text = ''.join([val for _, val in data[1]]) if text: return Stream.TEXT, text, pos return None @@ -263,7 +209,8 @@ def __repr__(self): return '<%s>' % (self.__class__.__name__) - class attribute_by_name(object): + class _AttributeByName(object): + """Node test that matches an attribute with a specific name.""" def __init__(self, name): self.name = QName(name) def __call__(self, kind, data, pos): @@ -275,7 +222,8 @@ def __repr__(self): return '<%s "%s">' % (self.__class__.__name__, self.name) - class fn_text(object): + class _FunctionText(object): + """Function that returns text content.""" def __call__(self, kind, data, pos): if kind is Stream.TEXT: return kind, data, pos @@ -283,15 +231,17 @@ def __repr__(self): return '<%s>' % (self.__class__.__name__) - class literal_string(object): + class _LiteralString(object): + """Always returns a literal string.""" def __init__(self, value): self.value = value - def __call__(self, kind, data, pos): + def __call__(self, *_): return Stream.TEXT, self.value, (-1, -1) def __repr__(self): return '<%s>' % (self.__class__.__name__) - class op_eq(object): + class _OperatorEq(object): + """Equality comparison operator.""" def __init__(self, lval, rval): self.lval = lval self.rval = rval @@ -303,7 +253,8 @@ return '<%s %r = %r>' % (self.__class__.__name__, self.lval, self.rval) - class op_neq(object): + class _OperatorNeq(object): + """Inequality comparison operator.""" def __init__(self, lval, rval): self.lval = lval self.rval = rval diff --git a/markup/tests/input.py b/markup/tests/input.py --- a/markup/tests/input.py +++ b/markup/tests/input.py @@ -11,20 +11,16 @@ # individuals. For the exact contribution history, see the revision # history and logs, available at http://projects.edgewall.com/trac/. +import doctest import unittest from markup.core import Stream from markup.input import XMLParser -class XMLParserTestCase(unittest.TestCase): - pass - - - def suite(): suite = unittest.TestSuite() - suite.addTest(unittest.makeSuite(XMLParserTestCase, 'test')) + suite.addTest(doctest.DocTestSuite(XMLParser.__module__)) return suite if __name__ == '__main__': diff --git a/markup/tests/path.py b/markup/tests/path.py --- a/markup/tests/path.py +++ b/markup/tests/path.py @@ -14,12 +14,82 @@ import doctest import unittest -from markup import path +from markup.input import XML +from markup.path import Path + + +class PathTestCase(unittest.TestCase): + + def test_1step(self): + xml = XML('') + self.assertEqual('', Path('root').select(xml).render()) + self.assertEqual('', Path('//root').select(xml).render()) + + def test_1step_wildcard(self): + xml = XML('') + self.assertEqual('', Path('*').select(xml).render()) + self.assertEqual('', Path('//*').select(xml).render()) + + def test_1step_attribute(self): + path = Path('@foo') + self.assertEqual('', path.select(XML('')).render()) + self.assertEqual('bar', path.select(XML('')).render()) + + def test_1step_attribute(self): + path = Path('@foo') + self.assertEqual('', path.select(XML('')).render()) + self.assertEqual('bar', path.select(XML('')).render()) + + def test_2step(self): + xml = XML('') + self.assertEqual('', Path('root/*').select(xml).render()) + self.assertEqual('', Path('root/bar').select(xml).render()) + self.assertEqual('', Path('root/baz').select(xml).render()) + + def test_2step_complex(self): + xml = XML('') + self.assertEqual('', Path('foo/bar').select(xml).render()) + self.assertEqual('', Path('foo/*').select(xml).render()) + self.assertEqual('', Path('root/bar').select(xml).render()) + + xml = XML('') + self.assertEqual('', Path('root/bar').select(xml).render()) + + def test_2step_text(self): + xml = XML('Foo') + self.assertEqual('Foo', Path('item/text()').select(xml).render()) + xml = XML('FooBar') + self.assertEqual('FooBar', Path('item/text()').select(xml).render()) + + def test_3step(self): + xml = XML('') + self.assertEqual('', Path('root/foo/*').select(xml).render()) + + def test_3step_complex(self): + xml = XML('') + self.assertEqual('', Path('root/*/bar').select(xml).render()) + xml = XML('') + self.assertEqual('', + Path('root//bar').select(xml).render()) + + def test_predicate_attr(self): + xml = XML('') + self.assertEqual('', + Path('root/item[@important]').select(xml).render()) + self.assertEqual('', + Path('root/item[@important="very"]').select(xml).render()) + + xml = XML('') + self.assertEqual('', + Path('root/item[@important="very"]').select(xml).render()) + self.assertEqual('', + Path('root/item[@important!="very"]').select(xml).render()) def suite(): suite = unittest.TestSuite() - suite.addTest(doctest.DocTestSuite(path)) + suite.addTest(doctest.DocTestSuite(Path.__module__)) + suite.addTest(unittest.makeSuite(PathTestCase, 'test')) return suite if __name__ == '__main__':