# HG changeset patch
# User cmlenz
# Date 1151484904 0
# Node ID 3c1a022be04c5d2c4e2aeeed6c41cf7f775e98b9
# Parent e3be27f5bcf5f1ec07c5e634b13e3c234c23a921
* Split out the XPath tests into a separate `unittest`-based file.
* Added many more docstrings.
* Cleaned up the implementation of the XML/HTML parsers a bit.
* The HTML parser now correctly handles minimized attributes.
* Added `COPYING` and `README` files.
diff --git a/COPYING b/COPYING
new file mode 100644
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,28 @@
+Copyright (C) 2006 Christopher Lenz
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ 3. The name of the author may not be used to endorse or promote
+ products derived from this software without specific prior
+ written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
+OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.txt b/README.txt
new file mode 100644
--- /dev/null
+++ b/README.txt
@@ -0,0 +1,11 @@
+About Markup
+============
+
+Markup is a Python library that provides a integrated set of components
+for parsing, generating, and processing HTML or XML content in a uniform
+manner. The major feature is a template language, which is heavily
+inspired by Kid.
+
+For more information visit the Markup web site:
+
+
diff --git a/markup/input.py b/markup/input.py
--- a/markup/input.py
+++ b/markup/input.py
@@ -18,7 +18,6 @@
from sets import ImmutableSet as frozenset
import HTMLParser as html
import htmlentitydefs
-import re
from StringIO import StringIO
from markup.core import Attributes, Markup, QName, Stream
@@ -37,9 +36,26 @@
class XMLParser(object):
"""Generator-based XML parser based on roughly equivalent code in
- Kid/ElementTree."""
+ Kid/ElementTree.
+
+ The parsing is initiated by iterating over the parser object:
+
+ >>> parser = XMLParser(StringIO('Foo'))
+ >>> for kind, data, pos in parser:
+ ... print kind, data
+ START (u'root', [(u'id', u'2')])
+ START (u'child', [])
+ TEXT Foo
+ END child
+ END root
+ """
def __init__(self, source, filename=None):
+ """Initialize the parser for the given XML text.
+
+ @param source: the XML text as a file-like object
+ @param filename: the name of the file, if appropriate
+ """
self.source = source
self.filename = filename
@@ -90,6 +106,11 @@
msg += ', in ' + self.filename
raise ParseError(msg, self.filename, e.lineno, e.offset)
+ def _enqueue(self, kind, data, pos=None):
+ if pos is None:
+ pos = self._getpos()
+ self._queue.append((kind, data, pos))
+
def _getpos_unknown(self):
return (self.filename or '', -1, -1)
@@ -98,40 +119,38 @@
self.expat.CurrentColumnNumber)
def _handle_start(self, tag, attrib):
- self._queue.append((Stream.START, (QName(tag), Attributes(attrib.items())),
- self._getpos()))
+ self._enqueue(Stream.START, (QName(tag), Attributes(attrib.items())))
def _handle_end(self, tag):
- self._queue.append((Stream.END, QName(tag), self._getpos()))
+ self._enqueue(Stream.END, QName(tag))
def _handle_data(self, text):
- self._queue.append((Stream.TEXT, text, self._getpos()))
+ self._enqueue(Stream.TEXT, text)
def _handle_prolog(self, version, encoding, standalone):
- self._queue.append((Stream.PROLOG, (version, encoding, standalone),
- self._getpos()))
+ self._enqueue(Stream.PROLOG, (version, encoding, standalone))
def _handle_doctype(self, name, sysid, pubid, has_internal_subset):
- self._queue.append((Stream.DOCTYPE, (name, pubid, sysid), self._getpos()))
+ self._enqueue(Stream.DOCTYPE, (name, pubid, sysid))
def _handle_start_ns(self, prefix, uri):
- self._queue.append((Stream.START_NS, (prefix or '', uri), self._getpos()))
+ self._enqueue(Stream.START_NS, (prefix or '', uri))
def _handle_end_ns(self, prefix):
- self._queue.append((Stream.END_NS, prefix or '', self._getpos()))
+ self._enqueue(Stream.END_NS, prefix or '')
def _handle_pi(self, target, data):
- self._queue.append((Stream.PI, (target, data), self._getpos()))
+ self._enqueue(Stream.PI, (target, data))
def _handle_comment(self, text):
- self._queue.append((Stream.COMMENT, text, self._getpos()))
+ self._enqueue(Stream.COMMENT, text)
def _handle_other(self, text):
if text.startswith('&'):
# deal with undefined entities
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
- self._queue.append((Stream.TEXT, text, self._getpos()))
+ self._enqueue(Stream.TEXT, text)
except KeyError:
lineno, offset = self._getpos()
raise expat.error("undefined entity %s: line %d, column %d" %
@@ -147,6 +166,17 @@
This class provides the same interface for generating stream events as
`XMLParser`, and attempts to automatically balance tags.
+
+ The parsing is initiated by iterating over the parser object:
+
+ >>> parser = HTMLParser(StringIO(''))
+ >>> for kind, data, pos in parser:
+ ... print kind, data
+ START (u'ul', [(u'compact', u'compact')])
+ START (u'li', [])
+ TEXT Foo
+ END li
+ END ul
"""
_EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame',
@@ -187,45 +217,53 @@
msg += ', in %s' % self.filename
raise ParseError(msg, self.filename, e.lineno, e.offset)
+ def _enqueue(self, kind, data, pos=None):
+ if pos is None:
+ pos = self._getpos()
+ self._queue.append((kind, data, pos))
+
def _getpos(self):
lineno, column = self.getpos()
return (self.filename, lineno, column)
def handle_starttag(self, tag, attrib):
- pos = self._getpos()
- self._queue.append((Stream.START, (QName(tag), Attributes(attrib)), pos))
+ fixed_attrib = []
+ for name, value in attrib: # Fixup minimized attributes
+ if value is None:
+ value = name
+ fixed_attrib.append((name, unicode(value)))
+
+ self._enqueue(Stream.START, (QName(tag), Attributes(fixed_attrib)))
if tag in self._EMPTY_ELEMS:
- self._queue.append((Stream.END, QName(tag), pos))
+ self._enqueue(Stream.END, QName(tag))
else:
self._open_tags.append(tag)
def handle_endtag(self, tag):
if tag not in self._EMPTY_ELEMS:
- pos = self._getpos()
while self._open_tags:
open_tag = self._open_tags.pop()
if open_tag.lower() == tag.lower():
break
- self._queue.append((Stream.END, QName(open_tag), pos))
- self._queue.append((Stream.END, QName(tag), pos))
+ self._enqueue(Stream.END, QName(open_tag))
+ self._enqueue(Stream.END, QName(tag))
def handle_data(self, text):
- self._queue.append((Stream.TEXT, text, self._getpos()))
+ self._enqueue(Stream.TEXT, text)
def handle_charref(self, name):
- self._queue.append((Stream.TEXT, Markup('%s;' % name), self._getpos()))
+ self._enqueue(Stream.TEXT, Markup('%s;' % name))
def handle_entityref(self, name):
- self._queue.append((Stream.TEXT, Markup('&%s;' % name), self._getpos()))
+ self._enqueue(Stream.TEXT, Markup('&%s;' % name))
def handle_pi(self, data):
target, data = data.split(maxsplit=1)
data = data.rstrip('?')
- self._queue.append((Stream.PI, (target.strip(), data.strip()),
- self._getpos()))
+ self._enqueue(Stream.PI, (target.strip(), data.strip()))
def handle_comment(self, text):
- self._queue.append((Stream.COMMENT, text, self._getpos()))
+ self._enqueue(Stream.COMMENT, text)
def HTML(text):
diff --git a/markup/output.py b/markup/output.py
--- a/markup/output.py
+++ b/markup/output.py
@@ -21,7 +21,6 @@
from sets import ImmutableSet as frozenset
from markup.core import Markup, Namespace, QName, Stream
-from markup.filters import WhitespaceFilter
__all__ = ['Serializer', 'XMLSerializer', 'HTMLSerializer']
@@ -30,6 +29,12 @@
"""Base class for serializers."""
def serialize(self, stream):
+ """Must be implemented by concrete subclasses to serialize the given
+ stream.
+
+ This method must be implemented as a generator, producing the
+ serialized output incrementally as unicode strings.
+ """
raise NotImplementedError
@@ -46,7 +51,7 @@
ns_attrib = []
ns_mapping = {}
- stream = PushbackIterator(stream)
+ stream = _PushbackIterator(stream)
for kind, data, pos in stream:
if kind is Stream.DOCTYPE:
@@ -81,11 +86,7 @@
for attr, value in attrib:
attrname = attr.localname
if attr.namespace:
- try:
- prefix = ns_mapping[attr.namespace]
- except KeyError:
- # FIXME: synthesize a prefix for the attribute?
- prefix = ''
+ prefix = ns_mapping.get(attr.namespace)
if prefix:
attrname = prefix + ':' + attrname
buf.append(' %s="%s"' % (attrname, Markup.escape(value)))
@@ -103,12 +104,9 @@
tag = data
tagname = tag.localname
if tag.namespace:
- try:
- prefix = ns_mapping[tag.namespace]
- if prefix:
- tagname = prefix + ':' + tag.localname
- except KeyError:
- pass
+ prefix = ns_mapping.get(tag.namespace)
+ if prefix:
+ tagname = prefix + ':' + tag.localname
yield Markup('%s>' % tagname)
elif kind is Stream.TEXT:
@@ -136,7 +134,7 @@
def serialize(self, stream):
ns_mapping = {}
- stream = PushbackIterator(stream)
+ stream = _PushbackIterator(stream)
for kind, data, pos in stream:
if kind is Stream.DOCTYPE:
@@ -179,7 +177,7 @@
yield Markup.escape(data, quotes=False)
-class PushbackIterator(object):
+class _PushbackIterator(object):
"""A simple wrapper for iterators that allows pushing items back on the
queue via the `pushback()` method.
diff --git a/markup/path.py b/markup/path.py
--- a/markup/path.py
+++ b/markup/path.py
@@ -19,98 +19,24 @@
__all__ = ['Path']
-_QUOTES = (("'", "'"), ('"', '"'))
class Path(object):
- """Basic XPath support on markup event streams.
-
- >>> from markup.input import XML
-
- Selecting specific tags:
-
- >>> Path('root').select(XML('')).render()
- ''
- >>> Path('//root').select(XML('')).render()
- ''
-
- Using wildcards for tag names:
-
- >>> Path('*').select(XML('')).render()
- ''
- >>> Path('//*').select(XML('')).render()
- ''
-
- Selecting attribute values:
-
- >>> Path('@foo').select(XML('')).render()
- ''
- >>> Path('@foo').select(XML('')).render()
- 'bar'
-
- Selecting descendants:
-
- >>> Path("root/*").select(XML('')).render()
- ''
- >>> Path("root/bar").select(XML('')).render()
- ''
- >>> Path("root/baz").select(XML('')).render()
- ''
- >>> Path("root/foo/*").select(
- ... XML('')).render()
- ''
+ """Implements basic XPath support on streams.
- Selecting text nodes:
- >>> Path("item/text()").select(
- ... XML('- Foo
')).render()
- 'Foo'
- >>> Path("item/text()").select(
- ... XML('- Foo
- Bar
')).render()
- 'FooBar'
-
- Skipping ancestors:
-
- >>> Path("foo/bar").select(
- ... XML('')).render()
- ''
- >>> Path("foo/*").select(
- ... XML('')).render()
- ''
- >>> Path("root/bar").select(
- ... XML('')).render()
- ''
- >>> Path("root/bar").select(
- ... XML('')).render()
- ''
- >>> Path("root/*/bar").select(
- ... XML('')).render()
- ''
- >>> Path("root//bar").select(
- ... XML('')).render()
- ''
- >>> Path("root//bar").select(
- ... XML('')).render()
- ''
-
- Using simple attribute predicates:
- >>> Path("root/item[@important]").select(
- ... XML(' ')).render()
- ' '
- >>> Path('root/item[@important="very"]').select(
- ... XML(' ')).render()
- ' '
- >>> Path("root/item[@important='very']").select(
- ... XML(' ')).render()
- ''
- >>> Path("root/item[@important!='very']").select(
- ... XML(' ')).render()
- ' '
+ Instances of this class represent a "compiled" XPath expression, and provide
+ methods for testing the path against a stream, as well as extracting a
+ substream matching that path.
"""
-
_TOKEN_RE = re.compile('(::|\.\.|\(\)|[/.:\[\]\(\)@=!])|'
'([^/:\[\]\(\)@=!\s]+)|'
'\s+')
+ _QUOTES = (("'", "'"), ('"', '"'))
def __init__(self, text):
+ """Create the path object from a string.
+
+ @param text: the path expression
+ """
self.source = text
steps = []
@@ -125,7 +51,7 @@
in_predicate = False
elif op.startswith('('):
if cur_tag == 'text':
- steps[-1] = (False, self.fn_text(), [])
+ steps[-1] = (False, self._FunctionText(), [])
else:
raise NotImplementedError('XPath function "%s" not '
'supported' % cur_tag)
@@ -136,23 +62,25 @@
closure = cur_op in ('', '//')
if cur_op == '@':
if tag == '*':
- node_test = self.any_attribute()
+ node_test = self._AnyAttribute()
else:
- node_test = self.attribute_by_name(tag)
+ node_test = self._AttributeByName(tag)
else:
if tag == '*':
- node_test = self.any_element()
+ node_test = self._AnyElement()
elif in_predicate:
- if len(tag) > 1 and (tag[0], tag[-1]) in _QUOTES:
- node_test = self.literal_string(tag[1:-1])
+ if len(tag) > 1 and (tag[0], tag[-1]) in self._QUOTES:
+ node_test = self._LiteralString(tag[1:-1])
if cur_op == '=':
- node_test = self.op_eq(steps[-1][2][-1], node_test)
+ node_test = self._OperatorEq(steps[-1][2][-1],
+ node_test)
steps[-1][2].pop()
elif cur_op == '!=':
- node_test = self.op_neq(steps[-1][2][-1], node_test)
+ node_test = self._OperatorNeq(steps[-1][2][-1],
+ node_test)
steps[-1][2].pop()
else:
- node_test = self.element_by_name(tag)
+ node_test = self._ElementByName(tag)
if in_predicate:
steps[-1][2].append(node_test)
else:
@@ -165,8 +93,15 @@
return '<%s "%s">' % (self.__class__.__name__, self.source)
def select(self, stream):
+ """Returns a substream of the given stream that matches the path.
+
+ If there are no matches, this method returns an empty stream.
+
+ @param stream: the stream to select from
+ @return: the substream matching the path, or an empty stream
+ """
stream = iter(stream)
- def _generate(tests):
+ def _generate():
test = self.test()
for kind, data, pos in stream:
result = test(kind, data, pos)
@@ -183,9 +118,17 @@
test(*ev)
elif result:
yield result
- return Stream(_generate(self.steps))
+ return Stream(_generate())
def test(self):
+ """Returns a function that can be used to track whether the path matches
+ a specific stream event.
+
+ The function returned expects the positional arguments `kind`, `data`,
+ and `pos`, i.e. basically an unpacked stream event. If the path matches
+ the event, the function returns the match (for example, a `START` or
+ `TEXT` event.) Otherwise, it returns `None` or `False`.
+ """
stack = [0] # stack of cursors into the location path
def _test(kind, data, pos):
@@ -234,28 +177,31 @@
return _test
- class any_element(object):
- def __call__(self, kind, data, pos):
+ class _AnyElement(object):
+ """Node test that matches any element."""
+ def __call__(self, kind, *_):
if kind is Stream.START:
return True
return None
def __repr__(self):
return '<%s>' % self.__class__.__name__
- class element_by_name(object):
+ class _ElementByName(object):
+ """Node test that matches an element with a specific tag name."""
def __init__(self, name):
self.name = QName(name)
- def __call__(self, kind, data, pos):
+ def __call__(self, kind, data, _):
if kind is Stream.START:
return data[0].localname == self.name
return None
def __repr__(self):
return '<%s "%s">' % (self.__class__.__name__, self.name)
- class any_attribute(object):
+ class _AnyAttribute(object):
+ """Node test that matches any attribute."""
def __call__(self, kind, data, pos):
if kind is Stream.START:
- text = ''.join([val for name, val in data[1]])
+ text = ''.join([val for _, val in data[1]])
if text:
return Stream.TEXT, text, pos
return None
@@ -263,7 +209,8 @@
def __repr__(self):
return '<%s>' % (self.__class__.__name__)
- class attribute_by_name(object):
+ class _AttributeByName(object):
+ """Node test that matches an attribute with a specific name."""
def __init__(self, name):
self.name = QName(name)
def __call__(self, kind, data, pos):
@@ -275,7 +222,8 @@
def __repr__(self):
return '<%s "%s">' % (self.__class__.__name__, self.name)
- class fn_text(object):
+ class _FunctionText(object):
+ """Function that returns text content."""
def __call__(self, kind, data, pos):
if kind is Stream.TEXT:
return kind, data, pos
@@ -283,15 +231,17 @@
def __repr__(self):
return '<%s>' % (self.__class__.__name__)
- class literal_string(object):
+ class _LiteralString(object):
+ """Always returns a literal string."""
def __init__(self, value):
self.value = value
- def __call__(self, kind, data, pos):
+ def __call__(self, *_):
return Stream.TEXT, self.value, (-1, -1)
def __repr__(self):
return '<%s>' % (self.__class__.__name__)
- class op_eq(object):
+ class _OperatorEq(object):
+ """Equality comparison operator."""
def __init__(self, lval, rval):
self.lval = lval
self.rval = rval
@@ -303,7 +253,8 @@
return '<%s %r = %r>' % (self.__class__.__name__, self.lval,
self.rval)
- class op_neq(object):
+ class _OperatorNeq(object):
+ """Inequality comparison operator."""
def __init__(self, lval, rval):
self.lval = lval
self.rval = rval
diff --git a/markup/tests/input.py b/markup/tests/input.py
--- a/markup/tests/input.py
+++ b/markup/tests/input.py
@@ -11,20 +11,16 @@
# individuals. For the exact contribution history, see the revision
# history and logs, available at http://projects.edgewall.com/trac/.
+import doctest
import unittest
from markup.core import Stream
from markup.input import XMLParser
-class XMLParserTestCase(unittest.TestCase):
- pass
-
-
-
def suite():
suite = unittest.TestSuite()
- suite.addTest(unittest.makeSuite(XMLParserTestCase, 'test'))
+ suite.addTest(doctest.DocTestSuite(XMLParser.__module__))
return suite
if __name__ == '__main__':
diff --git a/markup/tests/path.py b/markup/tests/path.py
--- a/markup/tests/path.py
+++ b/markup/tests/path.py
@@ -14,12 +14,82 @@
import doctest
import unittest
-from markup import path
+from markup.input import XML
+from markup.path import Path
+
+
+class PathTestCase(unittest.TestCase):
+
+ def test_1step(self):
+ xml = XML('')
+ self.assertEqual('', Path('root').select(xml).render())
+ self.assertEqual('', Path('//root').select(xml).render())
+
+ def test_1step_wildcard(self):
+ xml = XML('')
+ self.assertEqual('', Path('*').select(xml).render())
+ self.assertEqual('', Path('//*').select(xml).render())
+
+ def test_1step_attribute(self):
+ path = Path('@foo')
+ self.assertEqual('', path.select(XML('')).render())
+ self.assertEqual('bar', path.select(XML('')).render())
+
+ def test_1step_attribute(self):
+ path = Path('@foo')
+ self.assertEqual('', path.select(XML('')).render())
+ self.assertEqual('bar', path.select(XML('')).render())
+
+ def test_2step(self):
+ xml = XML('')
+ self.assertEqual('', Path('root/*').select(xml).render())
+ self.assertEqual('', Path('root/bar').select(xml).render())
+ self.assertEqual('', Path('root/baz').select(xml).render())
+
+ def test_2step_complex(self):
+ xml = XML('')
+ self.assertEqual('', Path('foo/bar').select(xml).render())
+ self.assertEqual('', Path('foo/*').select(xml).render())
+ self.assertEqual('', Path('root/bar').select(xml).render())
+
+ xml = XML('')
+ self.assertEqual('', Path('root/bar').select(xml).render())
+
+ def test_2step_text(self):
+ xml = XML('- Foo
')
+ self.assertEqual('Foo', Path('item/text()').select(xml).render())
+ xml = XML('- Foo
- Bar
')
+ self.assertEqual('FooBar', Path('item/text()').select(xml).render())
+
+ def test_3step(self):
+ xml = XML('')
+ self.assertEqual('', Path('root/foo/*').select(xml).render())
+
+ def test_3step_complex(self):
+ xml = XML('')
+ self.assertEqual('', Path('root/*/bar').select(xml).render())
+ xml = XML('')
+ self.assertEqual('',
+ Path('root//bar').select(xml).render())
+
+ def test_predicate_attr(self):
+ xml = XML(' ')
+ self.assertEqual(' ',
+ Path('root/item[@important]').select(xml).render())
+ self.assertEqual(' ',
+ Path('root/item[@important="very"]').select(xml).render())
+
+ xml = XML(' ')
+ self.assertEqual('',
+ Path('root/item[@important="very"]').select(xml).render())
+ self.assertEqual(' ',
+ Path('root/item[@important!="very"]').select(xml).render())
def suite():
suite = unittest.TestSuite()
- suite.addTest(doctest.DocTestSuite(path))
+ suite.addTest(doctest.DocTestSuite(Path.__module__))
+ suite.addTest(unittest.makeSuite(PathTestCase, 'test'))
return suite
if __name__ == '__main__':