Mercurial > genshi > genshi-test
annotate markup/path.py @ 26:039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
* Added many more docstrings.
* Cleaned up the implementation of the XML/HTML parsers a bit.
* The HTML parser now correctly handles minimized attributes.
* Added `COPYING` and `README` files.
author | cmlenz |
---|---|
date | Wed, 28 Jun 2006 08:55:04 +0000 |
parents | c4201b794ab0 |
children | b8456279c444 |
rev | line source |
---|---|
1 | 1 # -*- coding: utf-8 -*- |
2 # | |
3 # Copyright (C) 2006 Edgewall Software | |
4 # All rights reserved. | |
5 # | |
6 # This software is licensed as described in the file COPYING, which | |
7 # you should have received as part of this distribution. The terms | |
8 # are also available at http://trac.edgewall.com/license.html. | |
9 # | |
10 # This software consists of voluntary contributions made by many | |
11 # individuals. For the exact contribution history, see the revision | |
12 # history and logs, available at http://projects.edgewall.com/trac/. | |
13 | |
14 """Basic support for evaluating XPath expressions against streams.""" | |
15 | |
16 import re | |
17 | |
18 from markup.core import QName, Stream | |
19 | |
20 __all__ = ['Path'] | |
21 | |
22 | |
23 class Path(object): | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
24 """Implements basic XPath support on streams. |
1 | 25 |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
26 Instances of this class represent a "compiled" XPath expression, and provide |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
27 methods for testing the path against a stream, as well as extracting a |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
28 substream matching that path. |
1 | 29 """ |
30 _TOKEN_RE = re.compile('(::|\.\.|\(\)|[/.:\[\]\(\)@=!])|' | |
31 '([^/:\[\]\(\)@=!\s]+)|' | |
32 '\s+') | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
33 _QUOTES = (("'", "'"), ('"', '"')) |
1 | 34 |
35 def __init__(self, text): | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
36 """Create the path object from a string. |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
37 |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
38 @param text: the path expression |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
39 """ |
1 | 40 self.source = text |
41 | |
42 steps = [] | |
43 cur_op = '' | |
44 cur_tag = '' | |
45 in_predicate = False | |
46 for op, tag in self._TOKEN_RE.findall(text): | |
47 if op: | |
48 if op == '[': | |
49 in_predicate = True | |
50 elif op == ']': | |
51 in_predicate = False | |
52 elif op.startswith('('): | |
53 if cur_tag == 'text': | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
54 steps[-1] = (False, self._FunctionText(), []) |
1 | 55 else: |
56 raise NotImplementedError('XPath function "%s" not ' | |
57 'supported' % cur_tag) | |
58 else: | |
59 cur_op += op | |
60 cur_tag = '' | |
61 else: | |
62 closure = cur_op in ('', '//') | |
63 if cur_op == '@': | |
64 if tag == '*': | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
65 node_test = self._AnyAttribute() |
1 | 66 else: |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
67 node_test = self._AttributeByName(tag) |
1 | 68 else: |
69 if tag == '*': | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
70 node_test = self._AnyElement() |
1 | 71 elif in_predicate: |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
72 if len(tag) > 1 and (tag[0], tag[-1]) in self._QUOTES: |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
73 node_test = self._LiteralString(tag[1:-1]) |
1 | 74 if cur_op == '=': |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
75 node_test = self._OperatorEq(steps[-1][2][-1], |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
76 node_test) |
1 | 77 steps[-1][2].pop() |
78 elif cur_op == '!=': | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
79 node_test = self._OperatorNeq(steps[-1][2][-1], |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
80 node_test) |
1 | 81 steps[-1][2].pop() |
82 else: | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
83 node_test = self._ElementByName(tag) |
1 | 84 if in_predicate: |
85 steps[-1][2].append(node_test) | |
86 else: | |
87 steps.append([closure, node_test, []]) | |
88 cur_op = '' | |
89 cur_tag = tag | |
90 self.steps = steps | |
91 | |
92 def __repr__(self): | |
93 return '<%s "%s">' % (self.__class__.__name__, self.source) | |
94 | |
95 def select(self, stream): | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
96 """Returns a substream of the given stream that matches the path. |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
97 |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
98 If there are no matches, this method returns an empty stream. |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
99 |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
100 @param stream: the stream to select from |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
101 @return: the substream matching the path, or an empty stream |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
102 """ |
1 | 103 stream = iter(stream) |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
104 def _generate(): |
1 | 105 test = self.test() |
106 for kind, data, pos in stream: | |
107 result = test(kind, data, pos) | |
108 if result is True: | |
109 yield kind, data, pos | |
110 depth = 1 | |
111 while depth > 0: | |
112 ev = stream.next() | |
113 if ev[0] is Stream.START: | |
114 depth += 1 | |
115 elif ev[0] is Stream.END: | |
116 depth -= 1 | |
117 yield ev | |
118 test(*ev) | |
119 elif result: | |
120 yield result | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
121 return Stream(_generate()) |
1 | 122 |
123 def test(self): | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
124 """Returns a function that can be used to track whether the path matches |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
125 a specific stream event. |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
126 |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
127 The function returned expects the positional arguments `kind`, `data`, |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
128 and `pos`, i.e. basically an unpacked stream event. If the path matches |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
129 the event, the function returns the match (for example, a `START` or |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
130 `TEXT` event.) Otherwise, it returns `None` or `False`. |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
131 """ |
1 | 132 stack = [0] # stack of cursors into the location path |
133 | |
134 def _test(kind, data, pos): | |
135 if not stack: | |
136 return False | |
137 | |
138 if kind is Stream.END: | |
139 stack.pop() | |
140 return None | |
141 | |
142 if kind is Stream.START: | |
143 stack.append(stack[-1]) | |
144 | |
145 matched = False | |
146 closure, node_test, predicates = self.steps[stack[-1]] | |
147 | |
148 matched = node_test(kind, data, pos) | |
149 if matched and predicates: | |
150 for predicate in predicates: | |
151 if not predicate(kind, data, pos): | |
152 matched = None | |
153 break | |
154 | |
155 if matched: | |
156 if stack[-1] == len(self.steps) - 1: | |
157 return matched | |
158 | |
159 stack[-1] += 1 | |
160 | |
161 elif kind is Stream.START and not closure: | |
24 | 162 # If this step is not a closure, it cannot be matched until the |
163 # current element is closed... so we need to move the cursor | |
164 # back to the last closure and retest that against the current | |
165 # element | |
1 | 166 closures = [step for step in self.steps[:stack[-1]] if step[0]] |
25 | 167 closures.reverse() |
1 | 168 for closure, node_test, predicates in closures: |
169 stack[-1] -= 1 | |
170 if closure: | |
171 matched = node_test(kind, data, pos) | |
172 if matched: | |
173 stack[-1] += 1 | |
174 break | |
175 | |
176 return None | |
177 | |
178 return _test | |
179 | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
180 class _AnyElement(object): |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
181 """Node test that matches any element.""" |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
182 def __call__(self, kind, *_): |
1 | 183 if kind is Stream.START: |
184 return True | |
185 return None | |
186 def __repr__(self): | |
187 return '<%s>' % self.__class__.__name__ | |
188 | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
189 class _ElementByName(object): |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
190 """Node test that matches an element with a specific tag name.""" |
1 | 191 def __init__(self, name): |
192 self.name = QName(name) | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
193 def __call__(self, kind, data, _): |
1 | 194 if kind is Stream.START: |
195 return data[0].localname == self.name | |
196 return None | |
197 def __repr__(self): | |
198 return '<%s "%s">' % (self.__class__.__name__, self.name) | |
199 | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
200 class _AnyAttribute(object): |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
201 """Node test that matches any attribute.""" |
1 | 202 def __call__(self, kind, data, pos): |
203 if kind is Stream.START: | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
204 text = ''.join([val for _, val in data[1]]) |
1 | 205 if text: |
206 return Stream.TEXT, text, pos | |
207 return None | |
208 return None | |
209 def __repr__(self): | |
210 return '<%s>' % (self.__class__.__name__) | |
211 | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
212 class _AttributeByName(object): |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
213 """Node test that matches an attribute with a specific name.""" |
1 | 214 def __init__(self, name): |
215 self.name = QName(name) | |
216 def __call__(self, kind, data, pos): | |
217 if kind is Stream.START: | |
218 if self.name in data[1]: | |
219 return Stream.TEXT, data[1].get(self.name), pos | |
220 return None | |
221 return None | |
222 def __repr__(self): | |
223 return '<%s "%s">' % (self.__class__.__name__, self.name) | |
224 | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
225 class _FunctionText(object): |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
226 """Function that returns text content.""" |
1 | 227 def __call__(self, kind, data, pos): |
228 if kind is Stream.TEXT: | |
229 return kind, data, pos | |
230 return None | |
231 def __repr__(self): | |
232 return '<%s>' % (self.__class__.__name__) | |
233 | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
234 class _LiteralString(object): |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
235 """Always returns a literal string.""" |
1 | 236 def __init__(self, value): |
237 self.value = value | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
238 def __call__(self, *_): |
1 | 239 return Stream.TEXT, self.value, (-1, -1) |
240 def __repr__(self): | |
241 return '<%s>' % (self.__class__.__name__) | |
242 | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
243 class _OperatorEq(object): |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
244 """Equality comparison operator.""" |
1 | 245 def __init__(self, lval, rval): |
246 self.lval = lval | |
247 self.rval = rval | |
248 def __call__(self, kind, data, pos): | |
249 lval = self.lval(kind, data, pos) | |
250 rval = self.rval(kind, data, pos) | |
251 return (lval and lval[1]) == (rval and rval[1]) | |
252 def __repr__(self): | |
253 return '<%s %r = %r>' % (self.__class__.__name__, self.lval, | |
254 self.rval) | |
255 | |
26
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
256 class _OperatorNeq(object): |
039fc5b87405
* Split out the XPath tests into a separate `unittest`-based file.
cmlenz
parents:
25
diff
changeset
|
257 """Inequality comparison operator.""" |
1 | 258 def __init__(self, lval, rval): |
259 self.lval = lval | |
260 self.rval = rval | |
261 def __call__(self, kind, data, pos): | |
262 lval = self.lval(kind, data, pos) | |
263 rval = self.rval(kind, data, pos) | |
264 return (lval and lval[1]) != (rval and rval[1]) | |
265 def __repr__(self): | |
266 return '<%s %r != %r>' % (self.__class__.__name__, self.lval, | |
267 self.rval) |