Mercurial > genshi > genshi-test
comparison genshi/core.py @ 902:09cc3627654c experimental-inline
Sync `experimental/inline` branch with [source:trunk@1126].
author | cmlenz |
---|---|
date | Fri, 23 Apr 2010 21:08:26 +0000 |
parents | de82830f8816 |
children | bb813ef5fe25 |
comparison
equal
deleted
inserted
replaced
830:de82830f8816 | 902:09cc3627654c |
---|---|
1 # -*- coding: utf-8 -*- | 1 # -*- coding: utf-8 -*- |
2 # | 2 # |
3 # Copyright (C) 2006-2008 Edgewall Software | 3 # Copyright (C) 2006-2009 Edgewall Software |
4 # All rights reserved. | 4 # All rights reserved. |
5 # | 5 # |
6 # This software is licensed as described in the file COPYING, which | 6 # This software is licensed as described in the file COPYING, which |
7 # you should have received as part of this distribution. The terms | 7 # you should have received as part of this distribution. The terms |
8 # are also available at http://genshi.edgewall.org/wiki/License. | 8 # are also available at http://genshi.edgewall.org/wiki/License. |
12 # history and logs, available at http://genshi.edgewall.org/log/. | 12 # history and logs, available at http://genshi.edgewall.org/log/. |
13 | 13 |
14 """Core classes for markup processing.""" | 14 """Core classes for markup processing.""" |
15 | 15 |
16 try: | 16 try: |
17 reduce # builtin in Python < 3 | |
18 except NameError: | |
17 from functools import reduce | 19 from functools import reduce |
18 except ImportError: | |
19 pass # builtin in Python <= 2.5 | |
20 from itertools import chain | 20 from itertools import chain |
21 import operator | 21 import operator |
22 | 22 |
23 from genshi.util import plaintext, stripentities, striptags | 23 from genshi.util import plaintext, stripentities, striptags, stringrepr |
24 | 24 |
25 __all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Attrs', 'Namespace', | 25 __all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Attrs', 'Namespace', |
26 'QName'] | 26 'QName'] |
27 __docformat__ = 'restructuredtext en' | 27 __docformat__ = 'restructuredtext en' |
28 | 28 |
91 | 91 |
92 Assume the following stream produced by the `HTML` function: | 92 Assume the following stream produced by the `HTML` function: |
93 | 93 |
94 >>> from genshi.input import HTML | 94 >>> from genshi.input import HTML |
95 >>> html = HTML('''<p onclick="alert('Whoa')">Hello, world!</p>''') | 95 >>> html = HTML('''<p onclick="alert('Whoa')">Hello, world!</p>''') |
96 >>> print html | 96 >>> print(html) |
97 <p onclick="alert('Whoa')">Hello, world!</p> | 97 <p onclick="alert('Whoa')">Hello, world!</p> |
98 | 98 |
99 A filter such as the HTML sanitizer can be applied to that stream using | 99 A filter such as the HTML sanitizer can be applied to that stream using |
100 the pipe notation as follows: | 100 the pipe notation as follows: |
101 | 101 |
102 >>> from genshi.filters import HTMLSanitizer | 102 >>> from genshi.filters import HTMLSanitizer |
103 >>> sanitizer = HTMLSanitizer() | 103 >>> sanitizer = HTMLSanitizer() |
104 >>> print html | sanitizer | 104 >>> print(html | sanitizer) |
105 <p>Hello, world!</p> | 105 <p>Hello, world!</p> |
106 | 106 |
107 Filters can be any function that accepts and produces a stream (where | 107 Filters can be any function that accepts and produces a stream (where |
108 a stream is anything that iterates over events): | 108 a stream is anything that iterates over events): |
109 | 109 |
110 >>> def uppercase(stream): | 110 >>> def uppercase(stream): |
111 ... for kind, data, pos in stream: | 111 ... for kind, data, pos in stream: |
112 ... if kind is TEXT: | 112 ... if kind is TEXT: |
113 ... data = data.upper() | 113 ... data = data.upper() |
114 ... yield kind, data, pos | 114 ... yield kind, data, pos |
115 >>> print html | sanitizer | uppercase | 115 >>> print(html | sanitizer | uppercase) |
116 <p>HELLO, WORLD!</p> | 116 <p>HELLO, WORLD!</p> |
117 | 117 |
118 Serializers can also be used with this notation: | 118 Serializers can also be used with this notation: |
119 | 119 |
120 >>> from genshi.output import TextSerializer | 120 >>> from genshi.output import TextSerializer |
121 >>> output = TextSerializer() | 121 >>> output = TextSerializer() |
122 >>> print html | sanitizer | uppercase | output | 122 >>> print(html | sanitizer | uppercase | output) |
123 HELLO, WORLD! | 123 HELLO, WORLD! |
124 | 124 |
125 Commonly, serializers should be used at the end of the "pipeline"; | 125 Commonly, serializers should be used at the end of the "pipeline"; |
126 using them somewhere in the middle may produce unexpected results. | 126 using them somewhere in the middle may produce unexpected results. |
127 | 127 |
186 """Return a new stream that contains the events matching the given | 186 """Return a new stream that contains the events matching the given |
187 XPath expression. | 187 XPath expression. |
188 | 188 |
189 >>> from genshi import HTML | 189 >>> from genshi import HTML |
190 >>> stream = HTML('<doc><elem>foo</elem><elem>bar</elem></doc>') | 190 >>> stream = HTML('<doc><elem>foo</elem><elem>bar</elem></doc>') |
191 >>> print stream.select('elem') | 191 >>> print(stream.select('elem')) |
192 <elem>foo</elem><elem>bar</elem> | 192 <elem>foo</elem><elem>bar</elem> |
193 >>> print stream.select('elem/text()') | 193 >>> print(stream.select('elem/text()')) |
194 foobar | 194 foobar |
195 | 195 |
196 Note that the outermost element of the stream becomes the *context | 196 Note that the outermost element of the stream becomes the *context |
197 node* for the XPath test. That means that the expression "doc" would | 197 node* for the XPath test. That means that the expression "doc" would |
198 not match anything in the example above, because it only tests against | 198 not match anything in the example above, because it only tests against |
199 child elements of the outermost element: | 199 child elements of the outermost element: |
200 | 200 |
201 >>> print stream.select('doc') | 201 >>> print(stream.select('doc')) |
202 <BLANKLINE> | 202 <BLANKLINE> |
203 | 203 |
204 You can use the "." expression to match the context node itself | 204 You can use the "." expression to match the context node itself |
205 (although that usually makes little sense): | 205 (although that usually makes little sense): |
206 | 206 |
207 >>> print stream.select('.') | 207 >>> print(stream.select('.')) |
208 <doc><elem>foo</elem><elem>bar</elem></doc> | 208 <doc><elem>foo</elem><elem>bar</elem></doc> |
209 | 209 |
210 :param path: a string containing the XPath expression | 210 :param path: a string containing the XPath expression |
211 :param namespaces: mapping of namespace prefixes used in the path | 211 :param namespaces: mapping of namespace prefixes used in the path |
212 :param variables: mapping of variable names to values | 212 :param variables: mapping of variable names to values |
263 START_CDATA = Stream.START_CDATA | 263 START_CDATA = Stream.START_CDATA |
264 END_CDATA = Stream.END_CDATA | 264 END_CDATA = Stream.END_CDATA |
265 PI = Stream.PI | 265 PI = Stream.PI |
266 COMMENT = Stream.COMMENT | 266 COMMENT = Stream.COMMENT |
267 | 267 |
268 | |
268 def _ensure(stream): | 269 def _ensure(stream): |
269 """Ensure that every item on the stream is actually a markup event.""" | 270 """Ensure that every item on the stream is actually a markup event.""" |
270 stream = iter(stream) | 271 stream = iter(stream) |
271 event = stream.next() | 272 event = stream.next() |
272 | 273 |
351 """ | 352 """ |
352 for attr, _ in self: | 353 for attr, _ in self: |
353 if attr == name: | 354 if attr == name: |
354 return True | 355 return True |
355 | 356 |
357 def __getitem__(self, i): | |
358 """Return an item or slice of the attributes list. | |
359 | |
360 >>> attrs = Attrs([('href', '#'), ('title', 'Foo')]) | |
361 >>> attrs[1] | |
362 ('title', 'Foo') | |
363 >>> attrs[1:] | |
364 Attrs([('title', 'Foo')]) | |
365 """ | |
366 items = tuple.__getitem__(self, i) | |
367 if type(i) is slice: | |
368 return Attrs(items) | |
369 return items | |
370 | |
356 def __getslice__(self, i, j): | 371 def __getslice__(self, i, j): |
357 """Return a slice of the attributes list. | 372 """Return a slice of the attributes list. |
358 | 373 |
359 >>> attrs = Attrs([('href', '#'), ('title', 'Foo')]) | 374 >>> attrs = Attrs([('href', '#'), ('title', 'Foo')]) |
360 >>> attrs[1:] | 375 >>> attrs[1:] |
410 | 425 |
411 The returned event is a `TEXT` event, the data is the value of all | 426 The returned event is a `TEXT` event, the data is the value of all |
412 attributes joined together. | 427 attributes joined together. |
413 | 428 |
414 >>> Attrs([('href', '#'), ('title', 'Foo')]).totuple() | 429 >>> Attrs([('href', '#'), ('title', 'Foo')]).totuple() |
415 ('TEXT', u'#Foo', (None, -1, -1)) | 430 ('TEXT', '#Foo', (None, -1, -1)) |
416 | 431 |
417 :return: a `TEXT` event | 432 :return: a `TEXT` event |
418 :rtype: `tuple` | 433 :rtype: `tuple` |
419 """ | 434 """ |
420 return TEXT, u''.join([x[1] for x in self]), (None, -1, -1) | 435 return TEXT, ''.join([x[1] for x in self]), (None, -1, -1) |
421 | 436 |
422 | 437 |
423 class Markup(unicode): | 438 class Markup(unicode): |
424 """Marks a string as being safe for inclusion in HTML/XML output without | 439 """Marks a string as being safe for inclusion in HTML/XML output without |
425 needing to be escaped. | 440 needing to be escaped. |
426 """ | 441 """ |
427 __slots__ = [] | 442 __slots__ = [] |
428 | 443 |
429 def __add__(self, other): | 444 def __add__(self, other): |
430 return Markup(unicode(self) + unicode(escape(other))) | 445 return Markup(unicode.__add__(self, escape(other))) |
431 | 446 |
432 def __radd__(self, other): | 447 def __radd__(self, other): |
433 return Markup(unicode(escape(other)) + unicode(self)) | 448 return Markup(unicode.__add__(escape(other), self)) |
434 | 449 |
435 def __mod__(self, args): | 450 def __mod__(self, args): |
436 if isinstance(args, dict): | 451 if isinstance(args, dict): |
437 args = dict(zip(args.keys(), map(escape, args.values()))) | 452 args = dict(zip(args.keys(), map(escape, args.values()))) |
438 elif isinstance(args, (list, tuple)): | 453 elif isinstance(args, (list, tuple)): |
440 else: | 455 else: |
441 args = escape(args) | 456 args = escape(args) |
442 return Markup(unicode.__mod__(self, args)) | 457 return Markup(unicode.__mod__(self, args)) |
443 | 458 |
444 def __mul__(self, num): | 459 def __mul__(self, num): |
445 return Markup(unicode(self) * num) | 460 return Markup(unicode.__mul__(self, num)) |
446 | 461 __rmul__ = __mul__ |
447 def __rmul__(self, num): | |
448 return Markup(num * unicode(self)) | |
449 | 462 |
450 def __repr__(self): | 463 def __repr__(self): |
451 return '<%s %r>' % (self.__class__.__name__, unicode(self)) | 464 return "<%s %s>" % (type(self).__name__, unicode.__repr__(self)) |
452 | 465 |
453 def join(self, seq, escape_quotes=True): | 466 def join(self, seq, escape_quotes=True): |
454 """Return a `Markup` object which is the concatenation of the strings | 467 """Return a `Markup` object which is the concatenation of the strings |
455 in the given sequence, where this `Markup` object is the separator | 468 in the given sequence, where this `Markup` object is the separator |
456 between the joined elements. | 469 between the joined elements. |
463 should be escaped | 476 should be escaped |
464 :return: the joined `Markup` object | 477 :return: the joined `Markup` object |
465 :rtype: `Markup` | 478 :rtype: `Markup` |
466 :see: `escape` | 479 :see: `escape` |
467 """ | 480 """ |
468 return Markup(unicode(self).join([escape(item, quotes=escape_quotes) | 481 return Markup(unicode.join(self, [escape(item, quotes=escape_quotes) |
469 for item in seq])) | 482 for item in seq])) |
470 | 483 |
471 @classmethod | 484 @classmethod |
472 def escape(cls, text, quotes=True): | 485 def escape(cls, text, quotes=True): |
473 """Create a Markup instance from a string and escape special characters | 486 """Create a Markup instance from a string and escape special characters |
494 if type(text) is cls: | 507 if type(text) is cls: |
495 return text | 508 return text |
496 if hasattr(text, '__html__'): | 509 if hasattr(text, '__html__'): |
497 return Markup(text.__html__()) | 510 return Markup(text.__html__()) |
498 | 511 |
499 text = unicode(text).replace('&', '&') \ | 512 text = text.replace('&', '&') \ |
500 .replace('<', '<') \ | 513 .replace('<', '<') \ |
501 .replace('>', '>') | 514 .replace('>', '>') |
502 if quotes: | 515 if quotes: |
503 text = text.replace('"', '"') | 516 text = text.replace('"', '"') |
504 return cls(text) | 517 return cls(text) |
505 | 518 |
506 def unescape(self): | 519 def unescape(self): |
512 :return: the unescaped string | 525 :return: the unescaped string |
513 :rtype: `unicode` | 526 :rtype: `unicode` |
514 :see: `genshi.core.unescape` | 527 :see: `genshi.core.unescape` |
515 """ | 528 """ |
516 if not self: | 529 if not self: |
517 return u'' | 530 return '' |
518 return unicode(self).replace('"', '"') \ | 531 return unicode(self).replace('"', '"') \ |
519 .replace('>', '>') \ | 532 .replace('>', '>') \ |
520 .replace('<', '<') \ | 533 .replace('<', '<') \ |
521 .replace('&', '&') | 534 .replace('&', '&') |
522 | 535 |
547 try: | 560 try: |
548 from genshi._speedups import Markup | 561 from genshi._speedups import Markup |
549 except ImportError: | 562 except ImportError: |
550 pass # just use the Python implementation | 563 pass # just use the Python implementation |
551 | 564 |
565 | |
552 escape = Markup.escape | 566 escape = Markup.escape |
567 | |
553 | 568 |
554 def unescape(text): | 569 def unescape(text): |
555 """Reverse-escapes &, <, >, and \" and returns a `unicode` object. | 570 """Reverse-escapes &, <, >, and \" and returns a `unicode` object. |
556 | 571 |
557 >>> unescape(Markup('1 < 2')) | 572 >>> unescape(Markup('1 < 2')) |
581 | 596 |
582 A `Namespace` object is instantiated with the namespace URI. | 597 A `Namespace` object is instantiated with the namespace URI. |
583 | 598 |
584 >>> html = Namespace('http://www.w3.org/1999/xhtml') | 599 >>> html = Namespace('http://www.w3.org/1999/xhtml') |
585 >>> html | 600 >>> html |
586 <Namespace "http://www.w3.org/1999/xhtml"> | 601 Namespace('http://www.w3.org/1999/xhtml') |
587 >>> html.uri | 602 >>> html.uri |
588 u'http://www.w3.org/1999/xhtml' | 603 u'http://www.w3.org/1999/xhtml' |
589 | 604 |
590 The `Namespace` object can than be used to generate `QName` objects with | 605 The `Namespace` object can than be used to generate `QName` objects with |
591 that namespace: | 606 that namespace: |
592 | 607 |
593 >>> html.body | 608 >>> html.body |
594 QName(u'http://www.w3.org/1999/xhtml}body') | 609 QName('http://www.w3.org/1999/xhtml}body') |
595 >>> html.body.localname | 610 >>> html.body.localname |
596 u'body' | 611 u'body' |
597 >>> html.body.namespace | 612 >>> html.body.namespace |
598 u'http://www.w3.org/1999/xhtml' | 613 u'http://www.w3.org/1999/xhtml' |
599 | 614 |
600 The same works using item access notation, which is useful for element or | 615 The same works using item access notation, which is useful for element or |
601 attribute names that are not valid Python identifiers: | 616 attribute names that are not valid Python identifiers: |
602 | 617 |
603 >>> html['body'] | 618 >>> html['body'] |
604 QName(u'http://www.w3.org/1999/xhtml}body') | 619 QName('http://www.w3.org/1999/xhtml}body') |
605 | 620 |
606 A `Namespace` object can also be used to test whether a specific `QName` | 621 A `Namespace` object can also be used to test whether a specific `QName` |
607 belongs to that namespace using the ``in`` operator: | 622 belongs to that namespace using the ``in`` operator: |
608 | 623 |
609 >>> qname = html.body | 624 >>> qname = html.body |
639 if isinstance(other, Namespace): | 654 if isinstance(other, Namespace): |
640 return self.uri == other.uri | 655 return self.uri == other.uri |
641 return self.uri == other | 656 return self.uri == other |
642 | 657 |
643 def __getitem__(self, name): | 658 def __getitem__(self, name): |
644 return QName(self.uri + u'}' + name) | 659 return QName(self.uri + '}' + name) |
645 __getattr__ = __getitem__ | 660 __getattr__ = __getitem__ |
646 | 661 |
647 def __hash__(self): | 662 def __hash__(self): |
648 return hash(self.uri) | 663 return hash(self.uri) |
649 | 664 |
650 def __repr__(self): | 665 def __repr__(self): |
651 return '<Namespace "%s">' % self.uri | 666 return '%s(%s)' % (type(self).__name__, stringrepr(self.uri)) |
652 | 667 |
653 def __str__(self): | 668 def __str__(self): |
654 return self.uri.encode('utf-8') | 669 return self.uri.encode('utf-8') |
655 | 670 |
656 def __unicode__(self): | 671 def __unicode__(self): |
669 namespace URI can be obtained through the additional `namespace` attribute, | 684 namespace URI can be obtained through the additional `namespace` attribute, |
670 while the local name can be accessed through the `localname` attribute. | 685 while the local name can be accessed through the `localname` attribute. |
671 | 686 |
672 >>> qname = QName('foo') | 687 >>> qname = QName('foo') |
673 >>> qname | 688 >>> qname |
674 QName(u'foo') | 689 QName('foo') |
675 >>> qname.localname | 690 >>> qname.localname |
676 u'foo' | 691 u'foo' |
677 >>> qname.namespace | 692 >>> qname.namespace |
678 | 693 |
679 >>> qname = QName('http://www.w3.org/1999/xhtml}body') | 694 >>> qname = QName('http://www.w3.org/1999/xhtml}body') |
680 >>> qname | 695 >>> qname |
681 QName(u'http://www.w3.org/1999/xhtml}body') | 696 QName('http://www.w3.org/1999/xhtml}body') |
682 >>> qname.localname | 697 >>> qname.localname |
683 u'body' | 698 u'body' |
684 >>> qname.namespace | 699 >>> qname.namespace |
685 u'http://www.w3.org/1999/xhtml' | 700 u'http://www.w3.org/1999/xhtml' |
686 """ | 701 """ |
694 brace is optional | 709 brace is optional |
695 """ | 710 """ |
696 if type(qname) is cls: | 711 if type(qname) is cls: |
697 return qname | 712 return qname |
698 | 713 |
699 parts = qname.lstrip(u'{').split(u'}', 1) | 714 parts = qname.lstrip('{').split('}', 1) |
700 if len(parts) > 1: | 715 if len(parts) > 1: |
701 self = unicode.__new__(cls, u'{%s' % qname) | 716 self = unicode.__new__(cls, '{%s' % qname) |
702 self.namespace, self.localname = map(unicode, parts) | 717 self.namespace, self.localname = map(unicode, parts) |
703 else: | 718 else: |
704 self = unicode.__new__(cls, qname) | 719 self = unicode.__new__(cls, qname) |
705 self.namespace, self.localname = None, unicode(qname) | 720 self.namespace, self.localname = None, unicode(qname) |
706 return self | 721 return self |
707 | 722 |
708 def __getnewargs__(self): | 723 def __getnewargs__(self): |
709 return (self.lstrip('{'),) | 724 return (self.lstrip('{'),) |
710 | 725 |
711 def __repr__(self): | 726 def __repr__(self): |
712 return 'QName(%s)' % unicode.__repr__(self.lstrip('{')) | 727 return '%s(%s)' % (type(self).__name__, stringrepr(self.lstrip('{'))) |