Mercurial > genshi > genshi-test
comparison genshi/core.py @ 860:61d37796da98
A bit of cleanup of the `Markup` Python implementation.
author | cmlenz |
---|---|
date | Thu, 12 Nov 2009 17:31:40 +0000 |
parents | 24733a5854d9 |
children | e098d29c4de1 |
comparison
equal
deleted
inserted
replaced
859:fbe34d12acde | 860:61d37796da98 |
---|---|
440 needing to be escaped. | 440 needing to be escaped. |
441 """ | 441 """ |
442 __slots__ = [] | 442 __slots__ = [] |
443 | 443 |
444 def __add__(self, other): | 444 def __add__(self, other): |
445 return Markup(unicode(self) + unicode(escape(other))) | 445 return Markup(unicode.__add__(self, escape(other))) |
446 | 446 |
447 def __radd__(self, other): | 447 def __radd__(self, other): |
448 return Markup(unicode(escape(other)) + unicode(self)) | 448 return Markup(unicode.__add__(escape(other), self)) |
449 | 449 |
450 def __mod__(self, args): | 450 def __mod__(self, args): |
451 if isinstance(args, dict): | 451 if isinstance(args, dict): |
452 args = dict(zip(args.keys(), map(escape, args.values()))) | 452 args = dict(zip(args.keys(), map(escape, args.values()))) |
453 elif isinstance(args, (list, tuple)): | 453 elif isinstance(args, (list, tuple)): |
455 else: | 455 else: |
456 args = escape(args) | 456 args = escape(args) |
457 return Markup(unicode.__mod__(self, args)) | 457 return Markup(unicode.__mod__(self, args)) |
458 | 458 |
459 def __mul__(self, num): | 459 def __mul__(self, num): |
460 return Markup(unicode(self) * num) | 460 return Markup(unicode.__mul__(self, num)) |
461 | 461 __rmul__ = __mul__ |
462 def __rmul__(self, num): | |
463 return Markup(num * unicode(self)) | |
464 | 462 |
465 def __repr__(self): | 463 def __repr__(self): |
466 return "<%s %s>" % (self.__class__.__name__, unicode.__repr__(self)) | 464 return "<%s %s>" % (type(self).__name__, unicode.__repr__(self)) |
467 | 465 |
468 def join(self, seq, escape_quotes=True): | 466 def join(self, seq, escape_quotes=True): |
469 """Return a `Markup` object which is the concatenation of the strings | 467 """Return a `Markup` object which is the concatenation of the strings |
470 in the given sequence, where this `Markup` object is the separator | 468 in the given sequence, where this `Markup` object is the separator |
471 between the joined elements. | 469 between the joined elements. |
478 should be escaped | 476 should be escaped |
479 :return: the joined `Markup` object | 477 :return: the joined `Markup` object |
480 :rtype: `Markup` | 478 :rtype: `Markup` |
481 :see: `escape` | 479 :see: `escape` |
482 """ | 480 """ |
483 return Markup(unicode(self).join([escape(item, quotes=escape_quotes) | 481 return Markup(unicode.join(self, [escape(item, quotes=escape_quotes) |
484 for item in seq])) | 482 for item in seq])) |
485 | 483 |
486 @classmethod | 484 @classmethod |
487 def escape(cls, text, quotes=True): | 485 def escape(cls, text, quotes=True): |
488 """Create a Markup instance from a string and escape special characters | 486 """Create a Markup instance from a string and escape special characters |
509 if type(text) is cls: | 507 if type(text) is cls: |
510 return text | 508 return text |
511 if hasattr(text, '__html__'): | 509 if hasattr(text, '__html__'): |
512 return Markup(text.__html__()) | 510 return Markup(text.__html__()) |
513 | 511 |
514 text = unicode(text).replace('&', '&') \ | 512 text = text.replace('&', '&') \ |
515 .replace('<', '<') \ | 513 .replace('<', '<') \ |
516 .replace('>', '>') | 514 .replace('>', '>') |
517 if quotes: | 515 if quotes: |
518 text = text.replace('"', '"') | 516 text = text.replace('"', '"') |
519 return cls(text) | 517 return cls(text) |
520 | 518 |
521 def unescape(self): | 519 def unescape(self): |
725 def __getnewargs__(self): | 723 def __getnewargs__(self): |
726 return (self.lstrip('{'),) | 724 return (self.lstrip('{'),) |
727 | 725 |
728 def __repr__(self): | 726 def __repr__(self): |
729 return 'QName(%s)' % stringrepr(self.lstrip('{')) | 727 return 'QName(%s)' % stringrepr(self.lstrip('{')) |
728 # -*- coding: utf-8 -*- | |
729 # | |
730 # Copyright (C) 2006-2009 Edgewall Software | |
731 # All rights reserved. | |
732 # | |
733 # This software is licensed as described in the file COPYING, which | |
734 # you should have received as part of this distribution. The terms | |
735 # are also available at http://genshi.edgewall.org/wiki/License. | |
736 # | |
737 # This software consists of voluntary contributions made by many | |
738 # individuals. For the exact contribution history, see the revision | |
739 # history and logs, available at http://genshi.edgewall.org/log/. | |
740 | |
741 """Core classes for markup processing.""" | |
742 | |
743 try: | |
744 reduce # builtin in Python < 3 | |
745 except NameError: | |
746 from functools import reduce | |
747 from itertools import chain | |
748 import operator | |
749 | |
750 from genshi.util import plaintext, stripentities, striptags, stringrepr | |
751 | |
752 __all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Attrs', 'Namespace', | |
753 'QName'] | |
754 __docformat__ = 'restructuredtext en' | |
755 | |
756 | |
757 class StreamEventKind(str): | |
758 """A kind of event on a markup stream.""" | |
759 __slots__ = [] | |
760 _instances = {} | |
761 | |
762 def __new__(cls, val): | |
763 return cls._instances.setdefault(val, str.__new__(cls, val)) | |
764 | |
765 | |
766 class Stream(object): | |
767 """Represents a stream of markup events. | |
768 | |
769 This class is basically an iterator over the events. | |
770 | |
771 Stream events are tuples of the form:: | |
772 | |
773 (kind, data, position) | |
774 | |
775 where ``kind`` is the event kind (such as `START`, `END`, `TEXT`, etc), | |
776 ``data`` depends on the kind of event, and ``position`` is a | |
777 ``(filename, line, offset)`` tuple that contains the location of the | |
778 original element or text in the input. If the original location is unknown, | |
779 ``position`` is ``(None, -1, -1)``. | |
780 | |
781 Also provided are ways to serialize the stream to text. The `serialize()` | |
782 method will return an iterator over generated strings, while `render()` | |
783 returns the complete generated text at once. Both accept various parameters | |
784 that impact the way the stream is serialized. | |
785 """ | |
786 __slots__ = ['events', 'serializer'] | |
787 | |
788 START = StreamEventKind('START') #: a start tag | |
789 END = StreamEventKind('END') #: an end tag | |
790 TEXT = StreamEventKind('TEXT') #: literal text | |
791 XML_DECL = StreamEventKind('XML_DECL') #: XML declaration | |
792 DOCTYPE = StreamEventKind('DOCTYPE') #: doctype declaration | |
793 START_NS = StreamEventKind('START_NS') #: start namespace mapping | |
794 END_NS = StreamEventKind('END_NS') #: end namespace mapping | |
795 START_CDATA = StreamEventKind('START_CDATA') #: start CDATA section | |
796 END_CDATA = StreamEventKind('END_CDATA') #: end CDATA section | |
797 PI = StreamEventKind('PI') #: processing instruction | |
798 COMMENT = StreamEventKind('COMMENT') #: comment | |
799 | |
800 def __init__(self, events, serializer=None): | |
801 """Initialize the stream with a sequence of markup events. | |
802 | |
803 :param events: a sequence or iterable providing the events | |
804 :param serializer: the default serialization method to use for this | |
805 stream | |
806 | |
807 :note: Changed in 0.5: added the `serializer` argument | |
808 """ | |
809 self.events = events #: The underlying iterable producing the events | |
810 self.serializer = serializer #: The default serializion method | |
811 | |
812 def __iter__(self): | |
813 return iter(self.events) | |
814 | |
815 def __or__(self, function): | |
816 """Override the "bitwise or" operator to apply filters or serializers | |
817 to the stream, providing a syntax similar to pipes on Unix shells. | |
818 | |
819 Assume the following stream produced by the `HTML` function: | |
820 | |
821 >>> from genshi.input import HTML | |
822 >>> html = HTML('''<p onclick="alert('Whoa')">Hello, world!</p>''') | |
823 >>> print(html) | |
824 <p onclick="alert('Whoa')">Hello, world!</p> | |
825 | |
826 A filter such as the HTML sanitizer can be applied to that stream using | |
827 the pipe notation as follows: | |
828 | |
829 >>> from genshi.filters import HTMLSanitizer | |
830 >>> sanitizer = HTMLSanitizer() | |
831 >>> print(html | sanitizer) | |
832 <p>Hello, world!</p> | |
833 | |
834 Filters can be any function that accepts and produces a stream (where | |
835 a stream is anything that iterates over events): | |
836 | |
837 >>> def uppercase(stream): | |
838 ... for kind, data, pos in stream: | |
839 ... if kind is TEXT: | |
840 ... data = data.upper() | |
841 ... yield kind, data, pos | |
842 >>> print(html | sanitizer | uppercase) | |
843 <p>HELLO, WORLD!</p> | |
844 | |
845 Serializers can also be used with this notation: | |
846 | |
847 >>> from genshi.output import TextSerializer | |
848 >>> output = TextSerializer() | |
849 >>> print(html | sanitizer | uppercase | output) | |
850 HELLO, WORLD! | |
851 | |
852 Commonly, serializers should be used at the end of the "pipeline"; | |
853 using them somewhere in the middle may produce unexpected results. | |
854 | |
855 :param function: the callable object that should be applied as a filter | |
856 :return: the filtered stream | |
857 :rtype: `Stream` | |
858 """ | |
859 return Stream(_ensure(function(self)), serializer=self.serializer) | |
860 | |
861 def filter(self, *filters): | |
862 """Apply filters to the stream. | |
863 | |
864 This method returns a new stream with the given filters applied. The | |
865 filters must be callables that accept the stream object as parameter, | |
866 and return the filtered stream. | |
867 | |
868 The call:: | |
869 | |
870 stream.filter(filter1, filter2) | |
871 | |
872 is equivalent to:: | |
873 | |
874 stream | filter1 | filter2 | |
875 | |
876 :param filters: one or more callable objects that should be applied as | |
877 filters | |
878 :return: the filtered stream | |
879 :rtype: `Stream` | |
880 """ | |
881 return reduce(operator.or_, (self,) + filters) | |
882 | |
883 def render(self, method=None, encoding='utf-8', out=None, **kwargs): | |
884 """Return a string representation of the stream. | |
885 | |
886 Any additional keyword arguments are passed to the serializer, and thus | |
887 depend on the `method` parameter value. | |
888 | |
889 :param method: determines how the stream is serialized; can be either | |
890 "xml", "xhtml", "html", "text", or a custom serializer | |
891 class; if `None`, the default serialization method of | |
892 the stream is used | |
893 :param encoding: how the output string should be encoded; if set to | |
894 `None`, this method returns a `unicode` object | |
895 :param out: a file-like object that the output should be written to | |
896 instead of being returned as one big string; note that if | |
897 this is a file or socket (or similar), the `encoding` must | |
898 not be `None` (that is, the output must be encoded) | |
899 :return: a `str` or `unicode` object (depending on the `encoding` | |
900 parameter), or `None` if the `out` parameter is provided | |
901 :rtype: `basestring` | |
902 | |
903 :see: XMLSerializer, XHTMLSerializer, HTMLSerializer, TextSerializer | |
904 :note: Changed in 0.5: added the `out` parameter | |
905 """ | |
906 from genshi.output import encode | |
907 if method is None: | |
908 method = self.serializer or 'xml' | |
909 generator = self.serialize(method=method, **kwargs) | |
910 return encode(generator, method=method, encoding=encoding, out=out) | |
911 | |
912 def select(self, path, namespaces=None, variables=None): | |
913 """Return a new stream that contains the events matching the given | |
914 XPath expression. | |
915 | |
916 >>> from genshi import HTML | |
917 >>> stream = HTML('<doc><elem>foo</elem><elem>bar</elem></doc>') | |
918 >>> print(stream.select('elem')) | |
919 <elem>foo</elem><elem>bar</elem> | |
920 >>> print(stream.select('elem/text()')) | |
921 foobar | |
922 | |
923 Note that the outermost element of the stream becomes the *context | |
924 node* for the XPath test. That means that the expression "doc" would | |
925 not match anything in the example above, because it only tests against | |
926 child elements of the outermost element: | |
927 | |
928 >>> print(stream.select('doc')) | |
929 <BLANKLINE> | |
930 | |
931 You can use the "." expression to match the context node itself | |
932 (although that usually makes little sense): | |
933 | |
934 >>> print(stream.select('.')) | |
935 <doc><elem>foo</elem><elem>bar</elem></doc> | |
936 | |
937 :param path: a string containing the XPath expression | |
938 :param namespaces: mapping of namespace prefixes used in the path | |
939 :param variables: mapping of variable names to values | |
940 :return: the selected substream | |
941 :rtype: `Stream` | |
942 :raises PathSyntaxError: if the given path expression is invalid or not | |
943 supported | |
944 """ | |
945 from genshi.path import Path | |
946 return Path(path).select(self, namespaces, variables) | |
947 | |
948 def serialize(self, method='xml', **kwargs): | |
949 """Generate strings corresponding to a specific serialization of the | |
950 stream. | |
951 | |
952 Unlike the `render()` method, this method is a generator that returns | |
953 the serialized output incrementally, as opposed to returning a single | |
954 string. | |
955 | |
956 Any additional keyword arguments are passed to the serializer, and thus | |
957 depend on the `method` parameter value. | |
958 | |
959 :param method: determines how the stream is serialized; can be either | |
960 "xml", "xhtml", "html", "text", or a custom serializer | |
961 class; if `None`, the default serialization method of | |
962 the stream is used | |
963 :return: an iterator over the serialization results (`Markup` or | |
964 `unicode` objects, depending on the serialization method) | |
965 :rtype: ``iterator`` | |
966 :see: XMLSerializer, XHTMLSerializer, HTMLSerializer, TextSerializer | |
967 """ | |
968 from genshi.output import get_serializer | |
969 if method is None: | |
970 method = self.serializer or 'xml' | |
971 return get_serializer(method, **kwargs)(_ensure(self)) | |
972 | |
973 def __str__(self): | |
974 return self.render() | |
975 | |
976 def __unicode__(self): | |
977 return self.render(encoding=None) | |
978 | |
979 def __html__(self): | |
980 return self | |
981 | |
982 | |
983 START = Stream.START | |
984 END = Stream.END | |
985 TEXT = Stream.TEXT | |
986 XML_DECL = Stream.XML_DECL | |
987 DOCTYPE = Stream.DOCTYPE | |
988 START_NS = Stream.START_NS | |
989 END_NS = Stream.END_NS | |
990 START_CDATA = Stream.START_CDATA | |
991 END_CDATA = Stream.END_CDATA | |
992 PI = Stream.PI | |
993 COMMENT = Stream.COMMENT | |
994 | |
995 | |
996 def _ensure(stream): | |
997 """Ensure that every item on the stream is actually a markup event.""" | |
998 stream = iter(stream) | |
999 event = stream.next() | |
1000 | |
1001 # Check whether the iterable is a real markup event stream by examining the | |
1002 # first item it yields; if it's not we'll need to do some conversion | |
1003 if type(event) is not tuple or len(event) != 3: | |
1004 for event in chain([event], stream): | |
1005 if hasattr(event, 'totuple'): | |
1006 event = event.totuple() | |
1007 else: | |
1008 event = TEXT, unicode(event), (None, -1, -1) | |
1009 yield event | |
1010 return | |
1011 | |
1012 # This looks like a markup event stream, so we'll just pass it through | |
1013 # unchanged | |
1014 yield event | |
1015 for event in stream: | |
1016 yield event | |
1017 | |
1018 | |
1019 class Attrs(tuple): | |
1020 """Immutable sequence type that stores the attributes of an element. | |
1021 | |
1022 Ordering of the attributes is preserved, while access by name is also | |
1023 supported. | |
1024 | |
1025 >>> attrs = Attrs([('href', '#'), ('title', 'Foo')]) | |
1026 >>> attrs | |
1027 Attrs([('href', '#'), ('title', 'Foo')]) | |
1028 | |
1029 >>> 'href' in attrs | |
1030 True | |
1031 >>> 'tabindex' in attrs | |
1032 False | |
1033 >>> attrs.get('title') | |
1034 'Foo' | |
1035 | |
1036 Instances may not be manipulated directly. Instead, the operators ``|`` and | |
1037 ``-`` can be used to produce new instances that have specific attributes | |
1038 added, replaced or removed. | |
1039 | |
1040 To remove an attribute, use the ``-`` operator. The right hand side can be | |
1041 either a string or a set/sequence of strings, identifying the name(s) of | |
1042 the attribute(s) to remove: | |
1043 | |
1044 >>> attrs - 'title' | |
1045 Attrs([('href', '#')]) | |
1046 >>> attrs - ('title', 'href') | |
1047 Attrs() | |
1048 | |
1049 The original instance is not modified, but the operator can of course be | |
1050 used with an assignment: | |
1051 | |
1052 >>> attrs | |
1053 Attrs([('href', '#'), ('title', 'Foo')]) | |
1054 >>> attrs -= 'title' | |
1055 >>> attrs | |
1056 Attrs([('href', '#')]) | |
1057 | |
1058 To add a new attribute, use the ``|`` operator, where the right hand value | |
1059 is a sequence of ``(name, value)`` tuples (which includes `Attrs` | |
1060 instances): | |
1061 | |
1062 >>> attrs | [('title', 'Bar')] | |
1063 Attrs([('href', '#'), ('title', 'Bar')]) | |
1064 | |
1065 If the attributes already contain an attribute with a given name, the value | |
1066 of that attribute is replaced: | |
1067 | |
1068 >>> attrs | [('href', 'http://example.org/')] | |
1069 Attrs([('href', 'http://example.org/')]) | |
1070 """ | |
1071 __slots__ = [] | |
1072 | |
1073 def __contains__(self, name): | |
1074 """Return whether the list includes an attribute with the specified | |
1075 name. | |
1076 | |
1077 :return: `True` if the list includes the attribute | |
1078 :rtype: `bool` | |
1079 """ | |
1080 for attr, _ in self: | |
1081 if attr == name: | |
1082 return True | |
1083 | |
1084 def __getitem__(self, i): | |
1085 """Return an item or slice of the attributes list. | |
1086 | |
1087 >>> attrs = Attrs([('href', '#'), ('title', 'Foo')]) | |
1088 >>> attrs[1] | |
1089 ('title', 'Foo') | |
1090 >>> attrs[1:] | |
1091 Attrs([('title', 'Foo')]) | |
1092 """ | |
1093 items = tuple.__getitem__(self, i) | |
1094 if type(i) is slice: | |
1095 return Attrs(items) | |
1096 return items | |
1097 | |
1098 def __getslice__(self, i, j): | |
1099 """Return a slice of the attributes list. | |
1100 | |
1101 >>> attrs = Attrs([('href', '#'), ('title', 'Foo')]) | |
1102 >>> attrs[1:] | |
1103 Attrs([('title', 'Foo')]) | |
1104 """ | |
1105 return Attrs(tuple.__getslice__(self, i, j)) | |
1106 | |
1107 def __or__(self, attrs): | |
1108 """Return a new instance that contains the attributes in `attrs` in | |
1109 addition to any already existing attributes. | |
1110 | |
1111 :return: a new instance with the merged attributes | |
1112 :rtype: `Attrs` | |
1113 """ | |
1114 repl = dict([(an, av) for an, av in attrs if an in self]) | |
1115 return Attrs([(sn, repl.get(sn, sv)) for sn, sv in self] + | |
1116 [(an, av) for an, av in attrs if an not in self]) | |
1117 | |
1118 def __repr__(self): | |
1119 if not self: | |
1120 return 'Attrs()' | |
1121 return 'Attrs([%s])' % ', '.join([repr(item) for item in self]) | |
1122 | |
1123 def __sub__(self, names): | |
1124 """Return a new instance with all attributes with a name in `names` are | |
1125 removed. | |
1126 | |
1127 :param names: the names of the attributes to remove | |
1128 :return: a new instance with the attribute removed | |
1129 :rtype: `Attrs` | |
1130 """ | |
1131 if isinstance(names, basestring): | |
1132 names = (names,) | |
1133 return Attrs([(name, val) for name, val in self if name not in names]) | |
1134 | |
1135 def get(self, name, default=None): | |
1136 """Return the value of the attribute with the specified name, or the | |
1137 value of the `default` parameter if no such attribute is found. | |
1138 | |
1139 :param name: the name of the attribute | |
1140 :param default: the value to return when the attribute does not exist | |
1141 :return: the attribute value, or the `default` value if that attribute | |
1142 does not exist | |
1143 :rtype: `object` | |
1144 """ | |
1145 for attr, value in self: | |
1146 if attr == name: | |
1147 return value | |
1148 return default | |
1149 | |
1150 def totuple(self): | |
1151 """Return the attributes as a markup event. | |
1152 | |
1153 The returned event is a `TEXT` event, the data is the value of all | |
1154 attributes joined together. | |
1155 | |
1156 >>> Attrs([('href', '#'), ('title', 'Foo')]).totuple() | |
1157 ('TEXT', '#Foo', (None, -1, -1)) | |
1158 | |
1159 :return: a `TEXT` event | |
1160 :rtype: `tuple` | |
1161 """ | |
1162 return TEXT, ''.join([x[1] for x in self]), (None, -1, -1) | |
1163 | |
1164 | |
1165 class Markup(unicode): | |
1166 """Marks a string as being safe for inclusion in HTML/XML output without | |
1167 needing to be escaped. | |
1168 """ | |
1169 __slots__ = [] | |
1170 | |
1171 def __add__(self, other): | |
1172 return Markup(unicode.__add__(self, escape(other))) | |
1173 | |
1174 def __radd__(self, other): | |
1175 return Markup(unicode.__add__(escape(other), self)) | |
1176 | |
1177 def __mod__(self, args): | |
1178 if isinstance(args, dict): | |
1179 args = dict(zip(args.keys(), map(escape, args.values()))) | |
1180 elif isinstance(args, (list, tuple)): | |
1181 args = tuple(map(escape, args)) | |
1182 else: | |
1183 args = escape(args) | |
1184 return Markup(unicode.__mod__(self, args)) | |
1185 | |
1186 def __mul__(self, num): | |
1187 return Markup(unicode.__mul__(self, num)) | |
1188 __rmul__ = __mul__ | |
1189 | |
1190 def __repr__(self): | |
1191 return "<%s %s>" % (type(self).__name__, unicode.__repr__(self)) | |
1192 | |
1193 def join(self, seq, escape_quotes=True): | |
1194 """Return a `Markup` object which is the concatenation of the strings | |
1195 in the given sequence, where this `Markup` object is the separator | |
1196 between the joined elements. | |
1197 | |
1198 Any element in the sequence that is not a `Markup` instance is | |
1199 automatically escaped. | |
1200 | |
1201 :param seq: the sequence of strings to join | |
1202 :param escape_quotes: whether double quote characters in the elements | |
1203 should be escaped | |
1204 :return: the joined `Markup` object | |
1205 :rtype: `Markup` | |
1206 :see: `escape` | |
1207 """ | |
1208 return Markup(unicode.join(self, [escape(item, quotes=escape_quotes) | |
1209 for item in seq])) | |
1210 | |
1211 @classmethod | |
1212 def escape(cls, text, quotes=True): | |
1213 """Create a Markup instance from a string and escape special characters | |
1214 it may contain (<, >, & and \"). | |
1215 | |
1216 >>> escape('"1 < 2"') | |
1217 <Markup u'"1 < 2"'> | |
1218 | |
1219 If the `quotes` parameter is set to `False`, the \" character is left | |
1220 as is. Escaping quotes is generally only required for strings that are | |
1221 to be used in attribute values. | |
1222 | |
1223 >>> escape('"1 < 2"', quotes=False) | |
1224 <Markup u'"1 < 2"'> | |
1225 | |
1226 :param text: the text to escape | |
1227 :param quotes: if ``True``, double quote characters are escaped in | |
1228 addition to the other special characters | |
1229 :return: the escaped `Markup` string | |
1230 :rtype: `Markup` | |
1231 """ | |
1232 if not text: | |
1233 return cls() | |
1234 if type(text) is cls: | |
1235 return text | |
1236 if hasattr(text, '__html__'): | |
1237 return Markup(text.__html__()) | |
1238 | |
1239 text = text.replace('&', '&') \ | |
1240 .replace('<', '<') \ | |
1241 .replace('>', '>') | |
1242 if quotes: | |
1243 text = text.replace('"', '"') | |
1244 return cls(text) | |
1245 | |
1246 def unescape(self): | |
1247 """Reverse-escapes &, <, >, and \" and returns a `unicode` object. | |
1248 | |
1249 >>> Markup('1 < 2').unescape() | |
1250 u'1 < 2' | |
1251 | |
1252 :return: the unescaped string | |
1253 :rtype: `unicode` | |
1254 :see: `genshi.core.unescape` | |
1255 """ | |
1256 if not self: | |
1257 return '' | |
1258 return unicode(self).replace('"', '"') \ | |
1259 .replace('>', '>') \ | |
1260 .replace('<', '<') \ | |
1261 .replace('&', '&') | |
1262 | |
1263 def stripentities(self, keepxmlentities=False): | |
1264 """Return a copy of the text with any character or numeric entities | |
1265 replaced by the equivalent UTF-8 characters. | |
1266 | |
1267 If the `keepxmlentities` parameter is provided and evaluates to `True`, | |
1268 the core XML entities (``&``, ``'``, ``>``, ``<`` and | |
1269 ``"``) are not stripped. | |
1270 | |
1271 :return: a `Markup` instance with entities removed | |
1272 :rtype: `Markup` | |
1273 :see: `genshi.util.stripentities` | |
1274 """ | |
1275 return Markup(stripentities(self, keepxmlentities=keepxmlentities)) | |
1276 | |
1277 def striptags(self): | |
1278 """Return a copy of the text with all XML/HTML tags removed. | |
1279 | |
1280 :return: a `Markup` instance with all tags removed | |
1281 :rtype: `Markup` | |
1282 :see: `genshi.util.striptags` | |
1283 """ | |
1284 return Markup(striptags(self)) | |
1285 | |
1286 | |
1287 try: | |
1288 from genshi._speedups import Markup | |
1289 except ImportError: | |
1290 pass # just use the Python implementation | |
1291 | |
1292 | |
1293 escape = Markup.escape | |
1294 | |
1295 | |
1296 def unescape(text): | |
1297 """Reverse-escapes &, <, >, and \" and returns a `unicode` object. | |
1298 | |
1299 >>> unescape(Markup('1 < 2')) | |
1300 u'1 < 2' | |
1301 | |
1302 If the provided `text` object is not a `Markup` instance, it is returned | |
1303 unchanged. | |
1304 | |
1305 >>> unescape('1 < 2') | |
1306 '1 < 2' | |
1307 | |
1308 :param text: the text to unescape | |
1309 :return: the unescsaped string | |
1310 :rtype: `unicode` | |
1311 """ | |
1312 if not isinstance(text, Markup): | |
1313 return text | |
1314 return text.unescape() | |
1315 | |
1316 | |
1317 class Namespace(object): | |
1318 """Utility class creating and testing elements with a namespace. | |
1319 | |
1320 Internally, namespace URIs are encoded in the `QName` of any element or | |
1321 attribute, the namespace URI being enclosed in curly braces. This class | |
1322 helps create and test these strings. | |
1323 | |
1324 A `Namespace` object is instantiated with the namespace URI. | |
1325 | |
1326 >>> html = Namespace('http://www.w3.org/1999/xhtml') | |
1327 >>> html | |
1328 Namespace('http://www.w3.org/1999/xhtml') | |
1329 >>> html.uri | |
1330 u'http://www.w3.org/1999/xhtml' | |
1331 | |
1332 The `Namespace` object can than be used to generate `QName` objects with | |
1333 that namespace: | |
1334 | |
1335 >>> html.body | |
1336 QName('http://www.w3.org/1999/xhtml}body') | |
1337 >>> html.body.localname | |
1338 u'body' | |
1339 >>> html.body.namespace | |
1340 u'http://www.w3.org/1999/xhtml' | |
1341 | |
1342 The same works using item access notation, which is useful for element or | |
1343 attribute names that are not valid Python identifiers: | |
1344 | |
1345 >>> html['body'] | |
1346 QName('http://www.w3.org/1999/xhtml}body') | |
1347 | |
1348 A `Namespace` object can also be used to test whether a specific `QName` | |
1349 belongs to that namespace using the ``in`` operator: | |
1350 | |
1351 >>> qname = html.body | |
1352 >>> qname in html | |
1353 True | |
1354 >>> qname in Namespace('http://www.w3.org/2002/06/xhtml2') | |
1355 False | |
1356 """ | |
1357 def __new__(cls, uri): | |
1358 if type(uri) is cls: | |
1359 return uri | |
1360 return object.__new__(cls) | |
1361 | |
1362 def __getnewargs__(self): | |
1363 return (self.uri,) | |
1364 | |
1365 def __getstate__(self): | |
1366 return self.uri | |
1367 | |
1368 def __setstate__(self, uri): | |
1369 self.uri = uri | |
1370 | |
1371 def __init__(self, uri): | |
1372 self.uri = unicode(uri) | |
1373 | |
1374 def __contains__(self, qname): | |
1375 return qname.namespace == self.uri | |
1376 | |
1377 def __ne__(self, other): | |
1378 return not self == other | |
1379 | |
1380 def __eq__(self, other): | |
1381 if isinstance(other, Namespace): | |
1382 return self.uri == other.uri | |
1383 return self.uri == other | |
1384 | |
1385 def __getitem__(self, name): | |
1386 return QName(self.uri + '}' + name) | |
1387 __getattr__ = __getitem__ | |
1388 | |
1389 def __hash__(self): | |
1390 return hash(self.uri) | |
1391 | |
1392 def __repr__(self): | |
1393 return 'Namespace(%s)' % stringrepr(self.uri) | |
1394 | |
1395 def __str__(self): | |
1396 return self.uri.encode('utf-8') | |
1397 | |
1398 def __unicode__(self): | |
1399 return self.uri | |
1400 | |
1401 | |
1402 # The namespace used by attributes such as xml:lang and xml:space | |
1403 XML_NAMESPACE = Namespace('http://www.w3.org/XML/1998/namespace') | |
1404 | |
1405 | |
1406 class QName(unicode): | |
1407 """A qualified element or attribute name. | |
1408 | |
1409 The unicode value of instances of this class contains the qualified name of | |
1410 the element or attribute, in the form ``{namespace-uri}local-name``. The | |
1411 namespace URI can be obtained through the additional `namespace` attribute, | |
1412 while the local name can be accessed through the `localname` attribute. | |
1413 | |
1414 >>> qname = QName('foo') | |
1415 >>> qname | |
1416 QName('foo') | |
1417 >>> qname.localname | |
1418 u'foo' | |
1419 >>> qname.namespace | |
1420 | |
1421 >>> qname = QName('http://www.w3.org/1999/xhtml}body') | |
1422 >>> qname | |
1423 QName('http://www.w3.org/1999/xhtml}body') | |
1424 >>> qname.localname | |
1425 u'body' | |
1426 >>> qname.namespace | |
1427 u'http://www.w3.org/1999/xhtml' | |
1428 """ | |
1429 __slots__ = ['namespace', 'localname'] | |
1430 | |
1431 def __new__(cls, qname): | |
1432 """Create the `QName` instance. | |
1433 | |
1434 :param qname: the qualified name as a string of the form | |
1435 ``{namespace-uri}local-name``, where the leading curly | |
1436 brace is optional | |
1437 """ | |
1438 if type(qname) is cls: | |
1439 return qname | |
1440 | |
1441 parts = qname.lstrip('{').split('}', 1) | |
1442 if len(parts) > 1: | |
1443 self = unicode.__new__(cls, '{%s' % qname) | |
1444 self.namespace, self.localname = map(unicode, parts) | |
1445 else: | |
1446 self = unicode.__new__(cls, qname) | |
1447 self.namespace, self.localname = None, unicode(qname) | |
1448 return self | |
1449 | |
1450 def __getnewargs__(self): | |
1451 return (self.lstrip('{'),) | |
1452 | |
1453 def __repr__(self): | |
1454 return 'QName(%s)' % stringrepr(self.lstrip('{')) | |
1455 # -*- coding: utf-8 -*- | |
1456 # | |
1457 # Copyright (C) 2006-2009 Edgewall Software | |
1458 # All rights reserved. | |
1459 # | |
1460 # This software is licensed as described in the file COPYING, which | |
1461 # you should have received as part of this distribution. The terms | |
1462 # are also available at http://genshi.edgewall.org/wiki/License. | |
1463 # | |
1464 # This software consists of voluntary contributions made by many | |
1465 # individuals. For the exact contribution history, see the revision | |
1466 # history and logs, available at http://genshi.edgewall.org/log/. | |
1467 | |
1468 """Core classes for markup processing.""" | |
1469 | |
1470 try: | |
1471 reduce # builtin in Python < 3 | |
1472 except NameError: | |
1473 from functools import reduce | |
1474 from itertools import chain | |
1475 import operator | |
1476 | |
1477 from genshi.util import plaintext, stripentities, striptags, stringrepr | |
1478 | |
1479 __all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Attrs', 'Namespace', | |
1480 'QName'] | |
1481 __docformat__ = 'restructuredtext en' | |
1482 | |
1483 | |
1484 class StreamEventKind(str): | |
1485 """A kind of event on a markup stream.""" | |
1486 __slots__ = [] | |
1487 _instances = {} | |
1488 | |
1489 def __new__(cls, val): | |
1490 return cls._instances.setdefault(val, str.__new__(cls, val)) | |
1491 | |
1492 | |
1493 class Stream(object): | |
1494 """Represents a stream of markup events. | |
1495 | |
1496 This class is basically an iterator over the events. | |
1497 | |
1498 Stream events are tuples of the form:: | |
1499 | |
1500 (kind, data, position) | |
1501 | |
1502 where ``kind`` is the event kind (such as `START`, `END`, `TEXT`, etc), | |
1503 ``data`` depends on the kind of event, and ``position`` is a | |
1504 ``(filename, line, offset)`` tuple that contains the location of the | |
1505 original element or text in the input. If the original location is unknown, | |
1506 ``position`` is ``(None, -1, -1)``. | |
1507 | |
1508 Also provided are ways to serialize the stream to text. The `serialize()` | |
1509 method will return an iterator over generated strings, while `render()` | |
1510 returns the complete generated text at once. Both accept various parameters | |
1511 that impact the way the stream is serialized. | |
1512 """ | |
1513 __slots__ = ['events', 'serializer'] | |
1514 | |
1515 START = StreamEventKind('START') #: a start tag | |
1516 END = StreamEventKind('END') #: an end tag | |
1517 TEXT = StreamEventKind('TEXT') #: literal text | |
1518 XML_DECL = StreamEventKind('XML_DECL') #: XML declaration | |
1519 DOCTYPE = StreamEventKind('DOCTYPE') #: doctype declaration | |
1520 START_NS = StreamEventKind('START_NS') #: start namespace mapping | |
1521 END_NS = StreamEventKind('END_NS') #: end namespace mapping | |
1522 START_CDATA = StreamEventKind('START_CDATA') #: start CDATA section | |
1523 END_CDATA = StreamEventKind('END_CDATA') #: end CDATA section | |
1524 PI = StreamEventKind('PI') #: processing instruction | |
1525 COMMENT = StreamEventKind('COMMENT') #: comment | |
1526 | |
1527 def __init__(self, events, serializer=None): | |
1528 """Initialize the stream with a sequence of markup events. | |
1529 | |
1530 :param events: a sequence or iterable providing the events | |
1531 :param serializer: the default serialization method to use for this | |
1532 stream | |
1533 | |
1534 :note: Changed in 0.5: added the `serializer` argument | |
1535 """ | |
1536 self.events = events #: The underlying iterable producing the events | |
1537 self.serializer = serializer #: The default serializion method | |
1538 | |
1539 def __iter__(self): | |
1540 return iter(self.events) | |
1541 | |
1542 def __or__(self, function): | |
1543 """Override the "bitwise or" operator to apply filters or serializers | |
1544 to the stream, providing a syntax similar to pipes on Unix shells. | |
1545 | |
1546 Assume the following stream produced by the `HTML` function: | |
1547 | |
1548 >>> from genshi.input import HTML | |
1549 >>> html = HTML('''<p onclick="alert('Whoa')">Hello, world!</p>''') | |
1550 >>> print(html) | |
1551 <p onclick="alert('Whoa')">Hello, world!</p> | |
1552 | |
1553 A filter such as the HTML sanitizer can be applied to that stream using | |
1554 the pipe notation as follows: | |
1555 | |
1556 >>> from genshi.filters import HTMLSanitizer | |
1557 >>> sanitizer = HTMLSanitizer() | |
1558 >>> print(html | sanitizer) | |
1559 <p>Hello, world!</p> | |
1560 | |
1561 Filters can be any function that accepts and produces a stream (where | |
1562 a stream is anything that iterates over events): | |
1563 | |
1564 >>> def uppercase(stream): | |
1565 ... for kind, data, pos in stream: | |
1566 ... if kind is TEXT: | |
1567 ... data = data.upper() | |
1568 ... yield kind, data, pos | |
1569 >>> print(html | sanitizer | uppercase) | |
1570 <p>HELLO, WORLD!</p> | |
1571 | |
1572 Serializers can also be used with this notation: | |
1573 | |
1574 >>> from genshi.output import TextSerializer | |
1575 >>> output = TextSerializer() | |
1576 >>> print(html | sanitizer | uppercase | output) | |
1577 HELLO, WORLD! | |
1578 | |
1579 Commonly, serializers should be used at the end of the "pipeline"; | |
1580 using them somewhere in the middle may produce unexpected results. | |
1581 | |
1582 :param function: the callable object that should be applied as a filter | |
1583 :return: the filtered stream | |
1584 :rtype: `Stream` | |
1585 """ | |
1586 return Stream(_ensure(function(self)), serializer=self.serializer) | |
1587 | |
1588 def filter(self, *filters): | |
1589 """Apply filters to the stream. | |
1590 | |
1591 This method returns a new stream with the given filters applied. The | |
1592 filters must be callables that accept the stream object as parameter, | |
1593 and return the filtered stream. | |
1594 | |
1595 The call:: | |
1596 | |
1597 stream.filter(filter1, filter2) | |
1598 | |
1599 is equivalent to:: | |
1600 | |
1601 stream | filter1 | filter2 | |
1602 | |
1603 :param filters: one or more callable objects that should be applied as | |
1604 filters | |
1605 :return: the filtered stream | |
1606 :rtype: `Stream` | |
1607 """ | |
1608 return reduce(operator.or_, (self,) + filters) | |
1609 | |
1610 def render(self, method=None, encoding='utf-8', out=None, **kwargs): | |
1611 """Return a string representation of the stream. | |
1612 | |
1613 Any additional keyword arguments are passed to the serializer, and thus | |
1614 depend on the `method` parameter value. | |
1615 | |
1616 :param method: determines how the stream is serialized; can be either | |
1617 "xml", "xhtml", "html", "text", or a custom serializer | |
1618 class; if `None`, the default serialization method of | |
1619 the stream is used | |
1620 :param encoding: how the output string should be encoded; if set to | |
1621 `None`, this method returns a `unicode` object | |
1622 :param out: a file-like object that the output should be written to | |
1623 instead of being returned as one big string; note that if | |
1624 this is a file or socket (or similar), the `encoding` must | |
1625 not be `None` (that is, the output must be encoded) | |
1626 :return: a `str` or `unicode` object (depending on the `encoding` | |
1627 parameter), or `None` if the `out` parameter is provided | |
1628 :rtype: `basestring` | |
1629 | |
1630 :see: XMLSerializer, XHTMLSerializer, HTMLSerializer, TextSerializer | |
1631 :note: Changed in 0.5: added the `out` parameter | |
1632 """ | |
1633 from genshi.output import encode | |
1634 if method is None: | |
1635 method = self.serializer or 'xml' | |
1636 generator = self.serialize(method=method, **kwargs) | |
1637 return encode(generator, method=method, encoding=encoding, out=out) | |
1638 | |
1639 def select(self, path, namespaces=None, variables=None): | |
1640 """Return a new stream that contains the events matching the given | |
1641 XPath expression. | |
1642 | |
1643 >>> from genshi import HTML | |
1644 >>> stream = HTML('<doc><elem>foo</elem><elem>bar</elem></doc>') | |
1645 >>> print(stream.select('elem')) | |
1646 <elem>foo</elem><elem>bar</elem> | |
1647 >>> print(stream.select('elem/text()')) | |
1648 foobar | |
1649 | |
1650 Note that the outermost element of the stream becomes the *context | |
1651 node* for the XPath test. That means that the expression "doc" would | |
1652 not match anything in the example above, because it only tests against | |
1653 child elements of the outermost element: | |
1654 | |
1655 >>> print(stream.select('doc')) | |
1656 <BLANKLINE> | |
1657 | |
1658 You can use the "." expression to match the context node itself | |
1659 (although that usually makes little sense): | |
1660 | |
1661 >>> print(stream.select('.')) | |
1662 <doc><elem>foo</elem><elem>bar</elem></doc> | |
1663 | |
1664 :param path: a string containing the XPath expression | |
1665 :param namespaces: mapping of namespace prefixes used in the path | |
1666 :param variables: mapping of variable names to values | |
1667 :return: the selected substream | |
1668 :rtype: `Stream` | |
1669 :raises PathSyntaxError: if the given path expression is invalid or not | |
1670 supported | |
1671 """ | |
1672 from genshi.path import Path | |
1673 return Path(path).select(self, namespaces, variables) | |
1674 | |
1675 def serialize(self, method='xml', **kwargs): | |
1676 """Generate strings corresponding to a specific serialization of the | |
1677 stream. | |
1678 | |
1679 Unlike the `render()` method, this method is a generator that returns | |
1680 the serialized output incrementally, as opposed to returning a single | |
1681 string. | |
1682 | |
1683 Any additional keyword arguments are passed to the serializer, and thus | |
1684 depend on the `method` parameter value. | |
1685 | |
1686 :param method: determines how the stream is serialized; can be either | |
1687 "xml", "xhtml", "html", "text", or a custom serializer | |
1688 class; if `None`, the default serialization method of | |
1689 the stream is used | |
1690 :return: an iterator over the serialization results (`Markup` or | |
1691 `unicode` objects, depending on the serialization method) | |
1692 :rtype: ``iterator`` | |
1693 :see: XMLSerializer, XHTMLSerializer, HTMLSerializer, TextSerializer | |
1694 """ | |
1695 from genshi.output import get_serializer | |
1696 if method is None: | |
1697 method = self.serializer or 'xml' | |
1698 return get_serializer(method, **kwargs)(_ensure(self)) | |
1699 | |
1700 def __str__(self): | |
1701 return self.render() | |
1702 | |
1703 def __unicode__(self): | |
1704 return self.render(encoding=None) | |
1705 | |
1706 def __html__(self): | |
1707 return self | |
1708 | |
1709 | |
1710 START = Stream.START | |
1711 END = Stream.END | |
1712 TEXT = Stream.TEXT | |
1713 XML_DECL = Stream.XML_DECL | |
1714 DOCTYPE = Stream.DOCTYPE | |
1715 START_NS = Stream.START_NS | |
1716 END_NS = Stream.END_NS | |
1717 START_CDATA = Stream.START_CDATA | |
1718 END_CDATA = Stream.END_CDATA | |
1719 PI = Stream.PI | |
1720 COMMENT = Stream.COMMENT | |
1721 | |
1722 | |
1723 def _ensure(stream): | |
1724 """Ensure that every item on the stream is actually a markup event.""" | |
1725 stream = iter(stream) | |
1726 event = stream.next() | |
1727 | |
1728 # Check whether the iterable is a real markup event stream by examining the | |
1729 # first item it yields; if it's not we'll need to do some conversion | |
1730 if type(event) is not tuple or len(event) != 3: | |
1731 for event in chain([event], stream): | |
1732 if hasattr(event, 'totuple'): | |
1733 event = event.totuple() | |
1734 else: | |
1735 event = TEXT, unicode(event), (None, -1, -1) | |
1736 yield event | |
1737 return | |
1738 | |
1739 # This looks like a markup event stream, so we'll just pass it through | |
1740 # unchanged | |
1741 yield event | |
1742 for event in stream: | |
1743 yield event | |
1744 | |
1745 | |
1746 class Attrs(tuple): | |
1747 """Immutable sequence type that stores the attributes of an element. | |
1748 | |
1749 Ordering of the attributes is preserved, while access by name is also | |
1750 supported. | |
1751 | |
1752 >>> attrs = Attrs([('href', '#'), ('title', 'Foo')]) | |
1753 >>> attrs | |
1754 Attrs([('href', '#'), ('title', 'Foo')]) | |
1755 | |
1756 >>> 'href' in attrs | |
1757 True | |
1758 >>> 'tabindex' in attrs | |
1759 False | |
1760 >>> attrs.get('title') | |
1761 'Foo' | |
1762 | |
1763 Instances may not be manipulated directly. Instead, the operators ``|`` and | |
1764 ``-`` can be used to produce new instances that have specific attributes | |
1765 added, replaced or removed. | |
1766 | |
1767 To remove an attribute, use the ``-`` operator. The right hand side can be | |
1768 either a string or a set/sequence of strings, identifying the name(s) of | |
1769 the attribute(s) to remove: | |
1770 | |
1771 >>> attrs - 'title' | |
1772 Attrs([('href', '#')]) | |
1773 >>> attrs - ('title', 'href') | |
1774 Attrs() | |
1775 | |
1776 The original instance is not modified, but the operator can of course be | |
1777 used with an assignment: | |
1778 | |
1779 >>> attrs | |
1780 Attrs([('href', '#'), ('title', 'Foo')]) | |
1781 >>> attrs -= 'title' | |
1782 >>> attrs | |
1783 Attrs([('href', '#')]) | |
1784 | |
1785 To add a new attribute, use the ``|`` operator, where the right hand value | |
1786 is a sequence of ``(name, value)`` tuples (which includes `Attrs` | |
1787 instances): | |
1788 | |
1789 >>> attrs | [('title', 'Bar')] | |
1790 Attrs([('href', '#'), ('title', 'Bar')]) | |
1791 | |
1792 If the attributes already contain an attribute with a given name, the value | |
1793 of that attribute is replaced: | |
1794 | |
1795 >>> attrs | [('href', 'http://example.org/')] | |
1796 Attrs([('href', 'http://example.org/')]) | |
1797 """ | |
1798 __slots__ = [] | |
1799 | |
1800 def __contains__(self, name): | |
1801 """Return whether the list includes an attribute with the specified | |
1802 name. | |
1803 | |
1804 :return: `True` if the list includes the attribute | |
1805 :rtype: `bool` | |
1806 """ | |
1807 for attr, _ in self: | |
1808 if attr == name: | |
1809 return True | |
1810 | |
1811 def __getitem__(self, i): | |
1812 """Return an item or slice of the attributes list. | |
1813 | |
1814 >>> attrs = Attrs([('href', '#'), ('title', 'Foo')]) | |
1815 >>> attrs[1] | |
1816 ('title', 'Foo') | |
1817 >>> attrs[1:] | |
1818 Attrs([('title', 'Foo')]) | |
1819 """ | |
1820 items = tuple.__getitem__(self, i) | |
1821 if type(i) is slice: | |
1822 return Attrs(items) | |
1823 return items | |
1824 | |
1825 def __getslice__(self, i, j): | |
1826 """Return a slice of the attributes list. | |
1827 | |
1828 >>> attrs = Attrs([('href', '#'), ('title', 'Foo')]) | |
1829 >>> attrs[1:] | |
1830 Attrs([('title', 'Foo')]) | |
1831 """ | |
1832 return Attrs(tuple.__getslice__(self, i, j)) | |
1833 | |
1834 def __or__(self, attrs): | |
1835 """Return a new instance that contains the attributes in `attrs` in | |
1836 addition to any already existing attributes. | |
1837 | |
1838 :return: a new instance with the merged attributes | |
1839 :rtype: `Attrs` | |
1840 """ | |
1841 repl = dict([(an, av) for an, av in attrs if an in self]) | |
1842 return Attrs([(sn, repl.get(sn, sv)) for sn, sv in self] + | |
1843 [(an, av) for an, av in attrs if an not in self]) | |
1844 | |
1845 def __repr__(self): | |
1846 if not self: | |
1847 return 'Attrs()' | |
1848 return 'Attrs([%s])' % ', '.join([repr(item) for item in self]) | |
1849 | |
1850 def __sub__(self, names): | |
1851 """Return a new instance with all attributes with a name in `names` are | |
1852 removed. | |
1853 | |
1854 :param names: the names of the attributes to remove | |
1855 :return: a new instance with the attribute removed | |
1856 :rtype: `Attrs` | |
1857 """ | |
1858 if isinstance(names, basestring): | |
1859 names = (names,) | |
1860 return Attrs([(name, val) for name, val in self if name not in names]) | |
1861 | |
1862 def get(self, name, default=None): | |
1863 """Return the value of the attribute with the specified name, or the | |
1864 value of the `default` parameter if no such attribute is found. | |
1865 | |
1866 :param name: the name of the attribute | |
1867 :param default: the value to return when the attribute does not exist | |
1868 :return: the attribute value, or the `default` value if that attribute | |
1869 does not exist | |
1870 :rtype: `object` | |
1871 """ | |
1872 for attr, value in self: | |
1873 if attr == name: | |
1874 return value | |
1875 return default | |
1876 | |
1877 def totuple(self): | |
1878 """Return the attributes as a markup event. | |
1879 | |
1880 The returned event is a `TEXT` event, the data is the value of all | |
1881 attributes joined together. | |
1882 | |
1883 >>> Attrs([('href', '#'), ('title', 'Foo')]).totuple() | |
1884 ('TEXT', '#Foo', (None, -1, -1)) | |
1885 | |
1886 :return: a `TEXT` event | |
1887 :rtype: `tuple` | |
1888 """ | |
1889 return TEXT, ''.join([x[1] for x in self]), (None, -1, -1) | |
1890 | |
1891 | |
1892 class Markup(unicode): | |
1893 """Marks a string as being safe for inclusion in HTML/XML output without | |
1894 needing to be escaped. | |
1895 """ | |
1896 __slots__ = [] | |
1897 | |
1898 def __add__(self, other): | |
1899 return Markup(unicode.__add__(self, escape(other))) | |
1900 | |
1901 def __radd__(self, other): | |
1902 return Markup(unicode.__add__(escape(other), self)) | |
1903 | |
1904 def __mod__(self, args): | |
1905 if isinstance(args, dict): | |
1906 args = dict(zip(args.keys(), map(escape, args.values()))) | |
1907 elif isinstance(args, (list, tuple)): | |
1908 args = tuple(map(escape, args)) | |
1909 else: | |
1910 args = escape(args) | |
1911 return Markup(unicode.__mod__(self, args)) | |
1912 | |
1913 def __mul__(self, num): | |
1914 return Markup(unicode.__mul__(self, num)) | |
1915 __rmul__ = __mul__ | |
1916 | |
1917 def __repr__(self): | |
1918 return "<%s %s>" % (type(self).__name__, unicode.__repr__(self)) | |
1919 | |
1920 def join(self, seq, escape_quotes=True): | |
1921 """Return a `Markup` object which is the concatenation of the strings | |
1922 in the given sequence, where this `Markup` object is the separator | |
1923 between the joined elements. | |
1924 | |
1925 Any element in the sequence that is not a `Markup` instance is | |
1926 automatically escaped. | |
1927 | |
1928 :param seq: the sequence of strings to join | |
1929 :param escape_quotes: whether double quote characters in the elements | |
1930 should be escaped | |
1931 :return: the joined `Markup` object | |
1932 :rtype: `Markup` | |
1933 :see: `escape` | |
1934 """ | |
1935 return Markup(unicode.join(self, [escape(item, quotes=escape_quotes) | |
1936 for item in seq])) | |
1937 | |
1938 @classmethod | |
1939 def escape(cls, text, quotes=True): | |
1940 """Create a Markup instance from a string and escape special characters | |
1941 it may contain (<, >, & and \"). | |
1942 | |
1943 >>> escape('"1 < 2"') | |
1944 <Markup u'"1 < 2"'> | |
1945 | |
1946 If the `quotes` parameter is set to `False`, the \" character is left | |
1947 as is. Escaping quotes is generally only required for strings that are | |
1948 to be used in attribute values. | |
1949 | |
1950 >>> escape('"1 < 2"', quotes=False) | |
1951 <Markup u'"1 < 2"'> | |
1952 | |
1953 :param text: the text to escape | |
1954 :param quotes: if ``True``, double quote characters are escaped in | |
1955 addition to the other special characters | |
1956 :return: the escaped `Markup` string | |
1957 :rtype: `Markup` | |
1958 """ | |
1959 if not text: | |
1960 return cls() | |
1961 if type(text) is cls: | |
1962 return text | |
1963 if hasattr(text, '__html__'): | |
1964 return Markup(text.__html__()) | |
1965 | |
1966 text = text.replace('&', '&') \ | |
1967 .replace('<', '<') \ | |
1968 .replace('>', '>') | |
1969 if quotes: | |
1970 text = text.replace('"', '"') | |
1971 return cls(text) | |
1972 | |
1973 def unescape(self): | |
1974 """Reverse-escapes &, <, >, and \" and returns a `unicode` object. | |
1975 | |
1976 >>> Markup('1 < 2').unescape() | |
1977 u'1 < 2' | |
1978 | |
1979 :return: the unescaped string | |
1980 :rtype: `unicode` | |
1981 :see: `genshi.core.unescape` | |
1982 """ | |
1983 if not self: | |
1984 return '' | |
1985 return unicode(self).replace('"', '"') \ | |
1986 .replace('>', '>') \ | |
1987 .replace('<', '<') \ | |
1988 .replace('&', '&') | |
1989 | |
1990 def stripentities(self, keepxmlentities=False): | |
1991 """Return a copy of the text with any character or numeric entities | |
1992 replaced by the equivalent UTF-8 characters. | |
1993 | |
1994 If the `keepxmlentities` parameter is provided and evaluates to `True`, | |
1995 the core XML entities (``&``, ``'``, ``>``, ``<`` and | |
1996 ``"``) are not stripped. | |
1997 | |
1998 :return: a `Markup` instance with entities removed | |
1999 :rtype: `Markup` | |
2000 :see: `genshi.util.stripentities` | |
2001 """ | |
2002 return Markup(stripentities(self, keepxmlentities=keepxmlentities)) | |
2003 | |
2004 def striptags(self): | |
2005 """Return a copy of the text with all XML/HTML tags removed. | |
2006 | |
2007 :return: a `Markup` instance with all tags removed | |
2008 :rtype: `Markup` | |
2009 :see: `genshi.util.striptags` | |
2010 """ | |
2011 return Markup(striptags(self)) | |
2012 | |
2013 | |
2014 try: | |
2015 from genshi._speedups import Markup | |
2016 except ImportError: | |
2017 pass # just use the Python implementation | |
2018 | |
2019 | |
2020 escape = Markup.escape | |
2021 | |
2022 | |
2023 def unescape(text): | |
2024 """Reverse-escapes &, <, >, and \" and returns a `unicode` object. | |
2025 | |
2026 >>> unescape(Markup('1 < 2')) | |
2027 u'1 < 2' | |
2028 | |
2029 If the provided `text` object is not a `Markup` instance, it is returned | |
2030 unchanged. | |
2031 | |
2032 >>> unescape('1 < 2') | |
2033 '1 < 2' | |
2034 | |
2035 :param text: the text to unescape | |
2036 :return: the unescsaped string | |
2037 :rtype: `unicode` | |
2038 """ | |
2039 if not isinstance(text, Markup): | |
2040 return text | |
2041 return text.unescape() | |
2042 | |
2043 | |
2044 class Namespace(object): | |
2045 """Utility class creating and testing elements with a namespace. | |
2046 | |
2047 Internally, namespace URIs are encoded in the `QName` of any element or | |
2048 attribute, the namespace URI being enclosed in curly braces. This class | |
2049 helps create and test these strings. | |
2050 | |
2051 A `Namespace` object is instantiated with the namespace URI. | |
2052 | |
2053 >>> html = Namespace('http://www.w3.org/1999/xhtml') | |
2054 >>> html | |
2055 Namespace('http://www.w3.org/1999/xhtml') | |
2056 >>> html.uri | |
2057 u'http://www.w3.org/1999/xhtml' | |
2058 | |
2059 The `Namespace` object can than be used to generate `QName` objects with | |
2060 that namespace: | |
2061 | |
2062 >>> html.body | |
2063 QName('http://www.w3.org/1999/xhtml}body') | |
2064 >>> html.body.localname | |
2065 u'body' | |
2066 >>> html.body.namespace | |
2067 u'http://www.w3.org/1999/xhtml' | |
2068 | |
2069 The same works using item access notation, which is useful for element or | |
2070 attribute names that are not valid Python identifiers: | |
2071 | |
2072 >>> html['body'] | |
2073 QName('http://www.w3.org/1999/xhtml}body') | |
2074 | |
2075 A `Namespace` object can also be used to test whether a specific `QName` | |
2076 belongs to that namespace using the ``in`` operator: | |
2077 | |
2078 >>> qname = html.body | |
2079 >>> qname in html | |
2080 True | |
2081 >>> qname in Namespace('http://www.w3.org/2002/06/xhtml2') | |
2082 False | |
2083 """ | |
2084 def __new__(cls, uri): | |
2085 if type(uri) is cls: | |
2086 return uri | |
2087 return object.__new__(cls) | |
2088 | |
2089 def __getnewargs__(self): | |
2090 return (self.uri,) | |
2091 | |
2092 def __getstate__(self): | |
2093 return self.uri | |
2094 | |
2095 def __setstate__(self, uri): | |
2096 self.uri = uri | |
2097 | |
2098 def __init__(self, uri): | |
2099 self.uri = unicode(uri) | |
2100 | |
2101 def __contains__(self, qname): | |
2102 return qname.namespace == self.uri | |
2103 | |
2104 def __ne__(self, other): | |
2105 return not self == other | |
2106 | |
2107 def __eq__(self, other): | |
2108 if isinstance(other, Namespace): | |
2109 return self.uri == other.uri | |
2110 return self.uri == other | |
2111 | |
2112 def __getitem__(self, name): | |
2113 return QName(self.uri + '}' + name) | |
2114 __getattr__ = __getitem__ | |
2115 | |
2116 def __hash__(self): | |
2117 return hash(self.uri) | |
2118 | |
2119 def __repr__(self): | |
2120 return 'Namespace(%s)' % stringrepr(self.uri) | |
2121 | |
2122 def __str__(self): | |
2123 return self.uri.encode('utf-8') | |
2124 | |
2125 def __unicode__(self): | |
2126 return self.uri | |
2127 | |
2128 | |
2129 # The namespace used by attributes such as xml:lang and xml:space | |
2130 XML_NAMESPACE = Namespace('http://www.w3.org/XML/1998/namespace') | |
2131 | |
2132 | |
2133 class QName(unicode): | |
2134 """A qualified element or attribute name. | |
2135 | |
2136 The unicode value of instances of this class contains the qualified name of | |
2137 the element or attribute, in the form ``{namespace-uri}local-name``. The | |
2138 namespace URI can be obtained through the additional `namespace` attribute, | |
2139 while the local name can be accessed through the `localname` attribute. | |
2140 | |
2141 >>> qname = QName('foo') | |
2142 >>> qname | |
2143 QName('foo') | |
2144 >>> qname.localname | |
2145 u'foo' | |
2146 >>> qname.namespace | |
2147 | |
2148 >>> qname = QName('http://www.w3.org/1999/xhtml}body') | |
2149 >>> qname | |
2150 QName('http://www.w3.org/1999/xhtml}body') | |
2151 >>> qname.localname | |
2152 u'body' | |
2153 >>> qname.namespace | |
2154 u'http://www.w3.org/1999/xhtml' | |
2155 """ | |
2156 __slots__ = ['namespace', 'localname'] | |
2157 | |
2158 def __new__(cls, qname): | |
2159 """Create the `QName` instance. | |
2160 | |
2161 :param qname: the qualified name as a string of the form | |
2162 ``{namespace-uri}local-name``, where the leading curly | |
2163 brace is optional | |
2164 """ | |
2165 if type(qname) is cls: | |
2166 return qname | |
2167 | |
2168 parts = qname.lstrip('{').split('}', 1) | |
2169 if len(parts) > 1: | |
2170 self = unicode.__new__(cls, '{%s' % qname) | |
2171 self.namespace, self.localname = map(unicode, parts) | |
2172 else: | |
2173 self = unicode.__new__(cls, qname) | |
2174 self.namespace, self.localname = None, unicode(qname) | |
2175 return self | |
2176 | |
2177 def __getnewargs__(self): | |
2178 return (self.lstrip('{'),) | |
2179 | |
2180 def __repr__(self): | |
2181 return 'QName(%s)' % stringrepr(self.lstrip('{')) | |
2182 # -*- coding: utf-8 -*- | |
2183 # | |
2184 # Copyright (C) 2006-2009 Edgewall Software | |
2185 # All rights reserved. | |
2186 # | |
2187 # This software is licensed as described in the file COPYING, which | |
2188 # you should have received as part of this distribution. The terms | |
2189 # are also available at http://genshi.edgewall.org/wiki/License. | |
2190 # | |
2191 # This software consists of voluntary contributions made by many | |
2192 # individuals. For the exact contribution history, see the revision | |
2193 # history and logs, available at http://genshi.edgewall.org/log/. | |
2194 | |
2195 """Core classes for markup processing.""" | |
2196 | |
2197 try: | |
2198 reduce # builtin in Python < 3 | |
2199 except NameError: | |
2200 from functools import reduce | |
2201 from itertools import chain | |
2202 import operator | |
2203 | |
2204 from genshi.util import plaintext, stripentities, striptags, stringrepr | |
2205 | |
2206 __all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Attrs', 'Namespace', | |
2207 'QName'] | |
2208 __docformat__ = 'restructuredtext en' | |
2209 | |
2210 | |
2211 class StreamEventKind(str): | |
2212 """A kind of event on a markup stream.""" | |
2213 __slots__ = [] | |
2214 _instances = {} | |
2215 | |
2216 def __new__(cls, val): | |
2217 return cls._instances.setdefault(val, str.__new__(cls, val)) | |
2218 | |
2219 | |
2220 class Stream(object): | |
2221 """Represents a stream of markup events. | |
2222 | |
2223 This class is basically an iterator over the events. | |
2224 | |
2225 Stream events are tuples of the form:: | |
2226 | |
2227 (kind, data, position) | |
2228 | |
2229 where ``kind`` is the event kind (such as `START`, `END`, `TEXT`, etc), | |
2230 ``data`` depends on the kind of event, and ``position`` is a | |
2231 ``(filename, line, offset)`` tuple that contains the location of the | |
2232 original element or text in the input. If the original location is unknown, | |
2233 ``position`` is ``(None, -1, -1)``. | |
2234 | |
2235 Also provided are ways to serialize the stream to text. The `serialize()` | |
2236 method will return an iterator over generated strings, while `render()` | |
2237 returns the complete generated text at once. Both accept various parameters | |
2238 that impact the way the stream is serialized. | |
2239 """ | |
2240 __slots__ = ['events', 'serializer'] | |
2241 | |
2242 START = StreamEventKind('START') #: a start tag | |
2243 END = StreamEventKind('END') #: an end tag | |
2244 TEXT = StreamEventKind('TEXT') #: literal text | |
2245 XML_DECL = StreamEventKind('XML_DECL') #: XML declaration | |
2246 DOCTYPE = StreamEventKind('DOCTYPE') #: doctype declaration | |
2247 START_NS = StreamEventKind('START_NS') #: start namespace mapping | |
2248 END_NS = StreamEventKind('END_NS') #: end namespace mapping | |
2249 START_CDATA = StreamEventKind('START_CDATA') #: start CDATA section | |
2250 END_CDATA = StreamEventKind('END_CDATA') #: end CDATA section | |
2251 PI = StreamEventKind('PI') #: processing instruction | |
2252 COMMENT = StreamEventKind('COMMENT') #: comment | |
2253 | |
2254 def __init__(self, events, serializer=None): | |
2255 """Initialize the stream with a sequence of markup events. | |
2256 | |
2257 :param events: a sequence or iterable providing the events | |
2258 :param serializer: the default serialization method to use for this | |
2259 stream | |
2260 | |
2261 :note: Changed in 0.5: added the `serializer` argument | |
2262 """ | |
2263 self.events = events #: The underlying iterable producing the events | |
2264 self.serializer = serializer #: The default serializion method | |
2265 | |
2266 def __iter__(self): | |
2267 return iter(self.events) | |
2268 | |
2269 def __or__(self, function): | |
2270 """Override the "bitwise or" operator to apply filters or serializers | |
2271 to the stream, providing a syntax similar to pipes on Unix shells. | |
2272 | |
2273 Assume the following stream produced by the `HTML` function: | |
2274 | |
2275 >>> from genshi.input import HTML | |
2276 >>> html = HTML('''<p onclick="alert('Whoa')">Hello, world!</p>''') | |
2277 >>> print(html) | |
2278 <p onclick="alert('Whoa')">Hello, world!</p> | |
2279 | |
2280 A filter such as the HTML sanitizer can be applied to that stream using | |
2281 the pipe notation as follows: | |
2282 | |
2283 >>> from genshi.filters import HTMLSanitizer | |
2284 >>> sanitizer = HTMLSanitizer() | |
2285 >>> print(html | sanitizer) | |
2286 <p>Hello, world!</p> | |
2287 | |
2288 Filters can be any function that accepts and produces a stream (where | |
2289 a stream is anything that iterates over events): | |
2290 | |
2291 >>> def uppercase(stream): | |
2292 ... for kind, data, pos in stream: | |
2293 ... if kind is TEXT: | |
2294 ... data = data.upper() | |
2295 ... yield kind, data, pos | |
2296 >>> print(html | sanitizer | uppercase) | |
2297 <p>HELLO, WORLD!</p> | |
2298 | |
2299 Serializers can also be used with this notation: | |
2300 | |
2301 >>> from genshi.output import TextSerializer | |
2302 >>> output = TextSerializer() | |
2303 >>> print(html | sanitizer | uppercase | output) | |
2304 HELLO, WORLD! | |
2305 | |
2306 Commonly, serializers should be used at the end of the "pipeline"; | |
2307 using them somewhere in the middle may produce unexpected results. | |
2308 | |
2309 :param function: the callable object that should be applied as a filter | |
2310 :return: the filtered stream | |
2311 :rtype: `Stream` | |
2312 """ | |
2313 return Stream(_ensure(function(self)), serializer=self.serializer) | |
2314 | |
2315 def filter(self, *filters): | |
2316 """Apply filters to the stream. | |
2317 | |
2318 This method returns a new stream with the given filters applied. The | |
2319 filters must be callables that accept the stream object as parameter, | |
2320 and return the filtered stream. | |
2321 | |
2322 The call:: | |
2323 | |
2324 stream.filter(filter1, filter2) | |
2325 | |
2326 is equivalent to:: | |
2327 | |
2328 stream | filter1 | filter2 | |
2329 | |
2330 :param filters: one or more callable objects that should be applied as | |
2331 filters | |
2332 :return: the filtered stream | |
2333 :rtype: `Stream` | |
2334 """ | |
2335 return reduce(operator.or_, (self,) + filters) | |
2336 | |
2337 def render(self, method=None, encoding='utf-8', out=None, **kwargs): | |
2338 """Return a string representation of the stream. | |
2339 | |
2340 Any additional keyword arguments are passed to the serializer, and thus | |
2341 depend on the `method` parameter value. | |
2342 | |
2343 :param method: determines how the stream is serialized; can be either | |
2344 "xml", "xhtml", "html", "text", or a custom serializer | |
2345 class; if `None`, the default serialization method of | |
2346 the stream is used | |
2347 :param encoding: how the output string should be encoded; if set to | |
2348 `None`, this method returns a `unicode` object | |
2349 :param out: a file-like object that the output should be written to | |
2350 instead of being returned as one big string; note that if | |
2351 this is a file or socket (or similar), the `encoding` must | |
2352 not be `None` (that is, the output must be encoded) | |
2353 :return: a `str` or `unicode` object (depending on the `encoding` | |
2354 parameter), or `None` if the `out` parameter is provided | |
2355 :rtype: `basestring` | |
2356 | |
2357 :see: XMLSerializer, XHTMLSerializer, HTMLSerializer, TextSerializer | |
2358 :note: Changed in 0.5: added the `out` parameter | |
2359 """ | |
2360 from genshi.output import encode | |
2361 if method is None: | |
2362 method = self.serializer or 'xml' | |
2363 generator = self.serialize(method=method, **kwargs) | |
2364 return encode(generator, method=method, encoding=encoding, out=out) | |
2365 | |
2366 def select(self, path, namespaces=None, variables=None): | |
2367 """Return a new stream that contains the events matching the given | |
2368 XPath expression. | |
2369 | |
2370 >>> from genshi import HTML | |
2371 >>> stream = HTML('<doc><elem>foo</elem><elem>bar</elem></doc>') | |
2372 >>> print(stream.select('elem')) | |
2373 <elem>foo</elem><elem>bar</elem> | |
2374 >>> print(stream.select('elem/text()')) | |
2375 foobar | |
2376 | |
2377 Note that the outermost element of the stream becomes the *context | |
2378 node* for the XPath test. That means that the expression "doc" would | |
2379 not match anything in the example above, because it only tests against | |
2380 child elements of the outermost element: | |
2381 | |
2382 >>> print(stream.select('doc')) | |
2383 <BLANKLINE> | |
2384 | |
2385 You can use the "." expression to match the context node itself | |
2386 (although that usually makes little sense): | |
2387 | |
2388 >>> print(stream.select('.')) | |
2389 <doc><elem>foo</elem><elem>bar</elem></doc> | |
2390 | |
2391 :param path: a string containing the XPath expression | |
2392 :param namespaces: mapping of namespace prefixes used in the path | |
2393 :param variables: mapping of variable names to values | |
2394 :return: the selected substream | |
2395 :rtype: `Stream` | |
2396 :raises PathSyntaxError: if the given path expression is invalid or not | |
2397 supported | |
2398 """ | |
2399 from genshi.path import Path | |
2400 return Path(path).select(self, namespaces, variables) | |
2401 | |
2402 def serialize(self, method='xml', **kwargs): | |
2403 """Generate strings corresponding to a specific serialization of the | |
2404 stream. | |
2405 | |
2406 Unlike the `render()` method, this method is a generator that returns | |
2407 the serialized output incrementally, as opposed to returning a single | |
2408 string. | |
2409 | |
2410 Any additional keyword arguments are passed to the serializer, and thus | |
2411 depend on the `method` parameter value. | |
2412 | |
2413 :param method: determines how the stream is serialized; can be either | |
2414 "xml", "xhtml", "html", "text", or a custom serializer | |
2415 class; if `None`, the default serialization method of | |
2416 the stream is used | |
2417 :return: an iterator over the serialization results (`Markup` or | |
2418 `unicode` objects, depending on the serialization method) | |
2419 :rtype: ``iterator`` | |
2420 :see: XMLSerializer, XHTMLSerializer, HTMLSerializer, TextSerializer | |
2421 """ | |
2422 from genshi.output import get_serializer | |
2423 if method is None: | |
2424 method = self.serializer or 'xml' | |
2425 return get_serializer(method, **kwargs)(_ensure(self)) | |
2426 | |
2427 def __str__(self): | |
2428 return self.render() | |
2429 | |
2430 def __unicode__(self): | |
2431 return self.render(encoding=None) | |
2432 | |
2433 def __html__(self): | |
2434 return self | |
2435 | |
2436 | |
2437 START = Stream.START | |
2438 END = Stream.END | |
2439 TEXT = Stream.TEXT | |
2440 XML_DECL = Stream.XML_DECL | |
2441 DOCTYPE = Stream.DOCTYPE | |
2442 START_NS = Stream.START_NS | |
2443 END_NS = Stream.END_NS | |
2444 START_CDATA = Stream.START_CDATA | |
2445 END_CDATA = Stream.END_CDATA | |
2446 PI = Stream.PI | |
2447 COMMENT = Stream.COMMENT | |
2448 | |
2449 | |
2450 def _ensure(stream): | |
2451 """Ensure that every item on the stream is actually a markup event.""" | |
2452 stream = iter(stream) | |
2453 event = stream.next() | |
2454 | |
2455 # Check whether the iterable is a real markup event stream by examining the | |
2456 # first item it yields; if it's not we'll need to do some conversion | |
2457 if type(event) is not tuple or len(event) != 3: | |
2458 for event in chain([event], stream): | |
2459 if hasattr(event, 'totuple'): | |
2460 event = event.totuple() | |
2461 else: | |
2462 event = TEXT, unicode(event), (None, -1, -1) | |
2463 yield event | |
2464 return | |
2465 | |
2466 # This looks like a markup event stream, so we'll just pass it through | |
2467 # unchanged | |
2468 yield event | |
2469 for event in stream: | |
2470 yield event | |
2471 | |
2472 | |
2473 class Attrs(tuple): | |
2474 """Immutable sequence type that stores the attributes of an element. | |
2475 | |
2476 Ordering of the attributes is preserved, while access by name is also | |
2477 supported. | |
2478 | |
2479 >>> attrs = Attrs([('href', '#'), ('title', 'Foo')]) | |
2480 >>> attrs | |
2481 Attrs([('href', '#'), ('title', 'Foo')]) | |
2482 | |
2483 >>> 'href' in attrs | |
2484 True | |
2485 >>> 'tabindex' in attrs | |
2486 False | |
2487 >>> attrs.get('title') | |
2488 'Foo' | |
2489 | |
2490 Instances may not be manipulated directly. Instead, the operators ``|`` and | |
2491 ``-`` can be used to produce new instances that have specific attributes | |
2492 added, replaced or removed. | |
2493 | |
2494 To remove an attribute, use the ``-`` operator. The right hand side can be | |
2495 either a string or a set/sequence of strings, identifying the name(s) of | |
2496 the attribute(s) to remove: | |
2497 | |
2498 >>> attrs - 'title' | |
2499 Attrs([('href', '#')]) | |
2500 >>> attrs - ('title', 'href') | |
2501 Attrs() | |
2502 | |
2503 The original instance is not modified, but the operator can of course be | |
2504 used with an assignment: | |
2505 | |
2506 >>> attrs | |
2507 Attrs([('href', '#'), ('title', 'Foo')]) | |
2508 >>> attrs -= 'title' | |
2509 >>> attrs | |
2510 Attrs([('href', '#')]) | |
2511 | |
2512 To add a new attribute, use the ``|`` operator, where the right hand value | |
2513 is a sequence of ``(name, value)`` tuples (which includes `Attrs` | |
2514 instances): | |
2515 | |
2516 >>> attrs | [('title', 'Bar')] | |
2517 Attrs([('href', '#'), ('title', 'Bar')]) | |
2518 | |
2519 If the attributes already contain an attribute with a given name, the value | |
2520 of that attribute is replaced: | |
2521 | |
2522 >>> attrs | [('href', 'http://example.org/')] | |
2523 Attrs([('href', 'http://example.org/')]) | |
2524 """ | |
2525 __slots__ = [] | |
2526 | |
2527 def __contains__(self, name): | |
2528 """Return whether the list includes an attribute with the specified | |
2529 name. | |
2530 | |
2531 :return: `True` if the list includes the attribute | |
2532 :rtype: `bool` | |
2533 """ | |
2534 for attr, _ in self: | |
2535 if attr == name: | |
2536 return True | |
2537 | |
2538 def __getitem__(self, i): | |
2539 """Return an item or slice of the attributes list. | |
2540 | |
2541 >>> attrs = Attrs([('href', '#'), ('title', 'Foo')]) | |
2542 >>> attrs[1] | |
2543 ('title', 'Foo') | |
2544 >>> attrs[1:] | |
2545 Attrs([('title', 'Foo')]) | |
2546 """ | |
2547 items = tuple.__getitem__(self, i) | |
2548 if type(i) is slice: | |
2549 return Attrs(items) | |
2550 return items | |
2551 | |
2552 def __getslice__(self, i, j): | |
2553 """Return a slice of the attributes list. | |
2554 | |
2555 >>> attrs = Attrs([('href', '#'), ('title', 'Foo')]) | |
2556 >>> attrs[1:] | |
2557 Attrs([('title', 'Foo')]) | |
2558 """ | |
2559 return Attrs(tuple.__getslice__(self, i, j)) | |
2560 | |
2561 def __or__(self, attrs): | |
2562 """Return a new instance that contains the attributes in `attrs` in | |
2563 addition to any already existing attributes. | |
2564 | |
2565 :return: a new instance with the merged attributes | |
2566 :rtype: `Attrs` | |
2567 """ | |
2568 repl = dict([(an, av) for an, av in attrs if an in self]) | |
2569 return Attrs([(sn, repl.get(sn, sv)) for sn, sv in self] + | |
2570 [(an, av) for an, av in attrs if an not in self]) | |
2571 | |
2572 def __repr__(self): | |
2573 if not self: | |
2574 return 'Attrs()' | |
2575 return 'Attrs([%s])' % ', '.join([repr(item) for item in self]) | |
2576 | |
2577 def __sub__(self, names): | |
2578 """Return a new instance with all attributes with a name in `names` are | |
2579 removed. | |
2580 | |
2581 :param names: the names of the attributes to remove | |
2582 :return: a new instance with the attribute removed | |
2583 :rtype: `Attrs` | |
2584 """ | |
2585 if isinstance(names, basestring): | |
2586 names = (names,) | |
2587 return Attrs([(name, val) for name, val in self if name not in names]) | |
2588 | |
2589 def get(self, name, default=None): | |
2590 """Return the value of the attribute with the specified name, or the | |
2591 value of the `default` parameter if no such attribute is found. | |
2592 | |
2593 :param name: the name of the attribute | |
2594 :param default: the value to return when the attribute does not exist | |
2595 :return: the attribute value, or the `default` value if that attribute | |
2596 does not exist | |
2597 :rtype: `object` | |
2598 """ | |
2599 for attr, value in self: | |
2600 if attr == name: | |
2601 return value | |
2602 return default | |
2603 | |
2604 def totuple(self): | |
2605 """Return the attributes as a markup event. | |
2606 | |
2607 The returned event is a `TEXT` event, the data is the value of all | |
2608 attributes joined together. | |
2609 | |
2610 >>> Attrs([('href', '#'), ('title', 'Foo')]).totuple() | |
2611 ('TEXT', '#Foo', (None, -1, -1)) | |
2612 | |
2613 :return: a `TEXT` event | |
2614 :rtype: `tuple` | |
2615 """ | |
2616 return TEXT, ''.join([x[1] for x in self]), (None, -1, -1) | |
2617 | |
2618 | |
2619 class Markup(unicode): | |
2620 """Marks a string as being safe for inclusion in HTML/XML output without | |
2621 needing to be escaped. | |
2622 """ | |
2623 __slots__ = [] | |
2624 | |
2625 def __add__(self, other): | |
2626 return Markup(unicode.__add__(self, escape(other))) | |
2627 | |
2628 def __radd__(self, other): | |
2629 return Markup(unicode.__add__(escape(other), self)) | |
2630 | |
2631 def __mod__(self, args): | |
2632 if isinstance(args, dict): | |
2633 args = dict(zip(args.keys(), map(escape, args.values()))) | |
2634 elif isinstance(args, (list, tuple)): | |
2635 args = tuple(map(escape, args)) | |
2636 else: | |
2637 args = escape(args) | |
2638 return Markup(unicode.__mod__(self, args)) | |
2639 | |
2640 def __mul__(self, num): | |
2641 return Markup(unicode.__mul__(self, num)) | |
2642 __rmul__ = __mul__ | |
2643 | |
2644 def __repr__(self): | |
2645 return "<%s %s>" % (type(self).__name__, unicode.__repr__(self)) | |
2646 | |
2647 def join(self, seq, escape_quotes=True): | |
2648 """Return a `Markup` object which is the concatenation of the strings | |
2649 in the given sequence, where this `Markup` object is the separator | |
2650 between the joined elements. | |
2651 | |
2652 Any element in the sequence that is not a `Markup` instance is | |
2653 automatically escaped. | |
2654 | |
2655 :param seq: the sequence of strings to join | |
2656 :param escape_quotes: whether double quote characters in the elements | |
2657 should be escaped | |
2658 :return: the joined `Markup` object | |
2659 :rtype: `Markup` | |
2660 :see: `escape` | |
2661 """ | |
2662 return Markup(unicode.join(self, [escape(item, quotes=escape_quotes) | |
2663 for item in seq])) | |
2664 | |
2665 @classmethod | |
2666 def escape(cls, text, quotes=True): | |
2667 """Create a Markup instance from a string and escape special characters | |
2668 it may contain (<, >, & and \"). | |
2669 | |
2670 >>> escape('"1 < 2"') | |
2671 <Markup u'"1 < 2"'> | |
2672 | |
2673 If the `quotes` parameter is set to `False`, the \" character is left | |
2674 as is. Escaping quotes is generally only required for strings that are | |
2675 to be used in attribute values. | |
2676 | |
2677 >>> escape('"1 < 2"', quotes=False) | |
2678 <Markup u'"1 < 2"'> | |
2679 | |
2680 :param text: the text to escape | |
2681 :param quotes: if ``True``, double quote characters are escaped in | |
2682 addition to the other special characters | |
2683 :return: the escaped `Markup` string | |
2684 :rtype: `Markup` | |
2685 """ | |
2686 if not text: | |
2687 return cls() | |
2688 if type(text) is cls: | |
2689 return text | |
2690 if hasattr(text, '__html__'): | |
2691 return Markup(text.__html__()) | |
2692 | |
2693 text = text.replace('&', '&') \ | |
2694 .replace('<', '<') \ | |
2695 .replace('>', '>') | |
2696 if quotes: | |
2697 text = text.replace('"', '"') | |
2698 return cls(text) | |
2699 | |
2700 def unescape(self): | |
2701 """Reverse-escapes &, <, >, and \" and returns a `unicode` object. | |
2702 | |
2703 >>> Markup('1 < 2').unescape() | |
2704 u'1 < 2' | |
2705 | |
2706 :return: the unescaped string | |
2707 :rtype: `unicode` | |
2708 :see: `genshi.core.unescape` | |
2709 """ | |
2710 if not self: | |
2711 return '' | |
2712 return unicode(self).replace('"', '"') \ | |
2713 .replace('>', '>') \ | |
2714 .replace('<', '<') \ | |
2715 .replace('&', '&') | |
2716 | |
2717 def stripentities(self, keepxmlentities=False): | |
2718 """Return a copy of the text with any character or numeric entities | |
2719 replaced by the equivalent UTF-8 characters. | |
2720 | |
2721 If the `keepxmlentities` parameter is provided and evaluates to `True`, | |
2722 the core XML entities (``&``, ``'``, ``>``, ``<`` and | |
2723 ``"``) are not stripped. | |
2724 | |
2725 :return: a `Markup` instance with entities removed | |
2726 :rtype: `Markup` | |
2727 :see: `genshi.util.stripentities` | |
2728 """ | |
2729 return Markup(stripentities(self, keepxmlentities=keepxmlentities)) | |
2730 | |
2731 def striptags(self): | |
2732 """Return a copy of the text with all XML/HTML tags removed. | |
2733 | |
2734 :return: a `Markup` instance with all tags removed | |
2735 :rtype: `Markup` | |
2736 :see: `genshi.util.striptags` | |
2737 """ | |
2738 return Markup(striptags(self)) | |
2739 | |
2740 | |
2741 try: | |
2742 from genshi._speedups import Markup | |
2743 except ImportError: | |
2744 pass # just use the Python implementation | |
2745 | |
2746 | |
2747 escape = Markup.escape | |
2748 | |
2749 | |
2750 def unescape(text): | |
2751 """Reverse-escapes &, <, >, and \" and returns a `unicode` object. | |
2752 | |
2753 >>> unescape(Markup('1 < 2')) | |
2754 u'1 < 2' | |
2755 | |
2756 If the provided `text` object is not a `Markup` instance, it is returned | |
2757 unchanged. | |
2758 | |
2759 >>> unescape('1 < 2') | |
2760 '1 < 2' | |
2761 | |
2762 :param text: the text to unescape | |
2763 :return: the unescsaped string | |
2764 :rtype: `unicode` | |
2765 """ | |
2766 if not isinstance(text, Markup): | |
2767 return text | |
2768 return text.unescape() | |
2769 | |
2770 | |
2771 class Namespace(object): | |
2772 """Utility class creating and testing elements with a namespace. | |
2773 | |
2774 Internally, namespace URIs are encoded in the `QName` of any element or | |
2775 attribute, the namespace URI being enclosed in curly braces. This class | |
2776 helps create and test these strings. | |
2777 | |
2778 A `Namespace` object is instantiated with the namespace URI. | |
2779 | |
2780 >>> html = Namespace('http://www.w3.org/1999/xhtml') | |
2781 >>> html | |
2782 Namespace('http://www.w3.org/1999/xhtml') | |
2783 >>> html.uri | |
2784 u'http://www.w3.org/1999/xhtml' | |
2785 | |
2786 The `Namespace` object can than be used to generate `QName` objects with | |
2787 that namespace: | |
2788 | |
2789 >>> html.body | |
2790 QName('http://www.w3.org/1999/xhtml}body') | |
2791 >>> html.body.localname | |
2792 u'body' | |
2793 >>> html.body.namespace | |
2794 u'http://www.w3.org/1999/xhtml' | |
2795 | |
2796 The same works using item access notation, which is useful for element or | |
2797 attribute names that are not valid Python identifiers: | |
2798 | |
2799 >>> html['body'] | |
2800 QName('http://www.w3.org/1999/xhtml}body') | |
2801 | |
2802 A `Namespace` object can also be used to test whether a specific `QName` | |
2803 belongs to that namespace using the ``in`` operator: | |
2804 | |
2805 >>> qname = html.body | |
2806 >>> qname in html | |
2807 True | |
2808 >>> qname in Namespace('http://www.w3.org/2002/06/xhtml2') | |
2809 False | |
2810 """ | |
2811 def __new__(cls, uri): | |
2812 if type(uri) is cls: | |
2813 return uri | |
2814 return object.__new__(cls) | |
2815 | |
2816 def __getnewargs__(self): | |
2817 return (self.uri,) | |
2818 | |
2819 def __getstate__(self): | |
2820 return self.uri | |
2821 | |
2822 def __setstate__(self, uri): | |
2823 self.uri = uri | |
2824 | |
2825 def __init__(self, uri): | |
2826 self.uri = unicode(uri) | |
2827 | |
2828 def __contains__(self, qname): | |
2829 return qname.namespace == self.uri | |
2830 | |
2831 def __ne__(self, other): | |
2832 return not self == other | |
2833 | |
2834 def __eq__(self, other): | |
2835 if isinstance(other, Namespace): | |
2836 return self.uri == other.uri | |
2837 return self.uri == other | |
2838 | |
2839 def __getitem__(self, name): | |
2840 return QName(self.uri + '}' + name) | |
2841 __getattr__ = __getitem__ | |
2842 | |
2843 def __hash__(self): | |
2844 return hash(self.uri) | |
2845 | |
2846 def __repr__(self): | |
2847 return 'Namespace(%s)' % stringrepr(self.uri) | |
2848 | |
2849 def __str__(self): | |
2850 return self.uri.encode('utf-8') | |
2851 | |
2852 def __unicode__(self): | |
2853 return self.uri | |
2854 | |
2855 | |
2856 # The namespace used by attributes such as xml:lang and xml:space | |
2857 XML_NAMESPACE = Namespace('http://www.w3.org/XML/1998/namespace') | |
2858 | |
2859 | |
2860 class QName(unicode): | |
2861 """A qualified element or attribute name. | |
2862 | |
2863 The unicode value of instances of this class contains the qualified name of | |
2864 the element or attribute, in the form ``{namespace-uri}local-name``. The | |
2865 namespace URI can be obtained through the additional `namespace` attribute, | |
2866 while the local name can be accessed through the `localname` attribute. | |
2867 | |
2868 >>> qname = QName('foo') | |
2869 >>> qname | |
2870 QName('foo') | |
2871 >>> qname.localname | |
2872 u'foo' | |
2873 >>> qname.namespace | |
2874 | |
2875 >>> qname = QName('http://www.w3.org/1999/xhtml}body') | |
2876 >>> qname | |
2877 QName('http://www.w3.org/1999/xhtml}body') | |
2878 >>> qname.localname | |
2879 u'body' | |
2880 >>> qname.namespace | |
2881 u'http://www.w3.org/1999/xhtml' | |
2882 """ | |
2883 __slots__ = ['namespace', 'localname'] | |
2884 | |
2885 def __new__(cls, qname): | |
2886 """Create the `QName` instance. | |
2887 | |
2888 :param qname: the qualified name as a string of the form | |
2889 ``{namespace-uri}local-name``, where the leading curly | |
2890 brace is optional | |
2891 """ | |
2892 if type(qname) is cls: | |
2893 return qname | |
2894 | |
2895 parts = qname.lstrip('{').split('}', 1) | |
2896 if len(parts) > 1: | |
2897 self = unicode.__new__(cls, '{%s' % qname) | |
2898 self.namespace, self.localname = map(unicode, parts) | |
2899 else: | |
2900 self = unicode.__new__(cls, qname) | |
2901 self.namespace, self.localname = None, unicode(qname) | |
2902 return self | |
2903 | |
2904 def __getnewargs__(self): | |
2905 return (self.lstrip('{'),) | |
2906 | |
2907 def __repr__(self): | |
2908 return 'QName(%s)' % stringrepr(self.lstrip('{')) |