comparison genshi/core.py @ 860:61d37796da98

A bit of cleanup of the `Markup` Python implementation.
author cmlenz
date Thu, 12 Nov 2009 17:31:40 +0000
parents 24733a5854d9
children e098d29c4de1
comparison
equal deleted inserted replaced
859:fbe34d12acde 860:61d37796da98
440 needing to be escaped. 440 needing to be escaped.
441 """ 441 """
442 __slots__ = [] 442 __slots__ = []
443 443
444 def __add__(self, other): 444 def __add__(self, other):
445 return Markup(unicode(self) + unicode(escape(other))) 445 return Markup(unicode.__add__(self, escape(other)))
446 446
447 def __radd__(self, other): 447 def __radd__(self, other):
448 return Markup(unicode(escape(other)) + unicode(self)) 448 return Markup(unicode.__add__(escape(other), self))
449 449
450 def __mod__(self, args): 450 def __mod__(self, args):
451 if isinstance(args, dict): 451 if isinstance(args, dict):
452 args = dict(zip(args.keys(), map(escape, args.values()))) 452 args = dict(zip(args.keys(), map(escape, args.values())))
453 elif isinstance(args, (list, tuple)): 453 elif isinstance(args, (list, tuple)):
455 else: 455 else:
456 args = escape(args) 456 args = escape(args)
457 return Markup(unicode.__mod__(self, args)) 457 return Markup(unicode.__mod__(self, args))
458 458
459 def __mul__(self, num): 459 def __mul__(self, num):
460 return Markup(unicode(self) * num) 460 return Markup(unicode.__mul__(self, num))
461 461 __rmul__ = __mul__
462 def __rmul__(self, num):
463 return Markup(num * unicode(self))
464 462
465 def __repr__(self): 463 def __repr__(self):
466 return "<%s %s>" % (self.__class__.__name__, unicode.__repr__(self)) 464 return "<%s %s>" % (type(self).__name__, unicode.__repr__(self))
467 465
468 def join(self, seq, escape_quotes=True): 466 def join(self, seq, escape_quotes=True):
469 """Return a `Markup` object which is the concatenation of the strings 467 """Return a `Markup` object which is the concatenation of the strings
470 in the given sequence, where this `Markup` object is the separator 468 in the given sequence, where this `Markup` object is the separator
471 between the joined elements. 469 between the joined elements.
478 should be escaped 476 should be escaped
479 :return: the joined `Markup` object 477 :return: the joined `Markup` object
480 :rtype: `Markup` 478 :rtype: `Markup`
481 :see: `escape` 479 :see: `escape`
482 """ 480 """
483 return Markup(unicode(self).join([escape(item, quotes=escape_quotes) 481 return Markup(unicode.join(self, [escape(item, quotes=escape_quotes)
484 for item in seq])) 482 for item in seq]))
485 483
486 @classmethod 484 @classmethod
487 def escape(cls, text, quotes=True): 485 def escape(cls, text, quotes=True):
488 """Create a Markup instance from a string and escape special characters 486 """Create a Markup instance from a string and escape special characters
509 if type(text) is cls: 507 if type(text) is cls:
510 return text 508 return text
511 if hasattr(text, '__html__'): 509 if hasattr(text, '__html__'):
512 return Markup(text.__html__()) 510 return Markup(text.__html__())
513 511
514 text = unicode(text).replace('&', '&amp;') \ 512 text = text.replace('&', '&amp;') \
515 .replace('<', '&lt;') \ 513 .replace('<', '&lt;') \
516 .replace('>', '&gt;') 514 .replace('>', '&gt;')
517 if quotes: 515 if quotes:
518 text = text.replace('"', '&#34;') 516 text = text.replace('"', '&#34;')
519 return cls(text) 517 return cls(text)
520 518
521 def unescape(self): 519 def unescape(self):
725 def __getnewargs__(self): 723 def __getnewargs__(self):
726 return (self.lstrip('{'),) 724 return (self.lstrip('{'),)
727 725
728 def __repr__(self): 726 def __repr__(self):
729 return 'QName(%s)' % stringrepr(self.lstrip('{')) 727 return 'QName(%s)' % stringrepr(self.lstrip('{'))
728 # -*- coding: utf-8 -*-
729 #
730 # Copyright (C) 2006-2009 Edgewall Software
731 # All rights reserved.
732 #
733 # This software is licensed as described in the file COPYING, which
734 # you should have received as part of this distribution. The terms
735 # are also available at http://genshi.edgewall.org/wiki/License.
736 #
737 # This software consists of voluntary contributions made by many
738 # individuals. For the exact contribution history, see the revision
739 # history and logs, available at http://genshi.edgewall.org/log/.
740
741 """Core classes for markup processing."""
742
743 try:
744 reduce # builtin in Python < 3
745 except NameError:
746 from functools import reduce
747 from itertools import chain
748 import operator
749
750 from genshi.util import plaintext, stripentities, striptags, stringrepr
751
752 __all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Attrs', 'Namespace',
753 'QName']
754 __docformat__ = 'restructuredtext en'
755
756
757 class StreamEventKind(str):
758 """A kind of event on a markup stream."""
759 __slots__ = []
760 _instances = {}
761
762 def __new__(cls, val):
763 return cls._instances.setdefault(val, str.__new__(cls, val))
764
765
766 class Stream(object):
767 """Represents a stream of markup events.
768
769 This class is basically an iterator over the events.
770
771 Stream events are tuples of the form::
772
773 (kind, data, position)
774
775 where ``kind`` is the event kind (such as `START`, `END`, `TEXT`, etc),
776 ``data`` depends on the kind of event, and ``position`` is a
777 ``(filename, line, offset)`` tuple that contains the location of the
778 original element or text in the input. If the original location is unknown,
779 ``position`` is ``(None, -1, -1)``.
780
781 Also provided are ways to serialize the stream to text. The `serialize()`
782 method will return an iterator over generated strings, while `render()`
783 returns the complete generated text at once. Both accept various parameters
784 that impact the way the stream is serialized.
785 """
786 __slots__ = ['events', 'serializer']
787
788 START = StreamEventKind('START') #: a start tag
789 END = StreamEventKind('END') #: an end tag
790 TEXT = StreamEventKind('TEXT') #: literal text
791 XML_DECL = StreamEventKind('XML_DECL') #: XML declaration
792 DOCTYPE = StreamEventKind('DOCTYPE') #: doctype declaration
793 START_NS = StreamEventKind('START_NS') #: start namespace mapping
794 END_NS = StreamEventKind('END_NS') #: end namespace mapping
795 START_CDATA = StreamEventKind('START_CDATA') #: start CDATA section
796 END_CDATA = StreamEventKind('END_CDATA') #: end CDATA section
797 PI = StreamEventKind('PI') #: processing instruction
798 COMMENT = StreamEventKind('COMMENT') #: comment
799
800 def __init__(self, events, serializer=None):
801 """Initialize the stream with a sequence of markup events.
802
803 :param events: a sequence or iterable providing the events
804 :param serializer: the default serialization method to use for this
805 stream
806
807 :note: Changed in 0.5: added the `serializer` argument
808 """
809 self.events = events #: The underlying iterable producing the events
810 self.serializer = serializer #: The default serializion method
811
812 def __iter__(self):
813 return iter(self.events)
814
815 def __or__(self, function):
816 """Override the "bitwise or" operator to apply filters or serializers
817 to the stream, providing a syntax similar to pipes on Unix shells.
818
819 Assume the following stream produced by the `HTML` function:
820
821 >>> from genshi.input import HTML
822 >>> html = HTML('''<p onclick="alert('Whoa')">Hello, world!</p>''')
823 >>> print(html)
824 <p onclick="alert('Whoa')">Hello, world!</p>
825
826 A filter such as the HTML sanitizer can be applied to that stream using
827 the pipe notation as follows:
828
829 >>> from genshi.filters import HTMLSanitizer
830 >>> sanitizer = HTMLSanitizer()
831 >>> print(html | sanitizer)
832 <p>Hello, world!</p>
833
834 Filters can be any function that accepts and produces a stream (where
835 a stream is anything that iterates over events):
836
837 >>> def uppercase(stream):
838 ... for kind, data, pos in stream:
839 ... if kind is TEXT:
840 ... data = data.upper()
841 ... yield kind, data, pos
842 >>> print(html | sanitizer | uppercase)
843 <p>HELLO, WORLD!</p>
844
845 Serializers can also be used with this notation:
846
847 >>> from genshi.output import TextSerializer
848 >>> output = TextSerializer()
849 >>> print(html | sanitizer | uppercase | output)
850 HELLO, WORLD!
851
852 Commonly, serializers should be used at the end of the "pipeline";
853 using them somewhere in the middle may produce unexpected results.
854
855 :param function: the callable object that should be applied as a filter
856 :return: the filtered stream
857 :rtype: `Stream`
858 """
859 return Stream(_ensure(function(self)), serializer=self.serializer)
860
861 def filter(self, *filters):
862 """Apply filters to the stream.
863
864 This method returns a new stream with the given filters applied. The
865 filters must be callables that accept the stream object as parameter,
866 and return the filtered stream.
867
868 The call::
869
870 stream.filter(filter1, filter2)
871
872 is equivalent to::
873
874 stream | filter1 | filter2
875
876 :param filters: one or more callable objects that should be applied as
877 filters
878 :return: the filtered stream
879 :rtype: `Stream`
880 """
881 return reduce(operator.or_, (self,) + filters)
882
883 def render(self, method=None, encoding='utf-8', out=None, **kwargs):
884 """Return a string representation of the stream.
885
886 Any additional keyword arguments are passed to the serializer, and thus
887 depend on the `method` parameter value.
888
889 :param method: determines how the stream is serialized; can be either
890 "xml", "xhtml", "html", "text", or a custom serializer
891 class; if `None`, the default serialization method of
892 the stream is used
893 :param encoding: how the output string should be encoded; if set to
894 `None`, this method returns a `unicode` object
895 :param out: a file-like object that the output should be written to
896 instead of being returned as one big string; note that if
897 this is a file or socket (or similar), the `encoding` must
898 not be `None` (that is, the output must be encoded)
899 :return: a `str` or `unicode` object (depending on the `encoding`
900 parameter), or `None` if the `out` parameter is provided
901 :rtype: `basestring`
902
903 :see: XMLSerializer, XHTMLSerializer, HTMLSerializer, TextSerializer
904 :note: Changed in 0.5: added the `out` parameter
905 """
906 from genshi.output import encode
907 if method is None:
908 method = self.serializer or 'xml'
909 generator = self.serialize(method=method, **kwargs)
910 return encode(generator, method=method, encoding=encoding, out=out)
911
912 def select(self, path, namespaces=None, variables=None):
913 """Return a new stream that contains the events matching the given
914 XPath expression.
915
916 >>> from genshi import HTML
917 >>> stream = HTML('<doc><elem>foo</elem><elem>bar</elem></doc>')
918 >>> print(stream.select('elem'))
919 <elem>foo</elem><elem>bar</elem>
920 >>> print(stream.select('elem/text()'))
921 foobar
922
923 Note that the outermost element of the stream becomes the *context
924 node* for the XPath test. That means that the expression "doc" would
925 not match anything in the example above, because it only tests against
926 child elements of the outermost element:
927
928 >>> print(stream.select('doc'))
929 <BLANKLINE>
930
931 You can use the "." expression to match the context node itself
932 (although that usually makes little sense):
933
934 >>> print(stream.select('.'))
935 <doc><elem>foo</elem><elem>bar</elem></doc>
936
937 :param path: a string containing the XPath expression
938 :param namespaces: mapping of namespace prefixes used in the path
939 :param variables: mapping of variable names to values
940 :return: the selected substream
941 :rtype: `Stream`
942 :raises PathSyntaxError: if the given path expression is invalid or not
943 supported
944 """
945 from genshi.path import Path
946 return Path(path).select(self, namespaces, variables)
947
948 def serialize(self, method='xml', **kwargs):
949 """Generate strings corresponding to a specific serialization of the
950 stream.
951
952 Unlike the `render()` method, this method is a generator that returns
953 the serialized output incrementally, as opposed to returning a single
954 string.
955
956 Any additional keyword arguments are passed to the serializer, and thus
957 depend on the `method` parameter value.
958
959 :param method: determines how the stream is serialized; can be either
960 "xml", "xhtml", "html", "text", or a custom serializer
961 class; if `None`, the default serialization method of
962 the stream is used
963 :return: an iterator over the serialization results (`Markup` or
964 `unicode` objects, depending on the serialization method)
965 :rtype: ``iterator``
966 :see: XMLSerializer, XHTMLSerializer, HTMLSerializer, TextSerializer
967 """
968 from genshi.output import get_serializer
969 if method is None:
970 method = self.serializer or 'xml'
971 return get_serializer(method, **kwargs)(_ensure(self))
972
973 def __str__(self):
974 return self.render()
975
976 def __unicode__(self):
977 return self.render(encoding=None)
978
979 def __html__(self):
980 return self
981
982
983 START = Stream.START
984 END = Stream.END
985 TEXT = Stream.TEXT
986 XML_DECL = Stream.XML_DECL
987 DOCTYPE = Stream.DOCTYPE
988 START_NS = Stream.START_NS
989 END_NS = Stream.END_NS
990 START_CDATA = Stream.START_CDATA
991 END_CDATA = Stream.END_CDATA
992 PI = Stream.PI
993 COMMENT = Stream.COMMENT
994
995
996 def _ensure(stream):
997 """Ensure that every item on the stream is actually a markup event."""
998 stream = iter(stream)
999 event = stream.next()
1000
1001 # Check whether the iterable is a real markup event stream by examining the
1002 # first item it yields; if it's not we'll need to do some conversion
1003 if type(event) is not tuple or len(event) != 3:
1004 for event in chain([event], stream):
1005 if hasattr(event, 'totuple'):
1006 event = event.totuple()
1007 else:
1008 event = TEXT, unicode(event), (None, -1, -1)
1009 yield event
1010 return
1011
1012 # This looks like a markup event stream, so we'll just pass it through
1013 # unchanged
1014 yield event
1015 for event in stream:
1016 yield event
1017
1018
1019 class Attrs(tuple):
1020 """Immutable sequence type that stores the attributes of an element.
1021
1022 Ordering of the attributes is preserved, while access by name is also
1023 supported.
1024
1025 >>> attrs = Attrs([('href', '#'), ('title', 'Foo')])
1026 >>> attrs
1027 Attrs([('href', '#'), ('title', 'Foo')])
1028
1029 >>> 'href' in attrs
1030 True
1031 >>> 'tabindex' in attrs
1032 False
1033 >>> attrs.get('title')
1034 'Foo'
1035
1036 Instances may not be manipulated directly. Instead, the operators ``|`` and
1037 ``-`` can be used to produce new instances that have specific attributes
1038 added, replaced or removed.
1039
1040 To remove an attribute, use the ``-`` operator. The right hand side can be
1041 either a string or a set/sequence of strings, identifying the name(s) of
1042 the attribute(s) to remove:
1043
1044 >>> attrs - 'title'
1045 Attrs([('href', '#')])
1046 >>> attrs - ('title', 'href')
1047 Attrs()
1048
1049 The original instance is not modified, but the operator can of course be
1050 used with an assignment:
1051
1052 >>> attrs
1053 Attrs([('href', '#'), ('title', 'Foo')])
1054 >>> attrs -= 'title'
1055 >>> attrs
1056 Attrs([('href', '#')])
1057
1058 To add a new attribute, use the ``|`` operator, where the right hand value
1059 is a sequence of ``(name, value)`` tuples (which includes `Attrs`
1060 instances):
1061
1062 >>> attrs | [('title', 'Bar')]
1063 Attrs([('href', '#'), ('title', 'Bar')])
1064
1065 If the attributes already contain an attribute with a given name, the value
1066 of that attribute is replaced:
1067
1068 >>> attrs | [('href', 'http://example.org/')]
1069 Attrs([('href', 'http://example.org/')])
1070 """
1071 __slots__ = []
1072
1073 def __contains__(self, name):
1074 """Return whether the list includes an attribute with the specified
1075 name.
1076
1077 :return: `True` if the list includes the attribute
1078 :rtype: `bool`
1079 """
1080 for attr, _ in self:
1081 if attr == name:
1082 return True
1083
1084 def __getitem__(self, i):
1085 """Return an item or slice of the attributes list.
1086
1087 >>> attrs = Attrs([('href', '#'), ('title', 'Foo')])
1088 >>> attrs[1]
1089 ('title', 'Foo')
1090 >>> attrs[1:]
1091 Attrs([('title', 'Foo')])
1092 """
1093 items = tuple.__getitem__(self, i)
1094 if type(i) is slice:
1095 return Attrs(items)
1096 return items
1097
1098 def __getslice__(self, i, j):
1099 """Return a slice of the attributes list.
1100
1101 >>> attrs = Attrs([('href', '#'), ('title', 'Foo')])
1102 >>> attrs[1:]
1103 Attrs([('title', 'Foo')])
1104 """
1105 return Attrs(tuple.__getslice__(self, i, j))
1106
1107 def __or__(self, attrs):
1108 """Return a new instance that contains the attributes in `attrs` in
1109 addition to any already existing attributes.
1110
1111 :return: a new instance with the merged attributes
1112 :rtype: `Attrs`
1113 """
1114 repl = dict([(an, av) for an, av in attrs if an in self])
1115 return Attrs([(sn, repl.get(sn, sv)) for sn, sv in self] +
1116 [(an, av) for an, av in attrs if an not in self])
1117
1118 def __repr__(self):
1119 if not self:
1120 return 'Attrs()'
1121 return 'Attrs([%s])' % ', '.join([repr(item) for item in self])
1122
1123 def __sub__(self, names):
1124 """Return a new instance with all attributes with a name in `names` are
1125 removed.
1126
1127 :param names: the names of the attributes to remove
1128 :return: a new instance with the attribute removed
1129 :rtype: `Attrs`
1130 """
1131 if isinstance(names, basestring):
1132 names = (names,)
1133 return Attrs([(name, val) for name, val in self if name not in names])
1134
1135 def get(self, name, default=None):
1136 """Return the value of the attribute with the specified name, or the
1137 value of the `default` parameter if no such attribute is found.
1138
1139 :param name: the name of the attribute
1140 :param default: the value to return when the attribute does not exist
1141 :return: the attribute value, or the `default` value if that attribute
1142 does not exist
1143 :rtype: `object`
1144 """
1145 for attr, value in self:
1146 if attr == name:
1147 return value
1148 return default
1149
1150 def totuple(self):
1151 """Return the attributes as a markup event.
1152
1153 The returned event is a `TEXT` event, the data is the value of all
1154 attributes joined together.
1155
1156 >>> Attrs([('href', '#'), ('title', 'Foo')]).totuple()
1157 ('TEXT', '#Foo', (None, -1, -1))
1158
1159 :return: a `TEXT` event
1160 :rtype: `tuple`
1161 """
1162 return TEXT, ''.join([x[1] for x in self]), (None, -1, -1)
1163
1164
1165 class Markup(unicode):
1166 """Marks a string as being safe for inclusion in HTML/XML output without
1167 needing to be escaped.
1168 """
1169 __slots__ = []
1170
1171 def __add__(self, other):
1172 return Markup(unicode.__add__(self, escape(other)))
1173
1174 def __radd__(self, other):
1175 return Markup(unicode.__add__(escape(other), self))
1176
1177 def __mod__(self, args):
1178 if isinstance(args, dict):
1179 args = dict(zip(args.keys(), map(escape, args.values())))
1180 elif isinstance(args, (list, tuple)):
1181 args = tuple(map(escape, args))
1182 else:
1183 args = escape(args)
1184 return Markup(unicode.__mod__(self, args))
1185
1186 def __mul__(self, num):
1187 return Markup(unicode.__mul__(self, num))
1188 __rmul__ = __mul__
1189
1190 def __repr__(self):
1191 return "<%s %s>" % (type(self).__name__, unicode.__repr__(self))
1192
1193 def join(self, seq, escape_quotes=True):
1194 """Return a `Markup` object which is the concatenation of the strings
1195 in the given sequence, where this `Markup` object is the separator
1196 between the joined elements.
1197
1198 Any element in the sequence that is not a `Markup` instance is
1199 automatically escaped.
1200
1201 :param seq: the sequence of strings to join
1202 :param escape_quotes: whether double quote characters in the elements
1203 should be escaped
1204 :return: the joined `Markup` object
1205 :rtype: `Markup`
1206 :see: `escape`
1207 """
1208 return Markup(unicode.join(self, [escape(item, quotes=escape_quotes)
1209 for item in seq]))
1210
1211 @classmethod
1212 def escape(cls, text, quotes=True):
1213 """Create a Markup instance from a string and escape special characters
1214 it may contain (<, >, & and \").
1215
1216 >>> escape('"1 < 2"')
1217 <Markup u'&#34;1 &lt; 2&#34;'>
1218
1219 If the `quotes` parameter is set to `False`, the \" character is left
1220 as is. Escaping quotes is generally only required for strings that are
1221 to be used in attribute values.
1222
1223 >>> escape('"1 < 2"', quotes=False)
1224 <Markup u'"1 &lt; 2"'>
1225
1226 :param text: the text to escape
1227 :param quotes: if ``True``, double quote characters are escaped in
1228 addition to the other special characters
1229 :return: the escaped `Markup` string
1230 :rtype: `Markup`
1231 """
1232 if not text:
1233 return cls()
1234 if type(text) is cls:
1235 return text
1236 if hasattr(text, '__html__'):
1237 return Markup(text.__html__())
1238
1239 text = text.replace('&', '&amp;') \
1240 .replace('<', '&lt;') \
1241 .replace('>', '&gt;')
1242 if quotes:
1243 text = text.replace('"', '&#34;')
1244 return cls(text)
1245
1246 def unescape(self):
1247 """Reverse-escapes &, <, >, and \" and returns a `unicode` object.
1248
1249 >>> Markup('1 &lt; 2').unescape()
1250 u'1 < 2'
1251
1252 :return: the unescaped string
1253 :rtype: `unicode`
1254 :see: `genshi.core.unescape`
1255 """
1256 if not self:
1257 return ''
1258 return unicode(self).replace('&#34;', '"') \
1259 .replace('&gt;', '>') \
1260 .replace('&lt;', '<') \
1261 .replace('&amp;', '&')
1262
1263 def stripentities(self, keepxmlentities=False):
1264 """Return a copy of the text with any character or numeric entities
1265 replaced by the equivalent UTF-8 characters.
1266
1267 If the `keepxmlentities` parameter is provided and evaluates to `True`,
1268 the core XML entities (``&amp;``, ``&apos;``, ``&gt;``, ``&lt;`` and
1269 ``&quot;``) are not stripped.
1270
1271 :return: a `Markup` instance with entities removed
1272 :rtype: `Markup`
1273 :see: `genshi.util.stripentities`
1274 """
1275 return Markup(stripentities(self, keepxmlentities=keepxmlentities))
1276
1277 def striptags(self):
1278 """Return a copy of the text with all XML/HTML tags removed.
1279
1280 :return: a `Markup` instance with all tags removed
1281 :rtype: `Markup`
1282 :see: `genshi.util.striptags`
1283 """
1284 return Markup(striptags(self))
1285
1286
1287 try:
1288 from genshi._speedups import Markup
1289 except ImportError:
1290 pass # just use the Python implementation
1291
1292
1293 escape = Markup.escape
1294
1295
1296 def unescape(text):
1297 """Reverse-escapes &, <, >, and \" and returns a `unicode` object.
1298
1299 >>> unescape(Markup('1 &lt; 2'))
1300 u'1 < 2'
1301
1302 If the provided `text` object is not a `Markup` instance, it is returned
1303 unchanged.
1304
1305 >>> unescape('1 &lt; 2')
1306 '1 &lt; 2'
1307
1308 :param text: the text to unescape
1309 :return: the unescsaped string
1310 :rtype: `unicode`
1311 """
1312 if not isinstance(text, Markup):
1313 return text
1314 return text.unescape()
1315
1316
1317 class Namespace(object):
1318 """Utility class creating and testing elements with a namespace.
1319
1320 Internally, namespace URIs are encoded in the `QName` of any element or
1321 attribute, the namespace URI being enclosed in curly braces. This class
1322 helps create and test these strings.
1323
1324 A `Namespace` object is instantiated with the namespace URI.
1325
1326 >>> html = Namespace('http://www.w3.org/1999/xhtml')
1327 >>> html
1328 Namespace('http://www.w3.org/1999/xhtml')
1329 >>> html.uri
1330 u'http://www.w3.org/1999/xhtml'
1331
1332 The `Namespace` object can than be used to generate `QName` objects with
1333 that namespace:
1334
1335 >>> html.body
1336 QName('http://www.w3.org/1999/xhtml}body')
1337 >>> html.body.localname
1338 u'body'
1339 >>> html.body.namespace
1340 u'http://www.w3.org/1999/xhtml'
1341
1342 The same works using item access notation, which is useful for element or
1343 attribute names that are not valid Python identifiers:
1344
1345 >>> html['body']
1346 QName('http://www.w3.org/1999/xhtml}body')
1347
1348 A `Namespace` object can also be used to test whether a specific `QName`
1349 belongs to that namespace using the ``in`` operator:
1350
1351 >>> qname = html.body
1352 >>> qname in html
1353 True
1354 >>> qname in Namespace('http://www.w3.org/2002/06/xhtml2')
1355 False
1356 """
1357 def __new__(cls, uri):
1358 if type(uri) is cls:
1359 return uri
1360 return object.__new__(cls)
1361
1362 def __getnewargs__(self):
1363 return (self.uri,)
1364
1365 def __getstate__(self):
1366 return self.uri
1367
1368 def __setstate__(self, uri):
1369 self.uri = uri
1370
1371 def __init__(self, uri):
1372 self.uri = unicode(uri)
1373
1374 def __contains__(self, qname):
1375 return qname.namespace == self.uri
1376
1377 def __ne__(self, other):
1378 return not self == other
1379
1380 def __eq__(self, other):
1381 if isinstance(other, Namespace):
1382 return self.uri == other.uri
1383 return self.uri == other
1384
1385 def __getitem__(self, name):
1386 return QName(self.uri + '}' + name)
1387 __getattr__ = __getitem__
1388
1389 def __hash__(self):
1390 return hash(self.uri)
1391
1392 def __repr__(self):
1393 return 'Namespace(%s)' % stringrepr(self.uri)
1394
1395 def __str__(self):
1396 return self.uri.encode('utf-8')
1397
1398 def __unicode__(self):
1399 return self.uri
1400
1401
1402 # The namespace used by attributes such as xml:lang and xml:space
1403 XML_NAMESPACE = Namespace('http://www.w3.org/XML/1998/namespace')
1404
1405
1406 class QName(unicode):
1407 """A qualified element or attribute name.
1408
1409 The unicode value of instances of this class contains the qualified name of
1410 the element or attribute, in the form ``{namespace-uri}local-name``. The
1411 namespace URI can be obtained through the additional `namespace` attribute,
1412 while the local name can be accessed through the `localname` attribute.
1413
1414 >>> qname = QName('foo')
1415 >>> qname
1416 QName('foo')
1417 >>> qname.localname
1418 u'foo'
1419 >>> qname.namespace
1420
1421 >>> qname = QName('http://www.w3.org/1999/xhtml}body')
1422 >>> qname
1423 QName('http://www.w3.org/1999/xhtml}body')
1424 >>> qname.localname
1425 u'body'
1426 >>> qname.namespace
1427 u'http://www.w3.org/1999/xhtml'
1428 """
1429 __slots__ = ['namespace', 'localname']
1430
1431 def __new__(cls, qname):
1432 """Create the `QName` instance.
1433
1434 :param qname: the qualified name as a string of the form
1435 ``{namespace-uri}local-name``, where the leading curly
1436 brace is optional
1437 """
1438 if type(qname) is cls:
1439 return qname
1440
1441 parts = qname.lstrip('{').split('}', 1)
1442 if len(parts) > 1:
1443 self = unicode.__new__(cls, '{%s' % qname)
1444 self.namespace, self.localname = map(unicode, parts)
1445 else:
1446 self = unicode.__new__(cls, qname)
1447 self.namespace, self.localname = None, unicode(qname)
1448 return self
1449
1450 def __getnewargs__(self):
1451 return (self.lstrip('{'),)
1452
1453 def __repr__(self):
1454 return 'QName(%s)' % stringrepr(self.lstrip('{'))
1455 # -*- coding: utf-8 -*-
1456 #
1457 # Copyright (C) 2006-2009 Edgewall Software
1458 # All rights reserved.
1459 #
1460 # This software is licensed as described in the file COPYING, which
1461 # you should have received as part of this distribution. The terms
1462 # are also available at http://genshi.edgewall.org/wiki/License.
1463 #
1464 # This software consists of voluntary contributions made by many
1465 # individuals. For the exact contribution history, see the revision
1466 # history and logs, available at http://genshi.edgewall.org/log/.
1467
1468 """Core classes for markup processing."""
1469
1470 try:
1471 reduce # builtin in Python < 3
1472 except NameError:
1473 from functools import reduce
1474 from itertools import chain
1475 import operator
1476
1477 from genshi.util import plaintext, stripentities, striptags, stringrepr
1478
1479 __all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Attrs', 'Namespace',
1480 'QName']
1481 __docformat__ = 'restructuredtext en'
1482
1483
1484 class StreamEventKind(str):
1485 """A kind of event on a markup stream."""
1486 __slots__ = []
1487 _instances = {}
1488
1489 def __new__(cls, val):
1490 return cls._instances.setdefault(val, str.__new__(cls, val))
1491
1492
1493 class Stream(object):
1494 """Represents a stream of markup events.
1495
1496 This class is basically an iterator over the events.
1497
1498 Stream events are tuples of the form::
1499
1500 (kind, data, position)
1501
1502 where ``kind`` is the event kind (such as `START`, `END`, `TEXT`, etc),
1503 ``data`` depends on the kind of event, and ``position`` is a
1504 ``(filename, line, offset)`` tuple that contains the location of the
1505 original element or text in the input. If the original location is unknown,
1506 ``position`` is ``(None, -1, -1)``.
1507
1508 Also provided are ways to serialize the stream to text. The `serialize()`
1509 method will return an iterator over generated strings, while `render()`
1510 returns the complete generated text at once. Both accept various parameters
1511 that impact the way the stream is serialized.
1512 """
1513 __slots__ = ['events', 'serializer']
1514
1515 START = StreamEventKind('START') #: a start tag
1516 END = StreamEventKind('END') #: an end tag
1517 TEXT = StreamEventKind('TEXT') #: literal text
1518 XML_DECL = StreamEventKind('XML_DECL') #: XML declaration
1519 DOCTYPE = StreamEventKind('DOCTYPE') #: doctype declaration
1520 START_NS = StreamEventKind('START_NS') #: start namespace mapping
1521 END_NS = StreamEventKind('END_NS') #: end namespace mapping
1522 START_CDATA = StreamEventKind('START_CDATA') #: start CDATA section
1523 END_CDATA = StreamEventKind('END_CDATA') #: end CDATA section
1524 PI = StreamEventKind('PI') #: processing instruction
1525 COMMENT = StreamEventKind('COMMENT') #: comment
1526
1527 def __init__(self, events, serializer=None):
1528 """Initialize the stream with a sequence of markup events.
1529
1530 :param events: a sequence or iterable providing the events
1531 :param serializer: the default serialization method to use for this
1532 stream
1533
1534 :note: Changed in 0.5: added the `serializer` argument
1535 """
1536 self.events = events #: The underlying iterable producing the events
1537 self.serializer = serializer #: The default serializion method
1538
1539 def __iter__(self):
1540 return iter(self.events)
1541
1542 def __or__(self, function):
1543 """Override the "bitwise or" operator to apply filters or serializers
1544 to the stream, providing a syntax similar to pipes on Unix shells.
1545
1546 Assume the following stream produced by the `HTML` function:
1547
1548 >>> from genshi.input import HTML
1549 >>> html = HTML('''<p onclick="alert('Whoa')">Hello, world!</p>''')
1550 >>> print(html)
1551 <p onclick="alert('Whoa')">Hello, world!</p>
1552
1553 A filter such as the HTML sanitizer can be applied to that stream using
1554 the pipe notation as follows:
1555
1556 >>> from genshi.filters import HTMLSanitizer
1557 >>> sanitizer = HTMLSanitizer()
1558 >>> print(html | sanitizer)
1559 <p>Hello, world!</p>
1560
1561 Filters can be any function that accepts and produces a stream (where
1562 a stream is anything that iterates over events):
1563
1564 >>> def uppercase(stream):
1565 ... for kind, data, pos in stream:
1566 ... if kind is TEXT:
1567 ... data = data.upper()
1568 ... yield kind, data, pos
1569 >>> print(html | sanitizer | uppercase)
1570 <p>HELLO, WORLD!</p>
1571
1572 Serializers can also be used with this notation:
1573
1574 >>> from genshi.output import TextSerializer
1575 >>> output = TextSerializer()
1576 >>> print(html | sanitizer | uppercase | output)
1577 HELLO, WORLD!
1578
1579 Commonly, serializers should be used at the end of the "pipeline";
1580 using them somewhere in the middle may produce unexpected results.
1581
1582 :param function: the callable object that should be applied as a filter
1583 :return: the filtered stream
1584 :rtype: `Stream`
1585 """
1586 return Stream(_ensure(function(self)), serializer=self.serializer)
1587
1588 def filter(self, *filters):
1589 """Apply filters to the stream.
1590
1591 This method returns a new stream with the given filters applied. The
1592 filters must be callables that accept the stream object as parameter,
1593 and return the filtered stream.
1594
1595 The call::
1596
1597 stream.filter(filter1, filter2)
1598
1599 is equivalent to::
1600
1601 stream | filter1 | filter2
1602
1603 :param filters: one or more callable objects that should be applied as
1604 filters
1605 :return: the filtered stream
1606 :rtype: `Stream`
1607 """
1608 return reduce(operator.or_, (self,) + filters)
1609
1610 def render(self, method=None, encoding='utf-8', out=None, **kwargs):
1611 """Return a string representation of the stream.
1612
1613 Any additional keyword arguments are passed to the serializer, and thus
1614 depend on the `method` parameter value.
1615
1616 :param method: determines how the stream is serialized; can be either
1617 "xml", "xhtml", "html", "text", or a custom serializer
1618 class; if `None`, the default serialization method of
1619 the stream is used
1620 :param encoding: how the output string should be encoded; if set to
1621 `None`, this method returns a `unicode` object
1622 :param out: a file-like object that the output should be written to
1623 instead of being returned as one big string; note that if
1624 this is a file or socket (or similar), the `encoding` must
1625 not be `None` (that is, the output must be encoded)
1626 :return: a `str` or `unicode` object (depending on the `encoding`
1627 parameter), or `None` if the `out` parameter is provided
1628 :rtype: `basestring`
1629
1630 :see: XMLSerializer, XHTMLSerializer, HTMLSerializer, TextSerializer
1631 :note: Changed in 0.5: added the `out` parameter
1632 """
1633 from genshi.output import encode
1634 if method is None:
1635 method = self.serializer or 'xml'
1636 generator = self.serialize(method=method, **kwargs)
1637 return encode(generator, method=method, encoding=encoding, out=out)
1638
1639 def select(self, path, namespaces=None, variables=None):
1640 """Return a new stream that contains the events matching the given
1641 XPath expression.
1642
1643 >>> from genshi import HTML
1644 >>> stream = HTML('<doc><elem>foo</elem><elem>bar</elem></doc>')
1645 >>> print(stream.select('elem'))
1646 <elem>foo</elem><elem>bar</elem>
1647 >>> print(stream.select('elem/text()'))
1648 foobar
1649
1650 Note that the outermost element of the stream becomes the *context
1651 node* for the XPath test. That means that the expression "doc" would
1652 not match anything in the example above, because it only tests against
1653 child elements of the outermost element:
1654
1655 >>> print(stream.select('doc'))
1656 <BLANKLINE>
1657
1658 You can use the "." expression to match the context node itself
1659 (although that usually makes little sense):
1660
1661 >>> print(stream.select('.'))
1662 <doc><elem>foo</elem><elem>bar</elem></doc>
1663
1664 :param path: a string containing the XPath expression
1665 :param namespaces: mapping of namespace prefixes used in the path
1666 :param variables: mapping of variable names to values
1667 :return: the selected substream
1668 :rtype: `Stream`
1669 :raises PathSyntaxError: if the given path expression is invalid or not
1670 supported
1671 """
1672 from genshi.path import Path
1673 return Path(path).select(self, namespaces, variables)
1674
1675 def serialize(self, method='xml', **kwargs):
1676 """Generate strings corresponding to a specific serialization of the
1677 stream.
1678
1679 Unlike the `render()` method, this method is a generator that returns
1680 the serialized output incrementally, as opposed to returning a single
1681 string.
1682
1683 Any additional keyword arguments are passed to the serializer, and thus
1684 depend on the `method` parameter value.
1685
1686 :param method: determines how the stream is serialized; can be either
1687 "xml", "xhtml", "html", "text", or a custom serializer
1688 class; if `None`, the default serialization method of
1689 the stream is used
1690 :return: an iterator over the serialization results (`Markup` or
1691 `unicode` objects, depending on the serialization method)
1692 :rtype: ``iterator``
1693 :see: XMLSerializer, XHTMLSerializer, HTMLSerializer, TextSerializer
1694 """
1695 from genshi.output import get_serializer
1696 if method is None:
1697 method = self.serializer or 'xml'
1698 return get_serializer(method, **kwargs)(_ensure(self))
1699
1700 def __str__(self):
1701 return self.render()
1702
1703 def __unicode__(self):
1704 return self.render(encoding=None)
1705
1706 def __html__(self):
1707 return self
1708
1709
1710 START = Stream.START
1711 END = Stream.END
1712 TEXT = Stream.TEXT
1713 XML_DECL = Stream.XML_DECL
1714 DOCTYPE = Stream.DOCTYPE
1715 START_NS = Stream.START_NS
1716 END_NS = Stream.END_NS
1717 START_CDATA = Stream.START_CDATA
1718 END_CDATA = Stream.END_CDATA
1719 PI = Stream.PI
1720 COMMENT = Stream.COMMENT
1721
1722
1723 def _ensure(stream):
1724 """Ensure that every item on the stream is actually a markup event."""
1725 stream = iter(stream)
1726 event = stream.next()
1727
1728 # Check whether the iterable is a real markup event stream by examining the
1729 # first item it yields; if it's not we'll need to do some conversion
1730 if type(event) is not tuple or len(event) != 3:
1731 for event in chain([event], stream):
1732 if hasattr(event, 'totuple'):
1733 event = event.totuple()
1734 else:
1735 event = TEXT, unicode(event), (None, -1, -1)
1736 yield event
1737 return
1738
1739 # This looks like a markup event stream, so we'll just pass it through
1740 # unchanged
1741 yield event
1742 for event in stream:
1743 yield event
1744
1745
1746 class Attrs(tuple):
1747 """Immutable sequence type that stores the attributes of an element.
1748
1749 Ordering of the attributes is preserved, while access by name is also
1750 supported.
1751
1752 >>> attrs = Attrs([('href', '#'), ('title', 'Foo')])
1753 >>> attrs
1754 Attrs([('href', '#'), ('title', 'Foo')])
1755
1756 >>> 'href' in attrs
1757 True
1758 >>> 'tabindex' in attrs
1759 False
1760 >>> attrs.get('title')
1761 'Foo'
1762
1763 Instances may not be manipulated directly. Instead, the operators ``|`` and
1764 ``-`` can be used to produce new instances that have specific attributes
1765 added, replaced or removed.
1766
1767 To remove an attribute, use the ``-`` operator. The right hand side can be
1768 either a string or a set/sequence of strings, identifying the name(s) of
1769 the attribute(s) to remove:
1770
1771 >>> attrs - 'title'
1772 Attrs([('href', '#')])
1773 >>> attrs - ('title', 'href')
1774 Attrs()
1775
1776 The original instance is not modified, but the operator can of course be
1777 used with an assignment:
1778
1779 >>> attrs
1780 Attrs([('href', '#'), ('title', 'Foo')])
1781 >>> attrs -= 'title'
1782 >>> attrs
1783 Attrs([('href', '#')])
1784
1785 To add a new attribute, use the ``|`` operator, where the right hand value
1786 is a sequence of ``(name, value)`` tuples (which includes `Attrs`
1787 instances):
1788
1789 >>> attrs | [('title', 'Bar')]
1790 Attrs([('href', '#'), ('title', 'Bar')])
1791
1792 If the attributes already contain an attribute with a given name, the value
1793 of that attribute is replaced:
1794
1795 >>> attrs | [('href', 'http://example.org/')]
1796 Attrs([('href', 'http://example.org/')])
1797 """
1798 __slots__ = []
1799
1800 def __contains__(self, name):
1801 """Return whether the list includes an attribute with the specified
1802 name.
1803
1804 :return: `True` if the list includes the attribute
1805 :rtype: `bool`
1806 """
1807 for attr, _ in self:
1808 if attr == name:
1809 return True
1810
1811 def __getitem__(self, i):
1812 """Return an item or slice of the attributes list.
1813
1814 >>> attrs = Attrs([('href', '#'), ('title', 'Foo')])
1815 >>> attrs[1]
1816 ('title', 'Foo')
1817 >>> attrs[1:]
1818 Attrs([('title', 'Foo')])
1819 """
1820 items = tuple.__getitem__(self, i)
1821 if type(i) is slice:
1822 return Attrs(items)
1823 return items
1824
1825 def __getslice__(self, i, j):
1826 """Return a slice of the attributes list.
1827
1828 >>> attrs = Attrs([('href', '#'), ('title', 'Foo')])
1829 >>> attrs[1:]
1830 Attrs([('title', 'Foo')])
1831 """
1832 return Attrs(tuple.__getslice__(self, i, j))
1833
1834 def __or__(self, attrs):
1835 """Return a new instance that contains the attributes in `attrs` in
1836 addition to any already existing attributes.
1837
1838 :return: a new instance with the merged attributes
1839 :rtype: `Attrs`
1840 """
1841 repl = dict([(an, av) for an, av in attrs if an in self])
1842 return Attrs([(sn, repl.get(sn, sv)) for sn, sv in self] +
1843 [(an, av) for an, av in attrs if an not in self])
1844
1845 def __repr__(self):
1846 if not self:
1847 return 'Attrs()'
1848 return 'Attrs([%s])' % ', '.join([repr(item) for item in self])
1849
1850 def __sub__(self, names):
1851 """Return a new instance with all attributes with a name in `names` are
1852 removed.
1853
1854 :param names: the names of the attributes to remove
1855 :return: a new instance with the attribute removed
1856 :rtype: `Attrs`
1857 """
1858 if isinstance(names, basestring):
1859 names = (names,)
1860 return Attrs([(name, val) for name, val in self if name not in names])
1861
1862 def get(self, name, default=None):
1863 """Return the value of the attribute with the specified name, or the
1864 value of the `default` parameter if no such attribute is found.
1865
1866 :param name: the name of the attribute
1867 :param default: the value to return when the attribute does not exist
1868 :return: the attribute value, or the `default` value if that attribute
1869 does not exist
1870 :rtype: `object`
1871 """
1872 for attr, value in self:
1873 if attr == name:
1874 return value
1875 return default
1876
1877 def totuple(self):
1878 """Return the attributes as a markup event.
1879
1880 The returned event is a `TEXT` event, the data is the value of all
1881 attributes joined together.
1882
1883 >>> Attrs([('href', '#'), ('title', 'Foo')]).totuple()
1884 ('TEXT', '#Foo', (None, -1, -1))
1885
1886 :return: a `TEXT` event
1887 :rtype: `tuple`
1888 """
1889 return TEXT, ''.join([x[1] for x in self]), (None, -1, -1)
1890
1891
1892 class Markup(unicode):
1893 """Marks a string as being safe for inclusion in HTML/XML output without
1894 needing to be escaped.
1895 """
1896 __slots__ = []
1897
1898 def __add__(self, other):
1899 return Markup(unicode.__add__(self, escape(other)))
1900
1901 def __radd__(self, other):
1902 return Markup(unicode.__add__(escape(other), self))
1903
1904 def __mod__(self, args):
1905 if isinstance(args, dict):
1906 args = dict(zip(args.keys(), map(escape, args.values())))
1907 elif isinstance(args, (list, tuple)):
1908 args = tuple(map(escape, args))
1909 else:
1910 args = escape(args)
1911 return Markup(unicode.__mod__(self, args))
1912
1913 def __mul__(self, num):
1914 return Markup(unicode.__mul__(self, num))
1915 __rmul__ = __mul__
1916
1917 def __repr__(self):
1918 return "<%s %s>" % (type(self).__name__, unicode.__repr__(self))
1919
1920 def join(self, seq, escape_quotes=True):
1921 """Return a `Markup` object which is the concatenation of the strings
1922 in the given sequence, where this `Markup` object is the separator
1923 between the joined elements.
1924
1925 Any element in the sequence that is not a `Markup` instance is
1926 automatically escaped.
1927
1928 :param seq: the sequence of strings to join
1929 :param escape_quotes: whether double quote characters in the elements
1930 should be escaped
1931 :return: the joined `Markup` object
1932 :rtype: `Markup`
1933 :see: `escape`
1934 """
1935 return Markup(unicode.join(self, [escape(item, quotes=escape_quotes)
1936 for item in seq]))
1937
1938 @classmethod
1939 def escape(cls, text, quotes=True):
1940 """Create a Markup instance from a string and escape special characters
1941 it may contain (<, >, & and \").
1942
1943 >>> escape('"1 < 2"')
1944 <Markup u'&#34;1 &lt; 2&#34;'>
1945
1946 If the `quotes` parameter is set to `False`, the \" character is left
1947 as is. Escaping quotes is generally only required for strings that are
1948 to be used in attribute values.
1949
1950 >>> escape('"1 < 2"', quotes=False)
1951 <Markup u'"1 &lt; 2"'>
1952
1953 :param text: the text to escape
1954 :param quotes: if ``True``, double quote characters are escaped in
1955 addition to the other special characters
1956 :return: the escaped `Markup` string
1957 :rtype: `Markup`
1958 """
1959 if not text:
1960 return cls()
1961 if type(text) is cls:
1962 return text
1963 if hasattr(text, '__html__'):
1964 return Markup(text.__html__())
1965
1966 text = text.replace('&', '&amp;') \
1967 .replace('<', '&lt;') \
1968 .replace('>', '&gt;')
1969 if quotes:
1970 text = text.replace('"', '&#34;')
1971 return cls(text)
1972
1973 def unescape(self):
1974 """Reverse-escapes &, <, >, and \" and returns a `unicode` object.
1975
1976 >>> Markup('1 &lt; 2').unescape()
1977 u'1 < 2'
1978
1979 :return: the unescaped string
1980 :rtype: `unicode`
1981 :see: `genshi.core.unescape`
1982 """
1983 if not self:
1984 return ''
1985 return unicode(self).replace('&#34;', '"') \
1986 .replace('&gt;', '>') \
1987 .replace('&lt;', '<') \
1988 .replace('&amp;', '&')
1989
1990 def stripentities(self, keepxmlentities=False):
1991 """Return a copy of the text with any character or numeric entities
1992 replaced by the equivalent UTF-8 characters.
1993
1994 If the `keepxmlentities` parameter is provided and evaluates to `True`,
1995 the core XML entities (``&amp;``, ``&apos;``, ``&gt;``, ``&lt;`` and
1996 ``&quot;``) are not stripped.
1997
1998 :return: a `Markup` instance with entities removed
1999 :rtype: `Markup`
2000 :see: `genshi.util.stripentities`
2001 """
2002 return Markup(stripentities(self, keepxmlentities=keepxmlentities))
2003
2004 def striptags(self):
2005 """Return a copy of the text with all XML/HTML tags removed.
2006
2007 :return: a `Markup` instance with all tags removed
2008 :rtype: `Markup`
2009 :see: `genshi.util.striptags`
2010 """
2011 return Markup(striptags(self))
2012
2013
2014 try:
2015 from genshi._speedups import Markup
2016 except ImportError:
2017 pass # just use the Python implementation
2018
2019
2020 escape = Markup.escape
2021
2022
2023 def unescape(text):
2024 """Reverse-escapes &, <, >, and \" and returns a `unicode` object.
2025
2026 >>> unescape(Markup('1 &lt; 2'))
2027 u'1 < 2'
2028
2029 If the provided `text` object is not a `Markup` instance, it is returned
2030 unchanged.
2031
2032 >>> unescape('1 &lt; 2')
2033 '1 &lt; 2'
2034
2035 :param text: the text to unescape
2036 :return: the unescsaped string
2037 :rtype: `unicode`
2038 """
2039 if not isinstance(text, Markup):
2040 return text
2041 return text.unescape()
2042
2043
2044 class Namespace(object):
2045 """Utility class creating and testing elements with a namespace.
2046
2047 Internally, namespace URIs are encoded in the `QName` of any element or
2048 attribute, the namespace URI being enclosed in curly braces. This class
2049 helps create and test these strings.
2050
2051 A `Namespace` object is instantiated with the namespace URI.
2052
2053 >>> html = Namespace('http://www.w3.org/1999/xhtml')
2054 >>> html
2055 Namespace('http://www.w3.org/1999/xhtml')
2056 >>> html.uri
2057 u'http://www.w3.org/1999/xhtml'
2058
2059 The `Namespace` object can than be used to generate `QName` objects with
2060 that namespace:
2061
2062 >>> html.body
2063 QName('http://www.w3.org/1999/xhtml}body')
2064 >>> html.body.localname
2065 u'body'
2066 >>> html.body.namespace
2067 u'http://www.w3.org/1999/xhtml'
2068
2069 The same works using item access notation, which is useful for element or
2070 attribute names that are not valid Python identifiers:
2071
2072 >>> html['body']
2073 QName('http://www.w3.org/1999/xhtml}body')
2074
2075 A `Namespace` object can also be used to test whether a specific `QName`
2076 belongs to that namespace using the ``in`` operator:
2077
2078 >>> qname = html.body
2079 >>> qname in html
2080 True
2081 >>> qname in Namespace('http://www.w3.org/2002/06/xhtml2')
2082 False
2083 """
2084 def __new__(cls, uri):
2085 if type(uri) is cls:
2086 return uri
2087 return object.__new__(cls)
2088
2089 def __getnewargs__(self):
2090 return (self.uri,)
2091
2092 def __getstate__(self):
2093 return self.uri
2094
2095 def __setstate__(self, uri):
2096 self.uri = uri
2097
2098 def __init__(self, uri):
2099 self.uri = unicode(uri)
2100
2101 def __contains__(self, qname):
2102 return qname.namespace == self.uri
2103
2104 def __ne__(self, other):
2105 return not self == other
2106
2107 def __eq__(self, other):
2108 if isinstance(other, Namespace):
2109 return self.uri == other.uri
2110 return self.uri == other
2111
2112 def __getitem__(self, name):
2113 return QName(self.uri + '}' + name)
2114 __getattr__ = __getitem__
2115
2116 def __hash__(self):
2117 return hash(self.uri)
2118
2119 def __repr__(self):
2120 return 'Namespace(%s)' % stringrepr(self.uri)
2121
2122 def __str__(self):
2123 return self.uri.encode('utf-8')
2124
2125 def __unicode__(self):
2126 return self.uri
2127
2128
2129 # The namespace used by attributes such as xml:lang and xml:space
2130 XML_NAMESPACE = Namespace('http://www.w3.org/XML/1998/namespace')
2131
2132
2133 class QName(unicode):
2134 """A qualified element or attribute name.
2135
2136 The unicode value of instances of this class contains the qualified name of
2137 the element or attribute, in the form ``{namespace-uri}local-name``. The
2138 namespace URI can be obtained through the additional `namespace` attribute,
2139 while the local name can be accessed through the `localname` attribute.
2140
2141 >>> qname = QName('foo')
2142 >>> qname
2143 QName('foo')
2144 >>> qname.localname
2145 u'foo'
2146 >>> qname.namespace
2147
2148 >>> qname = QName('http://www.w3.org/1999/xhtml}body')
2149 >>> qname
2150 QName('http://www.w3.org/1999/xhtml}body')
2151 >>> qname.localname
2152 u'body'
2153 >>> qname.namespace
2154 u'http://www.w3.org/1999/xhtml'
2155 """
2156 __slots__ = ['namespace', 'localname']
2157
2158 def __new__(cls, qname):
2159 """Create the `QName` instance.
2160
2161 :param qname: the qualified name as a string of the form
2162 ``{namespace-uri}local-name``, where the leading curly
2163 brace is optional
2164 """
2165 if type(qname) is cls:
2166 return qname
2167
2168 parts = qname.lstrip('{').split('}', 1)
2169 if len(parts) > 1:
2170 self = unicode.__new__(cls, '{%s' % qname)
2171 self.namespace, self.localname = map(unicode, parts)
2172 else:
2173 self = unicode.__new__(cls, qname)
2174 self.namespace, self.localname = None, unicode(qname)
2175 return self
2176
2177 def __getnewargs__(self):
2178 return (self.lstrip('{'),)
2179
2180 def __repr__(self):
2181 return 'QName(%s)' % stringrepr(self.lstrip('{'))
2182 # -*- coding: utf-8 -*-
2183 #
2184 # Copyright (C) 2006-2009 Edgewall Software
2185 # All rights reserved.
2186 #
2187 # This software is licensed as described in the file COPYING, which
2188 # you should have received as part of this distribution. The terms
2189 # are also available at http://genshi.edgewall.org/wiki/License.
2190 #
2191 # This software consists of voluntary contributions made by many
2192 # individuals. For the exact contribution history, see the revision
2193 # history and logs, available at http://genshi.edgewall.org/log/.
2194
2195 """Core classes for markup processing."""
2196
2197 try:
2198 reduce # builtin in Python < 3
2199 except NameError:
2200 from functools import reduce
2201 from itertools import chain
2202 import operator
2203
2204 from genshi.util import plaintext, stripentities, striptags, stringrepr
2205
2206 __all__ = ['Stream', 'Markup', 'escape', 'unescape', 'Attrs', 'Namespace',
2207 'QName']
2208 __docformat__ = 'restructuredtext en'
2209
2210
2211 class StreamEventKind(str):
2212 """A kind of event on a markup stream."""
2213 __slots__ = []
2214 _instances = {}
2215
2216 def __new__(cls, val):
2217 return cls._instances.setdefault(val, str.__new__(cls, val))
2218
2219
2220 class Stream(object):
2221 """Represents a stream of markup events.
2222
2223 This class is basically an iterator over the events.
2224
2225 Stream events are tuples of the form::
2226
2227 (kind, data, position)
2228
2229 where ``kind`` is the event kind (such as `START`, `END`, `TEXT`, etc),
2230 ``data`` depends on the kind of event, and ``position`` is a
2231 ``(filename, line, offset)`` tuple that contains the location of the
2232 original element or text in the input. If the original location is unknown,
2233 ``position`` is ``(None, -1, -1)``.
2234
2235 Also provided are ways to serialize the stream to text. The `serialize()`
2236 method will return an iterator over generated strings, while `render()`
2237 returns the complete generated text at once. Both accept various parameters
2238 that impact the way the stream is serialized.
2239 """
2240 __slots__ = ['events', 'serializer']
2241
2242 START = StreamEventKind('START') #: a start tag
2243 END = StreamEventKind('END') #: an end tag
2244 TEXT = StreamEventKind('TEXT') #: literal text
2245 XML_DECL = StreamEventKind('XML_DECL') #: XML declaration
2246 DOCTYPE = StreamEventKind('DOCTYPE') #: doctype declaration
2247 START_NS = StreamEventKind('START_NS') #: start namespace mapping
2248 END_NS = StreamEventKind('END_NS') #: end namespace mapping
2249 START_CDATA = StreamEventKind('START_CDATA') #: start CDATA section
2250 END_CDATA = StreamEventKind('END_CDATA') #: end CDATA section
2251 PI = StreamEventKind('PI') #: processing instruction
2252 COMMENT = StreamEventKind('COMMENT') #: comment
2253
2254 def __init__(self, events, serializer=None):
2255 """Initialize the stream with a sequence of markup events.
2256
2257 :param events: a sequence or iterable providing the events
2258 :param serializer: the default serialization method to use for this
2259 stream
2260
2261 :note: Changed in 0.5: added the `serializer` argument
2262 """
2263 self.events = events #: The underlying iterable producing the events
2264 self.serializer = serializer #: The default serializion method
2265
2266 def __iter__(self):
2267 return iter(self.events)
2268
2269 def __or__(self, function):
2270 """Override the "bitwise or" operator to apply filters or serializers
2271 to the stream, providing a syntax similar to pipes on Unix shells.
2272
2273 Assume the following stream produced by the `HTML` function:
2274
2275 >>> from genshi.input import HTML
2276 >>> html = HTML('''<p onclick="alert('Whoa')">Hello, world!</p>''')
2277 >>> print(html)
2278 <p onclick="alert('Whoa')">Hello, world!</p>
2279
2280 A filter such as the HTML sanitizer can be applied to that stream using
2281 the pipe notation as follows:
2282
2283 >>> from genshi.filters import HTMLSanitizer
2284 >>> sanitizer = HTMLSanitizer()
2285 >>> print(html | sanitizer)
2286 <p>Hello, world!</p>
2287
2288 Filters can be any function that accepts and produces a stream (where
2289 a stream is anything that iterates over events):
2290
2291 >>> def uppercase(stream):
2292 ... for kind, data, pos in stream:
2293 ... if kind is TEXT:
2294 ... data = data.upper()
2295 ... yield kind, data, pos
2296 >>> print(html | sanitizer | uppercase)
2297 <p>HELLO, WORLD!</p>
2298
2299 Serializers can also be used with this notation:
2300
2301 >>> from genshi.output import TextSerializer
2302 >>> output = TextSerializer()
2303 >>> print(html | sanitizer | uppercase | output)
2304 HELLO, WORLD!
2305
2306 Commonly, serializers should be used at the end of the "pipeline";
2307 using them somewhere in the middle may produce unexpected results.
2308
2309 :param function: the callable object that should be applied as a filter
2310 :return: the filtered stream
2311 :rtype: `Stream`
2312 """
2313 return Stream(_ensure(function(self)), serializer=self.serializer)
2314
2315 def filter(self, *filters):
2316 """Apply filters to the stream.
2317
2318 This method returns a new stream with the given filters applied. The
2319 filters must be callables that accept the stream object as parameter,
2320 and return the filtered stream.
2321
2322 The call::
2323
2324 stream.filter(filter1, filter2)
2325
2326 is equivalent to::
2327
2328 stream | filter1 | filter2
2329
2330 :param filters: one or more callable objects that should be applied as
2331 filters
2332 :return: the filtered stream
2333 :rtype: `Stream`
2334 """
2335 return reduce(operator.or_, (self,) + filters)
2336
2337 def render(self, method=None, encoding='utf-8', out=None, **kwargs):
2338 """Return a string representation of the stream.
2339
2340 Any additional keyword arguments are passed to the serializer, and thus
2341 depend on the `method` parameter value.
2342
2343 :param method: determines how the stream is serialized; can be either
2344 "xml", "xhtml", "html", "text", or a custom serializer
2345 class; if `None`, the default serialization method of
2346 the stream is used
2347 :param encoding: how the output string should be encoded; if set to
2348 `None`, this method returns a `unicode` object
2349 :param out: a file-like object that the output should be written to
2350 instead of being returned as one big string; note that if
2351 this is a file or socket (or similar), the `encoding` must
2352 not be `None` (that is, the output must be encoded)
2353 :return: a `str` or `unicode` object (depending on the `encoding`
2354 parameter), or `None` if the `out` parameter is provided
2355 :rtype: `basestring`
2356
2357 :see: XMLSerializer, XHTMLSerializer, HTMLSerializer, TextSerializer
2358 :note: Changed in 0.5: added the `out` parameter
2359 """
2360 from genshi.output import encode
2361 if method is None:
2362 method = self.serializer or 'xml'
2363 generator = self.serialize(method=method, **kwargs)
2364 return encode(generator, method=method, encoding=encoding, out=out)
2365
2366 def select(self, path, namespaces=None, variables=None):
2367 """Return a new stream that contains the events matching the given
2368 XPath expression.
2369
2370 >>> from genshi import HTML
2371 >>> stream = HTML('<doc><elem>foo</elem><elem>bar</elem></doc>')
2372 >>> print(stream.select('elem'))
2373 <elem>foo</elem><elem>bar</elem>
2374 >>> print(stream.select('elem/text()'))
2375 foobar
2376
2377 Note that the outermost element of the stream becomes the *context
2378 node* for the XPath test. That means that the expression "doc" would
2379 not match anything in the example above, because it only tests against
2380 child elements of the outermost element:
2381
2382 >>> print(stream.select('doc'))
2383 <BLANKLINE>
2384
2385 You can use the "." expression to match the context node itself
2386 (although that usually makes little sense):
2387
2388 >>> print(stream.select('.'))
2389 <doc><elem>foo</elem><elem>bar</elem></doc>
2390
2391 :param path: a string containing the XPath expression
2392 :param namespaces: mapping of namespace prefixes used in the path
2393 :param variables: mapping of variable names to values
2394 :return: the selected substream
2395 :rtype: `Stream`
2396 :raises PathSyntaxError: if the given path expression is invalid or not
2397 supported
2398 """
2399 from genshi.path import Path
2400 return Path(path).select(self, namespaces, variables)
2401
2402 def serialize(self, method='xml', **kwargs):
2403 """Generate strings corresponding to a specific serialization of the
2404 stream.
2405
2406 Unlike the `render()` method, this method is a generator that returns
2407 the serialized output incrementally, as opposed to returning a single
2408 string.
2409
2410 Any additional keyword arguments are passed to the serializer, and thus
2411 depend on the `method` parameter value.
2412
2413 :param method: determines how the stream is serialized; can be either
2414 "xml", "xhtml", "html", "text", or a custom serializer
2415 class; if `None`, the default serialization method of
2416 the stream is used
2417 :return: an iterator over the serialization results (`Markup` or
2418 `unicode` objects, depending on the serialization method)
2419 :rtype: ``iterator``
2420 :see: XMLSerializer, XHTMLSerializer, HTMLSerializer, TextSerializer
2421 """
2422 from genshi.output import get_serializer
2423 if method is None:
2424 method = self.serializer or 'xml'
2425 return get_serializer(method, **kwargs)(_ensure(self))
2426
2427 def __str__(self):
2428 return self.render()
2429
2430 def __unicode__(self):
2431 return self.render(encoding=None)
2432
2433 def __html__(self):
2434 return self
2435
2436
2437 START = Stream.START
2438 END = Stream.END
2439 TEXT = Stream.TEXT
2440 XML_DECL = Stream.XML_DECL
2441 DOCTYPE = Stream.DOCTYPE
2442 START_NS = Stream.START_NS
2443 END_NS = Stream.END_NS
2444 START_CDATA = Stream.START_CDATA
2445 END_CDATA = Stream.END_CDATA
2446 PI = Stream.PI
2447 COMMENT = Stream.COMMENT
2448
2449
2450 def _ensure(stream):
2451 """Ensure that every item on the stream is actually a markup event."""
2452 stream = iter(stream)
2453 event = stream.next()
2454
2455 # Check whether the iterable is a real markup event stream by examining the
2456 # first item it yields; if it's not we'll need to do some conversion
2457 if type(event) is not tuple or len(event) != 3:
2458 for event in chain([event], stream):
2459 if hasattr(event, 'totuple'):
2460 event = event.totuple()
2461 else:
2462 event = TEXT, unicode(event), (None, -1, -1)
2463 yield event
2464 return
2465
2466 # This looks like a markup event stream, so we'll just pass it through
2467 # unchanged
2468 yield event
2469 for event in stream:
2470 yield event
2471
2472
2473 class Attrs(tuple):
2474 """Immutable sequence type that stores the attributes of an element.
2475
2476 Ordering of the attributes is preserved, while access by name is also
2477 supported.
2478
2479 >>> attrs = Attrs([('href', '#'), ('title', 'Foo')])
2480 >>> attrs
2481 Attrs([('href', '#'), ('title', 'Foo')])
2482
2483 >>> 'href' in attrs
2484 True
2485 >>> 'tabindex' in attrs
2486 False
2487 >>> attrs.get('title')
2488 'Foo'
2489
2490 Instances may not be manipulated directly. Instead, the operators ``|`` and
2491 ``-`` can be used to produce new instances that have specific attributes
2492 added, replaced or removed.
2493
2494 To remove an attribute, use the ``-`` operator. The right hand side can be
2495 either a string or a set/sequence of strings, identifying the name(s) of
2496 the attribute(s) to remove:
2497
2498 >>> attrs - 'title'
2499 Attrs([('href', '#')])
2500 >>> attrs - ('title', 'href')
2501 Attrs()
2502
2503 The original instance is not modified, but the operator can of course be
2504 used with an assignment:
2505
2506 >>> attrs
2507 Attrs([('href', '#'), ('title', 'Foo')])
2508 >>> attrs -= 'title'
2509 >>> attrs
2510 Attrs([('href', '#')])
2511
2512 To add a new attribute, use the ``|`` operator, where the right hand value
2513 is a sequence of ``(name, value)`` tuples (which includes `Attrs`
2514 instances):
2515
2516 >>> attrs | [('title', 'Bar')]
2517 Attrs([('href', '#'), ('title', 'Bar')])
2518
2519 If the attributes already contain an attribute with a given name, the value
2520 of that attribute is replaced:
2521
2522 >>> attrs | [('href', 'http://example.org/')]
2523 Attrs([('href', 'http://example.org/')])
2524 """
2525 __slots__ = []
2526
2527 def __contains__(self, name):
2528 """Return whether the list includes an attribute with the specified
2529 name.
2530
2531 :return: `True` if the list includes the attribute
2532 :rtype: `bool`
2533 """
2534 for attr, _ in self:
2535 if attr == name:
2536 return True
2537
2538 def __getitem__(self, i):
2539 """Return an item or slice of the attributes list.
2540
2541 >>> attrs = Attrs([('href', '#'), ('title', 'Foo')])
2542 >>> attrs[1]
2543 ('title', 'Foo')
2544 >>> attrs[1:]
2545 Attrs([('title', 'Foo')])
2546 """
2547 items = tuple.__getitem__(self, i)
2548 if type(i) is slice:
2549 return Attrs(items)
2550 return items
2551
2552 def __getslice__(self, i, j):
2553 """Return a slice of the attributes list.
2554
2555 >>> attrs = Attrs([('href', '#'), ('title', 'Foo')])
2556 >>> attrs[1:]
2557 Attrs([('title', 'Foo')])
2558 """
2559 return Attrs(tuple.__getslice__(self, i, j))
2560
2561 def __or__(self, attrs):
2562 """Return a new instance that contains the attributes in `attrs` in
2563 addition to any already existing attributes.
2564
2565 :return: a new instance with the merged attributes
2566 :rtype: `Attrs`
2567 """
2568 repl = dict([(an, av) for an, av in attrs if an in self])
2569 return Attrs([(sn, repl.get(sn, sv)) for sn, sv in self] +
2570 [(an, av) for an, av in attrs if an not in self])
2571
2572 def __repr__(self):
2573 if not self:
2574 return 'Attrs()'
2575 return 'Attrs([%s])' % ', '.join([repr(item) for item in self])
2576
2577 def __sub__(self, names):
2578 """Return a new instance with all attributes with a name in `names` are
2579 removed.
2580
2581 :param names: the names of the attributes to remove
2582 :return: a new instance with the attribute removed
2583 :rtype: `Attrs`
2584 """
2585 if isinstance(names, basestring):
2586 names = (names,)
2587 return Attrs([(name, val) for name, val in self if name not in names])
2588
2589 def get(self, name, default=None):
2590 """Return the value of the attribute with the specified name, or the
2591 value of the `default` parameter if no such attribute is found.
2592
2593 :param name: the name of the attribute
2594 :param default: the value to return when the attribute does not exist
2595 :return: the attribute value, or the `default` value if that attribute
2596 does not exist
2597 :rtype: `object`
2598 """
2599 for attr, value in self:
2600 if attr == name:
2601 return value
2602 return default
2603
2604 def totuple(self):
2605 """Return the attributes as a markup event.
2606
2607 The returned event is a `TEXT` event, the data is the value of all
2608 attributes joined together.
2609
2610 >>> Attrs([('href', '#'), ('title', 'Foo')]).totuple()
2611 ('TEXT', '#Foo', (None, -1, -1))
2612
2613 :return: a `TEXT` event
2614 :rtype: `tuple`
2615 """
2616 return TEXT, ''.join([x[1] for x in self]), (None, -1, -1)
2617
2618
2619 class Markup(unicode):
2620 """Marks a string as being safe for inclusion in HTML/XML output without
2621 needing to be escaped.
2622 """
2623 __slots__ = []
2624
2625 def __add__(self, other):
2626 return Markup(unicode.__add__(self, escape(other)))
2627
2628 def __radd__(self, other):
2629 return Markup(unicode.__add__(escape(other), self))
2630
2631 def __mod__(self, args):
2632 if isinstance(args, dict):
2633 args = dict(zip(args.keys(), map(escape, args.values())))
2634 elif isinstance(args, (list, tuple)):
2635 args = tuple(map(escape, args))
2636 else:
2637 args = escape(args)
2638 return Markup(unicode.__mod__(self, args))
2639
2640 def __mul__(self, num):
2641 return Markup(unicode.__mul__(self, num))
2642 __rmul__ = __mul__
2643
2644 def __repr__(self):
2645 return "<%s %s>" % (type(self).__name__, unicode.__repr__(self))
2646
2647 def join(self, seq, escape_quotes=True):
2648 """Return a `Markup` object which is the concatenation of the strings
2649 in the given sequence, where this `Markup` object is the separator
2650 between the joined elements.
2651
2652 Any element in the sequence that is not a `Markup` instance is
2653 automatically escaped.
2654
2655 :param seq: the sequence of strings to join
2656 :param escape_quotes: whether double quote characters in the elements
2657 should be escaped
2658 :return: the joined `Markup` object
2659 :rtype: `Markup`
2660 :see: `escape`
2661 """
2662 return Markup(unicode.join(self, [escape(item, quotes=escape_quotes)
2663 for item in seq]))
2664
2665 @classmethod
2666 def escape(cls, text, quotes=True):
2667 """Create a Markup instance from a string and escape special characters
2668 it may contain (<, >, & and \").
2669
2670 >>> escape('"1 < 2"')
2671 <Markup u'&#34;1 &lt; 2&#34;'>
2672
2673 If the `quotes` parameter is set to `False`, the \" character is left
2674 as is. Escaping quotes is generally only required for strings that are
2675 to be used in attribute values.
2676
2677 >>> escape('"1 < 2"', quotes=False)
2678 <Markup u'"1 &lt; 2"'>
2679
2680 :param text: the text to escape
2681 :param quotes: if ``True``, double quote characters are escaped in
2682 addition to the other special characters
2683 :return: the escaped `Markup` string
2684 :rtype: `Markup`
2685 """
2686 if not text:
2687 return cls()
2688 if type(text) is cls:
2689 return text
2690 if hasattr(text, '__html__'):
2691 return Markup(text.__html__())
2692
2693 text = text.replace('&', '&amp;') \
2694 .replace('<', '&lt;') \
2695 .replace('>', '&gt;')
2696 if quotes:
2697 text = text.replace('"', '&#34;')
2698 return cls(text)
2699
2700 def unescape(self):
2701 """Reverse-escapes &, <, >, and \" and returns a `unicode` object.
2702
2703 >>> Markup('1 &lt; 2').unescape()
2704 u'1 < 2'
2705
2706 :return: the unescaped string
2707 :rtype: `unicode`
2708 :see: `genshi.core.unescape`
2709 """
2710 if not self:
2711 return ''
2712 return unicode(self).replace('&#34;', '"') \
2713 .replace('&gt;', '>') \
2714 .replace('&lt;', '<') \
2715 .replace('&amp;', '&')
2716
2717 def stripentities(self, keepxmlentities=False):
2718 """Return a copy of the text with any character or numeric entities
2719 replaced by the equivalent UTF-8 characters.
2720
2721 If the `keepxmlentities` parameter is provided and evaluates to `True`,
2722 the core XML entities (``&amp;``, ``&apos;``, ``&gt;``, ``&lt;`` and
2723 ``&quot;``) are not stripped.
2724
2725 :return: a `Markup` instance with entities removed
2726 :rtype: `Markup`
2727 :see: `genshi.util.stripentities`
2728 """
2729 return Markup(stripentities(self, keepxmlentities=keepxmlentities))
2730
2731 def striptags(self):
2732 """Return a copy of the text with all XML/HTML tags removed.
2733
2734 :return: a `Markup` instance with all tags removed
2735 :rtype: `Markup`
2736 :see: `genshi.util.striptags`
2737 """
2738 return Markup(striptags(self))
2739
2740
2741 try:
2742 from genshi._speedups import Markup
2743 except ImportError:
2744 pass # just use the Python implementation
2745
2746
2747 escape = Markup.escape
2748
2749
2750 def unescape(text):
2751 """Reverse-escapes &, <, >, and \" and returns a `unicode` object.
2752
2753 >>> unescape(Markup('1 &lt; 2'))
2754 u'1 < 2'
2755
2756 If the provided `text` object is not a `Markup` instance, it is returned
2757 unchanged.
2758
2759 >>> unescape('1 &lt; 2')
2760 '1 &lt; 2'
2761
2762 :param text: the text to unescape
2763 :return: the unescsaped string
2764 :rtype: `unicode`
2765 """
2766 if not isinstance(text, Markup):
2767 return text
2768 return text.unescape()
2769
2770
2771 class Namespace(object):
2772 """Utility class creating and testing elements with a namespace.
2773
2774 Internally, namespace URIs are encoded in the `QName` of any element or
2775 attribute, the namespace URI being enclosed in curly braces. This class
2776 helps create and test these strings.
2777
2778 A `Namespace` object is instantiated with the namespace URI.
2779
2780 >>> html = Namespace('http://www.w3.org/1999/xhtml')
2781 >>> html
2782 Namespace('http://www.w3.org/1999/xhtml')
2783 >>> html.uri
2784 u'http://www.w3.org/1999/xhtml'
2785
2786 The `Namespace` object can than be used to generate `QName` objects with
2787 that namespace:
2788
2789 >>> html.body
2790 QName('http://www.w3.org/1999/xhtml}body')
2791 >>> html.body.localname
2792 u'body'
2793 >>> html.body.namespace
2794 u'http://www.w3.org/1999/xhtml'
2795
2796 The same works using item access notation, which is useful for element or
2797 attribute names that are not valid Python identifiers:
2798
2799 >>> html['body']
2800 QName('http://www.w3.org/1999/xhtml}body')
2801
2802 A `Namespace` object can also be used to test whether a specific `QName`
2803 belongs to that namespace using the ``in`` operator:
2804
2805 >>> qname = html.body
2806 >>> qname in html
2807 True
2808 >>> qname in Namespace('http://www.w3.org/2002/06/xhtml2')
2809 False
2810 """
2811 def __new__(cls, uri):
2812 if type(uri) is cls:
2813 return uri
2814 return object.__new__(cls)
2815
2816 def __getnewargs__(self):
2817 return (self.uri,)
2818
2819 def __getstate__(self):
2820 return self.uri
2821
2822 def __setstate__(self, uri):
2823 self.uri = uri
2824
2825 def __init__(self, uri):
2826 self.uri = unicode(uri)
2827
2828 def __contains__(self, qname):
2829 return qname.namespace == self.uri
2830
2831 def __ne__(self, other):
2832 return not self == other
2833
2834 def __eq__(self, other):
2835 if isinstance(other, Namespace):
2836 return self.uri == other.uri
2837 return self.uri == other
2838
2839 def __getitem__(self, name):
2840 return QName(self.uri + '}' + name)
2841 __getattr__ = __getitem__
2842
2843 def __hash__(self):
2844 return hash(self.uri)
2845
2846 def __repr__(self):
2847 return 'Namespace(%s)' % stringrepr(self.uri)
2848
2849 def __str__(self):
2850 return self.uri.encode('utf-8')
2851
2852 def __unicode__(self):
2853 return self.uri
2854
2855
2856 # The namespace used by attributes such as xml:lang and xml:space
2857 XML_NAMESPACE = Namespace('http://www.w3.org/XML/1998/namespace')
2858
2859
2860 class QName(unicode):
2861 """A qualified element or attribute name.
2862
2863 The unicode value of instances of this class contains the qualified name of
2864 the element or attribute, in the form ``{namespace-uri}local-name``. The
2865 namespace URI can be obtained through the additional `namespace` attribute,
2866 while the local name can be accessed through the `localname` attribute.
2867
2868 >>> qname = QName('foo')
2869 >>> qname
2870 QName('foo')
2871 >>> qname.localname
2872 u'foo'
2873 >>> qname.namespace
2874
2875 >>> qname = QName('http://www.w3.org/1999/xhtml}body')
2876 >>> qname
2877 QName('http://www.w3.org/1999/xhtml}body')
2878 >>> qname.localname
2879 u'body'
2880 >>> qname.namespace
2881 u'http://www.w3.org/1999/xhtml'
2882 """
2883 __slots__ = ['namespace', 'localname']
2884
2885 def __new__(cls, qname):
2886 """Create the `QName` instance.
2887
2888 :param qname: the qualified name as a string of the form
2889 ``{namespace-uri}local-name``, where the leading curly
2890 brace is optional
2891 """
2892 if type(qname) is cls:
2893 return qname
2894
2895 parts = qname.lstrip('{').split('}', 1)
2896 if len(parts) > 1:
2897 self = unicode.__new__(cls, '{%s' % qname)
2898 self.namespace, self.localname = map(unicode, parts)
2899 else:
2900 self = unicode.__new__(cls, qname)
2901 self.namespace, self.localname = None, unicode(qname)
2902 return self
2903
2904 def __getnewargs__(self):
2905 return (self.lstrip('{'),)
2906
2907 def __repr__(self):
2908 return 'QName(%s)' % stringrepr(self.lstrip('{'))
Copyright (C) 2012-2017 Edgewall Software