# HG changeset patch # User osimons # Date 1250027228 0 # Node ID 0a9f2af1f711aa8853de07f31af74996971e0617 # Parent 73ed8c171063d7593cb8cd70075e8367dea356f8 0.6dev: Adding support for unicode/non-ascii input and output. UTF-8 is default, and it strips non-allowed XML characters. International characters should now be supported from recipe entery and through to slave execution and reporting. Closes #119. diff --git a/bitten/build/api.py b/bitten/build/api.py --- a/bitten/build/api.py +++ b/bitten/build/api.py @@ -15,6 +15,7 @@ import os import shlex import time +import sys log = logging.getLogger('bitten.build.api') @@ -45,6 +46,20 @@ break yield tuple(to_yield) +def _encode(text): + """Encode input for call. Input must be unicode or utf-8 string.""" + if not isinstance(text, unicode): + text = unicode(text, 'utf-8') + return text.encode( + sys.getfilesystemencoding() or sys.stdin.encoding, 'replace') + +def _decode(text): + """Decode output from call.""" + try: + return text.decode('utf-8') + except UnicodeDecodeError: + return text.decode(sys.stdout.encoding, 'replace') + class CommandLine(object): """Simple helper for executing subprocesses.""" @@ -60,7 +75,7 @@ command """ self.executable = executable - self.arguments = [str(arg) for arg in args] + self.arguments = [_encode(arg) for arg in args] self.input = input self.cwd = cwd if self.cwd: @@ -130,8 +145,10 @@ os.chdir(old_cwd) for out_line, err_line in _combine(out_lines, err_lines): - yield out_line and out_line.rstrip().replace('\x00', ''), \ - err_line and err_line.rstrip().replace('\x00', '') + yield out_line and _decode( + out_line.rstrip().replace('\x00', '')), \ + err_line and _decode( + err_line.rstrip().replace('\x00', '')) else: # posix @@ -191,7 +208,8 @@ out_lines = self._extract_lines(out_data) err_lines = self._extract_lines(err_data) for out_line, err_line in _combine(out_lines, err_lines): - yield out_line, err_line + yield out_line and _decode(out_line), \ + err_line and _decode(err_line) time.sleep(.1) self.returncode = pipe.wait() log.debug('%s exited with code %s', self.executable, diff --git a/bitten/util/tests/xmlio.py b/bitten/util/tests/xmlio.py --- a/bitten/util/tests/xmlio.py +++ b/bitten/util/tests/xmlio.py @@ -23,6 +23,43 @@ x = xmlio.parse(s) assert x.name == "build" + def test_ParsedElement_encoding(self): + u = u'' + s = '' + self.assertEquals(u, s.decode('utf-8')) + # unicode input + x = xmlio.parse(u) + out_u = str(x) + self.assertEquals(out_u, s) + self.assertEquals(out_u.decode('utf-8'), u) + # utf-8 input + x = xmlio.parse(s) + out_s = str(x) + self.assertEquals(out_s, s) + self.assertEquals(out_s.decode('utf-8'), u) + # identical results + self.assertEquals(out_u, out_s) + + def test_escape_text(self): + # unicode + self.assertEquals(u"Me & you!", + xmlio._escape_text(u"Me\x01 & you\x86!")) + # str + self.assertEquals("Me & you!", + xmlio._escape_text("Me\x01 & you\x86!")) + # not basestring + self.assertEquals(42, xmlio._escape_text(42)) + + def test_escape_attr(self): + # unicode + self.assertEquals(u'"Me & you!"', + xmlio._escape_attr(u'"Me\x01 & you\x86!"')) + # str + self.assertEquals('"Me & you!"', + xmlio._escape_attr('"Me\x01 & you\x86!"')) + # not basestring + self.assertEquals(42, xmlio._escape_text(42)) + def suite(): suite = unittest.TestSuite() suite.addTest(unittest.makeSuite(XMLIOTestCase, 'test')) diff --git a/bitten/util/xmlio.py b/bitten/util/xmlio.py --- a/bitten/util/xmlio.py +++ b/bitten/util/xmlio.py @@ -23,28 +23,50 @@ import cgi import string -__trans = string.maketrans ("", "") -__todel = "" -for c in range (0, 256): - c1 = chr (c) - if not c1 in string.printable: - __todel += c1 -del c, c1 - __all__ = ['Fragment', 'Element', 'ParsedElement', 'parse'] __docformat__ = 'restructuredtext en' +def _from_utf8(text): + """Convert utf-8 string to unicode. All other input returned as-is.""" + if isinstance(text, str): + return text.decode('utf-8') + else: + return text + +def _to_utf8(text): + """Convert any input to utf-8 byte string.""" + if isinstance(text, str): + return text # presumes utf-8 + elif not isinstance(text, unicode): + text = unicode(text) + return text.encode('utf-8') + +__trans = string.maketrans('', '') +# http://www.w3.org/TR/xml11/#charsets (partial) +__todel = ('\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x12\x13\x14' + '\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\x80\x81\x82\x83' + '\x84\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94' + '\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f') +__uni_trans = dict([(ord(c), None) for c in __todel]) + def _escape_text(text): """Escape special characters in the provided text so that it can be safely included in XML text nodes. """ - return cgi.escape (str(text)).translate (__trans, __todel) + if isinstance(text, str): + text = cgi.escape(text.translate(__trans, __todel)) + elif isinstance(text, unicode): + text = cgi.escape(text.translate(__uni_trans)) + return text def _escape_attr(attr): """Escape special characters in the provided text so that it can be safely included in XML attribute values. """ - return _escape_text(attr).replace('"', '"') + if isinstance(attr, basestring): + return _escape_text(attr).replace('"', '"') + else: + return attr class Fragment(object): @@ -76,7 +98,10 @@ elif isinstance(node, Fragment): self.children += node.children elif node is not None and node != '': - self.children.append(str(node)) + if isinstance(node, basestring): + self.children.append(_from_utf8(node)) + else: + self.children.append(unicode(node)) def write(self, out, newlines=False): """Serializes the element and writes the XML to the given output @@ -87,9 +112,9 @@ child.write(out, newlines=newlines) else: if child.startswith('<'): - out.write('') + out.write('') else: - out.write(_escape_text(child)) + out.write(_to_utf8(_escape_text(child))) class Element(Fragment): @@ -109,7 +134,7 @@ >>> print Element('foo', bar='1 < 2') >>> print Element('foo', bar='"baz"') - + The order in which attributes are rendered is undefined. @@ -141,6 +166,12 @@ >>> print Element('foo')[''] ]]> + + Valid input are utf-8 or unicode strings, or any type easily converted + to unicode such as integers. Output is always utf-8: + + >>> print str(Element(u'\xf8\xfc', arg=u'\xe9\u20ac'.encode('utf-8'))) + <\xc3\xb8\xc3\xbc arg="\xc3\xa9\xe2\x82\xac"/> """ __slots__ = ['name', 'attr'] @@ -151,8 +182,9 @@ keyword arguments following it are handled as attributes of the element. """ Fragment.__init__(self) - self.name = name_ - self.attr = dict([(name, value) for name, value in attr.items() + self.name = _from_utf8(name_) + self.attr = dict([(_from_utf8(name), _from_utf8(value)) \ + for name, value in attr.items() \ if value is not None]) def write(self, out, newlines=False): @@ -160,13 +192,13 @@ stream. """ out.write('<') - out.write(self.name) + out.write(_to_utf8(self.name)) for name, value in self.attr.items(): - out.write(' %s="%s"' % (name, _escape_attr(value))) + out.write(_to_utf8(' %s="%s"' % (name, _escape_attr(value)))) if self.children: out.write('>') Fragment.write(self, out, newlines) - out.write('') + out.write('') else: out.write('/>') if newlines: @@ -186,8 +218,8 @@ from xml.dom import minidom from xml.parsers import expat try: - if isinstance(text_or_file, (str, unicode)): - dom = minidom.parseString(text_or_file) + if isinstance(text_or_file, basestring): + dom = minidom.parseString(_to_utf8(text_or_file)) else: dom = minidom.parse(text_or_file) return ParsedElement(dom.documentElement) @@ -248,6 +280,9 @@ >>> xml = parse('foo ]]>baz') >>> xml.gettext() 'foo baz' + + Valid input are utf-8 or unicode strings, or any type easily converted + to unicode such as integers. Output is always utf-8. """ __slots__ = ['_node', 'attr'] @@ -260,13 +295,13 @@ attr = self._node.getAttributeNode(name) if not attr: raise KeyError(name) - return attr.value.encode('utf-8') + return _to_utf8(attr.value) def __setitem__(self, name, value): self._node.setAttribute(name, value) def __delitem__(self, name): self._node.removeAttribute(name) def keys(self): - return [key.encode('utf-8') for key in self._node.attributes.keys()] + return [_to_utf8(key) for key in self._node.attributes.keys()] def __init__(self, node): self._node = node @@ -296,7 +331,7 @@ This concatenates the values of all text and CDATA nodes that are immediate children of this element. """ - return ''.join([c.nodeValue.encode('utf-8') + return ''.join([_to_utf8(c.nodeValue) for c in self._node.childNodes if c.nodeType in (3, 4)]) @@ -304,7 +339,8 @@ """Serializes the element and writes the XML to the given output stream. """ - self._node.writexml(out, newl=newlines and '\n' or '') + out.write(self._node.toprettyxml(newl=newlines and '\n' or '', + indent=newlines and '\t' or '', encoding='utf-8')) def __str__(self): """Return a string representation of the XML element."""