# HG changeset patch
# User osimons
# Date 1250027228 0
# Node ID 0a9f2af1f711aa8853de07f31af74996971e0617
# Parent 73ed8c171063d7593cb8cd70075e8367dea356f8
0.6dev: Adding support for unicode/non-ascii input and output. UTF-8 is default, and it strips non-allowed XML characters. International characters should now be supported from recipe entery and through to slave execution and reporting.
Closes #119.
diff --git a/bitten/build/api.py b/bitten/build/api.py
--- a/bitten/build/api.py
+++ b/bitten/build/api.py
@@ -15,6 +15,7 @@
import os
import shlex
import time
+import sys
log = logging.getLogger('bitten.build.api')
@@ -45,6 +46,20 @@
break
yield tuple(to_yield)
+def _encode(text):
+ """Encode input for call. Input must be unicode or utf-8 string."""
+ if not isinstance(text, unicode):
+ text = unicode(text, 'utf-8')
+ return text.encode(
+ sys.getfilesystemencoding() or sys.stdin.encoding, 'replace')
+
+def _decode(text):
+ """Decode output from call."""
+ try:
+ return text.decode('utf-8')
+ except UnicodeDecodeError:
+ return text.decode(sys.stdout.encoding, 'replace')
+
class CommandLine(object):
"""Simple helper for executing subprocesses."""
@@ -60,7 +75,7 @@
command
"""
self.executable = executable
- self.arguments = [str(arg) for arg in args]
+ self.arguments = [_encode(arg) for arg in args]
self.input = input
self.cwd = cwd
if self.cwd:
@@ -130,8 +145,10 @@
os.chdir(old_cwd)
for out_line, err_line in _combine(out_lines, err_lines):
- yield out_line and out_line.rstrip().replace('\x00', ''), \
- err_line and err_line.rstrip().replace('\x00', '')
+ yield out_line and _decode(
+ out_line.rstrip().replace('\x00', '')), \
+ err_line and _decode(
+ err_line.rstrip().replace('\x00', ''))
else: # posix
@@ -191,7 +208,8 @@
out_lines = self._extract_lines(out_data)
err_lines = self._extract_lines(err_data)
for out_line, err_line in _combine(out_lines, err_lines):
- yield out_line, err_line
+ yield out_line and _decode(out_line), \
+ err_line and _decode(err_line)
time.sleep(.1)
self.returncode = pipe.wait()
log.debug('%s exited with code %s', self.executable,
diff --git a/bitten/util/tests/xmlio.py b/bitten/util/tests/xmlio.py
--- a/bitten/util/tests/xmlio.py
+++ b/bitten/util/tests/xmlio.py
@@ -23,6 +23,43 @@
x = xmlio.parse(s)
assert x.name == "build"
+ def test_ParsedElement_encoding(self):
+ u = u''
+ s = ''
+ self.assertEquals(u, s.decode('utf-8'))
+ # unicode input
+ x = xmlio.parse(u)
+ out_u = str(x)
+ self.assertEquals(out_u, s)
+ self.assertEquals(out_u.decode('utf-8'), u)
+ # utf-8 input
+ x = xmlio.parse(s)
+ out_s = str(x)
+ self.assertEquals(out_s, s)
+ self.assertEquals(out_s.decode('utf-8'), u)
+ # identical results
+ self.assertEquals(out_u, out_s)
+
+ def test_escape_text(self):
+ # unicode
+ self.assertEquals(u"Me & you!",
+ xmlio._escape_text(u"Me\x01 & you\x86!"))
+ # str
+ self.assertEquals("Me & you!",
+ xmlio._escape_text("Me\x01 & you\x86!"))
+ # not basestring
+ self.assertEquals(42, xmlio._escape_text(42))
+
+ def test_escape_attr(self):
+ # unicode
+ self.assertEquals(u'"Me & you!"',
+ xmlio._escape_attr(u'"Me\x01 & you\x86!"'))
+ # str
+ self.assertEquals('"Me & you!"',
+ xmlio._escape_attr('"Me\x01 & you\x86!"'))
+ # not basestring
+ self.assertEquals(42, xmlio._escape_text(42))
+
def suite():
suite = unittest.TestSuite()
suite.addTest(unittest.makeSuite(XMLIOTestCase, 'test'))
diff --git a/bitten/util/xmlio.py b/bitten/util/xmlio.py
--- a/bitten/util/xmlio.py
+++ b/bitten/util/xmlio.py
@@ -23,28 +23,50 @@
import cgi
import string
-__trans = string.maketrans ("", "")
-__todel = ""
-for c in range (0, 256):
- c1 = chr (c)
- if not c1 in string.printable:
- __todel += c1
-del c, c1
-
__all__ = ['Fragment', 'Element', 'ParsedElement', 'parse']
__docformat__ = 'restructuredtext en'
+def _from_utf8(text):
+ """Convert utf-8 string to unicode. All other input returned as-is."""
+ if isinstance(text, str):
+ return text.decode('utf-8')
+ else:
+ return text
+
+def _to_utf8(text):
+ """Convert any input to utf-8 byte string."""
+ if isinstance(text, str):
+ return text # presumes utf-8
+ elif not isinstance(text, unicode):
+ text = unicode(text)
+ return text.encode('utf-8')
+
+__trans = string.maketrans('', '')
+# http://www.w3.org/TR/xml11/#charsets (partial)
+__todel = ('\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x12\x13\x14'
+ '\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\x80\x81\x82\x83'
+ '\x84\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94'
+ '\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f')
+__uni_trans = dict([(ord(c), None) for c in __todel])
+
def _escape_text(text):
"""Escape special characters in the provided text so that it can be safely
included in XML text nodes.
"""
- return cgi.escape (str(text)).translate (__trans, __todel)
+ if isinstance(text, str):
+ text = cgi.escape(text.translate(__trans, __todel))
+ elif isinstance(text, unicode):
+ text = cgi.escape(text.translate(__uni_trans))
+ return text
def _escape_attr(attr):
"""Escape special characters in the provided text so that it can be safely
included in XML attribute values.
"""
- return _escape_text(attr).replace('"', '"')
+ if isinstance(attr, basestring):
+ return _escape_text(attr).replace('"', '"')
+ else:
+ return attr
class Fragment(object):
@@ -76,7 +98,10 @@
elif isinstance(node, Fragment):
self.children += node.children
elif node is not None and node != '':
- self.children.append(str(node))
+ if isinstance(node, basestring):
+ self.children.append(_from_utf8(node))
+ else:
+ self.children.append(unicode(node))
def write(self, out, newlines=False):
"""Serializes the element and writes the XML to the given output
@@ -87,9 +112,9 @@
child.write(out, newlines=newlines)
else:
if child.startswith('<'):
- out.write('')
+ out.write('')
else:
- out.write(_escape_text(child))
+ out.write(_to_utf8(_escape_text(child)))
class Element(Fragment):
@@ -109,7 +134,7 @@
>>> print Element('foo', bar='1 < 2')
>>> print Element('foo', bar='"baz"')
-
+
The order in which attributes are rendered is undefined.
@@ -141,6 +166,12 @@
>>> print Element('foo')['']
]]>
+
+ Valid input are utf-8 or unicode strings, or any type easily converted
+ to unicode such as integers. Output is always utf-8:
+
+ >>> print str(Element(u'\xf8\xfc', arg=u'\xe9\u20ac'.encode('utf-8')))
+ <\xc3\xb8\xc3\xbc arg="\xc3\xa9\xe2\x82\xac"/>
"""
__slots__ = ['name', 'attr']
@@ -151,8 +182,9 @@
keyword arguments following it are handled as attributes of the element.
"""
Fragment.__init__(self)
- self.name = name_
- self.attr = dict([(name, value) for name, value in attr.items()
+ self.name = _from_utf8(name_)
+ self.attr = dict([(_from_utf8(name), _from_utf8(value)) \
+ for name, value in attr.items() \
if value is not None])
def write(self, out, newlines=False):
@@ -160,13 +192,13 @@
stream.
"""
out.write('<')
- out.write(self.name)
+ out.write(_to_utf8(self.name))
for name, value in self.attr.items():
- out.write(' %s="%s"' % (name, _escape_attr(value)))
+ out.write(_to_utf8(' %s="%s"' % (name, _escape_attr(value))))
if self.children:
out.write('>')
Fragment.write(self, out, newlines)
- out.write('' + self.name + '>')
+ out.write('' + _to_utf8(self.name) + '>')
else:
out.write('/>')
if newlines:
@@ -186,8 +218,8 @@
from xml.dom import minidom
from xml.parsers import expat
try:
- if isinstance(text_or_file, (str, unicode)):
- dom = minidom.parseString(text_or_file)
+ if isinstance(text_or_file, basestring):
+ dom = minidom.parseString(_to_utf8(text_or_file))
else:
dom = minidom.parse(text_or_file)
return ParsedElement(dom.documentElement)
@@ -248,6 +280,9 @@
>>> xml = parse('foo ]]>baz')
>>> xml.gettext()
'foo baz'
+
+ Valid input are utf-8 or unicode strings, or any type easily converted
+ to unicode such as integers. Output is always utf-8.
"""
__slots__ = ['_node', 'attr']
@@ -260,13 +295,13 @@
attr = self._node.getAttributeNode(name)
if not attr:
raise KeyError(name)
- return attr.value.encode('utf-8')
+ return _to_utf8(attr.value)
def __setitem__(self, name, value):
self._node.setAttribute(name, value)
def __delitem__(self, name):
self._node.removeAttribute(name)
def keys(self):
- return [key.encode('utf-8') for key in self._node.attributes.keys()]
+ return [_to_utf8(key) for key in self._node.attributes.keys()]
def __init__(self, node):
self._node = node
@@ -296,7 +331,7 @@
This concatenates the values of all text and CDATA nodes that are
immediate children of this element.
"""
- return ''.join([c.nodeValue.encode('utf-8')
+ return ''.join([_to_utf8(c.nodeValue)
for c in self._node.childNodes
if c.nodeType in (3, 4)])
@@ -304,7 +339,8 @@
"""Serializes the element and writes the XML to the given output
stream.
"""
- self._node.writexml(out, newl=newlines and '\n' or '')
+ out.write(self._node.toprettyxml(newl=newlines and '\n' or '',
+ indent=newlines and '\t' or '', encoding='utf-8'))
def __str__(self):
"""Return a string representation of the XML element."""