Mercurial > babel > old > mirror
diff babel/messages/mofile.py @ 336:6e86b862af57
Add basic MO file reading in preparation for #54.
author | cmlenz |
---|---|
date | Tue, 10 Jun 2008 17:05:52 +0000 |
parents | a7dff175b14f |
children | 662d332c0a2b |
line wrap: on
line diff
--- a/babel/messages/mofile.py +++ b/babel/messages/mofile.py @@ -21,9 +21,109 @@ import array import struct -__all__ = ['write_mo'] +from babel.messages.catalog import Catalog, Message + +__all__ = ['read_mo', 'write_mo'] __docformat__ = 'restructuredtext en' + +LE_MAGIC = 0x950412deL +BE_MAGIC = 0xde120495L + +def read_mo(fileobj): + """Read a binary MO file from the given file-like object and return a + corresponding `Catalog` object. + + :param fileobj: the file-like object to read the MO file from + :return: a catalog object representing the parsed MO file + :rtype: `Catalog` + + :note: The implementation of this function is heavily based on the + ``GNUTranslations._parse`` method of the ``gettext`` module in the + standard library. + """ + catalog = Catalog() + headers = {} + + unpack = struct.unpack + filename = getattr(fileobj, 'name', '') + charset = None + + buf = fileobj.read() + buflen = len(buf) + + # Parse the .mo file header, which consists of 5 little endian 32 + # bit words. + magic = unpack('<I', buf[:4])[0] # Are we big endian or little endian? + if magic == LE_MAGIC: + version, msgcount, masteridx, transidx = unpack('<4I', buf[4:20]) + ii = '<II' + elif magic == BE_MAGIC: + version, msgcount, masteridx, transidx = unpack('>4I', buf[4:20]) + ii = '>II' + else: + raise IOError(0, 'Bad magic number', filename) + + # Now put all messages from the .mo file buffer into the catalog + # dictionary + for i in xrange(0, msgcount): + mlen, moff = unpack(ii, buf[masteridx:masteridx + 8]) + mend = moff + mlen + tlen, toff = unpack(ii, buf[transidx:transidx + 8]) + tend = toff + tlen + if mend < buflen and tend < buflen: + msg = buf[moff:mend] + tmsg = buf[toff:tend] + else: + raise IOError(0, 'File is corrupt', filename) + + # See if we're looking at GNU .mo conventions for metadata + if mlen == 0: + # Catalog description + lastkey = key = None + for item in tmsg.splitlines(): + item = item.strip() + if not item: + continue + if ':' in item: + key, value = item.split(':', 1) + lastkey = key = key.strip().lower() + value = value.strip() + headers[key] = value + if key == 'content-type': + charset = value.split('charset=')[1] + elif lastkey: + self._info[lastkey] += '\n' + item + + # Note: we unconditionally convert both msgids and msgstrs to + # Unicode using the character encoding specified in the charset + # parameter of the Content-Type header. The gettext documentation + # strongly encourages msgids to be us-ascii, but some appliations + # require alternative encodings (e.g. Zope's ZCML and ZPT). For + # traditional gettext applications, the msgid conversion will + # cause no problems since us-ascii should always be a subset of + # the charset encoding. We may want to fall back to 8-bit msgids + # if the Unicode conversion fails. + if '\x00' in msg: + # Plural forms + msg = msg.split('\x00') + tmsg = tmsg.split('\x00') + if charset: + msg = [unicode(x, charset) for x in msg] + tmsg = [unicode(x, charset) for x in tmsg] + else: + if charset: + msg = unicode(msg, charset) + tmsg = unicode(tmsg, charset) + catalog[msg] = Message(msg, tmsg) + + # advance to next entry in the seek tables + masteridx += 8 + transidx += 8 + + catalog.mime_headers = headers.items() + return catalog + def write_mo(fileobj, catalog, use_fuzzy=False): """Write a catalog to the specified file-like object using the GNU MO file format. @@ -112,7 +212,7 @@ offsets = koffsets + voffsets fileobj.write(struct.pack('Iiiiiii', - 0x950412deL, # magic + LE_MAGIC, # magic 0, # version len(messages), # number of entries 7 * 4, # start of key index