Mercurial > babel > mirror
annotate babel/messages/mofile.py @ 334:1786dce4b1b0 trunk
Add basic MO file reading in preparation for #54.
author | cmlenz |
---|---|
date | Tue, 10 Jun 2008 17:05:52 +0000 |
parents | 465a0582d308 |
children | 4db404d0c19b |
rev | line source |
---|---|
160 | 1 # -*- coding: utf-8 -*- |
2 # | |
3 # Copyright (C) 2007 Edgewall Software | |
4 # All rights reserved. | |
5 # | |
6 # This software is licensed as described in the file COPYING, which | |
7 # you should have received as part of this distribution. The terms | |
8 # are also available at http://babel.edgewall.org/wiki/License. | |
9 # | |
10 # This software consists of voluntary contributions made by many | |
11 # individuals. For the exact contribution history, see the revision | |
12 # history and logs, available at http://babel.edgewall.org/log/. | |
13 | |
14 """Writing of files in the ``gettext`` MO (machine object) format. | |
15 | |
234 | 16 :since: version 0.9 |
160 | 17 :see: `The Format of MO Files |
18 <http://www.gnu.org/software/gettext/manual/gettext.html#MO-Files>`_ | |
19 """ | |
20 | |
21 import array | |
22 import struct | |
23 | |
334 | 24 from babel.messages.catalog import Catalog, Message |
25 | |
26 __all__ = ['read_mo', 'write_mo'] | |
161 | 27 __docformat__ = 'restructuredtext en' |
28 | |
334 | 29 |
30 LE_MAGIC = 0x950412deL | |
31 BE_MAGIC = 0xde120495L | |
32 | |
33 def read_mo(fileobj): | |
34 """Read a binary MO file from the given file-like object and return a | |
35 corresponding `Catalog` object. | |
36 | |
37 :param fileobj: the file-like object to read the MO file from | |
38 :return: a catalog object representing the parsed MO file | |
39 :rtype: `Catalog` | |
40 | |
41 :note: The implementation of this function is heavily based on the | |
42 ``GNUTranslations._parse`` method of the ``gettext`` module in the | |
43 standard library. | |
44 """ | |
45 catalog = Catalog() | |
46 headers = {} | |
47 | |
48 unpack = struct.unpack | |
49 filename = getattr(fileobj, 'name', '') | |
50 charset = None | |
51 | |
52 buf = fileobj.read() | |
53 buflen = len(buf) | |
54 | |
55 # Parse the .mo file header, which consists of 5 little endian 32 | |
56 # bit words. | |
57 magic = unpack('<I', buf[:4])[0] # Are we big endian or little endian? | |
58 if magic == LE_MAGIC: | |
59 version, msgcount, masteridx, transidx = unpack('<4I', buf[4:20]) | |
60 ii = '<II' | |
61 elif magic == BE_MAGIC: | |
62 version, msgcount, masteridx, transidx = unpack('>4I', buf[4:20]) | |
63 ii = '>II' | |
64 else: | |
65 raise IOError(0, 'Bad magic number', filename) | |
66 | |
67 # Now put all messages from the .mo file buffer into the catalog | |
68 # dictionary | |
69 for i in xrange(0, msgcount): | |
70 mlen, moff = unpack(ii, buf[masteridx:masteridx + 8]) | |
71 mend = moff + mlen | |
72 tlen, toff = unpack(ii, buf[transidx:transidx + 8]) | |
73 tend = toff + tlen | |
74 if mend < buflen and tend < buflen: | |
75 msg = buf[moff:mend] | |
76 tmsg = buf[toff:tend] | |
77 else: | |
78 raise IOError(0, 'File is corrupt', filename) | |
79 | |
80 # See if we're looking at GNU .mo conventions for metadata | |
81 if mlen == 0: | |
82 # Catalog description | |
83 lastkey = key = None | |
84 for item in tmsg.splitlines(): | |
85 item = item.strip() | |
86 if not item: | |
87 continue | |
88 if ':' in item: | |
89 key, value = item.split(':', 1) | |
90 lastkey = key = key.strip().lower() | |
91 value = value.strip() | |
92 headers[key] = value | |
93 if key == 'content-type': | |
94 charset = value.split('charset=')[1] | |
95 elif lastkey: | |
96 self._info[lastkey] += '\n' + item | |
97 | |
98 # Note: we unconditionally convert both msgids and msgstrs to | |
99 # Unicode using the character encoding specified in the charset | |
100 # parameter of the Content-Type header. The gettext documentation | |
101 # strongly encourages msgids to be us-ascii, but some appliations | |
102 # require alternative encodings (e.g. Zope's ZCML and ZPT). For | |
103 # traditional gettext applications, the msgid conversion will | |
104 # cause no problems since us-ascii should always be a subset of | |
105 # the charset encoding. We may want to fall back to 8-bit msgids | |
106 # if the Unicode conversion fails. | |
107 if '\x00' in msg: | |
108 # Plural forms | |
109 msg = msg.split('\x00') | |
110 tmsg = tmsg.split('\x00') | |
111 if charset: | |
112 msg = [unicode(x, charset) for x in msg] | |
113 tmsg = [unicode(x, charset) for x in tmsg] | |
114 else: | |
115 if charset: | |
116 msg = unicode(msg, charset) | |
117 tmsg = unicode(tmsg, charset) | |
118 catalog[msg] = Message(msg, tmsg) | |
119 | |
120 # advance to next entry in the seek tables | |
121 masteridx += 8 | |
122 transidx += 8 | |
123 | |
124 catalog.mime_headers = headers.items() | |
125 return catalog | |
126 | |
160 | 127 def write_mo(fileobj, catalog, use_fuzzy=False): |
128 """Write a catalog to the specified file-like object using the GNU MO file | |
129 format. | |
130 | |
131 >>> from babel.messages import Catalog | |
132 >>> from gettext import GNUTranslations | |
133 >>> from StringIO import StringIO | |
134 | |
135 >>> catalog = Catalog(locale='en_US') | |
136 >>> catalog.add('foo', 'Voh') | |
137 >>> catalog.add((u'bar', u'baz'), (u'Bahr', u'Batz')) | |
138 >>> catalog.add('fuz', 'Futz', flags=['fuzzy']) | |
172 | 139 >>> catalog.add('Fizz', '') |
174
bd256296086c
Extended the doctest to include tests for the fix on [176].
palgarvio
parents:
173
diff
changeset
|
140 >>> catalog.add(('Fuzz', 'Fuzzes'), ('', '')) |
160 | 141 >>> buf = StringIO() |
142 | |
143 >>> write_mo(buf, catalog) | |
144 >>> buf.seek(0) | |
145 >>> translations = GNUTranslations(fp=buf) | |
146 >>> translations.ugettext('foo') | |
147 u'Voh' | |
148 >>> translations.ungettext('bar', 'baz', 1) | |
149 u'Bahr' | |
150 >>> translations.ungettext('bar', 'baz', 2) | |
151 u'Batz' | |
152 >>> translations.ugettext('fuz') | |
153 u'fuz' | |
172 | 154 >>> translations.ugettext('Fizz') |
155 u'Fizz' | |
174
bd256296086c
Extended the doctest to include tests for the fix on [176].
palgarvio
parents:
173
diff
changeset
|
156 >>> translations.ugettext('Fuzz') |
bd256296086c
Extended the doctest to include tests for the fix on [176].
palgarvio
parents:
173
diff
changeset
|
157 u'Fuzz' |
bd256296086c
Extended the doctest to include tests for the fix on [176].
palgarvio
parents:
173
diff
changeset
|
158 >>> translations.ugettext('Fuzzes') |
bd256296086c
Extended the doctest to include tests for the fix on [176].
palgarvio
parents:
173
diff
changeset
|
159 u'Fuzzes' |
160 | 160 |
161 :param fileobj: the file-like object to write to | |
162 :param catalog: the `Catalog` instance | |
163 :param use_fuzzy: whether translations marked as "fuzzy" should be included | |
164 in the output | |
165 """ | |
166 messages = list(catalog) | |
167 if not use_fuzzy: | |
168 messages[1:] = [m for m in messages[1:] if not m.fuzzy] | |
248
f0b1ee94628c
add a __cmp__ to Message that correctly sorts by id, taking into account plurals
pjenvey
parents:
234
diff
changeset
|
169 messages.sort() |
160 | 170 |
171 ids = strs = '' | |
172 offsets = [] | |
173 | |
174 for message in messages: | |
175 # For each string, we need size and file offset. Each string is NUL | |
176 # terminated; the NUL does not count into the size. | |
177 if message.pluralizable: | |
178 msgid = '\x00'.join([ | |
179 msgid.encode(catalog.charset) for msgid in message.id | |
180 ]) | |
173
c100331c727c
Forgot to fix the pluralizable messages, regarding #28.
palgarvio
parents:
172
diff
changeset
|
181 msgstrs = [] |
c100331c727c
Forgot to fix the pluralizable messages, regarding #28.
palgarvio
parents:
172
diff
changeset
|
182 for idx, string in enumerate(message.string): |
c100331c727c
Forgot to fix the pluralizable messages, regarding #28.
palgarvio
parents:
172
diff
changeset
|
183 if not string: |
330
465a0582d308
Fix for #97, compilation of message catalogs for locales with more than two plural forms where the translations were empty was failing.
cmlenz
parents:
248
diff
changeset
|
184 msgstrs.append(message.id[min(int(idx), 1)]) |
173
c100331c727c
Forgot to fix the pluralizable messages, regarding #28.
palgarvio
parents:
172
diff
changeset
|
185 else: |
c100331c727c
Forgot to fix the pluralizable messages, regarding #28.
palgarvio
parents:
172
diff
changeset
|
186 msgstrs.append(string) |
160 | 187 msgstr = '\x00'.join([ |
173
c100331c727c
Forgot to fix the pluralizable messages, regarding #28.
palgarvio
parents:
172
diff
changeset
|
188 msgstr.encode(catalog.charset) for msgstr in msgstrs |
160 | 189 ]) |
190 else: | |
191 msgid = message.id.encode(catalog.charset) | |
172 | 192 if not message.string: |
193 msgstr = message.id.encode(catalog.charset) | |
194 else: | |
195 msgstr = message.string.encode(catalog.charset) | |
160 | 196 offsets.append((len(ids), len(msgid), len(strs), len(msgstr))) |
197 ids += msgid + '\x00' | |
198 strs += msgstr + '\x00' | |
199 | |
200 # The header is 7 32-bit unsigned integers. We don't use hash tables, so | |
201 # the keys start right after the index tables. | |
202 keystart = 7 * 4 + 16 * len(messages) | |
203 valuestart = keystart + len(ids) | |
204 | |
205 # The string table first has the list of keys, then the list of values. | |
206 # Each entry has first the size of the string, then the file offset. | |
207 koffsets = [] | |
208 voffsets = [] | |
209 for o1, l1, o2, l2 in offsets: | |
210 koffsets += [l1, o1 + keystart] | |
211 voffsets += [l2, o2 + valuestart] | |
212 offsets = koffsets + voffsets | |
213 | |
214 fileobj.write(struct.pack('Iiiiiii', | |
334 | 215 LE_MAGIC, # magic |
160 | 216 0, # version |
217 len(messages), # number of entries | |
218 7 * 4, # start of key index | |
219 7 * 4 + len(messages) * 8, # start of value index | |
220 0, 0 # size and offset of hash table | |
221 ) + array.array("i", offsets).tostring() + ids + strs) |