Mercurial > babel > old > mirror
annotate 0.9.x/scripts/import_cldr.py @ 348:05975a0e7021 stable
Merged revisions [358:360], [364:370], [373:378], [380:382] from [source:trunk].
author | cmlenz |
---|---|
date | Mon, 16 Jun 2008 12:48:43 +0000 |
parents | 5b7d3f9f7d74 |
children | 6a0e7205790f |
rev | line source |
---|---|
263 | 1 #!/usr/bin/env python |
2 # -*- coding: utf-8 -*- | |
3 # | |
4 # Copyright (C) 2007 Edgewall Software | |
5 # All rights reserved. | |
6 # | |
7 # This software is licensed as described in the file COPYING, which | |
8 # you should have received as part of this distribution. The terms | |
9 # are also available at http://babel.edgewall.org/wiki/License. | |
10 # | |
11 # This software consists of voluntary contributions made by many | |
12 # individuals. For the exact contribution history, see the revision | |
13 # history and logs, available at http://babel.edgewall.org/log/. | |
14 | |
15 import copy | |
16 from optparse import OptionParser | |
17 import os | |
18 import pickle | |
19 import sys | |
20 try: | |
21 from xml.etree.ElementTree import parse | |
22 except ImportError: | |
23 from elementtree.ElementTree import parse | |
24 | |
25 # Make sure we're using Babel source, and not some previously installed version | |
26 sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), '..')) | |
27 | |
28 from babel import dates, numbers | |
29 | |
30 weekdays = {'mon': 0, 'tue': 1, 'wed': 2, 'thu': 3, 'fri': 4, 'sat': 5, | |
31 'sun': 6} | |
32 | |
33 try: | |
34 any | |
35 except NameError: | |
36 def any(iterable): | |
37 return filter(None, list(iterable)) | |
38 | |
39 def _text(elem): | |
40 buf = [elem.text or ''] | |
41 for child in elem: | |
42 buf.append(_text(child)) | |
43 buf.append(elem.tail or '') | |
44 return u''.join(filter(None, buf)).strip() | |
45 | |
46 def main(): | |
47 parser = OptionParser(usage='%prog path/to/cldr') | |
48 options, args = parser.parse_args() | |
49 if len(args) != 1: | |
50 parser.error('incorrect number of arguments') | |
51 | |
52 srcdir = args[0] | |
53 destdir = os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), | |
54 '..', 'babel') | |
55 | |
56 sup = parse(os.path.join(srcdir, 'supplemental', 'supplementalData.xml')) | |
57 | |
348
05975a0e7021
Merged revisions [358:360], [364:370], [373:378], [380:382] from [source:trunk].
cmlenz
parents:
263
diff
changeset
|
58 # Import global data from the supplemental files |
263 | 59 global_data = {} |
60 | |
61 territory_zones = global_data.setdefault('territory_zones', {}) | |
62 zone_aliases = global_data.setdefault('zone_aliases', {}) | |
63 zone_territories = global_data.setdefault('zone_territories', {}) | |
64 for elem in sup.findall('//timezoneData/zoneFormatting/zoneItem'): | |
65 tzid = elem.attrib['type'] | |
66 territory_zones.setdefault(elem.attrib['territory'], []).append(tzid) | |
67 zone_territories[tzid] = elem.attrib['territory'] | |
68 if 'aliases' in elem.attrib: | |
69 for alias in elem.attrib['aliases'].split(): | |
70 zone_aliases[alias] = tzid | |
71 | |
348
05975a0e7021
Merged revisions [358:360], [364:370], [373:378], [380:382] from [source:trunk].
cmlenz
parents:
263
diff
changeset
|
72 # Import Metazone mapping |
05975a0e7021
Merged revisions [358:360], [364:370], [373:378], [380:382] from [source:trunk].
cmlenz
parents:
263
diff
changeset
|
73 meta_zones = global_data.setdefault('meta_zones', {}) |
05975a0e7021
Merged revisions [358:360], [364:370], [373:378], [380:382] from [source:trunk].
cmlenz
parents:
263
diff
changeset
|
74 tzsup = parse(os.path.join(srcdir, 'supplemental', 'metazoneInfo.xml')) |
05975a0e7021
Merged revisions [358:360], [364:370], [373:378], [380:382] from [source:trunk].
cmlenz
parents:
263
diff
changeset
|
75 for elem in tzsup.findall('//timezone'): |
05975a0e7021
Merged revisions [358:360], [364:370], [373:378], [380:382] from [source:trunk].
cmlenz
parents:
263
diff
changeset
|
76 for child in elem.findall('usesMetazone'): |
05975a0e7021
Merged revisions [358:360], [364:370], [373:378], [380:382] from [source:trunk].
cmlenz
parents:
263
diff
changeset
|
77 if 'to' not in child.attrib: # FIXME: support old mappings |
05975a0e7021
Merged revisions [358:360], [364:370], [373:378], [380:382] from [source:trunk].
cmlenz
parents:
263
diff
changeset
|
78 meta_zones[elem.attrib['type']] = child.attrib['mzone'] |
05975a0e7021
Merged revisions [358:360], [364:370], [373:378], [380:382] from [source:trunk].
cmlenz
parents:
263
diff
changeset
|
79 |
263 | 80 outfile = open(os.path.join(destdir, 'global.dat'), 'wb') |
81 try: | |
82 pickle.dump(global_data, outfile, 2) | |
83 finally: | |
84 outfile.close() | |
85 | |
86 # build a territory containment mapping for inheritance | |
87 regions = {} | |
88 for elem in sup.findall('//territoryContainment/group'): | |
89 regions[elem.attrib['type']] = elem.attrib['contains'].split() | |
90 | |
91 # Resolve territory containment | |
92 territory_containment = {} | |
93 region_items = regions.items() | |
94 region_items.sort() | |
95 for group, territory_list in region_items: | |
96 for territory in territory_list: | |
97 containers = territory_containment.setdefault(territory, set([])) | |
98 if group in territory_containment: | |
99 containers |= territory_containment[group] | |
100 containers.add(group) | |
101 | |
102 filenames = os.listdir(os.path.join(srcdir, 'main')) | |
103 filenames.remove('root.xml') | |
104 filenames.sort(lambda a,b: len(a)-len(b)) | |
105 filenames.insert(0, 'root.xml') | |
106 | |
107 for filename in filenames: | |
108 print>>sys.stderr, 'Processing input file %r' % filename | |
109 stem, ext = os.path.splitext(filename) | |
110 if ext != '.xml': | |
111 continue | |
112 | |
113 tree = parse(os.path.join(srcdir, 'main', filename)) | |
114 data = {} | |
115 | |
116 language = None | |
117 elem = tree.find('//identity/language') | |
118 if elem is not None: | |
119 language = elem.attrib['type'] | |
120 print>>sys.stderr, ' Language: %r' % language | |
121 | |
122 territory = None | |
123 elem = tree.find('//identity/territory') | |
124 if elem is not None: | |
125 territory = elem.attrib['type'] | |
126 else: | |
127 territory = '001' # world | |
128 print>>sys.stderr, ' Territory: %r' % territory | |
129 regions = territory_containment.get(territory, []) | |
130 print>>sys.stderr, ' Regions: %r' % regions | |
131 | |
132 # <localeDisplayNames> | |
133 | |
134 territories = data.setdefault('territories', {}) | |
135 for elem in tree.findall('//territories/territory'): | |
136 if 'draft' in elem.attrib and elem.attrib['type'] in territories: | |
137 continue | |
138 territories[elem.attrib['type']] = _text(elem) | |
139 | |
140 languages = data.setdefault('languages', {}) | |
141 for elem in tree.findall('//languages/language'): | |
142 if 'draft' in elem.attrib and elem.attrib['type'] in languages: | |
143 continue | |
144 languages[elem.attrib['type']] = _text(elem) | |
145 | |
146 variants = data.setdefault('variants', {}) | |
147 for elem in tree.findall('//variants/variant'): | |
148 if 'draft' in elem.attrib and elem.attrib['type'] in variants: | |
149 continue | |
150 variants[elem.attrib['type']] = _text(elem) | |
151 | |
152 scripts = data.setdefault('scripts', {}) | |
153 for elem in tree.findall('//scripts/script'): | |
154 if 'draft' in elem.attrib and elem.attrib['type'] in scripts: | |
155 continue | |
156 scripts[elem.attrib['type']] = _text(elem) | |
157 | |
158 # <dates> | |
159 | |
160 week_data = data.setdefault('week_data', {}) | |
161 supelem = sup.find('//weekData') | |
162 | |
163 for elem in supelem.findall('minDays'): | |
164 territories = elem.attrib['territories'].split() | |
165 if territory in territories or any([r in territories for r in regions]): | |
166 week_data['min_days'] = int(elem.attrib['count']) | |
167 | |
168 for elem in supelem.findall('firstDay'): | |
169 territories = elem.attrib['territories'].split() | |
170 if territory in territories or any([r in territories for r in regions]): | |
171 week_data['first_day'] = weekdays[elem.attrib['day']] | |
172 | |
173 for elem in supelem.findall('weekendStart'): | |
174 territories = elem.attrib['territories'].split() | |
175 if territory in territories or any([r in territories for r in regions]): | |
176 week_data['weekend_start'] = weekdays[elem.attrib['day']] | |
177 | |
178 for elem in supelem.findall('weekendEnd'): | |
179 territories = elem.attrib['territories'].split() | |
180 if territory in territories or any([r in territories for r in regions]): | |
181 week_data['weekend_end'] = weekdays[elem.attrib['day']] | |
182 | |
183 zone_formats = data.setdefault('zone_formats', {}) | |
184 for elem in tree.findall('//timeZoneNames/gmtFormat'): | |
185 if 'draft' not in elem.attrib: | |
186 zone_formats['gmt'] = unicode(elem.text).replace('{0}', '%s') | |
187 break | |
188 for elem in tree.findall('//timeZoneNames/regionFormat'): | |
189 if 'draft' not in elem.attrib: | |
190 zone_formats['region'] = unicode(elem.text).replace('{0}', '%s') | |
191 break | |
192 for elem in tree.findall('//timeZoneNames/fallbackFormat'): | |
193 if 'draft' not in elem.attrib: | |
194 zone_formats['fallback'] = unicode(elem.text) \ | |
195 .replace('{0}', '%(0)s').replace('{1}', '%(1)s') | |
196 break | |
197 | |
198 time_zones = data.setdefault('time_zones', {}) | |
199 for elem in tree.findall('//timeZoneNames/zone'): | |
200 info = {} | |
201 city = elem.findtext('exemplarCity') | |
202 if city: | |
203 info['city'] = unicode(city) | |
204 for child in elem.findall('long/*'): | |
205 info.setdefault('long', {})[child.tag] = unicode(child.text) | |
206 for child in elem.findall('short/*'): | |
207 info.setdefault('short', {})[child.tag] = unicode(child.text) | |
208 time_zones[elem.attrib['type']] = info | |
209 | |
210 meta_zones = data.setdefault('meta_zones', {}) | |
211 for elem in tree.findall('//timeZoneNames/metazone'): | |
212 info = {} | |
213 city = elem.findtext('exemplarCity') | |
214 if city: | |
215 info['city'] = unicode(city) | |
216 for child in elem.findall('long/*'): | |
217 info.setdefault('long', {})[child.tag] = unicode(child.text) | |
218 for child in elem.findall('short/*'): | |
219 info.setdefault('short', {})[child.tag] = unicode(child.text) | |
220 info['common'] = elem.findtext('commonlyUsed') == 'true' | |
221 meta_zones[elem.attrib['type']] = info | |
222 | |
223 for calendar in tree.findall('//calendars/calendar'): | |
224 if calendar.attrib['type'] != 'gregorian': | |
225 # TODO: support other calendar types | |
226 continue | |
227 | |
228 months = data.setdefault('months', {}) | |
229 for ctxt in calendar.findall('months/monthContext'): | |
230 ctxts = months.setdefault(ctxt.attrib['type'], {}) | |
231 for width in ctxt.findall('monthWidth'): | |
232 widths = ctxts.setdefault(width.attrib['type'], {}) | |
233 for elem in width.findall('month'): | |
234 if 'draft' in elem.attrib and int(elem.attrib['type']) in widths: | |
235 continue | |
236 widths[int(elem.attrib.get('type'))] = unicode(elem.text) | |
237 | |
238 days = data.setdefault('days', {}) | |
239 for ctxt in calendar.findall('days/dayContext'): | |
240 ctxts = days.setdefault(ctxt.attrib['type'], {}) | |
241 for width in ctxt.findall('dayWidth'): | |
242 widths = ctxts.setdefault(width.attrib['type'], {}) | |
243 for elem in width.findall('day'): | |
244 dtype = weekdays[elem.attrib['type']] | |
245 if 'draft' in elem.attrib and dtype in widths: | |
246 continue | |
247 widths[dtype] = unicode(elem.text) | |
248 | |
249 quarters = data.setdefault('quarters', {}) | |
250 for ctxt in calendar.findall('quarters/quarterContext'): | |
251 ctxts = quarters.setdefault(ctxt.attrib['type'], {}) | |
252 for width in ctxt.findall('quarterWidth'): | |
253 widths = ctxts.setdefault(width.attrib['type'], {}) | |
254 for elem in width.findall('quarter'): | |
255 if 'draft' in elem.attrib and int(elem.attrib['type']) in widths: | |
256 continue | |
257 widths[int(elem.attrib.get('type'))] = unicode(elem.text) | |
258 | |
259 eras = data.setdefault('eras', {}) | |
260 for width in calendar.findall('eras/*'): | |
261 ewidth = { | |
262 'eraAbbr': 'abbreviated', | |
263 'eraNames': 'wide', | |
264 'eraNarrow': 'narrow', | |
265 }[width.tag] | |
266 widths = eras.setdefault(ewidth, {}) | |
267 for elem in width.findall('era'): | |
268 if 'draft' in elem.attrib and int(elem.attrib['type']) in widths: | |
269 continue | |
270 widths[int(elem.attrib.get('type'))] = unicode(elem.text) | |
271 | |
272 # AM/PM | |
273 periods = data.setdefault('periods', {}) | |
274 for elem in calendar.findall('am'): | |
275 if 'draft' in elem.attrib and elem.tag in periods: | |
276 continue | |
277 periods[elem.tag] = unicode(elem.text) | |
278 for elem in calendar.findall('pm'): | |
279 if 'draft' in elem.attrib and elem.tag in periods: | |
280 continue | |
281 periods[elem.tag] = unicode(elem.text) | |
282 | |
283 date_formats = data.setdefault('date_formats', {}) | |
284 for elem in calendar.findall('dateFormats/dateFormatLength'): | |
285 if 'draft' in elem.attrib and elem.attrib.get('type') in date_formats: | |
286 continue | |
287 try: | |
288 date_formats[elem.attrib.get('type')] = \ | |
289 dates.parse_pattern(unicode(elem.findtext('dateFormat/pattern'))) | |
290 except ValueError, e: | |
291 print>>sys.stderr, 'ERROR: %s' % e | |
292 | |
293 time_formats = data.setdefault('time_formats', {}) | |
294 for elem in calendar.findall('timeFormats/timeFormatLength'): | |
295 if 'draft' in elem.attrib and elem.attrib.get('type') in time_formats: | |
296 continue | |
297 try: | |
298 time_formats[elem.attrib.get('type')] = \ | |
299 dates.parse_pattern(unicode(elem.findtext('timeFormat/pattern'))) | |
300 except ValueError, e: | |
301 print>>sys.stderr, 'ERROR: %s' % e | |
302 | |
303 datetime_formats = data.setdefault('datetime_formats', {}) | |
304 for elem in calendar.findall('dateTimeFormats/dateTimeFormatLength'): | |
305 if 'draft' in elem.attrib and elem.attrib.get('type') in datetime_formats: | |
306 continue | |
307 try: | |
308 datetime_formats[elem.attrib.get('type')] = \ | |
309 unicode(elem.findtext('dateTimeFormat/pattern')) | |
310 except ValueError, e: | |
311 print>>sys.stderr, 'ERROR: %s' % e | |
312 | |
313 # <numbers> | |
314 | |
315 number_symbols = data.setdefault('number_symbols', {}) | |
316 for elem in tree.findall('//numbers/symbols/*'): | |
317 number_symbols[elem.tag] = unicode(elem.text) | |
318 | |
319 decimal_formats = data.setdefault('decimal_formats', {}) | |
320 for elem in tree.findall('//decimalFormats/decimalFormatLength'): | |
321 if 'draft' in elem.attrib and elem.attrib.get('type') in decimal_formats: | |
322 continue | |
323 pattern = unicode(elem.findtext('decimalFormat/pattern')) | |
324 decimal_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern) | |
325 | |
326 scientific_formats = data.setdefault('scientific_formats', {}) | |
327 for elem in tree.findall('//scientificFormats/scientificFormatLength'): | |
328 if 'draft' in elem.attrib and elem.attrib.get('type') in scientific_formats: | |
329 continue | |
330 pattern = unicode(elem.findtext('scientificFormat/pattern')) | |
331 scientific_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern) | |
332 | |
333 currency_formats = data.setdefault('currency_formats', {}) | |
334 for elem in tree.findall('//currencyFormats/currencyFormatLength'): | |
335 if 'draft' in elem.attrib and elem.attrib.get('type') in currency_formats: | |
336 continue | |
337 pattern = unicode(elem.findtext('currencyFormat/pattern')) | |
338 currency_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern) | |
339 | |
340 percent_formats = data.setdefault('percent_formats', {}) | |
341 for elem in tree.findall('//percentFormats/percentFormatLength'): | |
342 if 'draft' in elem.attrib and elem.attrib.get('type') in percent_formats: | |
343 continue | |
344 pattern = unicode(elem.findtext('percentFormat/pattern')) | |
345 percent_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern) | |
346 | |
347 currency_names = data.setdefault('currency_names', {}) | |
348 currency_symbols = data.setdefault('currency_symbols', {}) | |
349 for elem in tree.findall('//currencies/currency'): | |
350 name = elem.findtext('displayName') | |
351 if name: | |
352 currency_names[elem.attrib['type']] = unicode(name) | |
353 symbol = elem.findtext('symbol') | |
354 if symbol: | |
355 currency_symbols[elem.attrib['type']] = unicode(symbol) | |
356 | |
357 outfile = open(os.path.join(destdir, 'localedata', stem + '.dat'), 'wb') | |
358 try: | |
359 pickle.dump(data, outfile, 2) | |
360 finally: | |
361 outfile.close() | |
362 | |
363 if __name__ == '__main__': | |
364 main() |