Mercurial > babel > old > babel-test
comparison scripts/import_cldr.py @ 379:1c0915da48c6 stable-0.9.x
Ported [407:415/trunk] back to 0.9.x branch.
author | cmlenz |
---|---|
date | Tue, 08 Jul 2008 21:01:28 +0000 |
parents | faf0ead3a132 |
children | cd8761c6f1a6 |
comparison
equal
deleted
inserted
replaced
369:c2ae38340540 | 379:1c0915da48c6 |
---|---|
14 | 14 |
15 import copy | 15 import copy |
16 from optparse import OptionParser | 16 from optparse import OptionParser |
17 import os | 17 import os |
18 import pickle | 18 import pickle |
19 import re | |
19 import sys | 20 import sys |
20 try: | 21 try: |
21 from xml.etree.ElementTree import parse | 22 from xml.etree.ElementTree import parse |
22 except ImportError: | 23 except ImportError: |
23 from elementtree.ElementTree import parse | 24 from elementtree.ElementTree import parse |
24 | 25 |
25 # Make sure we're using Babel source, and not some previously installed version | 26 # Make sure we're using Babel source, and not some previously installed version |
26 sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), '..')) | 27 sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), '..')) |
27 | 28 |
28 from babel import dates, numbers | 29 from babel import dates, numbers |
30 from babel.localedata import Alias | |
29 | 31 |
30 weekdays = {'mon': 0, 'tue': 1, 'wed': 2, 'thu': 3, 'fri': 4, 'sat': 5, | 32 weekdays = {'mon': 0, 'tue': 1, 'wed': 2, 'thu': 3, 'fri': 4, 'sat': 5, |
31 'sun': 6} | 33 'sun': 6} |
32 | 34 |
33 try: | 35 try: |
34 any | 36 any |
35 except NameError: | 37 except NameError: |
36 def any(iterable): | 38 def any(iterable): |
37 return filter(None, list(iterable)) | 39 return filter(None, list(iterable)) |
40 | |
38 | 41 |
39 def _text(elem): | 42 def _text(elem): |
40 buf = [elem.text or ''] | 43 buf = [elem.text or ''] |
41 for child in elem: | 44 for child in elem: |
42 buf.append(_text(child)) | 45 buf.append(_text(child)) |
43 buf.append(elem.tail or '') | 46 buf.append(elem.tail or '') |
44 return u''.join(filter(None, buf)).strip() | 47 return u''.join(filter(None, buf)).strip() |
48 | |
49 | |
50 NAME_RE = re.compile(r"^\w+$") | |
51 TYPE_ATTR_RE = re.compile(r"^\w+\[@type='(.*?)'\]$") | |
52 | |
53 NAME_MAP = { | |
54 'dateFormats': 'date_formats', | |
55 'dateTimeFormats': 'datetime_formats', | |
56 'eraAbbr': 'abbreviated', | |
57 'eraNames': 'wide', | |
58 'eraNarrow': 'narrow', | |
59 'timeFormats': 'time_formats' | |
60 } | |
61 | |
62 def _translate_alias(ctxt, path): | |
63 parts = path.split('/') | |
64 keys = ctxt[:] | |
65 for part in parts: | |
66 if part == '..': | |
67 keys.pop() | |
68 else: | |
69 match = TYPE_ATTR_RE.match(part) | |
70 if match: | |
71 keys.append(match.group(1)) | |
72 else: | |
73 assert NAME_RE.match(part) | |
74 keys.append(NAME_MAP.get(part, part)) | |
75 return keys | |
76 | |
45 | 77 |
46 def main(): | 78 def main(): |
47 parser = OptionParser(usage='%prog path/to/cldr') | 79 parser = OptionParser(usage='%prog path/to/cldr') |
48 options, args = parser.parse_args() | 80 options, args = parser.parse_args() |
49 if len(args) != 1: | 81 if len(args) != 1: |
107 for filename in filenames: | 139 for filename in filenames: |
108 print>>sys.stderr, 'Processing input file %r' % filename | 140 print>>sys.stderr, 'Processing input file %r' % filename |
109 stem, ext = os.path.splitext(filename) | 141 stem, ext = os.path.splitext(filename) |
110 if ext != '.xml': | 142 if ext != '.xml': |
111 continue | 143 continue |
144 #if stem != 'root': | |
145 # break | |
112 | 146 |
113 tree = parse(os.path.join(srcdir, 'main', filename)) | 147 tree = parse(os.path.join(srcdir, 'main', filename)) |
114 data = {} | 148 data = {} |
115 | 149 |
116 language = None | 150 language = None |
131 | 165 |
132 # <localeDisplayNames> | 166 # <localeDisplayNames> |
133 | 167 |
134 territories = data.setdefault('territories', {}) | 168 territories = data.setdefault('territories', {}) |
135 for elem in tree.findall('//territories/territory'): | 169 for elem in tree.findall('//territories/territory'): |
136 if 'draft' in elem.attrib and elem.attrib['type'] in territories: | 170 if ('draft' in elem.attrib or 'alt' in elem.attrib) \ |
171 and elem.attrib['type'] in territories: | |
137 continue | 172 continue |
138 territories[elem.attrib['type']] = _text(elem) | 173 territories[elem.attrib['type']] = _text(elem) |
139 | 174 |
140 languages = data.setdefault('languages', {}) | 175 languages = data.setdefault('languages', {}) |
141 for elem in tree.findall('//languages/language'): | 176 for elem in tree.findall('//languages/language'): |
142 if 'draft' in elem.attrib and elem.attrib['type'] in languages: | 177 if ('draft' in elem.attrib or 'alt' in elem.attrib) \ |
178 and elem.attrib['type'] in languages: | |
143 continue | 179 continue |
144 languages[elem.attrib['type']] = _text(elem) | 180 languages[elem.attrib['type']] = _text(elem) |
145 | 181 |
146 variants = data.setdefault('variants', {}) | 182 variants = data.setdefault('variants', {}) |
147 for elem in tree.findall('//variants/variant'): | 183 for elem in tree.findall('//variants/variant'): |
148 if 'draft' in elem.attrib and elem.attrib['type'] in variants: | 184 if ('draft' in elem.attrib or 'alt' in elem.attrib) \ |
185 and elem.attrib['type'] in variants: | |
149 continue | 186 continue |
150 variants[elem.attrib['type']] = _text(elem) | 187 variants[elem.attrib['type']] = _text(elem) |
151 | 188 |
152 scripts = data.setdefault('scripts', {}) | 189 scripts = data.setdefault('scripts', {}) |
153 for elem in tree.findall('//scripts/script'): | 190 for elem in tree.findall('//scripts/script'): |
154 if 'draft' in elem.attrib and elem.attrib['type'] in scripts: | 191 if ('draft' in elem.attrib or 'alt' in elem.attrib) \ |
192 and elem.attrib['type'] in scripts: | |
155 continue | 193 continue |
156 scripts[elem.attrib['type']] = _text(elem) | 194 scripts[elem.attrib['type']] = _text(elem) |
157 | 195 |
158 # <dates> | 196 # <dates> |
159 | 197 |
180 if territory in territories or any([r in territories for r in regions]): | 218 if territory in territories or any([r in territories for r in regions]): |
181 week_data['weekend_end'] = weekdays[elem.attrib['day']] | 219 week_data['weekend_end'] = weekdays[elem.attrib['day']] |
182 | 220 |
183 zone_formats = data.setdefault('zone_formats', {}) | 221 zone_formats = data.setdefault('zone_formats', {}) |
184 for elem in tree.findall('//timeZoneNames/gmtFormat'): | 222 for elem in tree.findall('//timeZoneNames/gmtFormat'): |
185 if 'draft' not in elem.attrib: | 223 if 'draft' not in elem.attrib and 'alt' not in elem.attrib: |
186 zone_formats['gmt'] = unicode(elem.text).replace('{0}', '%s') | 224 zone_formats['gmt'] = unicode(elem.text).replace('{0}', '%s') |
187 break | 225 break |
188 for elem in tree.findall('//timeZoneNames/regionFormat'): | 226 for elem in tree.findall('//timeZoneNames/regionFormat'): |
189 if 'draft' not in elem.attrib: | 227 if 'draft' not in elem.attrib and 'alt' not in elem.attrib: |
190 zone_formats['region'] = unicode(elem.text).replace('{0}', '%s') | 228 zone_formats['region'] = unicode(elem.text).replace('{0}', '%s') |
191 break | 229 break |
192 for elem in tree.findall('//timeZoneNames/fallbackFormat'): | 230 for elem in tree.findall('//timeZoneNames/fallbackFormat'): |
193 if 'draft' not in elem.attrib: | 231 if 'draft' not in elem.attrib and 'alt' not in elem.attrib: |
194 zone_formats['fallback'] = unicode(elem.text) \ | 232 zone_formats['fallback'] = unicode(elem.text) \ |
195 .replace('{0}', '%(0)s').replace('{1}', '%(1)s') | 233 .replace('{0}', '%(0)s').replace('{1}', '%(1)s') |
196 break | 234 break |
197 | 235 |
198 time_zones = data.setdefault('time_zones', {}) | 236 time_zones = data.setdefault('time_zones', {}) |
225 # TODO: support other calendar types | 263 # TODO: support other calendar types |
226 continue | 264 continue |
227 | 265 |
228 months = data.setdefault('months', {}) | 266 months = data.setdefault('months', {}) |
229 for ctxt in calendar.findall('months/monthContext'): | 267 for ctxt in calendar.findall('months/monthContext'): |
230 ctxts = months.setdefault(ctxt.attrib['type'], {}) | 268 ctxt_type = ctxt.attrib['type'] |
269 ctxts = months.setdefault(ctxt_type, {}) | |
231 for width in ctxt.findall('monthWidth'): | 270 for width in ctxt.findall('monthWidth'): |
232 widths = ctxts.setdefault(width.attrib['type'], {}) | 271 width_type = width.attrib['type'] |
233 for elem in width.findall('month'): | 272 widths = ctxts.setdefault(width_type, {}) |
234 if 'draft' in elem.attrib and int(elem.attrib['type']) in widths: | 273 for elem in width.getiterator(): |
274 if elem.tag == 'month': | |
275 if ('draft' in elem.attrib or 'alt' in elem.attrib) \ | |
276 and int(elem.attrib['type']) in widths: | |
277 continue | |
278 widths[int(elem.attrib.get('type'))] = unicode(elem.text) | |
279 elif elem.tag == 'alias': | |
280 ctxts[width_type] = Alias( | |
281 _translate_alias(['months', ctxt_type, width_type], | |
282 elem.attrib['path']) | |
283 ) | |
284 | |
285 days = data.setdefault('days', {}) | |
286 for ctxt in calendar.findall('days/dayContext'): | |
287 ctxt_type = ctxt.attrib['type'] | |
288 ctxts = days.setdefault(ctxt_type, {}) | |
289 for width in ctxt.findall('dayWidth'): | |
290 width_type = width.attrib['type'] | |
291 widths = ctxts.setdefault(width_type, {}) | |
292 for elem in width.getiterator(): | |
293 if elem.tag == 'day': | |
294 dtype = weekdays[elem.attrib['type']] | |
295 if ('draft' in elem.attrib or 'alt' not in elem.attrib) \ | |
296 and dtype in widths: | |
297 continue | |
298 widths[dtype] = unicode(elem.text) | |
299 elif elem.tag == 'alias': | |
300 ctxts[width_type] = Alias( | |
301 _translate_alias(['days', ctxt_type, width_type], | |
302 elem.attrib['path']) | |
303 ) | |
304 | |
305 quarters = data.setdefault('quarters', {}) | |
306 for ctxt in calendar.findall('quarters/quarterContext'): | |
307 ctxt_type = ctxt.attrib['type'] | |
308 ctxts = quarters.setdefault(ctxt.attrib['type'], {}) | |
309 for width in ctxt.findall('quarterWidth'): | |
310 width_type = width.attrib['type'] | |
311 widths = ctxts.setdefault(width_type, {}) | |
312 for elem in width.getiterator(): | |
313 if elem.tag == 'quarter': | |
314 if ('draft' in elem.attrib or 'alt' in elem.attrib) \ | |
315 and int(elem.attrib['type']) in widths: | |
316 continue | |
317 widths[int(elem.attrib['type'])] = unicode(elem.text) | |
318 elif elem.tag == 'alias': | |
319 ctxts[width_type] = Alias( | |
320 _translate_alias(['quarters', ctxt_type, width_type], | |
321 elem.attrib['path']) | |
322 ) | |
323 | |
324 eras = data.setdefault('eras', {}) | |
325 for width in calendar.findall('eras/*'): | |
326 width_type = NAME_MAP[width.tag] | |
327 widths = eras.setdefault(width_type, {}) | |
328 for elem in width.getiterator(): | |
329 if elem.tag == 'era': | |
330 if ('draft' in elem.attrib or 'alt' in elem.attrib) \ | |
331 and int(elem.attrib['type']) in widths: | |
235 continue | 332 continue |
236 widths[int(elem.attrib.get('type'))] = unicode(elem.text) | 333 widths[int(elem.attrib.get('type'))] = unicode(elem.text) |
237 | 334 elif elem.tag == 'alias': |
238 days = data.setdefault('days', {}) | 335 eras[width_type] = Alias( |
239 for ctxt in calendar.findall('days/dayContext'): | 336 _translate_alias(['eras', width_type], |
240 ctxts = days.setdefault(ctxt.attrib['type'], {}) | 337 elem.attrib['path']) |
241 for width in ctxt.findall('dayWidth'): | 338 ) |
242 widths = ctxts.setdefault(width.attrib['type'], {}) | |
243 for elem in width.findall('day'): | |
244 dtype = weekdays[elem.attrib['type']] | |
245 if 'draft' in elem.attrib and dtype in widths: | |
246 continue | |
247 widths[dtype] = unicode(elem.text) | |
248 | |
249 quarters = data.setdefault('quarters', {}) | |
250 for ctxt in calendar.findall('quarters/quarterContext'): | |
251 ctxts = quarters.setdefault(ctxt.attrib['type'], {}) | |
252 for width in ctxt.findall('quarterWidth'): | |
253 widths = ctxts.setdefault(width.attrib['type'], {}) | |
254 for elem in width.findall('quarter'): | |
255 if 'draft' in elem.attrib and int(elem.attrib['type']) in widths: | |
256 continue | |
257 widths[int(elem.attrib.get('type'))] = unicode(elem.text) | |
258 | |
259 eras = data.setdefault('eras', {}) | |
260 for width in calendar.findall('eras/*'): | |
261 ewidth = { | |
262 'eraAbbr': 'abbreviated', | |
263 'eraNames': 'wide', | |
264 'eraNarrow': 'narrow', | |
265 }[width.tag] | |
266 widths = eras.setdefault(ewidth, {}) | |
267 for elem in width.findall('era'): | |
268 if 'draft' in elem.attrib and int(elem.attrib['type']) in widths: | |
269 continue | |
270 widths[int(elem.attrib.get('type'))] = unicode(elem.text) | |
271 | 339 |
272 # AM/PM | 340 # AM/PM |
273 periods = data.setdefault('periods', {}) | 341 periods = data.setdefault('periods', {}) |
274 for elem in calendar.findall('am'): | 342 for elem in calendar.findall('am'): |
275 if 'draft' in elem.attrib and elem.tag in periods: | 343 if ('draft' in elem.attrib or 'alt' in elem.attrib) \ |
344 and elem.tag in periods: | |
276 continue | 345 continue |
277 periods[elem.tag] = unicode(elem.text) | 346 periods[elem.tag] = unicode(elem.text) |
278 for elem in calendar.findall('pm'): | 347 for elem in calendar.findall('pm'): |
279 if 'draft' in elem.attrib and elem.tag in periods: | 348 if ('draft' in elem.attrib or 'alt' in elem.attrib) \ |
349 and elem.tag in periods: | |
280 continue | 350 continue |
281 periods[elem.tag] = unicode(elem.text) | 351 periods[elem.tag] = unicode(elem.text) |
282 | 352 |
283 date_formats = data.setdefault('date_formats', {}) | 353 date_formats = data.setdefault('date_formats', {}) |
284 for elem in calendar.findall('dateFormats/dateFormatLength'): | 354 for format in calendar.findall('dateFormats'): |
285 if 'draft' in elem.attrib and elem.attrib.get('type') in date_formats: | 355 for elem in format.getiterator(): |
286 continue | 356 if elem.tag == 'dateFormatLength': |
287 try: | 357 if 'draft' in elem.attrib and \ |
288 date_formats[elem.attrib.get('type')] = \ | 358 elem.attrib.get('type') in date_formats: |
289 dates.parse_pattern(unicode(elem.findtext('dateFormat/pattern'))) | 359 continue |
290 except ValueError, e: | 360 try: |
291 print>>sys.stderr, 'ERROR: %s' % e | 361 date_formats[elem.attrib.get('type')] = \ |
362 dates.parse_pattern(unicode(elem.findtext('dateFormat/pattern'))) | |
363 except ValueError, e: | |
364 print>>sys.stderr, 'ERROR: %s' % e | |
365 elif elem.tag == 'alias': | |
366 date_formats = Alias(_translate_alias( | |
367 ['date_formats'], elem.attrib['path']) | |
368 ) | |
292 | 369 |
293 time_formats = data.setdefault('time_formats', {}) | 370 time_formats = data.setdefault('time_formats', {}) |
294 for elem in calendar.findall('timeFormats/timeFormatLength'): | 371 for format in calendar.findall('timeFormats'): |
295 if 'draft' in elem.attrib and elem.attrib.get('type') in time_formats: | 372 for elem in format.getiterator(): |
296 continue | 373 if elem.tag == 'timeFormatLength': |
297 try: | 374 if ('draft' in elem.attrib or 'alt' in elem.attrib) \ |
298 time_formats[elem.attrib.get('type')] = \ | 375 and elem.attrib.get('type') in time_formats: |
299 dates.parse_pattern(unicode(elem.findtext('timeFormat/pattern'))) | 376 continue |
300 except ValueError, e: | 377 try: |
301 print>>sys.stderr, 'ERROR: %s' % e | 378 time_formats[elem.attrib.get('type')] = \ |
379 dates.parse_pattern(unicode(elem.findtext('timeFormat/pattern'))) | |
380 except ValueError, e: | |
381 print>>sys.stderr, 'ERROR: %s' % e | |
382 elif elem.tag == 'alias': | |
383 time_formats = Alias(_translate_alias( | |
384 ['time_formats'], elem.attrib['path']) | |
385 ) | |
302 | 386 |
303 datetime_formats = data.setdefault('datetime_formats', {}) | 387 datetime_formats = data.setdefault('datetime_formats', {}) |
304 for elem in calendar.findall('dateTimeFormats/dateTimeFormatLength'): | 388 for format in calendar.findall('dateTimeFormats'): |
305 if 'draft' in elem.attrib and elem.attrib.get('type') in datetime_formats: | 389 for elem in format.getiterator(): |
306 continue | 390 if elem.tag == 'dateTimeFormatLength': |
307 try: | 391 if ('draft' in elem.attrib or 'alt' in elem.attrib) \ |
308 datetime_formats[elem.attrib.get('type')] = \ | 392 and elem.attrib.get('type') in datetime_formats: |
309 unicode(elem.findtext('dateTimeFormat/pattern')) | 393 continue |
310 except ValueError, e: | 394 try: |
311 print>>sys.stderr, 'ERROR: %s' % e | 395 datetime_formats[elem.attrib.get('type')] = \ |
396 unicode(elem.findtext('dateTimeFormat/pattern')) | |
397 except ValueError, e: | |
398 print>>sys.stderr, 'ERROR: %s' % e | |
399 elif elem.tag == 'alias': | |
400 datetime_formats = Alias(_translate_alias( | |
401 ['datetime_formats'], elem.attrib['path']) | |
402 ) | |
312 | 403 |
313 # <numbers> | 404 # <numbers> |
314 | 405 |
315 number_symbols = data.setdefault('number_symbols', {}) | 406 number_symbols = data.setdefault('number_symbols', {}) |
316 for elem in tree.findall('//numbers/symbols/*'): | 407 for elem in tree.findall('//numbers/symbols/*'): |
317 number_symbols[elem.tag] = unicode(elem.text) | 408 number_symbols[elem.tag] = unicode(elem.text) |
318 | 409 |
319 decimal_formats = data.setdefault('decimal_formats', {}) | 410 decimal_formats = data.setdefault('decimal_formats', {}) |
320 for elem in tree.findall('//decimalFormats/decimalFormatLength'): | 411 for elem in tree.findall('//decimalFormats/decimalFormatLength'): |
321 if 'draft' in elem.attrib and elem.attrib.get('type') in decimal_formats: | 412 if ('draft' in elem.attrib or 'alt' in elem.attrib) \ |
413 and elem.attrib.get('type') in decimal_formats: | |
322 continue | 414 continue |
323 pattern = unicode(elem.findtext('decimalFormat/pattern')) | 415 pattern = unicode(elem.findtext('decimalFormat/pattern')) |
324 decimal_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern) | 416 decimal_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern) |
325 | 417 |
326 scientific_formats = data.setdefault('scientific_formats', {}) | 418 scientific_formats = data.setdefault('scientific_formats', {}) |
327 for elem in tree.findall('//scientificFormats/scientificFormatLength'): | 419 for elem in tree.findall('//scientificFormats/scientificFormatLength'): |
328 if 'draft' in elem.attrib and elem.attrib.get('type') in scientific_formats: | 420 if ('draft' in elem.attrib or 'alt' in elem.attrib) \ |
421 and elem.attrib.get('type') in scientific_formats: | |
329 continue | 422 continue |
330 pattern = unicode(elem.findtext('scientificFormat/pattern')) | 423 pattern = unicode(elem.findtext('scientificFormat/pattern')) |
331 scientific_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern) | 424 scientific_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern) |
332 | 425 |
333 currency_formats = data.setdefault('currency_formats', {}) | 426 currency_formats = data.setdefault('currency_formats', {}) |
334 for elem in tree.findall('//currencyFormats/currencyFormatLength'): | 427 for elem in tree.findall('//currencyFormats/currencyFormatLength'): |
335 if 'draft' in elem.attrib and elem.attrib.get('type') in currency_formats: | 428 if ('draft' in elem.attrib or 'alt' in elem.attrib) \ |
429 and elem.attrib.get('type') in currency_formats: | |
336 continue | 430 continue |
337 pattern = unicode(elem.findtext('currencyFormat/pattern')) | 431 pattern = unicode(elem.findtext('currencyFormat/pattern')) |
338 currency_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern) | 432 currency_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern) |
339 | 433 |
340 percent_formats = data.setdefault('percent_formats', {}) | 434 percent_formats = data.setdefault('percent_formats', {}) |
341 for elem in tree.findall('//percentFormats/percentFormatLength'): | 435 for elem in tree.findall('//percentFormats/percentFormatLength'): |
342 if 'draft' in elem.attrib and elem.attrib.get('type') in percent_formats: | 436 if ('draft' in elem.attrib or 'alt' in elem.attrib) \ |
437 and elem.attrib.get('type') in percent_formats: | |
343 continue | 438 continue |
344 pattern = unicode(elem.findtext('percentFormat/pattern')) | 439 pattern = unicode(elem.findtext('percentFormat/pattern')) |
345 percent_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern) | 440 percent_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern) |
346 | 441 |
347 currency_names = data.setdefault('currency_names', {}) | 442 currency_names = data.setdefault('currency_names', {}) |
358 try: | 453 try: |
359 pickle.dump(data, outfile, 2) | 454 pickle.dump(data, outfile, 2) |
360 finally: | 455 finally: |
361 outfile.close() | 456 outfile.close() |
362 | 457 |
458 | |
363 if __name__ == '__main__': | 459 if __name__ == '__main__': |
364 main() | 460 main() |