comparison scripts/import_cldr.py @ 379:1c0915da48c6 stable-0.9.x

Ported [407:415/trunk] back to 0.9.x branch.
author cmlenz
date Tue, 08 Jul 2008 21:01:28 +0000
parents faf0ead3a132
children cd8761c6f1a6
comparison
equal deleted inserted replaced
369:c2ae38340540 379:1c0915da48c6
14 14
15 import copy 15 import copy
16 from optparse import OptionParser 16 from optparse import OptionParser
17 import os 17 import os
18 import pickle 18 import pickle
19 import re
19 import sys 20 import sys
20 try: 21 try:
21 from xml.etree.ElementTree import parse 22 from xml.etree.ElementTree import parse
22 except ImportError: 23 except ImportError:
23 from elementtree.ElementTree import parse 24 from elementtree.ElementTree import parse
24 25
25 # Make sure we're using Babel source, and not some previously installed version 26 # Make sure we're using Babel source, and not some previously installed version
26 sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), '..')) 27 sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), '..'))
27 28
28 from babel import dates, numbers 29 from babel import dates, numbers
30 from babel.localedata import Alias
29 31
30 weekdays = {'mon': 0, 'tue': 1, 'wed': 2, 'thu': 3, 'fri': 4, 'sat': 5, 32 weekdays = {'mon': 0, 'tue': 1, 'wed': 2, 'thu': 3, 'fri': 4, 'sat': 5,
31 'sun': 6} 33 'sun': 6}
32 34
33 try: 35 try:
34 any 36 any
35 except NameError: 37 except NameError:
36 def any(iterable): 38 def any(iterable):
37 return filter(None, list(iterable)) 39 return filter(None, list(iterable))
40
38 41
39 def _text(elem): 42 def _text(elem):
40 buf = [elem.text or ''] 43 buf = [elem.text or '']
41 for child in elem: 44 for child in elem:
42 buf.append(_text(child)) 45 buf.append(_text(child))
43 buf.append(elem.tail or '') 46 buf.append(elem.tail or '')
44 return u''.join(filter(None, buf)).strip() 47 return u''.join(filter(None, buf)).strip()
48
49
50 NAME_RE = re.compile(r"^\w+$")
51 TYPE_ATTR_RE = re.compile(r"^\w+\[@type='(.*?)'\]$")
52
53 NAME_MAP = {
54 'dateFormats': 'date_formats',
55 'dateTimeFormats': 'datetime_formats',
56 'eraAbbr': 'abbreviated',
57 'eraNames': 'wide',
58 'eraNarrow': 'narrow',
59 'timeFormats': 'time_formats'
60 }
61
62 def _translate_alias(ctxt, path):
63 parts = path.split('/')
64 keys = ctxt[:]
65 for part in parts:
66 if part == '..':
67 keys.pop()
68 else:
69 match = TYPE_ATTR_RE.match(part)
70 if match:
71 keys.append(match.group(1))
72 else:
73 assert NAME_RE.match(part)
74 keys.append(NAME_MAP.get(part, part))
75 return keys
76
45 77
46 def main(): 78 def main():
47 parser = OptionParser(usage='%prog path/to/cldr') 79 parser = OptionParser(usage='%prog path/to/cldr')
48 options, args = parser.parse_args() 80 options, args = parser.parse_args()
49 if len(args) != 1: 81 if len(args) != 1:
107 for filename in filenames: 139 for filename in filenames:
108 print>>sys.stderr, 'Processing input file %r' % filename 140 print>>sys.stderr, 'Processing input file %r' % filename
109 stem, ext = os.path.splitext(filename) 141 stem, ext = os.path.splitext(filename)
110 if ext != '.xml': 142 if ext != '.xml':
111 continue 143 continue
144 #if stem != 'root':
145 # break
112 146
113 tree = parse(os.path.join(srcdir, 'main', filename)) 147 tree = parse(os.path.join(srcdir, 'main', filename))
114 data = {} 148 data = {}
115 149
116 language = None 150 language = None
131 165
132 # <localeDisplayNames> 166 # <localeDisplayNames>
133 167
134 territories = data.setdefault('territories', {}) 168 territories = data.setdefault('territories', {})
135 for elem in tree.findall('//territories/territory'): 169 for elem in tree.findall('//territories/territory'):
136 if 'draft' in elem.attrib and elem.attrib['type'] in territories: 170 if ('draft' in elem.attrib or 'alt' in elem.attrib) \
171 and elem.attrib['type'] in territories:
137 continue 172 continue
138 territories[elem.attrib['type']] = _text(elem) 173 territories[elem.attrib['type']] = _text(elem)
139 174
140 languages = data.setdefault('languages', {}) 175 languages = data.setdefault('languages', {})
141 for elem in tree.findall('//languages/language'): 176 for elem in tree.findall('//languages/language'):
142 if 'draft' in elem.attrib and elem.attrib['type'] in languages: 177 if ('draft' in elem.attrib or 'alt' in elem.attrib) \
178 and elem.attrib['type'] in languages:
143 continue 179 continue
144 languages[elem.attrib['type']] = _text(elem) 180 languages[elem.attrib['type']] = _text(elem)
145 181
146 variants = data.setdefault('variants', {}) 182 variants = data.setdefault('variants', {})
147 for elem in tree.findall('//variants/variant'): 183 for elem in tree.findall('//variants/variant'):
148 if 'draft' in elem.attrib and elem.attrib['type'] in variants: 184 if ('draft' in elem.attrib or 'alt' in elem.attrib) \
185 and elem.attrib['type'] in variants:
149 continue 186 continue
150 variants[elem.attrib['type']] = _text(elem) 187 variants[elem.attrib['type']] = _text(elem)
151 188
152 scripts = data.setdefault('scripts', {}) 189 scripts = data.setdefault('scripts', {})
153 for elem in tree.findall('//scripts/script'): 190 for elem in tree.findall('//scripts/script'):
154 if 'draft' in elem.attrib and elem.attrib['type'] in scripts: 191 if ('draft' in elem.attrib or 'alt' in elem.attrib) \
192 and elem.attrib['type'] in scripts:
155 continue 193 continue
156 scripts[elem.attrib['type']] = _text(elem) 194 scripts[elem.attrib['type']] = _text(elem)
157 195
158 # <dates> 196 # <dates>
159 197
180 if territory in territories or any([r in territories for r in regions]): 218 if territory in territories or any([r in territories for r in regions]):
181 week_data['weekend_end'] = weekdays[elem.attrib['day']] 219 week_data['weekend_end'] = weekdays[elem.attrib['day']]
182 220
183 zone_formats = data.setdefault('zone_formats', {}) 221 zone_formats = data.setdefault('zone_formats', {})
184 for elem in tree.findall('//timeZoneNames/gmtFormat'): 222 for elem in tree.findall('//timeZoneNames/gmtFormat'):
185 if 'draft' not in elem.attrib: 223 if 'draft' not in elem.attrib and 'alt' not in elem.attrib:
186 zone_formats['gmt'] = unicode(elem.text).replace('{0}', '%s') 224 zone_formats['gmt'] = unicode(elem.text).replace('{0}', '%s')
187 break 225 break
188 for elem in tree.findall('//timeZoneNames/regionFormat'): 226 for elem in tree.findall('//timeZoneNames/regionFormat'):
189 if 'draft' not in elem.attrib: 227 if 'draft' not in elem.attrib and 'alt' not in elem.attrib:
190 zone_formats['region'] = unicode(elem.text).replace('{0}', '%s') 228 zone_formats['region'] = unicode(elem.text).replace('{0}', '%s')
191 break 229 break
192 for elem in tree.findall('//timeZoneNames/fallbackFormat'): 230 for elem in tree.findall('//timeZoneNames/fallbackFormat'):
193 if 'draft' not in elem.attrib: 231 if 'draft' not in elem.attrib and 'alt' not in elem.attrib:
194 zone_formats['fallback'] = unicode(elem.text) \ 232 zone_formats['fallback'] = unicode(elem.text) \
195 .replace('{0}', '%(0)s').replace('{1}', '%(1)s') 233 .replace('{0}', '%(0)s').replace('{1}', '%(1)s')
196 break 234 break
197 235
198 time_zones = data.setdefault('time_zones', {}) 236 time_zones = data.setdefault('time_zones', {})
225 # TODO: support other calendar types 263 # TODO: support other calendar types
226 continue 264 continue
227 265
228 months = data.setdefault('months', {}) 266 months = data.setdefault('months', {})
229 for ctxt in calendar.findall('months/monthContext'): 267 for ctxt in calendar.findall('months/monthContext'):
230 ctxts = months.setdefault(ctxt.attrib['type'], {}) 268 ctxt_type = ctxt.attrib['type']
269 ctxts = months.setdefault(ctxt_type, {})
231 for width in ctxt.findall('monthWidth'): 270 for width in ctxt.findall('monthWidth'):
232 widths = ctxts.setdefault(width.attrib['type'], {}) 271 width_type = width.attrib['type']
233 for elem in width.findall('month'): 272 widths = ctxts.setdefault(width_type, {})
234 if 'draft' in elem.attrib and int(elem.attrib['type']) in widths: 273 for elem in width.getiterator():
274 if elem.tag == 'month':
275 if ('draft' in elem.attrib or 'alt' in elem.attrib) \
276 and int(elem.attrib['type']) in widths:
277 continue
278 widths[int(elem.attrib.get('type'))] = unicode(elem.text)
279 elif elem.tag == 'alias':
280 ctxts[width_type] = Alias(
281 _translate_alias(['months', ctxt_type, width_type],
282 elem.attrib['path'])
283 )
284
285 days = data.setdefault('days', {})
286 for ctxt in calendar.findall('days/dayContext'):
287 ctxt_type = ctxt.attrib['type']
288 ctxts = days.setdefault(ctxt_type, {})
289 for width in ctxt.findall('dayWidth'):
290 width_type = width.attrib['type']
291 widths = ctxts.setdefault(width_type, {})
292 for elem in width.getiterator():
293 if elem.tag == 'day':
294 dtype = weekdays[elem.attrib['type']]
295 if ('draft' in elem.attrib or 'alt' not in elem.attrib) \
296 and dtype in widths:
297 continue
298 widths[dtype] = unicode(elem.text)
299 elif elem.tag == 'alias':
300 ctxts[width_type] = Alias(
301 _translate_alias(['days', ctxt_type, width_type],
302 elem.attrib['path'])
303 )
304
305 quarters = data.setdefault('quarters', {})
306 for ctxt in calendar.findall('quarters/quarterContext'):
307 ctxt_type = ctxt.attrib['type']
308 ctxts = quarters.setdefault(ctxt.attrib['type'], {})
309 for width in ctxt.findall('quarterWidth'):
310 width_type = width.attrib['type']
311 widths = ctxts.setdefault(width_type, {})
312 for elem in width.getiterator():
313 if elem.tag == 'quarter':
314 if ('draft' in elem.attrib or 'alt' in elem.attrib) \
315 and int(elem.attrib['type']) in widths:
316 continue
317 widths[int(elem.attrib['type'])] = unicode(elem.text)
318 elif elem.tag == 'alias':
319 ctxts[width_type] = Alias(
320 _translate_alias(['quarters', ctxt_type, width_type],
321 elem.attrib['path'])
322 )
323
324 eras = data.setdefault('eras', {})
325 for width in calendar.findall('eras/*'):
326 width_type = NAME_MAP[width.tag]
327 widths = eras.setdefault(width_type, {})
328 for elem in width.getiterator():
329 if elem.tag == 'era':
330 if ('draft' in elem.attrib or 'alt' in elem.attrib) \
331 and int(elem.attrib['type']) in widths:
235 continue 332 continue
236 widths[int(elem.attrib.get('type'))] = unicode(elem.text) 333 widths[int(elem.attrib.get('type'))] = unicode(elem.text)
237 334 elif elem.tag == 'alias':
238 days = data.setdefault('days', {}) 335 eras[width_type] = Alias(
239 for ctxt in calendar.findall('days/dayContext'): 336 _translate_alias(['eras', width_type],
240 ctxts = days.setdefault(ctxt.attrib['type'], {}) 337 elem.attrib['path'])
241 for width in ctxt.findall('dayWidth'): 338 )
242 widths = ctxts.setdefault(width.attrib['type'], {})
243 for elem in width.findall('day'):
244 dtype = weekdays[elem.attrib['type']]
245 if 'draft' in elem.attrib and dtype in widths:
246 continue
247 widths[dtype] = unicode(elem.text)
248
249 quarters = data.setdefault('quarters', {})
250 for ctxt in calendar.findall('quarters/quarterContext'):
251 ctxts = quarters.setdefault(ctxt.attrib['type'], {})
252 for width in ctxt.findall('quarterWidth'):
253 widths = ctxts.setdefault(width.attrib['type'], {})
254 for elem in width.findall('quarter'):
255 if 'draft' in elem.attrib and int(elem.attrib['type']) in widths:
256 continue
257 widths[int(elem.attrib.get('type'))] = unicode(elem.text)
258
259 eras = data.setdefault('eras', {})
260 for width in calendar.findall('eras/*'):
261 ewidth = {
262 'eraAbbr': 'abbreviated',
263 'eraNames': 'wide',
264 'eraNarrow': 'narrow',
265 }[width.tag]
266 widths = eras.setdefault(ewidth, {})
267 for elem in width.findall('era'):
268 if 'draft' in elem.attrib and int(elem.attrib['type']) in widths:
269 continue
270 widths[int(elem.attrib.get('type'))] = unicode(elem.text)
271 339
272 # AM/PM 340 # AM/PM
273 periods = data.setdefault('periods', {}) 341 periods = data.setdefault('periods', {})
274 for elem in calendar.findall('am'): 342 for elem in calendar.findall('am'):
275 if 'draft' in elem.attrib and elem.tag in periods: 343 if ('draft' in elem.attrib or 'alt' in elem.attrib) \
344 and elem.tag in periods:
276 continue 345 continue
277 periods[elem.tag] = unicode(elem.text) 346 periods[elem.tag] = unicode(elem.text)
278 for elem in calendar.findall('pm'): 347 for elem in calendar.findall('pm'):
279 if 'draft' in elem.attrib and elem.tag in periods: 348 if ('draft' in elem.attrib or 'alt' in elem.attrib) \
349 and elem.tag in periods:
280 continue 350 continue
281 periods[elem.tag] = unicode(elem.text) 351 periods[elem.tag] = unicode(elem.text)
282 352
283 date_formats = data.setdefault('date_formats', {}) 353 date_formats = data.setdefault('date_formats', {})
284 for elem in calendar.findall('dateFormats/dateFormatLength'): 354 for format in calendar.findall('dateFormats'):
285 if 'draft' in elem.attrib and elem.attrib.get('type') in date_formats: 355 for elem in format.getiterator():
286 continue 356 if elem.tag == 'dateFormatLength':
287 try: 357 if 'draft' in elem.attrib and \
288 date_formats[elem.attrib.get('type')] = \ 358 elem.attrib.get('type') in date_formats:
289 dates.parse_pattern(unicode(elem.findtext('dateFormat/pattern'))) 359 continue
290 except ValueError, e: 360 try:
291 print>>sys.stderr, 'ERROR: %s' % e 361 date_formats[elem.attrib.get('type')] = \
362 dates.parse_pattern(unicode(elem.findtext('dateFormat/pattern')))
363 except ValueError, e:
364 print>>sys.stderr, 'ERROR: %s' % e
365 elif elem.tag == 'alias':
366 date_formats = Alias(_translate_alias(
367 ['date_formats'], elem.attrib['path'])
368 )
292 369
293 time_formats = data.setdefault('time_formats', {}) 370 time_formats = data.setdefault('time_formats', {})
294 for elem in calendar.findall('timeFormats/timeFormatLength'): 371 for format in calendar.findall('timeFormats'):
295 if 'draft' in elem.attrib and elem.attrib.get('type') in time_formats: 372 for elem in format.getiterator():
296 continue 373 if elem.tag == 'timeFormatLength':
297 try: 374 if ('draft' in elem.attrib or 'alt' in elem.attrib) \
298 time_formats[elem.attrib.get('type')] = \ 375 and elem.attrib.get('type') in time_formats:
299 dates.parse_pattern(unicode(elem.findtext('timeFormat/pattern'))) 376 continue
300 except ValueError, e: 377 try:
301 print>>sys.stderr, 'ERROR: %s' % e 378 time_formats[elem.attrib.get('type')] = \
379 dates.parse_pattern(unicode(elem.findtext('timeFormat/pattern')))
380 except ValueError, e:
381 print>>sys.stderr, 'ERROR: %s' % e
382 elif elem.tag == 'alias':
383 time_formats = Alias(_translate_alias(
384 ['time_formats'], elem.attrib['path'])
385 )
302 386
303 datetime_formats = data.setdefault('datetime_formats', {}) 387 datetime_formats = data.setdefault('datetime_formats', {})
304 for elem in calendar.findall('dateTimeFormats/dateTimeFormatLength'): 388 for format in calendar.findall('dateTimeFormats'):
305 if 'draft' in elem.attrib and elem.attrib.get('type') in datetime_formats: 389 for elem in format.getiterator():
306 continue 390 if elem.tag == 'dateTimeFormatLength':
307 try: 391 if ('draft' in elem.attrib or 'alt' in elem.attrib) \
308 datetime_formats[elem.attrib.get('type')] = \ 392 and elem.attrib.get('type') in datetime_formats:
309 unicode(elem.findtext('dateTimeFormat/pattern')) 393 continue
310 except ValueError, e: 394 try:
311 print>>sys.stderr, 'ERROR: %s' % e 395 datetime_formats[elem.attrib.get('type')] = \
396 unicode(elem.findtext('dateTimeFormat/pattern'))
397 except ValueError, e:
398 print>>sys.stderr, 'ERROR: %s' % e
399 elif elem.tag == 'alias':
400 datetime_formats = Alias(_translate_alias(
401 ['datetime_formats'], elem.attrib['path'])
402 )
312 403
313 # <numbers> 404 # <numbers>
314 405
315 number_symbols = data.setdefault('number_symbols', {}) 406 number_symbols = data.setdefault('number_symbols', {})
316 for elem in tree.findall('//numbers/symbols/*'): 407 for elem in tree.findall('//numbers/symbols/*'):
317 number_symbols[elem.tag] = unicode(elem.text) 408 number_symbols[elem.tag] = unicode(elem.text)
318 409
319 decimal_formats = data.setdefault('decimal_formats', {}) 410 decimal_formats = data.setdefault('decimal_formats', {})
320 for elem in tree.findall('//decimalFormats/decimalFormatLength'): 411 for elem in tree.findall('//decimalFormats/decimalFormatLength'):
321 if 'draft' in elem.attrib and elem.attrib.get('type') in decimal_formats: 412 if ('draft' in elem.attrib or 'alt' in elem.attrib) \
413 and elem.attrib.get('type') in decimal_formats:
322 continue 414 continue
323 pattern = unicode(elem.findtext('decimalFormat/pattern')) 415 pattern = unicode(elem.findtext('decimalFormat/pattern'))
324 decimal_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern) 416 decimal_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern)
325 417
326 scientific_formats = data.setdefault('scientific_formats', {}) 418 scientific_formats = data.setdefault('scientific_formats', {})
327 for elem in tree.findall('//scientificFormats/scientificFormatLength'): 419 for elem in tree.findall('//scientificFormats/scientificFormatLength'):
328 if 'draft' in elem.attrib and elem.attrib.get('type') in scientific_formats: 420 if ('draft' in elem.attrib or 'alt' in elem.attrib) \
421 and elem.attrib.get('type') in scientific_formats:
329 continue 422 continue
330 pattern = unicode(elem.findtext('scientificFormat/pattern')) 423 pattern = unicode(elem.findtext('scientificFormat/pattern'))
331 scientific_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern) 424 scientific_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern)
332 425
333 currency_formats = data.setdefault('currency_formats', {}) 426 currency_formats = data.setdefault('currency_formats', {})
334 for elem in tree.findall('//currencyFormats/currencyFormatLength'): 427 for elem in tree.findall('//currencyFormats/currencyFormatLength'):
335 if 'draft' in elem.attrib and elem.attrib.get('type') in currency_formats: 428 if ('draft' in elem.attrib or 'alt' in elem.attrib) \
429 and elem.attrib.get('type') in currency_formats:
336 continue 430 continue
337 pattern = unicode(elem.findtext('currencyFormat/pattern')) 431 pattern = unicode(elem.findtext('currencyFormat/pattern'))
338 currency_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern) 432 currency_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern)
339 433
340 percent_formats = data.setdefault('percent_formats', {}) 434 percent_formats = data.setdefault('percent_formats', {})
341 for elem in tree.findall('//percentFormats/percentFormatLength'): 435 for elem in tree.findall('//percentFormats/percentFormatLength'):
342 if 'draft' in elem.attrib and elem.attrib.get('type') in percent_formats: 436 if ('draft' in elem.attrib or 'alt' in elem.attrib) \
437 and elem.attrib.get('type') in percent_formats:
343 continue 438 continue
344 pattern = unicode(elem.findtext('percentFormat/pattern')) 439 pattern = unicode(elem.findtext('percentFormat/pattern'))
345 percent_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern) 440 percent_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern)
346 441
347 currency_names = data.setdefault('currency_names', {}) 442 currency_names = data.setdefault('currency_names', {})
358 try: 453 try:
359 pickle.dump(data, outfile, 2) 454 pickle.dump(data, outfile, 2)
360 finally: 455 finally:
361 outfile.close() 456 outfile.close()
362 457
458
363 if __name__ == '__main__': 459 if __name__ == '__main__':
364 main() 460 main()
Copyright (C) 2012-2017 Edgewall Software