cmlenz@1: #!/usr/bin/env python
cmlenz@1: # -*- coding: utf-8 -*-
cmlenz@1: #
cmlenz@1: # Copyright (C) 2007 Edgewall Software
cmlenz@1: # All rights reserved.
cmlenz@1: #
cmlenz@1: # This software is licensed as described in the file COPYING, which
cmlenz@1: # you should have received as part of this distribution. The terms
cmlenz@1: # are also available at http://babel.edgewall.org/wiki/License.
cmlenz@1: #
cmlenz@1: # This software consists of voluntary contributions made by many
cmlenz@1: # individuals. For the exact contribution history, see the revision
cmlenz@1: # history and logs, available at http://babel.edgewall.org/log/.
cmlenz@1: 
cmlenz@1: import copy
cmlenz@1: from optparse import OptionParser
cmlenz@1: import os
cmlenz@1: import pickle
cmlenz@1: import sys
cmlenz@1: try:
cmlenz@1:     from xml.etree.ElementTree import parse
cmlenz@1: except ImportError:
cmlenz@1:     from elementtree.ElementTree import parse
cmlenz@1: 
jonas@9: from babel import dates, numbers
cmlenz@1: 
cmlenz@8: weekdays = {'mon': 1, 'tue': 2, 'wed': 3, 'thu': 4, 'fri': 5, 'sat': 6,
cmlenz@8:             'sun': 7}
cmlenz@8: 
cmlenz@8: try:
cmlenz@8:     any
cmlenz@8: except NameError:
cmlenz@8:     def any(iterable):
cmlenz@8:         return filter(None, list(iterable))
cmlenz@8: 
cmlenz@1: def _parent(locale):
cmlenz@1:     parts = locale.split('_')
cmlenz@1:     if len(parts) == 1:
cmlenz@1:         return 'root'
cmlenz@1:     else:
cmlenz@1:         return '_'.join(parts[:-1])
cmlenz@1: 
cmlenz@1: def _text(elem):
cmlenz@1:     buf = [elem.text or '']
cmlenz@1:     for child in elem:
cmlenz@1:         buf.append(_text(child))
cmlenz@1:     buf.append(elem.tail or '')
cmlenz@1:     return u''.join(filter(None, buf)).strip()
cmlenz@1: 
cmlenz@1: def main():
cmlenz@1:     parser = OptionParser(usage='%prog path/to/cldr')
cmlenz@1:     options, args = parser.parse_args()
cmlenz@1:     if len(args) != 1:
cmlenz@1:         parser.error('incorrect number of arguments')
cmlenz@1: 
cmlenz@1:     srcdir = args[0]
cmlenz@1:     destdir = os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])),
cmlenz@1:                            '..', 'babel', 'localedata')
cmlenz@1: 
cmlenz@8:     sup = parse(os.path.join(srcdir, 'supplemental', 'supplementalData.xml'))
cmlenz@8: 
cmlenz@8:     # build a territory containment mapping for inheritance
cmlenz@8:     regions = {}
cmlenz@8:     for elem in sup.findall('//territoryContainment/group'):
cmlenz@8:         regions[elem.attrib['type']] = elem.attrib['contains'].split()
cmlenz@8:     from pprint import pprint
cmlenz@8: 
cmlenz@8:     # Resolve territory containment
cmlenz@8:     territory_containment = {}
cmlenz@8:     region_items = regions.items()
cmlenz@8:     region_items.sort()
cmlenz@8:     for group, territory_list in region_items:
cmlenz@8:         for territory in territory_list:
cmlenz@8:             containers = territory_containment.setdefault(territory, set([]))
cmlenz@8:             if group in territory_containment:
cmlenz@8:                 containers |= territory_containment[group]
cmlenz@8:             containers.add(group)
cmlenz@8: 
cmlenz@1:     filenames = os.listdir(os.path.join(srcdir, 'main'))
cmlenz@1:     filenames.remove('root.xml')
cmlenz@1:     filenames.sort(lambda a,b: len(a)-len(b))
cmlenz@1:     filenames.insert(0, 'root.xml')
cmlenz@1: 
cmlenz@1:     dicts = {}
cmlenz@1: 
cmlenz@1:     for filename in filenames:
cmlenz@1:         print>>sys.stderr, 'Processing input file %r' % filename
cmlenz@1:         stem, ext = os.path.splitext(filename)
cmlenz@1:         if ext != '.xml':
cmlenz@1:             continue
cmlenz@1: 
cmlenz@1:         data = {}
cmlenz@1:         if stem != 'root':
cmlenz@1:             data.update(copy.deepcopy(dicts[_parent(stem)]))
cmlenz@1:         tree = parse(os.path.join(srcdir, 'main', filename))
cmlenz@1: 
cmlenz@8:         language = None
cmlenz@8:         elem = tree.find('//identity/language')
cmlenz@8:         if elem is not None:
cmlenz@8:             language = elem.attrib['type']
cmlenz@8:         print>>sys.stderr, '  Language:  %r' % language
cmlenz@8: 
cmlenz@8:         territory = None
cmlenz@8:         elem = tree.find('//identity/territory')
cmlenz@8:         if elem is not None:
cmlenz@8:             territory = elem.attrib['type']
cmlenz@8:         print>>sys.stderr, '  Territory: %r' % territory
cmlenz@8:         regions = territory_containment.get(territory, [])
cmlenz@8:         print>>sys.stderr, '  Regions:    %r' % regions
cmlenz@8: 
cmlenz@1:         # <localeDisplayNames>
cmlenz@1: 
cmlenz@1:         territories = data.setdefault('territories', {})
cmlenz@1:         for elem in tree.findall('//territories/territory'):
cmlenz@1:             if 'draft' in elem.attrib and elem.attrib['type'] in territories:
cmlenz@1:                 continue
cmlenz@1:             territories[elem.attrib['type']] = _text(elem)
cmlenz@1: 
cmlenz@1:         languages = data.setdefault('languages', {})
cmlenz@1:         for elem in tree.findall('//languages/language'):
cmlenz@1:             if 'draft' in elem.attrib and elem.attrib['type'] in languages:
cmlenz@1:                 continue
cmlenz@1:             languages[elem.attrib['type']] = _text(elem)
cmlenz@1: 
cmlenz@1:         variants = data.setdefault('variants', {})
cmlenz@1:         for elem in tree.findall('//variants/variant'):
cmlenz@1:             if 'draft' in elem.attrib and elem.attrib['type'] in variants:
cmlenz@1:                 continue
cmlenz@1:             variants[elem.attrib['type']] = _text(elem)
cmlenz@1: 
cmlenz@1:         scripts = data.setdefault('scripts', {})
cmlenz@1:         for elem in tree.findall('//scripts/script'):
cmlenz@1:             if 'draft' in elem.attrib and elem.attrib['type'] in scripts:
cmlenz@1:                 continue
cmlenz@1:             scripts[elem.attrib['type']] = _text(elem)
cmlenz@1: 
cmlenz@1:         # <dates>
cmlenz@1: 
cmlenz@8:         week_data = data.setdefault('week_data', {})
cmlenz@8:         supelem = sup.find('//weekData')
cmlenz@8: 
cmlenz@8:         for elem in supelem.findall('minDays'):
cmlenz@8:             territories = elem.attrib['territories'].split()
cmlenz@8:             if territory in territories or any([r in territories for r in regions]):
cmlenz@8:                 week_data['min_days'] = int(elem.attrib['count'])
cmlenz@8: 
cmlenz@8:         for elem in supelem.findall('firstDay'):
cmlenz@8:             territories = elem.attrib['territories'].split()
cmlenz@8:             if territory in territories or any([r in territories for r in regions]):
cmlenz@8:                 week_data['first_day'] = weekdays[elem.attrib['day']]
cmlenz@8: 
cmlenz@8:         for elem in supelem.findall('weekendStart'):
cmlenz@8:             territories = elem.attrib['territories'].split()
cmlenz@8:             if territory in territories or any([r in territories for r in regions]):
cmlenz@8:                 week_data['weekend_start'] = weekdays[elem.attrib['day']]
cmlenz@8: 
cmlenz@8:         for elem in supelem.findall('weekendEnd'):
cmlenz@8:             territories = elem.attrib['territories'].split()
cmlenz@8:             if territory in territories or any([r in territories for r in regions]):
cmlenz@8:                 week_data['weekend_end'] = weekdays[elem.attrib['day']]
cmlenz@8: 
cmlenz@1:         time_zones = data.setdefault('time_zones', {})
cmlenz@1:         for elem in tree.findall('//timeZoneNames/zone'):
cmlenz@1:             time_zones[elem.tag] = unicode(elem.findtext('displayName'))
cmlenz@1: 
cmlenz@1:         for calendar in tree.findall('//calendars/calendar'):
cmlenz@1:             if calendar.attrib['type'] != 'gregorian':
cmlenz@1:                 # TODO: support other calendar types
cmlenz@1:                 continue
cmlenz@1: 
cmlenz@1:             months = data.setdefault('months', {})
cmlenz@1:             for ctxt in calendar.findall('months/monthContext'):
cmlenz@1:                 ctxts = months.setdefault(ctxt.attrib['type'], {})
cmlenz@1:                 for width in ctxt.findall('monthWidth'):
cmlenz@1:                     widths = ctxts.setdefault(width.attrib['type'], {})
cmlenz@1:                     for elem in width.findall('month'):
cmlenz@1:                         if 'draft' in elem.attrib and int(elem.attrib['type']) in widths:
cmlenz@1:                             continue
cmlenz@1:                         widths[int(elem.attrib.get('type'))] = unicode(elem.text)
cmlenz@1: 
cmlenz@1:             days = data.setdefault('days', {})
cmlenz@1:             for ctxt in calendar.findall('days/dayContext'):
cmlenz@1:                 ctxts = days.setdefault(ctxt.attrib['type'], {})
cmlenz@1:                 for width in ctxt.findall('dayWidth'):
cmlenz@1:                     widths = ctxts.setdefault(width.attrib['type'], {})
cmlenz@1:                     for elem in width.findall('day'):
cmlenz@8:                         dtype = weekdays[elem.attrib['type']]
cmlenz@1:                         if 'draft' in elem.attrib and dtype in widths:
cmlenz@1:                             continue
cmlenz@1:                         widths[dtype] = unicode(elem.text)
cmlenz@1: 
cmlenz@1:             quarters = data.setdefault('quarters', {})
cmlenz@1:             for ctxt in calendar.findall('quarters/quarterContext'):
cmlenz@1:                 ctxts = quarters.setdefault(ctxt.attrib['type'], {})
cmlenz@1:                 for width in ctxt.findall('quarterWidth'):
cmlenz@1:                     widths = ctxts.setdefault(width.attrib['type'], {})
cmlenz@1:                     for elem in width.findall('quarter'):
cmlenz@1:                         if 'draft' in elem.attrib and int(elem.attrib['type']) in widths:
cmlenz@1:                             continue
cmlenz@1:                         widths[int(elem.attrib.get('type'))] = unicode(elem.text)
cmlenz@1: 
cmlenz@1:             eras = data.setdefault('eras', {})
cmlenz@1:             for width in calendar.findall('eras/*'):
cmlenz@1:                 ewidth = {'eraNames': 'wide', 'eraAbbr': 'abbreviated'}[width.tag]
cmlenz@1:                 widths = eras.setdefault(ewidth, {})
cmlenz@1:                 for elem in width.findall('era'):
cmlenz@1:                     if 'draft' in elem.attrib and int(elem.attrib['type']) in widths:
cmlenz@1:                         continue
cmlenz@1:                     widths[int(elem.attrib.get('type'))] = unicode(elem.text)
cmlenz@1: 
cmlenz@1:             # AM/PM
cmlenz@1:             periods = data.setdefault('periods', {})
cmlenz@1:             for elem in calendar.findall('am'):
cmlenz@1:                 if 'draft' in elem.attrib and elem.tag in periods:
cmlenz@1:                     continue
cmlenz@1:                 periods[elem.tag] = unicode(elem.text)
cmlenz@1:             for elem in calendar.findall('pm'):
cmlenz@1:                 if 'draft' in elem.attrib and elem.tag in periods:
cmlenz@1:                     continue
cmlenz@1:                 periods[elem.tag] = unicode(elem.text)
cmlenz@1: 
cmlenz@1:             date_formats = data.setdefault('date_formats', {})
cmlenz@1:             for elem in calendar.findall('dateFormats/dateFormatLength'):
cmlenz@1:                 if 'draft' in elem.attrib and elem.attrib.get('type') in date_formats:
cmlenz@1:                     continue
cmlenz@1:                 try:
cmlenz@1:                     date_formats[elem.attrib.get('type')] = \
jonas@9:                         dates.parse_pattern(unicode(elem.findtext('dateFormat/pattern')))
cmlenz@1:                 except ValueError, e:
cmlenz@1:                     print e
cmlenz@1: 
cmlenz@1:             time_formats = data.setdefault('time_formats', {})
cmlenz@1:             for elem in calendar.findall('timeFormats/timeFormatLength'):
cmlenz@1:                 if 'draft' in elem.attrib and elem.attrib.get('type') in time_formats:
cmlenz@1:                     continue
cmlenz@1:                 try:
cmlenz@1:                     time_formats[elem.attrib.get('type')] = \
jonas@9:                         dates.parse_pattern(unicode(elem.findtext('timeFormat/pattern')))
cmlenz@1:                 except ValueError, e:
cmlenz@1:                     print e
cmlenz@1: 
cmlenz@1:         # <numbers>
cmlenz@1: 
cmlenz@1:         number_symbols = data.setdefault('number_symbols', {})
cmlenz@1:         for elem in tree.findall('//numbers/symbols/*'):
cmlenz@1:             number_symbols[elem.tag] = unicode(elem.text)
cmlenz@1: 
cmlenz@1:         decimal_formats = data.setdefault('decimal_formats', {})
cmlenz@1:         for elem in tree.findall('//decimalFormats/decimalFormatLength'):
cmlenz@1:             if 'draft' in elem.attrib and elem.attrib.get('type') in decimal_formats:
cmlenz@1:                 continue
jonas@9:             decimal_formats[elem.attrib.get('type')] = numbers.parse_pattern(unicode(elem.findtext('decimalFormat/pattern')))
cmlenz@1: 
cmlenz@1:         scientific_formats = data.setdefault('scientific_formats', {})
cmlenz@1:         for elem in tree.findall('//scientificFormats/scientificFormatLength'):
cmlenz@1:             if 'draft' in elem.attrib and elem.attrib.get('type') in scientific_formats:
cmlenz@1:                 continue
cmlenz@1:             scientific_formats[elem.attrib.get('type')] = unicode(elem.findtext('scientificFormat/pattern'))
cmlenz@1: 
cmlenz@1:         currency_formats = data.setdefault('currency_formats', {})
cmlenz@1:         for elem in tree.findall('//currencyFormats/currencyFormatLength'):
cmlenz@1:             if 'draft' in elem.attrib and elem.attrib.get('type') in currency_formats:
cmlenz@1:                 continue
cmlenz@1:             currency_formats[elem.attrib.get('type')] = unicode(elem.findtext('currencyFormat/pattern'))
cmlenz@1: 
cmlenz@1:         percent_formats = data.setdefault('percent_formats', {})
cmlenz@1:         for elem in tree.findall('//percentFormats/percentFormatLength'):
cmlenz@1:             if 'draft' in elem.attrib and elem.attrib.get('type') in percent_formats:
cmlenz@1:                 continue
cmlenz@1:             percent_formats[elem.attrib.get('type')] = unicode(elem.findtext('percentFormat/pattern'))
cmlenz@1: 
cmlenz@1:         currencies = data.setdefault('currencies', {})
cmlenz@1:         for elem in tree.findall('//currencies/currency'):
cmlenz@1:             currencies[elem.attrib['type']] = {
cmlenz@1:                 'display_name': unicode(elem.findtext('displayName')),
cmlenz@1:                 'symbol': unicode(elem.findtext('symbol'))
cmlenz@1:             }
cmlenz@1: 
cmlenz@1:         dicts[stem] = data
cmlenz@1:         outfile = open(os.path.join(destdir, stem + '.dat'), 'wb')
cmlenz@1:         try:
cmlenz@1:             pickle.dump(data, outfile, 2)
cmlenz@1:         finally:
cmlenz@1:             outfile.close()
cmlenz@1: 
cmlenz@1: if __name__ == '__main__':
cmlenz@1:     main()