cmlenz@142: #!/usr/bin/env python cmlenz@142: # -*- coding: utf-8 -*- cmlenz@142: # cmlenz@142: # Copyright (C) 2007 Edgewall Software cmlenz@142: # All rights reserved. cmlenz@142: # cmlenz@142: # This software is licensed as described in the file COPYING, which cmlenz@142: # you should have received as part of this distribution. The terms cmlenz@142: # are also available at http://babel.edgewall.org/wiki/License. cmlenz@142: # cmlenz@142: # This software consists of voluntary contributions made by many cmlenz@142: # individuals. For the exact contribution history, see the revision cmlenz@142: # history and logs, available at http://babel.edgewall.org/log/. cmlenz@142: cmlenz@142: import copy cmlenz@142: from optparse import OptionParser cmlenz@142: import os cmlenz@142: import pickle cmlenz@142: import sys cmlenz@142: try: cmlenz@142: from xml.etree.ElementTree import parse cmlenz@142: except ImportError: cmlenz@142: from elementtree.ElementTree import parse cmlenz@142: cmlenz@142: # Make sure we're using Babel source, and not some previously installed version cmlenz@142: sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), '..')) cmlenz@142: cmlenz@142: from babel import dates, numbers cmlenz@142: cmlenz@142: weekdays = {'mon': 0, 'tue': 1, 'wed': 2, 'thu': 3, 'fri': 4, 'sat': 5, cmlenz@142: 'sun': 6} cmlenz@142: cmlenz@142: try: cmlenz@142: any cmlenz@142: except NameError: cmlenz@142: def any(iterable): cmlenz@142: return filter(None, list(iterable)) cmlenz@142: cmlenz@142: def _text(elem): cmlenz@142: buf = [elem.text or ''] cmlenz@142: for child in elem: cmlenz@142: buf.append(_text(child)) cmlenz@142: buf.append(elem.tail or '') cmlenz@142: return u''.join(filter(None, buf)).strip() cmlenz@142: cmlenz@142: def main(): cmlenz@142: parser = OptionParser(usage='%prog path/to/cldr') cmlenz@142: options, args = parser.parse_args() cmlenz@142: if len(args) != 1: cmlenz@142: parser.error('incorrect number of arguments') cmlenz@142: cmlenz@142: srcdir = args[0] cmlenz@142: destdir = os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), cmlenz@142: '..', 'babel', 'localedata') cmlenz@142: cmlenz@142: sup = parse(os.path.join(srcdir, 'supplemental', 'supplementalData.xml')) cmlenz@142: cmlenz@142: # build a territory containment mapping for inheritance cmlenz@142: regions = {} cmlenz@142: for elem in sup.findall('//territoryContainment/group'): cmlenz@142: regions[elem.attrib['type']] = elem.attrib['contains'].split() cmlenz@142: cmlenz@142: # Resolve territory containment cmlenz@142: territory_containment = {} cmlenz@142: region_items = regions.items() cmlenz@142: region_items.sort() cmlenz@142: for group, territory_list in region_items: cmlenz@142: for territory in territory_list: cmlenz@142: containers = territory_containment.setdefault(territory, set([])) cmlenz@142: if group in territory_containment: cmlenz@142: containers |= territory_containment[group] cmlenz@142: containers.add(group) cmlenz@142: cmlenz@142: filenames = os.listdir(os.path.join(srcdir, 'main')) cmlenz@142: filenames.remove('root.xml') cmlenz@142: filenames.sort(lambda a,b: len(a)-len(b)) cmlenz@142: filenames.insert(0, 'root.xml') cmlenz@142: cmlenz@142: dicts = {} cmlenz@142: cmlenz@142: for filename in filenames: cmlenz@142: print>>sys.stderr, 'Processing input file %r' % filename cmlenz@142: stem, ext = os.path.splitext(filename) cmlenz@142: if ext != '.xml': cmlenz@142: continue cmlenz@142: cmlenz@142: tree = parse(os.path.join(srcdir, 'main', filename)) cmlenz@142: data = {} cmlenz@142: cmlenz@142: language = None cmlenz@142: elem = tree.find('//identity/language') cmlenz@142: if elem is not None: cmlenz@142: language = elem.attrib['type'] cmlenz@142: print>>sys.stderr, ' Language: %r' % language cmlenz@142: cmlenz@142: territory = None cmlenz@142: elem = tree.find('//identity/territory') cmlenz@142: if elem is not None: cmlenz@142: territory = elem.attrib['type'] cmlenz@142: else: cmlenz@142: territory = '001' # world cmlenz@142: print>>sys.stderr, ' Territory: %r' % territory cmlenz@142: regions = territory_containment.get(territory, []) cmlenz@142: print>>sys.stderr, ' Regions: %r' % regions cmlenz@142: cmlenz@142: # cmlenz@142: cmlenz@142: territories = data.setdefault('territories', {}) cmlenz@142: for elem in tree.findall('//territories/territory'): cmlenz@142: if 'draft' in elem.attrib and elem.attrib['type'] in territories: cmlenz@142: continue cmlenz@142: territories[elem.attrib['type']] = _text(elem) cmlenz@142: cmlenz@142: languages = data.setdefault('languages', {}) cmlenz@142: for elem in tree.findall('//languages/language'): cmlenz@142: if 'draft' in elem.attrib and elem.attrib['type'] in languages: cmlenz@142: continue cmlenz@142: languages[elem.attrib['type']] = _text(elem) cmlenz@142: cmlenz@142: variants = data.setdefault('variants', {}) cmlenz@142: for elem in tree.findall('//variants/variant'): cmlenz@142: if 'draft' in elem.attrib and elem.attrib['type'] in variants: cmlenz@142: continue cmlenz@142: variants[elem.attrib['type']] = _text(elem) cmlenz@142: cmlenz@142: scripts = data.setdefault('scripts', {}) cmlenz@142: for elem in tree.findall('//scripts/script'): cmlenz@142: if 'draft' in elem.attrib and elem.attrib['type'] in scripts: cmlenz@142: continue cmlenz@142: scripts[elem.attrib['type']] = _text(elem) cmlenz@142: cmlenz@142: # cmlenz@142: cmlenz@142: week_data = data.setdefault('week_data', {}) cmlenz@142: supelem = sup.find('//weekData') cmlenz@142: cmlenz@142: for elem in supelem.findall('minDays'): cmlenz@142: territories = elem.attrib['territories'].split() cmlenz@142: if territory in territories or any([r in territories for r in regions]): cmlenz@142: week_data['min_days'] = int(elem.attrib['count']) cmlenz@142: cmlenz@142: for elem in supelem.findall('firstDay'): cmlenz@142: territories = elem.attrib['territories'].split() cmlenz@142: if territory in territories or any([r in territories for r in regions]): cmlenz@142: week_data['first_day'] = weekdays[elem.attrib['day']] cmlenz@142: cmlenz@142: for elem in supelem.findall('weekendStart'): cmlenz@142: territories = elem.attrib['territories'].split() cmlenz@142: if territory in territories or any([r in territories for r in regions]): cmlenz@142: week_data['weekend_start'] = weekdays[elem.attrib['day']] cmlenz@142: cmlenz@142: for elem in supelem.findall('weekendEnd'): cmlenz@142: territories = elem.attrib['territories'].split() cmlenz@142: if territory in territories or any([r in territories for r in regions]): cmlenz@142: week_data['weekend_end'] = weekdays[elem.attrib['day']] cmlenz@142: cmlenz@142: time_zones = data.setdefault('time_zones', {}) cmlenz@142: for elem in tree.findall('//timeZoneNames/zone'): cmlenz@142: info = {} cmlenz@142: city = elem.findtext('exemplarCity') cmlenz@142: if city: cmlenz@142: info['city'] = unicode(city) cmlenz@142: for child in elem.findall('long/*'): cmlenz@142: info.setdefault('long', {})[child.tag] = unicode(child.text) cmlenz@142: for child in elem.findall('short/*'): cmlenz@142: info.setdefault('short', {})[child.tag] = unicode(child.text) cmlenz@142: time_zones[elem.attrib['type']] = info cmlenz@142: cmlenz@142: zone_aliases = data.setdefault('zone_aliases', {}) cmlenz@142: if stem == 'root': cmlenz@142: for elem in sup.findall('//timezoneData/zoneFormatting/zoneItem'): cmlenz@142: if 'aliases' in elem.attrib: cmlenz@142: canonical_id = elem.attrib['type'] cmlenz@142: for alias in elem.attrib['aliases'].split(): cmlenz@142: zone_aliases[alias] = canonical_id cmlenz@142: cmlenz@142: for calendar in tree.findall('//calendars/calendar'): cmlenz@142: if calendar.attrib['type'] != 'gregorian': cmlenz@142: # TODO: support other calendar types cmlenz@142: continue cmlenz@142: cmlenz@142: months = data.setdefault('months', {}) cmlenz@142: for ctxt in calendar.findall('months/monthContext'): cmlenz@142: ctxts = months.setdefault(ctxt.attrib['type'], {}) cmlenz@142: for width in ctxt.findall('monthWidth'): cmlenz@142: widths = ctxts.setdefault(width.attrib['type'], {}) cmlenz@142: for elem in width.findall('month'): cmlenz@142: if 'draft' in elem.attrib and int(elem.attrib['type']) in widths: cmlenz@142: continue cmlenz@142: widths[int(elem.attrib.get('type'))] = unicode(elem.text) cmlenz@142: cmlenz@142: days = data.setdefault('days', {}) cmlenz@142: for ctxt in calendar.findall('days/dayContext'): cmlenz@142: ctxts = days.setdefault(ctxt.attrib['type'], {}) cmlenz@142: for width in ctxt.findall('dayWidth'): cmlenz@142: widths = ctxts.setdefault(width.attrib['type'], {}) cmlenz@142: for elem in width.findall('day'): cmlenz@142: dtype = weekdays[elem.attrib['type']] cmlenz@142: if 'draft' in elem.attrib and dtype in widths: cmlenz@142: continue cmlenz@142: widths[dtype] = unicode(elem.text) cmlenz@142: cmlenz@142: quarters = data.setdefault('quarters', {}) cmlenz@142: for ctxt in calendar.findall('quarters/quarterContext'): cmlenz@142: ctxts = quarters.setdefault(ctxt.attrib['type'], {}) cmlenz@142: for width in ctxt.findall('quarterWidth'): cmlenz@142: widths = ctxts.setdefault(width.attrib['type'], {}) cmlenz@142: for elem in width.findall('quarter'): cmlenz@142: if 'draft' in elem.attrib and int(elem.attrib['type']) in widths: cmlenz@142: continue cmlenz@142: widths[int(elem.attrib.get('type'))] = unicode(elem.text) cmlenz@142: cmlenz@142: eras = data.setdefault('eras', {}) cmlenz@142: for width in calendar.findall('eras/*'): cmlenz@142: ewidth = {'eraNames': 'wide', 'eraAbbr': 'abbreviated'}[width.tag] cmlenz@142: widths = eras.setdefault(ewidth, {}) cmlenz@142: for elem in width.findall('era'): cmlenz@142: if 'draft' in elem.attrib and int(elem.attrib['type']) in widths: cmlenz@142: continue cmlenz@142: widths[int(elem.attrib.get('type'))] = unicode(elem.text) cmlenz@142: cmlenz@142: # AM/PM cmlenz@142: periods = data.setdefault('periods', {}) cmlenz@142: for elem in calendar.findall('am'): cmlenz@142: if 'draft' in elem.attrib and elem.tag in periods: cmlenz@142: continue cmlenz@142: periods[elem.tag] = unicode(elem.text) cmlenz@142: for elem in calendar.findall('pm'): cmlenz@142: if 'draft' in elem.attrib and elem.tag in periods: cmlenz@142: continue cmlenz@142: periods[elem.tag] = unicode(elem.text) cmlenz@142: cmlenz@142: date_formats = data.setdefault('date_formats', {}) cmlenz@142: for elem in calendar.findall('dateFormats/dateFormatLength'): cmlenz@142: if 'draft' in elem.attrib and elem.attrib.get('type') in date_formats: cmlenz@142: continue cmlenz@142: try: cmlenz@142: date_formats[elem.attrib.get('type')] = \ cmlenz@142: dates.parse_pattern(unicode(elem.findtext('dateFormat/pattern'))) cmlenz@142: except ValueError, e: cmlenz@142: print>>sys.stderr, 'ERROR: %s' % e cmlenz@142: cmlenz@142: time_formats = data.setdefault('time_formats', {}) cmlenz@142: for elem in calendar.findall('timeFormats/timeFormatLength'): cmlenz@142: if 'draft' in elem.attrib and elem.attrib.get('type') in time_formats: cmlenz@142: continue cmlenz@142: try: cmlenz@142: time_formats[elem.attrib.get('type')] = \ cmlenz@142: dates.parse_pattern(unicode(elem.findtext('timeFormat/pattern'))) cmlenz@142: except ValueError, e: cmlenz@142: print>>sys.stderr, 'ERROR: %s' % e cmlenz@142: cmlenz@142: datetime_formats = data.setdefault('datetime_formats', {}) cmlenz@142: for elem in calendar.findall('dateTimeFormats/dateTimeFormatLength'): cmlenz@142: if 'draft' in elem.attrib and elem.attrib.get('type') in datetime_formats: cmlenz@142: continue cmlenz@142: try: cmlenz@142: datetime_formats[elem.attrib.get('type')] = \ cmlenz@142: unicode(elem.findtext('dateTimeFormat/pattern')) cmlenz@142: except ValueError, e: cmlenz@142: print>>sys.stderr, 'ERROR: %s' % e cmlenz@142: cmlenz@142: # cmlenz@142: cmlenz@142: number_symbols = data.setdefault('number_symbols', {}) cmlenz@142: for elem in tree.findall('//numbers/symbols/*'): cmlenz@142: number_symbols[elem.tag] = unicode(elem.text) cmlenz@142: cmlenz@142: decimal_formats = data.setdefault('decimal_formats', {}) cmlenz@142: for elem in tree.findall('//decimalFormats/decimalFormatLength'): cmlenz@142: if 'draft' in elem.attrib and elem.attrib.get('type') in decimal_formats: cmlenz@142: continue cmlenz@142: pattern = unicode(elem.findtext('decimalFormat/pattern')) cmlenz@142: decimal_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern) cmlenz@142: cmlenz@142: scientific_formats = data.setdefault('scientific_formats', {}) cmlenz@142: for elem in tree.findall('//scientificFormats/scientificFormatLength'): cmlenz@142: if 'draft' in elem.attrib and elem.attrib.get('type') in scientific_formats: cmlenz@142: continue cmlenz@142: pattern = unicode(elem.findtext('scientificFormat/pattern')) cmlenz@142: scientific_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern) cmlenz@142: cmlenz@142: currency_formats = data.setdefault('currency_formats', {}) cmlenz@142: for elem in tree.findall('//currencyFormats/currencyFormatLength'): cmlenz@142: if 'draft' in elem.attrib and elem.attrib.get('type') in currency_formats: cmlenz@142: continue cmlenz@142: pattern = unicode(elem.findtext('currencyFormat/pattern')) cmlenz@142: currency_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern) cmlenz@142: cmlenz@142: percent_formats = data.setdefault('percent_formats', {}) cmlenz@142: for elem in tree.findall('//percentFormats/percentFormatLength'): cmlenz@142: if 'draft' in elem.attrib and elem.attrib.get('type') in percent_formats: cmlenz@142: continue cmlenz@142: pattern = unicode(elem.findtext('percentFormat/pattern')) cmlenz@142: percent_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern) cmlenz@142: cmlenz@142: currency_names = data.setdefault('currency_names', {}) cmlenz@142: currency_symbols = data.setdefault('currency_symbols', {}) cmlenz@142: for elem in tree.findall('//currencies/currency'): cmlenz@142: name = elem.findtext('displayName') cmlenz@142: if name: cmlenz@142: currency_names[elem.attrib['type']] = unicode(name) cmlenz@142: symbol = elem.findtext('symbol') cmlenz@142: if symbol: cmlenz@142: currency_symbols[elem.attrib['type']] = unicode(symbol) cmlenz@142: cmlenz@142: dicts[stem] = data cmlenz@142: outfile = open(os.path.join(destdir, stem + '.dat'), 'wb') cmlenz@142: try: cmlenz@142: pickle.dump(data, outfile, 2) cmlenz@142: finally: cmlenz@142: outfile.close() cmlenz@142: cmlenz@142: if __name__ == '__main__': cmlenz@142: main()