cmlenz@263: #!/usr/bin/env python cmlenz@263: # -*- coding: utf-8 -*- cmlenz@263: # cmlenz@263: # Copyright (C) 2007 Edgewall Software cmlenz@263: # All rights reserved. cmlenz@263: # cmlenz@263: # This software is licensed as described in the file COPYING, which cmlenz@263: # you should have received as part of this distribution. The terms cmlenz@263: # are also available at http://babel.edgewall.org/wiki/License. cmlenz@263: # cmlenz@263: # This software consists of voluntary contributions made by many cmlenz@263: # individuals. For the exact contribution history, see the revision cmlenz@263: # history and logs, available at http://babel.edgewall.org/log/. cmlenz@263: cmlenz@263: import copy cmlenz@263: from optparse import OptionParser cmlenz@263: import os cmlenz@263: import pickle cmlenz@381: import re cmlenz@263: import sys cmlenz@263: try: cmlenz@263: from xml.etree.ElementTree import parse cmlenz@263: except ImportError: cmlenz@263: from elementtree.ElementTree import parse cmlenz@263: cmlenz@263: # Make sure we're using Babel source, and not some previously installed version cmlenz@263: sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), '..')) cmlenz@263: cmlenz@263: from babel import dates, numbers cmlenz@381: from babel.localedata import Alias fschwarz@511: from babel.util import set cmlenz@263: cmlenz@263: weekdays = {'mon': 0, 'tue': 1, 'wed': 2, 'thu': 3, 'fri': 4, 'sat': 5, cmlenz@263: 'sun': 6} cmlenz@263: cmlenz@263: try: cmlenz@263: any cmlenz@263: except NameError: cmlenz@263: def any(iterable): cmlenz@263: return filter(None, list(iterable)) cmlenz@263: cmlenz@381: cmlenz@263: def _text(elem): cmlenz@263: buf = [elem.text or ''] cmlenz@263: for child in elem: cmlenz@263: buf.append(_text(child)) cmlenz@263: buf.append(elem.tail or '') cmlenz@263: return u''.join(filter(None, buf)).strip() cmlenz@263: cmlenz@381: cmlenz@381: NAME_RE = re.compile(r"^\w+$") cmlenz@381: TYPE_ATTR_RE = re.compile(r"^\w+\[@type='(.*?)'\]$") cmlenz@381: cmlenz@381: NAME_MAP = { cmlenz@381: 'dateFormats': 'date_formats', cmlenz@381: 'dateTimeFormats': 'datetime_formats', cmlenz@381: 'eraAbbr': 'abbreviated', cmlenz@381: 'eraNames': 'wide', cmlenz@381: 'eraNarrow': 'narrow', cmlenz@381: 'timeFormats': 'time_formats' cmlenz@381: } cmlenz@381: cmlenz@381: def _translate_alias(ctxt, path): cmlenz@381: parts = path.split('/') cmlenz@381: keys = ctxt[:] cmlenz@381: for part in parts: cmlenz@381: if part == '..': cmlenz@381: keys.pop() cmlenz@381: else: cmlenz@381: match = TYPE_ATTR_RE.match(part) cmlenz@381: if match: cmlenz@381: keys.append(match.group(1)) cmlenz@381: else: cmlenz@381: assert NAME_RE.match(part) cmlenz@381: keys.append(NAME_MAP.get(part, part)) cmlenz@381: return keys cmlenz@381: cmlenz@381: cmlenz@263: def main(): cmlenz@263: parser = OptionParser(usage='%prog path/to/cldr') cmlenz@263: options, args = parser.parse_args() cmlenz@263: if len(args) != 1: cmlenz@263: parser.error('incorrect number of arguments') cmlenz@263: cmlenz@263: srcdir = args[0] cmlenz@263: destdir = os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), cmlenz@263: '..', 'babel') cmlenz@263: cmlenz@263: sup = parse(os.path.join(srcdir, 'supplemental', 'supplementalData.xml')) cmlenz@263: cmlenz@348: # Import global data from the supplemental files cmlenz@263: global_data = {} cmlenz@263: cmlenz@263: territory_zones = global_data.setdefault('territory_zones', {}) cmlenz@263: zone_aliases = global_data.setdefault('zone_aliases', {}) cmlenz@263: zone_territories = global_data.setdefault('zone_territories', {}) jruigrok@471: for elem in sup.findall('.//timezoneData/zoneFormatting/zoneItem'): cmlenz@263: tzid = elem.attrib['type'] cmlenz@263: territory_zones.setdefault(elem.attrib['territory'], []).append(tzid) cmlenz@263: zone_territories[tzid] = elem.attrib['territory'] cmlenz@263: if 'aliases' in elem.attrib: cmlenz@263: for alias in elem.attrib['aliases'].split(): cmlenz@263: zone_aliases[alias] = tzid cmlenz@263: cmlenz@348: # Import Metazone mapping cmlenz@348: meta_zones = global_data.setdefault('meta_zones', {}) cmlenz@348: tzsup = parse(os.path.join(srcdir, 'supplemental', 'metazoneInfo.xml')) jruigrok@471: for elem in tzsup.findall('.//timezone'): cmlenz@348: for child in elem.findall('usesMetazone'): cmlenz@348: if 'to' not in child.attrib: # FIXME: support old mappings cmlenz@348: meta_zones[elem.attrib['type']] = child.attrib['mzone'] cmlenz@348: cmlenz@263: outfile = open(os.path.join(destdir, 'global.dat'), 'wb') cmlenz@263: try: cmlenz@263: pickle.dump(global_data, outfile, 2) cmlenz@263: finally: cmlenz@263: outfile.close() cmlenz@263: cmlenz@263: # build a territory containment mapping for inheritance cmlenz@263: regions = {} jruigrok@471: for elem in sup.findall('.//territoryContainment/group'): cmlenz@263: regions[elem.attrib['type']] = elem.attrib['contains'].split() cmlenz@263: cmlenz@263: # Resolve territory containment cmlenz@263: territory_containment = {} cmlenz@263: region_items = regions.items() cmlenz@263: region_items.sort() cmlenz@263: for group, territory_list in region_items: cmlenz@263: for territory in territory_list: cmlenz@263: containers = territory_containment.setdefault(territory, set([])) cmlenz@263: if group in territory_containment: cmlenz@263: containers |= territory_containment[group] cmlenz@263: containers.add(group) cmlenz@263: cmlenz@263: filenames = os.listdir(os.path.join(srcdir, 'main')) cmlenz@263: filenames.remove('root.xml') cmlenz@263: filenames.sort(lambda a,b: len(a)-len(b)) cmlenz@263: filenames.insert(0, 'root.xml') cmlenz@263: cmlenz@263: for filename in filenames: cmlenz@263: stem, ext = os.path.splitext(filename) cmlenz@263: if ext != '.xml': cmlenz@263: continue cmlenz@263: cmlenz@391: print>>sys.stderr, 'Processing input file %r' % filename cmlenz@263: tree = parse(os.path.join(srcdir, 'main', filename)) cmlenz@263: data = {} cmlenz@263: cmlenz@263: language = None jruigrok@471: elem = tree.find('.//identity/language') cmlenz@263: if elem is not None: cmlenz@263: language = elem.attrib['type'] cmlenz@263: print>>sys.stderr, ' Language: %r' % language cmlenz@263: cmlenz@263: territory = None jruigrok@471: elem = tree.find('.//identity/territory') cmlenz@263: if elem is not None: cmlenz@263: territory = elem.attrib['type'] cmlenz@263: else: cmlenz@263: territory = '001' # world cmlenz@263: print>>sys.stderr, ' Territory: %r' % territory cmlenz@263: regions = territory_containment.get(territory, []) cmlenz@263: print>>sys.stderr, ' Regions: %r' % regions cmlenz@263: cmlenz@263: # cmlenz@263: cmlenz@263: territories = data.setdefault('territories', {}) jruigrok@471: for elem in tree.findall('.//territories/territory'): cmlenz@381: if ('draft' in elem.attrib or 'alt' in elem.attrib) \ cmlenz@381: and elem.attrib['type'] in territories: cmlenz@263: continue cmlenz@263: territories[elem.attrib['type']] = _text(elem) cmlenz@263: cmlenz@263: languages = data.setdefault('languages', {}) jruigrok@471: for elem in tree.findall('.//languages/language'): cmlenz@381: if ('draft' in elem.attrib or 'alt' in elem.attrib) \ cmlenz@381: and elem.attrib['type'] in languages: cmlenz@263: continue cmlenz@263: languages[elem.attrib['type']] = _text(elem) cmlenz@263: cmlenz@263: variants = data.setdefault('variants', {}) jruigrok@471: for elem in tree.findall('.//variants/variant'): cmlenz@381: if ('draft' in elem.attrib or 'alt' in elem.attrib) \ cmlenz@381: and elem.attrib['type'] in variants: cmlenz@263: continue cmlenz@263: variants[elem.attrib['type']] = _text(elem) cmlenz@263: cmlenz@263: scripts = data.setdefault('scripts', {}) jruigrok@471: for elem in tree.findall('.//scripts/script'): cmlenz@381: if ('draft' in elem.attrib or 'alt' in elem.attrib) \ cmlenz@381: and elem.attrib['type'] in scripts: cmlenz@263: continue cmlenz@263: scripts[elem.attrib['type']] = _text(elem) cmlenz@263: cmlenz@263: # cmlenz@263: cmlenz@263: week_data = data.setdefault('week_data', {}) jruigrok@471: supelem = sup.find('.//weekData') cmlenz@263: cmlenz@263: for elem in supelem.findall('minDays'): cmlenz@263: territories = elem.attrib['territories'].split() cmlenz@263: if territory in territories or any([r in territories for r in regions]): cmlenz@263: week_data['min_days'] = int(elem.attrib['count']) cmlenz@263: cmlenz@263: for elem in supelem.findall('firstDay'): cmlenz@263: territories = elem.attrib['territories'].split() cmlenz@263: if territory in territories or any([r in territories for r in regions]): cmlenz@263: week_data['first_day'] = weekdays[elem.attrib['day']] cmlenz@263: cmlenz@263: for elem in supelem.findall('weekendStart'): cmlenz@263: territories = elem.attrib['territories'].split() cmlenz@263: if territory in territories or any([r in territories for r in regions]): cmlenz@263: week_data['weekend_start'] = weekdays[elem.attrib['day']] cmlenz@263: cmlenz@263: for elem in supelem.findall('weekendEnd'): cmlenz@263: territories = elem.attrib['territories'].split() cmlenz@263: if territory in territories or any([r in territories for r in regions]): cmlenz@263: week_data['weekend_end'] = weekdays[elem.attrib['day']] cmlenz@263: cmlenz@263: zone_formats = data.setdefault('zone_formats', {}) jruigrok@471: for elem in tree.findall('.//timeZoneNames/gmtFormat'): cmlenz@381: if 'draft' not in elem.attrib and 'alt' not in elem.attrib: cmlenz@263: zone_formats['gmt'] = unicode(elem.text).replace('{0}', '%s') cmlenz@263: break jruigrok@471: for elem in tree.findall('.//timeZoneNames/regionFormat'): cmlenz@381: if 'draft' not in elem.attrib and 'alt' not in elem.attrib: cmlenz@263: zone_formats['region'] = unicode(elem.text).replace('{0}', '%s') cmlenz@263: break jruigrok@471: for elem in tree.findall('.//timeZoneNames/fallbackFormat'): cmlenz@381: if 'draft' not in elem.attrib and 'alt' not in elem.attrib: cmlenz@263: zone_formats['fallback'] = unicode(elem.text) \ cmlenz@263: .replace('{0}', '%(0)s').replace('{1}', '%(1)s') cmlenz@263: break cmlenz@263: cmlenz@263: time_zones = data.setdefault('time_zones', {}) jruigrok@471: for elem in tree.findall('.//timeZoneNames/zone'): cmlenz@263: info = {} cmlenz@263: city = elem.findtext('exemplarCity') cmlenz@263: if city: cmlenz@263: info['city'] = unicode(city) cmlenz@263: for child in elem.findall('long/*'): cmlenz@263: info.setdefault('long', {})[child.tag] = unicode(child.text) cmlenz@263: for child in elem.findall('short/*'): cmlenz@263: info.setdefault('short', {})[child.tag] = unicode(child.text) cmlenz@263: time_zones[elem.attrib['type']] = info cmlenz@263: cmlenz@263: meta_zones = data.setdefault('meta_zones', {}) jruigrok@471: for elem in tree.findall('.//timeZoneNames/metazone'): cmlenz@263: info = {} cmlenz@263: city = elem.findtext('exemplarCity') cmlenz@263: if city: cmlenz@263: info['city'] = unicode(city) cmlenz@263: for child in elem.findall('long/*'): cmlenz@263: info.setdefault('long', {})[child.tag] = unicode(child.text) cmlenz@263: for child in elem.findall('short/*'): cmlenz@263: info.setdefault('short', {})[child.tag] = unicode(child.text) cmlenz@263: info['common'] = elem.findtext('commonlyUsed') == 'true' cmlenz@263: meta_zones[elem.attrib['type']] = info cmlenz@263: jruigrok@471: for calendar in tree.findall('.//calendars/calendar'): cmlenz@263: if calendar.attrib['type'] != 'gregorian': cmlenz@263: # TODO: support other calendar types cmlenz@263: continue cmlenz@263: cmlenz@263: months = data.setdefault('months', {}) cmlenz@263: for ctxt in calendar.findall('months/monthContext'): cmlenz@381: ctxt_type = ctxt.attrib['type'] cmlenz@381: ctxts = months.setdefault(ctxt_type, {}) cmlenz@263: for width in ctxt.findall('monthWidth'): cmlenz@381: width_type = width.attrib['type'] cmlenz@381: widths = ctxts.setdefault(width_type, {}) cmlenz@381: for elem in width.getiterator(): cmlenz@381: if elem.tag == 'month': cmlenz@381: if ('draft' in elem.attrib or 'alt' in elem.attrib) \ cmlenz@381: and int(elem.attrib['type']) in widths: cmlenz@381: continue cmlenz@381: widths[int(elem.attrib.get('type'))] = unicode(elem.text) cmlenz@381: elif elem.tag == 'alias': cmlenz@381: ctxts[width_type] = Alias( cmlenz@381: _translate_alias(['months', ctxt_type, width_type], cmlenz@381: elem.attrib['path']) cmlenz@381: ) cmlenz@263: cmlenz@263: days = data.setdefault('days', {}) cmlenz@263: for ctxt in calendar.findall('days/dayContext'): cmlenz@381: ctxt_type = ctxt.attrib['type'] cmlenz@381: ctxts = days.setdefault(ctxt_type, {}) cmlenz@263: for width in ctxt.findall('dayWidth'): cmlenz@381: width_type = width.attrib['type'] cmlenz@381: widths = ctxts.setdefault(width_type, {}) cmlenz@381: for elem in width.getiterator(): cmlenz@381: if elem.tag == 'day': cmlenz@381: dtype = weekdays[elem.attrib['type']] cmlenz@381: if ('draft' in elem.attrib or 'alt' not in elem.attrib) \ cmlenz@381: and dtype in widths: cmlenz@381: continue cmlenz@381: widths[dtype] = unicode(elem.text) cmlenz@381: elif elem.tag == 'alias': cmlenz@381: ctxts[width_type] = Alias( cmlenz@381: _translate_alias(['days', ctxt_type, width_type], cmlenz@381: elem.attrib['path']) cmlenz@381: ) cmlenz@263: cmlenz@263: quarters = data.setdefault('quarters', {}) cmlenz@263: for ctxt in calendar.findall('quarters/quarterContext'): cmlenz@381: ctxt_type = ctxt.attrib['type'] cmlenz@263: ctxts = quarters.setdefault(ctxt.attrib['type'], {}) cmlenz@263: for width in ctxt.findall('quarterWidth'): cmlenz@381: width_type = width.attrib['type'] cmlenz@381: widths = ctxts.setdefault(width_type, {}) cmlenz@381: for elem in width.getiterator(): cmlenz@381: if elem.tag == 'quarter': cmlenz@381: if ('draft' in elem.attrib or 'alt' in elem.attrib) \ cmlenz@381: and int(elem.attrib['type']) in widths: cmlenz@381: continue cmlenz@381: widths[int(elem.attrib['type'])] = unicode(elem.text) cmlenz@381: elif elem.tag == 'alias': cmlenz@381: ctxts[width_type] = Alias( cmlenz@381: _translate_alias(['quarters', ctxt_type, width_type], cmlenz@381: elem.attrib['path']) cmlenz@381: ) cmlenz@263: cmlenz@263: eras = data.setdefault('eras', {}) cmlenz@263: for width in calendar.findall('eras/*'): cmlenz@381: width_type = NAME_MAP[width.tag] cmlenz@381: widths = eras.setdefault(width_type, {}) cmlenz@381: for elem in width.getiterator(): cmlenz@381: if elem.tag == 'era': cmlenz@381: if ('draft' in elem.attrib or 'alt' in elem.attrib) \ cmlenz@381: and int(elem.attrib['type']) in widths: cmlenz@381: continue cmlenz@381: widths[int(elem.attrib.get('type'))] = unicode(elem.text) cmlenz@381: elif elem.tag == 'alias': cmlenz@381: eras[width_type] = Alias( cmlenz@381: _translate_alias(['eras', width_type], cmlenz@381: elem.attrib['path']) cmlenz@381: ) cmlenz@263: cmlenz@263: # AM/PM cmlenz@263: periods = data.setdefault('periods', {}) cmlenz@263: for elem in calendar.findall('am'): cmlenz@381: if ('draft' in elem.attrib or 'alt' in elem.attrib) \ cmlenz@381: and elem.tag in periods: cmlenz@263: continue cmlenz@263: periods[elem.tag] = unicode(elem.text) cmlenz@263: for elem in calendar.findall('pm'): cmlenz@381: if ('draft' in elem.attrib or 'alt' in elem.attrib) \ cmlenz@381: and elem.tag in periods: cmlenz@263: continue cmlenz@263: periods[elem.tag] = unicode(elem.text) cmlenz@263: cmlenz@263: date_formats = data.setdefault('date_formats', {}) cmlenz@381: for format in calendar.findall('dateFormats'): cmlenz@381: for elem in format.getiterator(): cmlenz@381: if elem.tag == 'dateFormatLength': cmlenz@381: if 'draft' in elem.attrib and \ cmlenz@381: elem.attrib.get('type') in date_formats: cmlenz@381: continue cmlenz@381: try: cmlenz@381: date_formats[elem.attrib.get('type')] = \ cmlenz@381: dates.parse_pattern(unicode(elem.findtext('dateFormat/pattern'))) cmlenz@381: except ValueError, e: cmlenz@381: print>>sys.stderr, 'ERROR: %s' % e cmlenz@381: elif elem.tag == 'alias': cmlenz@381: date_formats = Alias(_translate_alias( cmlenz@381: ['date_formats'], elem.attrib['path']) cmlenz@381: ) cmlenz@263: cmlenz@263: time_formats = data.setdefault('time_formats', {}) cmlenz@381: for format in calendar.findall('timeFormats'): cmlenz@381: for elem in format.getiterator(): cmlenz@381: if elem.tag == 'timeFormatLength': cmlenz@381: if ('draft' in elem.attrib or 'alt' in elem.attrib) \ cmlenz@381: and elem.attrib.get('type') in time_formats: cmlenz@381: continue cmlenz@381: try: cmlenz@381: time_formats[elem.attrib.get('type')] = \ cmlenz@381: dates.parse_pattern(unicode(elem.findtext('timeFormat/pattern'))) cmlenz@381: except ValueError, e: cmlenz@381: print>>sys.stderr, 'ERROR: %s' % e cmlenz@381: elif elem.tag == 'alias': cmlenz@381: time_formats = Alias(_translate_alias( cmlenz@381: ['time_formats'], elem.attrib['path']) cmlenz@381: ) cmlenz@263: cmlenz@263: datetime_formats = data.setdefault('datetime_formats', {}) cmlenz@381: for format in calendar.findall('dateTimeFormats'): cmlenz@381: for elem in format.getiterator(): cmlenz@381: if elem.tag == 'dateTimeFormatLength': cmlenz@381: if ('draft' in elem.attrib or 'alt' in elem.attrib) \ cmlenz@381: and elem.attrib.get('type') in datetime_formats: cmlenz@381: continue cmlenz@381: try: cmlenz@381: datetime_formats[elem.attrib.get('type')] = \ cmlenz@381: unicode(elem.findtext('dateTimeFormat/pattern')) cmlenz@381: except ValueError, e: cmlenz@381: print>>sys.stderr, 'ERROR: %s' % e cmlenz@381: elif elem.tag == 'alias': cmlenz@381: datetime_formats = Alias(_translate_alias( cmlenz@381: ['datetime_formats'], elem.attrib['path']) cmlenz@381: ) cmlenz@263: cmlenz@263: # cmlenz@263: cmlenz@263: number_symbols = data.setdefault('number_symbols', {}) jruigrok@471: for elem in tree.findall('.//numbers/symbols/*'): fschwarz@512: if ('draft' in elem.attrib or 'alt' in elem.attrib): fschwarz@512: continue cmlenz@263: number_symbols[elem.tag] = unicode(elem.text) cmlenz@263: cmlenz@263: decimal_formats = data.setdefault('decimal_formats', {}) jruigrok@471: for elem in tree.findall('.//decimalFormats/decimalFormatLength'): cmlenz@381: if ('draft' in elem.attrib or 'alt' in elem.attrib) \ cmlenz@381: and elem.attrib.get('type') in decimal_formats: cmlenz@263: continue cmlenz@263: pattern = unicode(elem.findtext('decimalFormat/pattern')) cmlenz@263: decimal_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern) cmlenz@263: cmlenz@263: scientific_formats = data.setdefault('scientific_formats', {}) jruigrok@471: for elem in tree.findall('.//scientificFormats/scientificFormatLength'): cmlenz@381: if ('draft' in elem.attrib or 'alt' in elem.attrib) \ cmlenz@381: and elem.attrib.get('type') in scientific_formats: cmlenz@263: continue cmlenz@263: pattern = unicode(elem.findtext('scientificFormat/pattern')) cmlenz@263: scientific_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern) cmlenz@263: cmlenz@263: currency_formats = data.setdefault('currency_formats', {}) jruigrok@471: for elem in tree.findall('.//currencyFormats/currencyFormatLength'): cmlenz@381: if ('draft' in elem.attrib or 'alt' in elem.attrib) \ cmlenz@381: and elem.attrib.get('type') in currency_formats: cmlenz@263: continue cmlenz@263: pattern = unicode(elem.findtext('currencyFormat/pattern')) cmlenz@263: currency_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern) cmlenz@263: cmlenz@263: percent_formats = data.setdefault('percent_formats', {}) jruigrok@471: for elem in tree.findall('.//percentFormats/percentFormatLength'): cmlenz@381: if ('draft' in elem.attrib or 'alt' in elem.attrib) \ cmlenz@381: and elem.attrib.get('type') in percent_formats: cmlenz@263: continue cmlenz@263: pattern = unicode(elem.findtext('percentFormat/pattern')) cmlenz@263: percent_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern) cmlenz@263: cmlenz@263: currency_names = data.setdefault('currency_names', {}) cmlenz@263: currency_symbols = data.setdefault('currency_symbols', {}) jruigrok@471: for elem in tree.findall('.//currencies/currency'): cmlenz@391: code = elem.attrib['type'] cmlenz@391: # TODO: support plural rules for currency name selection cmlenz@391: for name in elem.findall('displayName'): cmlenz@391: if ('draft' in name.attrib or 'count' in name.attrib) \ cmlenz@391: and code in currency_names: cmlenz@391: continue cmlenz@391: currency_names[code] = unicode(name.text) cmlenz@391: # TODO: support choice patterns for currency symbol selection cmlenz@391: symbol = elem.find('symbol') cmlenz@391: if symbol is not None and 'draft' not in symbol.attrib \ cmlenz@391: and 'choice' not in symbol.attrib: cmlenz@391: currency_symbols[code] = unicode(symbol.text) cmlenz@263: cmlenz@263: outfile = open(os.path.join(destdir, 'localedata', stem + '.dat'), 'wb') cmlenz@263: try: cmlenz@263: pickle.dump(data, outfile, 2) cmlenz@263: finally: cmlenz@263: outfile.close() cmlenz@263: cmlenz@381: cmlenz@263: if __name__ == '__main__': cmlenz@263: main()