Mercurial > babel > old > mirror
diff scripts/import_cldr.py @ 377:841858d5b567
Implement support for aliases in the CLDR data. Closes #68. Also, update to CLDR 1.6, and a much improved `dump_data` script.
author | cmlenz |
---|---|
date | Mon, 07 Jul 2008 14:49:16 +0000 |
parents | c22f292731be |
children | 88e3589ca8df |
line wrap: on
line diff
--- a/scripts/import_cldr.py +++ b/scripts/import_cldr.py @@ -16,6 +16,7 @@ from optparse import OptionParser import os import pickle +import re import sys try: from xml.etree.ElementTree import parse @@ -26,6 +27,7 @@ sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), '..')) from babel import dates, numbers +from babel.localedata import Alias weekdays = {'mon': 0, 'tue': 1, 'wed': 2, 'thu': 3, 'fri': 4, 'sat': 5, 'sun': 6} @@ -36,6 +38,7 @@ def any(iterable): return filter(None, list(iterable)) + def _text(elem): buf = [elem.text or ''] for child in elem: @@ -43,6 +46,35 @@ buf.append(elem.tail or '') return u''.join(filter(None, buf)).strip() + +NAME_RE = re.compile(r"^\w+$") +TYPE_ATTR_RE = re.compile(r"^\w+\[@type='(.*?)'\]$") + +NAME_MAP = { + 'dateFormats': 'date_formats', + 'dateTimeFormats': 'datetime_formats', + 'eraAbbr': 'abbreviated', + 'eraNames': 'wide', + 'eraNarrow': 'narrow', + 'timeFormats': 'time_formats' +} + +def _translate_alias(ctxt, path): + parts = path.split('/') + keys = ctxt[:] + for part in parts: + if part == '..': + keys.pop() + else: + match = TYPE_ATTR_RE.match(part) + if match: + keys.append(match.group(1)) + else: + assert NAME_RE.match(part) + keys.append(NAME_MAP.get(part, part)) + return keys + + def main(): parser = OptionParser(usage='%prog path/to/cldr') options, args = parser.parse_args() @@ -109,6 +141,8 @@ stem, ext = os.path.splitext(filename) if ext != '.xml': continue + #if stem != 'root': + # break tree = parse(os.path.join(srcdir, 'main', filename)) data = {} @@ -133,25 +167,29 @@ territories = data.setdefault('territories', {}) for elem in tree.findall('//territories/territory'): - if 'draft' in elem.attrib and elem.attrib['type'] in territories: + if ('draft' in elem.attrib or 'alt' in elem.attrib) \ + and elem.attrib['type'] in territories: continue territories[elem.attrib['type']] = _text(elem) languages = data.setdefault('languages', {}) for elem in tree.findall('//languages/language'): - if 'draft' in elem.attrib and elem.attrib['type'] in languages: + if ('draft' in elem.attrib or 'alt' in elem.attrib) \ + and elem.attrib['type'] in languages: continue languages[elem.attrib['type']] = _text(elem) variants = data.setdefault('variants', {}) for elem in tree.findall('//variants/variant'): - if 'draft' in elem.attrib and elem.attrib['type'] in variants: + if ('draft' in elem.attrib or 'alt' in elem.attrib) \ + and elem.attrib['type'] in variants: continue variants[elem.attrib['type']] = _text(elem) scripts = data.setdefault('scripts', {}) for elem in tree.findall('//scripts/script'): - if 'draft' in elem.attrib and elem.attrib['type'] in scripts: + if ('draft' in elem.attrib or 'alt' in elem.attrib) \ + and elem.attrib['type'] in scripts: continue scripts[elem.attrib['type']] = _text(elem) @@ -182,15 +220,15 @@ zone_formats = data.setdefault('zone_formats', {}) for elem in tree.findall('//timeZoneNames/gmtFormat'): - if 'draft' not in elem.attrib: + if 'draft' not in elem.attrib and 'alt' not in elem.attrib: zone_formats['gmt'] = unicode(elem.text).replace('{0}', '%s') break for elem in tree.findall('//timeZoneNames/regionFormat'): - if 'draft' not in elem.attrib: + if 'draft' not in elem.attrib and 'alt' not in elem.attrib: zone_formats['region'] = unicode(elem.text).replace('{0}', '%s') break for elem in tree.findall('//timeZoneNames/fallbackFormat'): - if 'draft' not in elem.attrib: + if 'draft' not in elem.attrib and 'alt' not in elem.attrib: zone_formats['fallback'] = unicode(elem.text) \ .replace('{0}', '%(0)s').replace('{1}', '%(1)s') break @@ -227,88 +265,141 @@ months = data.setdefault('months', {}) for ctxt in calendar.findall('months/monthContext'): - ctxts = months.setdefault(ctxt.attrib['type'], {}) + ctxt_type = ctxt.attrib['type'] + ctxts = months.setdefault(ctxt_type, {}) for width in ctxt.findall('monthWidth'): - widths = ctxts.setdefault(width.attrib['type'], {}) - for elem in width.findall('month'): - if 'draft' in elem.attrib and int(elem.attrib['type']) in widths: - continue - widths[int(elem.attrib.get('type'))] = unicode(elem.text) + width_type = width.attrib['type'] + widths = ctxts.setdefault(width_type, {}) + for elem in width.getiterator(): + if elem.tag == 'month': + if ('draft' in elem.attrib or 'alt' in elem.attrib) \ + and int(elem.attrib['type']) in widths: + continue + widths[int(elem.attrib.get('type'))] = unicode(elem.text) + elif elem.tag == 'alias': + ctxts[width_type] = Alias( + _translate_alias(['months', ctxt_type, width_type], + elem.attrib['path']) + ) days = data.setdefault('days', {}) for ctxt in calendar.findall('days/dayContext'): - ctxts = days.setdefault(ctxt.attrib['type'], {}) + ctxt_type = ctxt.attrib['type'] + ctxts = days.setdefault(ctxt_type, {}) for width in ctxt.findall('dayWidth'): - widths = ctxts.setdefault(width.attrib['type'], {}) - for elem in width.findall('day'): - dtype = weekdays[elem.attrib['type']] - if 'draft' in elem.attrib and dtype in widths: - continue - widths[dtype] = unicode(elem.text) + width_type = width.attrib['type'] + widths = ctxts.setdefault(width_type, {}) + for elem in width.getiterator(): + if elem.tag == 'day': + dtype = weekdays[elem.attrib['type']] + if ('draft' in elem.attrib or 'alt' not in elem.attrib) \ + and dtype in widths: + continue + widths[dtype] = unicode(elem.text) + elif elem.tag == 'alias': + ctxts[width_type] = Alias( + _translate_alias(['days', ctxt_type, width_type], + elem.attrib['path']) + ) quarters = data.setdefault('quarters', {}) for ctxt in calendar.findall('quarters/quarterContext'): + ctxt_type = ctxt.attrib['type'] ctxts = quarters.setdefault(ctxt.attrib['type'], {}) for width in ctxt.findall('quarterWidth'): - widths = ctxts.setdefault(width.attrib['type'], {}) - for elem in width.findall('quarter'): - if 'draft' in elem.attrib and int(elem.attrib['type']) in widths: - continue - widths[int(elem.attrib.get('type'))] = unicode(elem.text) + width_type = width.attrib['type'] + widths = ctxts.setdefault(width_type, {}) + for elem in width.getiterator(): + if elem.tag == 'quarter': + if ('draft' in elem.attrib or 'alt' in elem.attrib) \ + and int(elem.attrib['type']) in widths: + continue + widths[int(elem.attrib['type'])] = unicode(elem.text) + elif elem.tag == 'alias': + ctxts[width_type] = Alias( + _translate_alias(['quarters', ctxt_type, width_type], + elem.attrib['path']) + ) eras = data.setdefault('eras', {}) for width in calendar.findall('eras/*'): - ewidth = { - 'eraAbbr': 'abbreviated', - 'eraNames': 'wide', - 'eraNarrow': 'narrow', - }[width.tag] - widths = eras.setdefault(ewidth, {}) - for elem in width.findall('era'): - if 'draft' in elem.attrib and int(elem.attrib['type']) in widths: - continue - widths[int(elem.attrib.get('type'))] = unicode(elem.text) + width_type = NAME_MAP[width.tag] + widths = eras.setdefault(width_type, {}) + for elem in width.getiterator(): + if elem.tag == 'era': + if ('draft' in elem.attrib or 'alt' in elem.attrib) \ + and int(elem.attrib['type']) in widths: + continue + widths[int(elem.attrib.get('type'))] = unicode(elem.text) + elif elem.tag == 'alias': + eras[width_type] = Alias( + _translate_alias(['eras', width_type], + elem.attrib['path']) + ) # AM/PM periods = data.setdefault('periods', {}) for elem in calendar.findall('am'): - if 'draft' in elem.attrib and elem.tag in periods: + if ('draft' in elem.attrib or 'alt' in elem.attrib) \ + and elem.tag in periods: continue periods[elem.tag] = unicode(elem.text) for elem in calendar.findall('pm'): - if 'draft' in elem.attrib and elem.tag in periods: + if ('draft' in elem.attrib or 'alt' in elem.attrib) \ + and elem.tag in periods: continue periods[elem.tag] = unicode(elem.text) date_formats = data.setdefault('date_formats', {}) - for elem in calendar.findall('dateFormats/dateFormatLength'): - if 'draft' in elem.attrib and elem.attrib.get('type') in date_formats: - continue - try: - date_formats[elem.attrib.get('type')] = \ - dates.parse_pattern(unicode(elem.findtext('dateFormat/pattern'))) - except ValueError, e: - print>>sys.stderr, 'ERROR: %s' % e + for format in calendar.findall('dateFormats'): + for elem in format.getiterator(): + if elem.tag == 'dateFormatLength': + if 'draft' in elem.attrib and \ + elem.attrib.get('type') in date_formats: + continue + try: + date_formats[elem.attrib.get('type')] = \ + dates.parse_pattern(unicode(elem.findtext('dateFormat/pattern'))) + except ValueError, e: + print>>sys.stderr, 'ERROR: %s' % e + elif elem.tag == 'alias': + date_formats = Alias(_translate_alias( + ['date_formats'], elem.attrib['path']) + ) time_formats = data.setdefault('time_formats', {}) - for elem in calendar.findall('timeFormats/timeFormatLength'): - if 'draft' in elem.attrib and elem.attrib.get('type') in time_formats: - continue - try: - time_formats[elem.attrib.get('type')] = \ - dates.parse_pattern(unicode(elem.findtext('timeFormat/pattern'))) - except ValueError, e: - print>>sys.stderr, 'ERROR: %s' % e + for format in calendar.findall('timeFormats'): + for elem in format.getiterator(): + if elem.tag == 'timeFormatLength': + if ('draft' in elem.attrib or 'alt' in elem.attrib) \ + and elem.attrib.get('type') in time_formats: + continue + try: + time_formats[elem.attrib.get('type')] = \ + dates.parse_pattern(unicode(elem.findtext('timeFormat/pattern'))) + except ValueError, e: + print>>sys.stderr, 'ERROR: %s' % e + elif elem.tag == 'alias': + time_formats = Alias(_translate_alias( + ['time_formats'], elem.attrib['path']) + ) datetime_formats = data.setdefault('datetime_formats', {}) - for elem in calendar.findall('dateTimeFormats/dateTimeFormatLength'): - if 'draft' in elem.attrib and elem.attrib.get('type') in datetime_formats: - continue - try: - datetime_formats[elem.attrib.get('type')] = \ - unicode(elem.findtext('dateTimeFormat/pattern')) - except ValueError, e: - print>>sys.stderr, 'ERROR: %s' % e + for format in calendar.findall('dateTimeFormats'): + for elem in format.getiterator(): + if elem.tag == 'dateTimeFormatLength': + if ('draft' in elem.attrib or 'alt' in elem.attrib) \ + and elem.attrib.get('type') in datetime_formats: + continue + try: + datetime_formats[elem.attrib.get('type')] = \ + unicode(elem.findtext('dateTimeFormat/pattern')) + except ValueError, e: + print>>sys.stderr, 'ERROR: %s' % e + elif elem.tag == 'alias': + datetime_formats = Alias(_translate_alias( + ['datetime_formats'], elem.attrib['path']) + ) # <numbers> @@ -318,28 +409,32 @@ decimal_formats = data.setdefault('decimal_formats', {}) for elem in tree.findall('//decimalFormats/decimalFormatLength'): - if 'draft' in elem.attrib and elem.attrib.get('type') in decimal_formats: + if ('draft' in elem.attrib or 'alt' in elem.attrib) \ + and elem.attrib.get('type') in decimal_formats: continue pattern = unicode(elem.findtext('decimalFormat/pattern')) decimal_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern) scientific_formats = data.setdefault('scientific_formats', {}) for elem in tree.findall('//scientificFormats/scientificFormatLength'): - if 'draft' in elem.attrib and elem.attrib.get('type') in scientific_formats: + if ('draft' in elem.attrib or 'alt' in elem.attrib) \ + and elem.attrib.get('type') in scientific_formats: continue pattern = unicode(elem.findtext('scientificFormat/pattern')) scientific_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern) currency_formats = data.setdefault('currency_formats', {}) for elem in tree.findall('//currencyFormats/currencyFormatLength'): - if 'draft' in elem.attrib and elem.attrib.get('type') in currency_formats: + if ('draft' in elem.attrib or 'alt' in elem.attrib) \ + and elem.attrib.get('type') in currency_formats: continue pattern = unicode(elem.findtext('currencyFormat/pattern')) currency_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern) percent_formats = data.setdefault('percent_formats', {}) for elem in tree.findall('//percentFormats/percentFormatLength'): - if 'draft' in elem.attrib and elem.attrib.get('type') in percent_formats: + if ('draft' in elem.attrib or 'alt' in elem.attrib) \ + and elem.attrib.get('type') in percent_formats: continue pattern = unicode(elem.findtext('percentFormat/pattern')) percent_formats[elem.attrib.get('type')] = numbers.parse_pattern(pattern) @@ -360,5 +455,6 @@ finally: outfile.close() + if __name__ == '__main__': main()