Mercurial > babel > mirror
annotate scripts/import_cldr.py @ 10:4130d9c6cb34 trunk
Both Babel's [source:trunk/babel/catalog/frontend.py frontend] and [source:trunk/babel/catalog/extract.py extract] now handle keyword indices. Also added an extra boolean flag so that the default keywords defined by Babel are not included in the keywords to search for when extracting strings.
author | palgarvio |
---|---|
date | Wed, 30 May 2007 22:48:11 +0000 |
parents | 9ed6cf5975a1 |
children | 368650dc3423 |
rev | line source |
---|---|
1 | 1 #!/usr/bin/env python |
2 # -*- coding: utf-8 -*- | |
3 # | |
4 # Copyright (C) 2007 Edgewall Software | |
5 # All rights reserved. | |
6 # | |
7 # This software is licensed as described in the file COPYING, which | |
8 # you should have received as part of this distribution. The terms | |
9 # are also available at http://babel.edgewall.org/wiki/License. | |
10 # | |
11 # This software consists of voluntary contributions made by many | |
12 # individuals. For the exact contribution history, see the revision | |
13 # history and logs, available at http://babel.edgewall.org/log/. | |
14 | |
15 import copy | |
16 from optparse import OptionParser | |
17 import os | |
18 import pickle | |
19 import sys | |
20 try: | |
21 from xml.etree.ElementTree import parse | |
22 except ImportError: | |
23 from elementtree.ElementTree import parse | |
24 | |
9 | 25 from babel import dates, numbers |
1 | 26 |
8
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
27 weekdays = {'mon': 1, 'tue': 2, 'wed': 3, 'thu': 4, 'fri': 5, 'sat': 6, |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
28 'sun': 7} |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
29 |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
30 try: |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
31 any |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
32 except NameError: |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
33 def any(iterable): |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
34 return filter(None, list(iterable)) |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
35 |
1 | 36 def _parent(locale): |
37 parts = locale.split('_') | |
38 if len(parts) == 1: | |
39 return 'root' | |
40 else: | |
41 return '_'.join(parts[:-1]) | |
42 | |
43 def _text(elem): | |
44 buf = [elem.text or ''] | |
45 for child in elem: | |
46 buf.append(_text(child)) | |
47 buf.append(elem.tail or '') | |
48 return u''.join(filter(None, buf)).strip() | |
49 | |
50 def main(): | |
51 parser = OptionParser(usage='%prog path/to/cldr') | |
52 options, args = parser.parse_args() | |
53 if len(args) != 1: | |
54 parser.error('incorrect number of arguments') | |
55 | |
56 srcdir = args[0] | |
57 destdir = os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), | |
58 '..', 'babel', 'localedata') | |
59 | |
8
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
60 sup = parse(os.path.join(srcdir, 'supplemental', 'supplementalData.xml')) |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
61 |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
62 # build a territory containment mapping for inheritance |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
63 regions = {} |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
64 for elem in sup.findall('//territoryContainment/group'): |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
65 regions[elem.attrib['type']] = elem.attrib['contains'].split() |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
66 from pprint import pprint |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
67 |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
68 # Resolve territory containment |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
69 territory_containment = {} |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
70 region_items = regions.items() |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
71 region_items.sort() |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
72 for group, territory_list in region_items: |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
73 for territory in territory_list: |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
74 containers = territory_containment.setdefault(territory, set([])) |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
75 if group in territory_containment: |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
76 containers |= territory_containment[group] |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
77 containers.add(group) |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
78 |
1 | 79 filenames = os.listdir(os.path.join(srcdir, 'main')) |
80 filenames.remove('root.xml') | |
81 filenames.sort(lambda a,b: len(a)-len(b)) | |
82 filenames.insert(0, 'root.xml') | |
83 | |
84 dicts = {} | |
85 | |
86 for filename in filenames: | |
87 print>>sys.stderr, 'Processing input file %r' % filename | |
88 stem, ext = os.path.splitext(filename) | |
89 if ext != '.xml': | |
90 continue | |
91 | |
92 data = {} | |
93 if stem != 'root': | |
94 data.update(copy.deepcopy(dicts[_parent(stem)])) | |
95 tree = parse(os.path.join(srcdir, 'main', filename)) | |
96 | |
8
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
97 language = None |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
98 elem = tree.find('//identity/language') |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
99 if elem is not None: |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
100 language = elem.attrib['type'] |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
101 print>>sys.stderr, ' Language: %r' % language |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
102 |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
103 territory = None |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
104 elem = tree.find('//identity/territory') |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
105 if elem is not None: |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
106 territory = elem.attrib['type'] |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
107 print>>sys.stderr, ' Territory: %r' % territory |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
108 regions = territory_containment.get(territory, []) |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
109 print>>sys.stderr, ' Regions: %r' % regions |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
110 |
1 | 111 # <localeDisplayNames> |
112 | |
113 territories = data.setdefault('territories', {}) | |
114 for elem in tree.findall('//territories/territory'): | |
115 if 'draft' in elem.attrib and elem.attrib['type'] in territories: | |
116 continue | |
117 territories[elem.attrib['type']] = _text(elem) | |
118 | |
119 languages = data.setdefault('languages', {}) | |
120 for elem in tree.findall('//languages/language'): | |
121 if 'draft' in elem.attrib and elem.attrib['type'] in languages: | |
122 continue | |
123 languages[elem.attrib['type']] = _text(elem) | |
124 | |
125 variants = data.setdefault('variants', {}) | |
126 for elem in tree.findall('//variants/variant'): | |
127 if 'draft' in elem.attrib and elem.attrib['type'] in variants: | |
128 continue | |
129 variants[elem.attrib['type']] = _text(elem) | |
130 | |
131 scripts = data.setdefault('scripts', {}) | |
132 for elem in tree.findall('//scripts/script'): | |
133 if 'draft' in elem.attrib and elem.attrib['type'] in scripts: | |
134 continue | |
135 scripts[elem.attrib['type']] = _text(elem) | |
136 | |
137 # <dates> | |
138 | |
8
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
139 week_data = data.setdefault('week_data', {}) |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
140 supelem = sup.find('//weekData') |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
141 |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
142 for elem in supelem.findall('minDays'): |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
143 territories = elem.attrib['territories'].split() |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
144 if territory in territories or any([r in territories for r in regions]): |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
145 week_data['min_days'] = int(elem.attrib['count']) |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
146 |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
147 for elem in supelem.findall('firstDay'): |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
148 territories = elem.attrib['territories'].split() |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
149 if territory in territories or any([r in territories for r in regions]): |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
150 week_data['first_day'] = weekdays[elem.attrib['day']] |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
151 |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
152 for elem in supelem.findall('weekendStart'): |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
153 territories = elem.attrib['territories'].split() |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
154 if territory in territories or any([r in territories for r in regions]): |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
155 week_data['weekend_start'] = weekdays[elem.attrib['day']] |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
156 |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
157 for elem in supelem.findall('weekendEnd'): |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
158 territories = elem.attrib['territories'].split() |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
159 if territory in territories or any([r in territories for r in regions]): |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
160 week_data['weekend_end'] = weekdays[elem.attrib['day']] |
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
161 |
1 | 162 time_zones = data.setdefault('time_zones', {}) |
163 for elem in tree.findall('//timeZoneNames/zone'): | |
164 time_zones[elem.tag] = unicode(elem.findtext('displayName')) | |
165 | |
166 for calendar in tree.findall('//calendars/calendar'): | |
167 if calendar.attrib['type'] != 'gregorian': | |
168 # TODO: support other calendar types | |
169 continue | |
170 | |
171 months = data.setdefault('months', {}) | |
172 for ctxt in calendar.findall('months/monthContext'): | |
173 ctxts = months.setdefault(ctxt.attrib['type'], {}) | |
174 for width in ctxt.findall('monthWidth'): | |
175 widths = ctxts.setdefault(width.attrib['type'], {}) | |
176 for elem in width.findall('month'): | |
177 if 'draft' in elem.attrib and int(elem.attrib['type']) in widths: | |
178 continue | |
179 widths[int(elem.attrib.get('type'))] = unicode(elem.text) | |
180 | |
181 days = data.setdefault('days', {}) | |
182 for ctxt in calendar.findall('days/dayContext'): | |
183 ctxts = days.setdefault(ctxt.attrib['type'], {}) | |
184 for width in ctxt.findall('dayWidth'): | |
185 widths = ctxts.setdefault(width.attrib['type'], {}) | |
186 for elem in width.findall('day'): | |
8
29f6f9a90f14
Pull in some supplemental data from the CLDR, for things like the first day of the week.
cmlenz
parents:
1
diff
changeset
|
187 dtype = weekdays[elem.attrib['type']] |
1 | 188 if 'draft' in elem.attrib and dtype in widths: |
189 continue | |
190 widths[dtype] = unicode(elem.text) | |
191 | |
192 quarters = data.setdefault('quarters', {}) | |
193 for ctxt in calendar.findall('quarters/quarterContext'): | |
194 ctxts = quarters.setdefault(ctxt.attrib['type'], {}) | |
195 for width in ctxt.findall('quarterWidth'): | |
196 widths = ctxts.setdefault(width.attrib['type'], {}) | |
197 for elem in width.findall('quarter'): | |
198 if 'draft' in elem.attrib and int(elem.attrib['type']) in widths: | |
199 continue | |
200 widths[int(elem.attrib.get('type'))] = unicode(elem.text) | |
201 | |
202 eras = data.setdefault('eras', {}) | |
203 for width in calendar.findall('eras/*'): | |
204 ewidth = {'eraNames': 'wide', 'eraAbbr': 'abbreviated'}[width.tag] | |
205 widths = eras.setdefault(ewidth, {}) | |
206 for elem in width.findall('era'): | |
207 if 'draft' in elem.attrib and int(elem.attrib['type']) in widths: | |
208 continue | |
209 widths[int(elem.attrib.get('type'))] = unicode(elem.text) | |
210 | |
211 # AM/PM | |
212 periods = data.setdefault('periods', {}) | |
213 for elem in calendar.findall('am'): | |
214 if 'draft' in elem.attrib and elem.tag in periods: | |
215 continue | |
216 periods[elem.tag] = unicode(elem.text) | |
217 for elem in calendar.findall('pm'): | |
218 if 'draft' in elem.attrib and elem.tag in periods: | |
219 continue | |
220 periods[elem.tag] = unicode(elem.text) | |
221 | |
222 date_formats = data.setdefault('date_formats', {}) | |
223 for elem in calendar.findall('dateFormats/dateFormatLength'): | |
224 if 'draft' in elem.attrib and elem.attrib.get('type') in date_formats: | |
225 continue | |
226 try: | |
227 date_formats[elem.attrib.get('type')] = \ | |
9 | 228 dates.parse_pattern(unicode(elem.findtext('dateFormat/pattern'))) |
1 | 229 except ValueError, e: |
230 print e | |
231 | |
232 time_formats = data.setdefault('time_formats', {}) | |
233 for elem in calendar.findall('timeFormats/timeFormatLength'): | |
234 if 'draft' in elem.attrib and elem.attrib.get('type') in time_formats: | |
235 continue | |
236 try: | |
237 time_formats[elem.attrib.get('type')] = \ | |
9 | 238 dates.parse_pattern(unicode(elem.findtext('timeFormat/pattern'))) |
1 | 239 except ValueError, e: |
240 print e | |
241 | |
242 # <numbers> | |
243 | |
244 number_symbols = data.setdefault('number_symbols', {}) | |
245 for elem in tree.findall('//numbers/symbols/*'): | |
246 number_symbols[elem.tag] = unicode(elem.text) | |
247 | |
248 decimal_formats = data.setdefault('decimal_formats', {}) | |
249 for elem in tree.findall('//decimalFormats/decimalFormatLength'): | |
250 if 'draft' in elem.attrib and elem.attrib.get('type') in decimal_formats: | |
251 continue | |
9 | 252 decimal_formats[elem.attrib.get('type')] = numbers.parse_pattern(unicode(elem.findtext('decimalFormat/pattern'))) |
1 | 253 |
254 scientific_formats = data.setdefault('scientific_formats', {}) | |
255 for elem in tree.findall('//scientificFormats/scientificFormatLength'): | |
256 if 'draft' in elem.attrib and elem.attrib.get('type') in scientific_formats: | |
257 continue | |
258 scientific_formats[elem.attrib.get('type')] = unicode(elem.findtext('scientificFormat/pattern')) | |
259 | |
260 currency_formats = data.setdefault('currency_formats', {}) | |
261 for elem in tree.findall('//currencyFormats/currencyFormatLength'): | |
262 if 'draft' in elem.attrib and elem.attrib.get('type') in currency_formats: | |
263 continue | |
264 currency_formats[elem.attrib.get('type')] = unicode(elem.findtext('currencyFormat/pattern')) | |
265 | |
266 percent_formats = data.setdefault('percent_formats', {}) | |
267 for elem in tree.findall('//percentFormats/percentFormatLength'): | |
268 if 'draft' in elem.attrib and elem.attrib.get('type') in percent_formats: | |
269 continue | |
270 percent_formats[elem.attrib.get('type')] = unicode(elem.findtext('percentFormat/pattern')) | |
271 | |
272 currencies = data.setdefault('currencies', {}) | |
273 for elem in tree.findall('//currencies/currency'): | |
274 currencies[elem.attrib['type']] = { | |
275 'display_name': unicode(elem.findtext('displayName')), | |
276 'symbol': unicode(elem.findtext('symbol')) | |
277 } | |
278 | |
279 dicts[stem] = data | |
280 outfile = open(os.path.join(destdir, stem + '.dat'), 'wb') | |
281 try: | |
282 pickle.dump(data, outfile, 2) | |
283 finally: | |
284 outfile.close() | |
285 | |
286 if __name__ == '__main__': | |
287 main() |