cmlenz@1: # -*- coding: utf-8 -*- cmlenz@1: # cmlenz@1: # Copyright (C) 2007 Edgewall Software cmlenz@1: # All rights reserved. cmlenz@1: # cmlenz@1: # This software is licensed as described in the file COPYING, which cmlenz@1: # you should have received as part of this distribution. The terms cmlenz@1: # are also available at http://babel.edgewall.org/wiki/License. cmlenz@1: # cmlenz@1: # This software consists of voluntary contributions made by many cmlenz@1: # individuals. For the exact contribution history, see the revision cmlenz@1: # history and logs, available at http://babel.edgewall.org/log/. cmlenz@1: cmlenz@1: """Reading and writing of files in the ``gettext`` PO (portable object) cmlenz@1: format. cmlenz@1: cmlenz@1: :see: `The Format of PO Files cmlenz@1: `_ cmlenz@1: """ cmlenz@1: cmlenz@5: from datetime import date, datetime cmlenz@134: import os cmlenz@1: import re cmlenz@1: cmlenz@1: from babel import __version__ as VERSION cmlenz@199: from babel.messages.catalog import Catalog, Message fschwarz@507: from babel.util import set, wraptext, LOCALTZ cmlenz@1: cmlenz@178: __all__ = ['read_po', 'write_po'] cmlenz@161: __docformat__ = 'restructuredtext en' cmlenz@158: cmlenz@158: def unescape(string): cmlenz@158: r"""Reverse `escape` the given string. palgarvio@200: cmlenz@158: >>> print unescape('"Say:\\n \\"hello, world!\\"\\n"') cmlenz@158: Say: cmlenz@158: "hello, world!" cmlenz@158: palgarvio@200: cmlenz@158: :param string: the string to unescape cmlenz@158: :return: the unescaped string cmlenz@158: :rtype: `str` or `unicode` cmlenz@158: """ cmlenz@158: return string[1:-1].replace('\\\\', '\\') \ cmlenz@158: .replace('\\t', '\t') \ cmlenz@158: .replace('\\r', '\r') \ cmlenz@158: .replace('\\n', '\n') \ cmlenz@158: .replace('\\"', '\"') cmlenz@158: cmlenz@158: def denormalize(string): cmlenz@158: r"""Reverse the normalization done by the `normalize` function. palgarvio@200: cmlenz@158: >>> print denormalize(r'''"" cmlenz@158: ... "Say:\n" cmlenz@158: ... " \"hello, world!\"\n"''') cmlenz@158: Say: cmlenz@158: "hello, world!" cmlenz@158: palgarvio@200: cmlenz@158: >>> print denormalize(r'''"" cmlenz@158: ... "Say:\n" cmlenz@158: ... " \"Lorem ipsum dolor sit " cmlenz@158: ... "amet, consectetur adipisicing" cmlenz@158: ... " elit, \"\n"''') cmlenz@158: Say: cmlenz@158: "Lorem ipsum dolor sit amet, consectetur adipisicing elit, " cmlenz@158: palgarvio@200: cmlenz@158: :param string: the string to denormalize cmlenz@158: :return: the denormalized string cmlenz@158: :rtype: `unicode` or `str` cmlenz@158: """ cmlenz@158: if string.startswith('""'): cmlenz@158: lines = [] cmlenz@158: for line in string.splitlines()[1:]: cmlenz@158: lines.append(unescape(line)) cmlenz@158: return ''.join(lines) cmlenz@158: else: cmlenz@158: return unescape(string) cmlenz@1: cmlenz@199: def read_po(fileobj, locale=None, domain=None, ignore_obsolete=False): cmlenz@6: """Read messages from a ``gettext`` PO (portable object) file from the given cmlenz@64: file-like object and return a `Catalog`. palgarvio@200: cmlenz@6: >>> from StringIO import StringIO cmlenz@106: >>> buf = StringIO(''' cmlenz@6: ... #: main.py:1 cmlenz@6: ... #, fuzzy, python-format cmlenz@6: ... msgid "foo %(name)s" cmlenz@6: ... msgstr "" palgarvio@21: ... palgarvio@94: ... # A user comment palgarvio@94: ... #. An auto comment cmlenz@6: ... #: main.py:3 cmlenz@6: ... msgid "bar" cmlenz@6: ... msgid_plural "baz" cmlenz@6: ... msgstr[0] "" cmlenz@6: ... msgstr[1] "" cmlenz@6: ... ''') cmlenz@64: >>> catalog = read_po(buf) cmlenz@104: >>> catalog.revision_date = datetime(2007, 04, 01) palgarvio@200: cmlenz@64: >>> for message in catalog: cmlenz@67: ... if message.id: cmlenz@67: ... print (message.id, message.string) palgarvio@105: ... print ' ', (message.locations, message.flags) palgarvio@105: ... print ' ', (message.user_comments, message.auto_comments) cmlenz@149: (u'foo %(name)s', '') cmlenz@149: ([(u'main.py', 1)], set([u'fuzzy', u'python-format'])) palgarvio@105: ([], []) cmlenz@149: ((u'bar', u'baz'), ('', '')) cmlenz@149: ([(u'main.py', 3)], set([])) cmlenz@149: ([u'A user comment'], [u'An auto comment']) palgarvio@200: cmlenz@1: :param fileobj: the file-like object to read the PO file from cmlenz@196: :param locale: the locale identifier or `Locale` object, or `None` cmlenz@196: if the catalog is not bound to a locale (which basically cmlenz@196: means it's a template) cmlenz@196: :param domain: the message domain cmlenz@227: :param ignore_obsolete: whether to ignore obsolete messages in the input cmlenz@1: :return: an iterator over ``(message, translation, location)`` tuples cmlenz@1: :rtype: ``iterator`` cmlenz@1: """ cmlenz@196: catalog = Catalog(locale=locale, domain=domain) cmlenz@64: cmlenz@196: counter = [0] cmlenz@220: offset = [0] cmlenz@6: messages = [] cmlenz@6: translations = [] cmlenz@6: locations = [] cmlenz@6: flags = [] palgarvio@105: user_comments = [] palgarvio@105: auto_comments = [] cmlenz@199: obsolete = [False] cmlenz@199: in_msgid = [False] cmlenz@199: in_msgstr = [False] cmlenz@6: cmlenz@64: def _add_message(): cmlenz@6: translations.sort() cmlenz@64: if len(messages) > 1: cmlenz@106: msgid = tuple([denormalize(m) for m in messages]) cmlenz@64: else: cmlenz@106: msgid = denormalize(messages[0]) cmlenz@379: if isinstance(msgid, (list, tuple)): cmlenz@379: string = [] cmlenz@379: for idx in range(catalog.num_plurals): cmlenz@379: try: cmlenz@379: string.append(translations[idx]) cmlenz@379: except IndexError: cmlenz@379: string.append((idx, '')) cmlenz@379: string = tuple([denormalize(t[1]) for t in string]) cmlenz@64: else: cmlenz@106: string = denormalize(translations[0][1]) cmlenz@199: message = Message(msgid, string, list(locations), set(flags), cmlenz@227: auto_comments, user_comments, lineno=offset[0] + 1) cmlenz@199: if obsolete[0]: cmlenz@199: if not ignore_obsolete: cmlenz@199: catalog.obsolete[msgid] = message cmlenz@199: else: cmlenz@199: catalog[msgid] = message cmlenz@84: del messages[:]; del translations[:]; del locations[:]; palgarvio@105: del flags[:]; del auto_comments[:]; del user_comments[:] cmlenz@199: obsolete[0] = False cmlenz@196: counter[0] += 1 cmlenz@6: cmlenz@220: def _process_message_line(lineno, line): cmlenz@199: if line.startswith('msgid_plural'): cmlenz@199: in_msgid[0] = True cmlenz@199: msg = line[12:].lstrip() cmlenz@199: messages.append(msg) cmlenz@199: elif line.startswith('msgid'): cmlenz@199: in_msgid[0] = True cmlenz@220: offset[0] = lineno cmlenz@199: txt = line[5:].lstrip() cmlenz@199: if messages: cmlenz@199: _add_message() cmlenz@199: messages.append(txt) cmlenz@199: elif line.startswith('msgstr'): cmlenz@199: in_msgid[0] = False cmlenz@199: in_msgstr[0] = True cmlenz@199: msg = line[6:].lstrip() cmlenz@199: if msg.startswith('['): jruigrok@448: idx, msg = msg[1:].split(']', 1) cmlenz@199: translations.append([int(idx), msg.lstrip()]) cmlenz@199: else: cmlenz@199: translations.append([0, msg]) cmlenz@199: elif line.startswith('"'): cmlenz@199: if in_msgid[0]: cmlenz@199: messages[-1] += u'\n' + line.rstrip() cmlenz@199: elif in_msgstr[0]: cmlenz@199: translations[-1][1] += u'\n' + line.rstrip() cmlenz@199: cmlenz@220: for lineno, line in enumerate(fileobj.readlines()): fschwarz@507: line = line.strip() fschwarz@507: if not isinstance(line, unicode): fschwarz@507: line = line.decode(catalog.charset) cmlenz@1: if line.startswith('#'): cmlenz@199: in_msgid[0] = in_msgstr[0] = False cmlenz@199: if messages and translations: cmlenz@106: _add_message() cmlenz@106: if line[1:].startswith(':'): cmlenz@106: for location in line[2:].lstrip().split(): cmlenz@369: pos = location.rfind(':') cmlenz@369: if pos >= 0: cmlenz@369: try: cmlenz@369: lineno = int(location[pos + 1:]) cmlenz@369: except ValueError: cmlenz@369: continue cmlenz@369: locations.append((location[:pos], lineno)) cmlenz@106: elif line[1:].startswith(','): cmlenz@106: for flag in line[2:].lstrip().split(','): cmlenz@106: flags.append(flag.strip()) cmlenz@199: elif line[1:].startswith('~'): cmlenz@199: obsolete[0] = True cmlenz@220: _process_message_line(lineno, line[2:].lstrip()) cmlenz@106: elif line[1:].startswith('.'): cmlenz@106: # These are called auto-comments cmlenz@106: comment = line[2:].strip() cmlenz@199: if comment: # Just check that we're not adding empty comments cmlenz@106: auto_comments.append(comment) cmlenz@120: else: cmlenz@106: # These are called user comments cmlenz@120: user_comments.append(line[1:].strip()) cmlenz@104: else: cmlenz@220: _process_message_line(lineno, line) cmlenz@6: cmlenz@6: if messages: cmlenz@64: _add_message() cmlenz@196: cmlenz@196: # No actual messages found, but there was some info in comments, from which cmlenz@196: # we'll construct an empty header message cmlenz@196: elif not counter[0] and (flags or user_comments or auto_comments): cmlenz@196: messages.append(u'') cmlenz@196: translations.append([0, u'']) cmlenz@196: _add_message() cmlenz@196: cmlenz@64: return catalog cmlenz@1: cmlenz@24: WORD_SEP = re.compile('(' cmlenz@24: r'\s+|' # any whitespace cmlenz@24: r'[^\s\w]*\w+[a-zA-Z]-(?=\w+[a-zA-Z])|' # hyphenated words cmlenz@24: r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w)' # em-dash cmlenz@24: ')') cmlenz@24: cmlenz@24: def escape(string): cmlenz@24: r"""Escape the given string so that it can be included in double-quoted cmlenz@24: strings in ``PO`` files. palgarvio@200: cmlenz@24: >>> escape('''Say: cmlenz@24: ... "hello, world!" cmlenz@24: ... ''') cmlenz@24: '"Say:\\n \\"hello, world!\\"\\n"' palgarvio@200: cmlenz@24: :param string: the string to escape cmlenz@24: :return: the escaped string cmlenz@24: :rtype: `str` or `unicode` cmlenz@24: """ cmlenz@24: return '"%s"' % string.replace('\\', '\\\\') \ cmlenz@24: .replace('\t', '\\t') \ cmlenz@24: .replace('\r', '\\r') \ cmlenz@24: .replace('\n', '\\n') \ cmlenz@24: .replace('\"', '\\"') cmlenz@24: cmlenz@190: def normalize(string, prefix='', width=76): cmlenz@106: r"""Convert a string into a format that is appropriate for .po files. palgarvio@200: cmlenz@24: >>> print normalize('''Say: cmlenz@24: ... "hello, world!" cmlenz@24: ... ''', width=None) cmlenz@24: "" cmlenz@24: "Say:\n" cmlenz@24: " \"hello, world!\"\n" palgarvio@200: cmlenz@24: >>> print normalize('''Say: cmlenz@24: ... "Lorem ipsum dolor sit amet, consectetur adipisicing elit, " cmlenz@24: ... ''', width=32) cmlenz@24: "" cmlenz@24: "Say:\n" cmlenz@24: " \"Lorem ipsum dolor sit " cmlenz@24: "amet, consectetur adipisicing" cmlenz@24: " elit, \"\n" palgarvio@200: cmlenz@24: :param string: the string to normalize cmlenz@190: :param prefix: a string that should be prepended to every line cmlenz@24: :param width: the maximum line width; use `None`, 0, or a negative number cmlenz@24: to completely disable line wrapping cmlenz@24: :return: the normalized string cmlenz@24: :rtype: `unicode` cmlenz@24: """ cmlenz@24: if width and width > 0: cmlenz@190: prefixlen = len(prefix) cmlenz@24: lines = [] cmlenz@24: for idx, line in enumerate(string.splitlines(True)): cmlenz@190: if len(escape(line)) + prefixlen > width: cmlenz@24: chunks = WORD_SEP.split(line) cmlenz@24: chunks.reverse() cmlenz@24: while chunks: cmlenz@24: buf = [] cmlenz@24: size = 2 cmlenz@24: while chunks: cmlenz@190: l = len(escape(chunks[-1])) - 2 + prefixlen cmlenz@24: if size + l < width: cmlenz@24: buf.append(chunks.pop()) cmlenz@24: size += l cmlenz@24: else: cmlenz@24: if not buf: cmlenz@24: # handle long chunks by putting them on a cmlenz@24: # separate line cmlenz@24: buf.append(chunks.pop()) cmlenz@24: break cmlenz@24: lines.append(u''.join(buf)) cmlenz@24: else: cmlenz@24: lines.append(line) cmlenz@24: else: cmlenz@24: lines = string.splitlines(True) cmlenz@24: cmlenz@67: if len(lines) <= 1: cmlenz@24: return escape(string) cmlenz@24: cmlenz@24: # Remove empty trailing line cmlenz@67: if lines and not lines[-1]: cmlenz@24: del lines[-1] cmlenz@24: lines[-1] += '\n' cmlenz@190: return u'""\n' + u'\n'.join([(prefix + escape(l)) for l in lines]) cmlenz@24: cmlenz@104: def write_po(fileobj, catalog, width=76, no_location=False, omit_header=False, palgarvio@200: sort_output=False, sort_by_file=False, ignore_obsolete=False, cmlenz@203: include_previous=False): cmlenz@56: r"""Write a ``gettext`` PO (portable object) template file for a given cmlenz@56: message catalog to the provided file-like object. palgarvio@200: cmlenz@56: >>> catalog = Catalog() cmlenz@56: >>> catalog.add(u'foo %(name)s', locations=[('main.py', 1)], cmlenz@56: ... flags=('fuzzy',)) cmlenz@56: >>> catalog.add((u'bar', u'baz'), locations=[('main.py', 3)]) cmlenz@1: >>> from StringIO import StringIO cmlenz@1: >>> buf = StringIO() cmlenz@104: >>> write_po(buf, catalog, omit_header=True) cmlenz@1: >>> print buf.getvalue() cmlenz@1: #: main.py:1 cmlenz@6: #, fuzzy, python-format cmlenz@6: msgid "foo %(name)s" cmlenz@1: msgstr "" cmlenz@1: cmlenz@1: #: main.py:3 cmlenz@1: msgid "bar" cmlenz@1: msgid_plural "baz" cmlenz@1: msgstr[0] "" cmlenz@1: msgstr[1] "" cmlenz@1: cmlenz@1: palgarvio@200: cmlenz@1: :param fileobj: the file-like object to write to cmlenz@67: :param catalog: the `Catalog` instance cmlenz@24: :param width: the maximum line width for the generated output; use `None`, cmlenz@24: 0, or a negative number to completely disable line wrapping cmlenz@1: :param no_location: do not emit a location comment for every message cmlenz@1: :param omit_header: do not include the ``msgid ""`` entry at the top of the cmlenz@1: output cmlenz@227: :param sort_output: whether to sort the messages in the output by msgid cmlenz@227: :param sort_by_file: whether to sort the messages in the output by their cmlenz@227: locations cmlenz@227: :param ignore_obsolete: whether to ignore obsolete messages and not include cmlenz@227: them in the output; by default they are included as cmlenz@227: comments cmlenz@203: :param include_previous: include the old msgid as a comment when cmlenz@229: updating the catalog cmlenz@1: """ cmlenz@190: def _normalize(key, prefix=''): cmlenz@190: return normalize(key, prefix=prefix, width=width) \ cmlenz@190: .encode(catalog.charset, 'backslashreplace') cmlenz@24: cmlenz@24: def _write(text): cmlenz@24: if isinstance(text, unicode): cmlenz@102: text = text.encode(catalog.charset) cmlenz@24: fileobj.write(text) cmlenz@1: cmlenz@181: def _write_comment(comment, prefix=''): jruigrok@476: # xgettext always wraps comments even if --no-wrap is passed; jruigrok@476: # provide the same behaviour cmlenz@181: if width and width > 0: jruigrok@476: _width = width jruigrok@476: else: jruigrok@476: _width = 76 jruigrok@476: for line in wraptext(comment, _width): cmlenz@181: _write('#%s %s\n' % (prefix, line.strip())) cmlenz@181: cmlenz@181: def _write_message(message, prefix=''): cmlenz@181: if isinstance(message.id, (list, tuple)): cmlenz@190: _write('%smsgid %s\n' % (prefix, _normalize(message.id[0], prefix))) cmlenz@190: _write('%smsgid_plural %s\n' % ( cmlenz@190: prefix, _normalize(message.id[1], prefix) cmlenz@190: )) cmlenz@379: cmlenz@379: for idx in range(catalog.num_plurals): cmlenz@379: try: cmlenz@379: string = message.string[idx] cmlenz@379: except IndexError: cmlenz@379: string = '' cmlenz@190: _write('%smsgstr[%d] %s\n' % ( cmlenz@379: prefix, idx, _normalize(string, prefix) cmlenz@190: )) cmlenz@181: else: cmlenz@190: _write('%smsgid %s\n' % (prefix, _normalize(message.id, prefix))) cmlenz@190: _write('%smsgstr %s\n' % ( cmlenz@190: prefix, _normalize(message.string or '', prefix) cmlenz@190: )) cmlenz@181: cmlenz@104: messages = list(catalog) palgarvio@71: if sort_output: pjenvey@248: messages.sort() palgarvio@71: elif sort_by_file: palgarvio@71: messages.sort(lambda x,y: cmp(x.locations, y.locations)) cmlenz@68: palgarvio@71: for message in messages: cmlenz@67: if not message.id: # This is the header "message" cmlenz@67: if omit_header: cmlenz@67: continue cmlenz@104: comment_header = catalog.header_comment cmlenz@103: if width and width > 0: cmlenz@103: lines = [] cmlenz@104: for line in comment_header.splitlines(): cmlenz@316: lines += wraptext(line, width=width, cmlenz@316: subsequent_indent='# ') cmlenz@104: comment_header = u'\n'.join(lines) + u'\n' cmlenz@104: _write(comment_header) cmlenz@102: cmlenz@227: for comment in message.user_comments: cmlenz@181: _write_comment(comment) cmlenz@227: for comment in message.auto_comments: cmlenz@181: _write_comment(comment, prefix='.') cmlenz@1: cmlenz@1: if not no_location: cmlenz@134: locs = u' '.join([u'%s:%d' % (filename.replace(os.sep, '/'), lineno) cmlenz@134: for filename, lineno in message.locations]) cmlenz@181: _write_comment(locs, prefix=':') cmlenz@56: if message.flags: cmlenz@56: _write('#%s\n' % ', '.join([''] + list(message.flags))) cmlenz@24: cmlenz@203: if message.previous_id and include_previous: cmlenz@310: _write_comment('msgid %s' % _normalize(message.previous_id[0]), cmlenz@203: prefix='|') cmlenz@203: if len(message.previous_id) > 1: cmlenz@310: _write_comment('msgid_plural %s' % _normalize( cmlenz@203: message.previous_id[1] cmlenz@203: ), prefix='|') palgarvio@200: cmlenz@181: _write_message(message) cmlenz@24: _write('\n') cmlenz@181: cmlenz@191: if not ignore_obsolete: cmlenz@191: for message in catalog.obsolete.values(): cmlenz@227: for comment in message.user_comments: cmlenz@191: _write_comment(comment) cmlenz@191: _write_message(message, prefix='#~ ') cmlenz@191: _write('\n')