cmlenz@1: # -*- coding: utf-8 -*- cmlenz@1: # cmlenz@1: # Copyright (C) 2007 Edgewall Software cmlenz@1: # All rights reserved. cmlenz@1: # cmlenz@1: # This software is licensed as described in the file COPYING, which cmlenz@1: # you should have received as part of this distribution. The terms cmlenz@1: # are also available at http://babel.edgewall.org/wiki/License. cmlenz@1: # cmlenz@1: # This software consists of voluntary contributions made by many cmlenz@1: # individuals. For the exact contribution history, see the revision cmlenz@1: # history and logs, available at http://babel.edgewall.org/log/. cmlenz@1: cmlenz@1: """Reading and writing of files in the ``gettext`` PO (portable object) cmlenz@1: format. cmlenz@1: cmlenz@1: :see: `The Format of PO Files cmlenz@1: `_ cmlenz@1: """ cmlenz@1: cmlenz@5: from datetime import date, datetime cmlenz@134: import os cmlenz@1: import re cmlenz@6: try: cmlenz@6: set cmlenz@6: except NameError: cmlenz@6: from sets import Set as set cmlenz@103: from textwrap import wrap cmlenz@1: cmlenz@1: from babel import __version__ as VERSION cmlenz@56: from babel.messages.catalog import Catalog cmlenz@97: from babel.util import LOCALTZ cmlenz@1: cmlenz@158: __all__ = ['unescape', 'denormalize', 'read_po', 'escape', 'normalize', cmlenz@158: 'write_po'] cmlenz@158: cmlenz@158: def unescape(string): cmlenz@158: r"""Reverse `escape` the given string. cmlenz@158: cmlenz@158: >>> print unescape('"Say:\\n \\"hello, world!\\"\\n"') cmlenz@158: Say: cmlenz@158: "hello, world!" cmlenz@158: cmlenz@158: cmlenz@158: :param string: the string to unescape cmlenz@158: :return: the unescaped string cmlenz@158: :rtype: `str` or `unicode` cmlenz@158: """ cmlenz@158: return string[1:-1].replace('\\\\', '\\') \ cmlenz@158: .replace('\\t', '\t') \ cmlenz@158: .replace('\\r', '\r') \ cmlenz@158: .replace('\\n', '\n') \ cmlenz@158: .replace('\\"', '\"') cmlenz@158: cmlenz@158: def denormalize(string): cmlenz@158: r"""Reverse the normalization done by the `normalize` function. cmlenz@158: cmlenz@158: >>> print denormalize(r'''"" cmlenz@158: ... "Say:\n" cmlenz@158: ... " \"hello, world!\"\n"''') cmlenz@158: Say: cmlenz@158: "hello, world!" cmlenz@158: cmlenz@158: cmlenz@158: >>> print denormalize(r'''"" cmlenz@158: ... "Say:\n" cmlenz@158: ... " \"Lorem ipsum dolor sit " cmlenz@158: ... "amet, consectetur adipisicing" cmlenz@158: ... " elit, \"\n"''') cmlenz@158: Say: cmlenz@158: "Lorem ipsum dolor sit amet, consectetur adipisicing elit, " cmlenz@158: cmlenz@158: cmlenz@158: :param string: the string to denormalize cmlenz@158: :return: the denormalized string cmlenz@158: :rtype: `unicode` or `str` cmlenz@158: """ cmlenz@158: if string.startswith('""'): cmlenz@158: lines = [] cmlenz@158: for line in string.splitlines()[1:]: cmlenz@158: lines.append(unescape(line)) cmlenz@158: return ''.join(lines) cmlenz@158: else: cmlenz@158: return unescape(string) cmlenz@1: cmlenz@1: def read_po(fileobj): cmlenz@6: """Read messages from a ``gettext`` PO (portable object) file from the given cmlenz@64: file-like object and return a `Catalog`. cmlenz@6: cmlenz@6: >>> from StringIO import StringIO cmlenz@106: >>> buf = StringIO(''' cmlenz@6: ... #: main.py:1 cmlenz@6: ... #, fuzzy, python-format cmlenz@6: ... msgid "foo %(name)s" cmlenz@6: ... msgstr "" palgarvio@21: ... palgarvio@94: ... # A user comment palgarvio@94: ... #. An auto comment cmlenz@6: ... #: main.py:3 cmlenz@6: ... msgid "bar" cmlenz@6: ... msgid_plural "baz" cmlenz@6: ... msgstr[0] "" cmlenz@6: ... msgstr[1] "" cmlenz@6: ... ''') cmlenz@64: >>> catalog = read_po(buf) cmlenz@104: >>> catalog.revision_date = datetime(2007, 04, 01) cmlenz@104: cmlenz@64: >>> for message in catalog: cmlenz@67: ... if message.id: cmlenz@67: ... print (message.id, message.string) palgarvio@105: ... print ' ', (message.locations, message.flags) palgarvio@105: ... print ' ', (message.user_comments, message.auto_comments) cmlenz@149: (u'foo %(name)s', '') cmlenz@149: ([(u'main.py', 1)], set([u'fuzzy', u'python-format'])) palgarvio@105: ([], []) cmlenz@149: ((u'bar', u'baz'), ('', '')) cmlenz@149: ([(u'main.py', 3)], set([])) cmlenz@149: ([u'A user comment'], [u'An auto comment']) cmlenz@1: cmlenz@1: :param fileobj: the file-like object to read the PO file from cmlenz@1: :return: an iterator over ``(message, translation, location)`` tuples cmlenz@1: :rtype: ``iterator`` cmlenz@1: """ cmlenz@64: catalog = Catalog() cmlenz@64: cmlenz@6: messages = [] cmlenz@6: translations = [] cmlenz@6: locations = [] cmlenz@6: flags = [] palgarvio@105: user_comments = [] palgarvio@105: auto_comments = [] cmlenz@6: in_msgid = in_msgstr = False cmlenz@6: cmlenz@64: def _add_message(): cmlenz@6: translations.sort() cmlenz@64: if len(messages) > 1: cmlenz@106: msgid = tuple([denormalize(m) for m in messages]) cmlenz@64: else: cmlenz@106: msgid = denormalize(messages[0]) cmlenz@64: if len(translations) > 1: cmlenz@106: string = tuple([denormalize(t[1]) for t in translations]) cmlenz@64: else: cmlenz@106: string = denormalize(translations[0][1]) palgarvio@105: catalog.add(msgid, string, list(locations), set(flags), palgarvio@108: list(auto_comments), list(user_comments)) cmlenz@84: del messages[:]; del translations[:]; del locations[:]; palgarvio@105: del flags[:]; del auto_comments[:]; del user_comments[:] cmlenz@6: cmlenz@1: for line in fileobj.readlines(): cmlenz@149: line = line.strip().decode(catalog.charset) cmlenz@1: if line.startswith('#'): cmlenz@106: in_msgid = in_msgstr = False cmlenz@106: if messages: cmlenz@106: _add_message() cmlenz@106: if line[1:].startswith(':'): cmlenz@106: for location in line[2:].lstrip().split(): cmlenz@106: filename, lineno = location.split(':', 1) cmlenz@106: locations.append((filename, int(lineno))) cmlenz@106: elif line[1:].startswith(','): cmlenz@106: for flag in line[2:].lstrip().split(','): cmlenz@106: flags.append(flag.strip()) cmlenz@106: elif line[1:].startswith('.'): cmlenz@106: # These are called auto-comments cmlenz@106: comment = line[2:].strip() cmlenz@106: if comment: cmlenz@106: # Just check that we're not adding empty comments cmlenz@106: auto_comments.append(comment) cmlenz@120: else: cmlenz@106: # These are called user comments cmlenz@120: user_comments.append(line[1:].strip()) cmlenz@104: else: cmlenz@1: if line.startswith('msgid_plural'): cmlenz@6: in_msgid = True cmlenz@1: msg = line[12:].lstrip() cmlenz@106: messages.append(msg) cmlenz@1: elif line.startswith('msgid'): cmlenz@6: in_msgid = True cmlenz@6: if messages: cmlenz@64: _add_message() cmlenz@106: messages.append(line[5:].lstrip()) cmlenz@1: elif line.startswith('msgstr'): cmlenz@6: in_msgid = False cmlenz@6: in_msgstr = True cmlenz@1: msg = line[6:].lstrip() cmlenz@1: if msg.startswith('['): cmlenz@6: idx, msg = msg[1:].split(']') cmlenz@106: translations.append([int(idx), msg.lstrip()]) cmlenz@6: else: cmlenz@106: translations.append([0, msg]) cmlenz@6: elif line.startswith('"'): cmlenz@6: if in_msgid: cmlenz@106: messages[-1] += u'\n' + line.rstrip() cmlenz@6: elif in_msgstr: cmlenz@106: translations[-1][1] += u'\n' + line.rstrip() cmlenz@6: cmlenz@6: if messages: cmlenz@64: _add_message() cmlenz@64: return catalog cmlenz@1: cmlenz@24: WORD_SEP = re.compile('(' cmlenz@24: r'\s+|' # any whitespace cmlenz@24: r'[^\s\w]*\w+[a-zA-Z]-(?=\w+[a-zA-Z])|' # hyphenated words cmlenz@24: r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w)' # em-dash cmlenz@24: ')') cmlenz@24: cmlenz@24: def escape(string): cmlenz@24: r"""Escape the given string so that it can be included in double-quoted cmlenz@24: strings in ``PO`` files. cmlenz@24: cmlenz@24: >>> escape('''Say: cmlenz@24: ... "hello, world!" cmlenz@24: ... ''') cmlenz@24: '"Say:\\n \\"hello, world!\\"\\n"' cmlenz@24: cmlenz@24: :param string: the string to escape cmlenz@24: :return: the escaped string cmlenz@24: :rtype: `str` or `unicode` cmlenz@24: """ cmlenz@24: return '"%s"' % string.replace('\\', '\\\\') \ cmlenz@24: .replace('\t', '\\t') \ cmlenz@24: .replace('\r', '\\r') \ cmlenz@24: .replace('\n', '\\n') \ cmlenz@24: .replace('\"', '\\"') cmlenz@24: cmlenz@24: def normalize(string, width=76): cmlenz@106: r"""Convert a string into a format that is appropriate for .po files. cmlenz@24: cmlenz@24: >>> print normalize('''Say: cmlenz@24: ... "hello, world!" cmlenz@24: ... ''', width=None) cmlenz@24: "" cmlenz@24: "Say:\n" cmlenz@24: " \"hello, world!\"\n" cmlenz@24: cmlenz@24: >>> print normalize('''Say: cmlenz@24: ... "Lorem ipsum dolor sit amet, consectetur adipisicing elit, " cmlenz@24: ... ''', width=32) cmlenz@24: "" cmlenz@24: "Say:\n" cmlenz@24: " \"Lorem ipsum dolor sit " cmlenz@24: "amet, consectetur adipisicing" cmlenz@24: " elit, \"\n" cmlenz@24: cmlenz@24: :param string: the string to normalize cmlenz@24: :param width: the maximum line width; use `None`, 0, or a negative number cmlenz@24: to completely disable line wrapping cmlenz@24: :return: the normalized string cmlenz@24: :rtype: `unicode` cmlenz@24: """ cmlenz@24: if width and width > 0: cmlenz@24: lines = [] cmlenz@24: for idx, line in enumerate(string.splitlines(True)): cmlenz@24: if len(escape(line)) > width: cmlenz@24: chunks = WORD_SEP.split(line) cmlenz@24: chunks.reverse() cmlenz@24: while chunks: cmlenz@24: buf = [] cmlenz@24: size = 2 cmlenz@24: while chunks: cmlenz@24: l = len(escape(chunks[-1])) - 2 cmlenz@24: if size + l < width: cmlenz@24: buf.append(chunks.pop()) cmlenz@24: size += l cmlenz@24: else: cmlenz@24: if not buf: cmlenz@24: # handle long chunks by putting them on a cmlenz@24: # separate line cmlenz@24: buf.append(chunks.pop()) cmlenz@24: break cmlenz@24: lines.append(u''.join(buf)) cmlenz@24: else: cmlenz@24: lines.append(line) cmlenz@24: else: cmlenz@24: lines = string.splitlines(True) cmlenz@24: cmlenz@67: if len(lines) <= 1: cmlenz@24: return escape(string) cmlenz@24: cmlenz@24: # Remove empty trailing line cmlenz@67: if lines and not lines[-1]: cmlenz@24: del lines[-1] cmlenz@24: lines[-1] += '\n' cmlenz@24: return u'""\n' + u'\n'.join([escape(l) for l in lines]) cmlenz@24: cmlenz@104: def write_po(fileobj, catalog, width=76, no_location=False, omit_header=False, cmlenz@104: sort_output=False, sort_by_file=False): cmlenz@56: r"""Write a ``gettext`` PO (portable object) template file for a given cmlenz@56: message catalog to the provided file-like object. cmlenz@1: cmlenz@56: >>> catalog = Catalog() cmlenz@56: >>> catalog.add(u'foo %(name)s', locations=[('main.py', 1)], cmlenz@56: ... flags=('fuzzy',)) cmlenz@56: >>> catalog.add((u'bar', u'baz'), locations=[('main.py', 3)]) cmlenz@1: >>> from StringIO import StringIO cmlenz@1: >>> buf = StringIO() cmlenz@104: >>> write_po(buf, catalog, omit_header=True) cmlenz@1: >>> print buf.getvalue() cmlenz@1: #: main.py:1 cmlenz@6: #, fuzzy, python-format cmlenz@6: msgid "foo %(name)s" cmlenz@1: msgstr "" cmlenz@1: cmlenz@1: #: main.py:3 cmlenz@1: msgid "bar" cmlenz@1: msgid_plural "baz" cmlenz@1: msgstr[0] "" cmlenz@1: msgstr[1] "" cmlenz@1: cmlenz@1: cmlenz@1: cmlenz@1: :param fileobj: the file-like object to write to cmlenz@67: :param catalog: the `Catalog` instance cmlenz@24: :param width: the maximum line width for the generated output; use `None`, cmlenz@24: 0, or a negative number to completely disable line wrapping cmlenz@1: :param no_location: do not emit a location comment for every message cmlenz@1: :param omit_header: do not include the ``msgid ""`` entry at the top of the cmlenz@1: output cmlenz@1: """ cmlenz@1: def _normalize(key): cmlenz@102: return normalize(key, width=width).encode(catalog.charset, cmlenz@102: 'backslashreplace') cmlenz@24: cmlenz@24: def _write(text): cmlenz@24: if isinstance(text, unicode): cmlenz@102: text = text.encode(catalog.charset) cmlenz@24: fileobj.write(text) cmlenz@1: cmlenz@104: messages = list(catalog) palgarvio@71: if sort_output: palgarvio@71: messages.sort(lambda x,y: cmp(x.id, y.id)) palgarvio@71: elif sort_by_file: palgarvio@71: messages.sort(lambda x,y: cmp(x.locations, y.locations)) cmlenz@68: palgarvio@71: for message in messages: cmlenz@67: if not message.id: # This is the header "message" cmlenz@67: if omit_header: cmlenz@67: continue cmlenz@104: comment_header = catalog.header_comment cmlenz@103: if width and width > 0: cmlenz@103: lines = [] cmlenz@104: for line in comment_header.splitlines(): cmlenz@103: lines += wrap(line, width=width, subsequent_indent='# ', cmlenz@103: break_long_words=False) cmlenz@104: comment_header = u'\n'.join(lines) + u'\n' cmlenz@104: _write(comment_header) cmlenz@102: palgarvio@105: if message.user_comments: palgarvio@105: for comment in message.user_comments: palgarvio@105: for line in wrap(comment, width, break_long_words=False): palgarvio@105: _write('# %s\n' % line.strip()) palgarvio@105: palgarvio@105: if message.auto_comments: palgarvio@105: for comment in message.auto_comments: cmlenz@103: for line in wrap(comment, width, break_long_words=False): palgarvio@80: _write('#. %s\n' % line.strip()) cmlenz@1: cmlenz@1: if not no_location: cmlenz@134: locs = u' '.join([u'%s:%d' % (filename.replace(os.sep, '/'), lineno) cmlenz@134: for filename, lineno in message.locations]) cmlenz@24: if width and width > 0: cmlenz@103: locs = wrap(locs, width, break_long_words=False) cmlenz@24: for line in locs: cmlenz@24: _write('#: %s\n' % line.strip()) cmlenz@56: if message.flags: cmlenz@56: _write('#%s\n' % ', '.join([''] + list(message.flags))) cmlenz@24: cmlenz@56: if isinstance(message.id, (list, tuple)): cmlenz@56: _write('msgid %s\n' % _normalize(message.id[0])) cmlenz@56: _write('msgid_plural %s\n' % _normalize(message.id[1])) cmlenz@68: for i, string in enumerate(message.string): cmlenz@68: _write('msgstr[%d] %s\n' % (i, _normalize(message.string[i]))) cmlenz@1: else: cmlenz@56: _write('msgid %s\n' % _normalize(message.id)) cmlenz@68: _write('msgstr %s\n' % _normalize(message.string or '')) cmlenz@24: _write('\n')