cmlenz@3: # -*- coding: utf-8 -*-
cmlenz@3: #
jruigrok@532: # Copyright (C) 2007-2011 Edgewall Software
cmlenz@3: # All rights reserved.
cmlenz@3: #
cmlenz@3: # This software is licensed as described in the file COPYING, which
cmlenz@3: # you should have received as part of this distribution. The terms
cmlenz@3: # are also available at http://babel.edgewall.org/wiki/License.
cmlenz@3: #
cmlenz@3: # This software consists of voluntary contributions made by many
cmlenz@3: # individuals. For the exact contribution history, see the revision
cmlenz@3: # history and logs, available at http://babel.edgewall.org/log/.
cmlenz@3: 
cmlenz@3: """Reading and writing of files in the ``gettext`` PO (portable object)
cmlenz@3: format.
cmlenz@3: 
cmlenz@3: :see: `The Format of PO Files
cmlenz@3:        <http://www.gnu.org/software/gettext/manual/gettext.html#PO-Files>`_
cmlenz@3: """
cmlenz@3: 
fschwarz@533: from datetime import datetime
cmlenz@136: import os
cmlenz@3: import re
cmlenz@3: 
cmlenz@201: from babel.messages.catalog import Catalog, Message
fschwarz@533: from babel.util import wraptext
cmlenz@3: 
cmlenz@180: __all__ = ['read_po', 'write_po']
cmlenz@163: __docformat__ = 'restructuredtext en'
cmlenz@160: 
cmlenz@160: def unescape(string):
cmlenz@160:     r"""Reverse `escape` the given string.
palgarvio@202: 
cmlenz@160:     >>> print unescape('"Say:\\n  \\"hello, world!\\"\\n"')
cmlenz@160:     Say:
cmlenz@160:       "hello, world!"
cmlenz@160:     <BLANKLINE>
palgarvio@202: 
cmlenz@160:     :param string: the string to unescape
cmlenz@160:     :return: the unescaped string
cmlenz@160:     :rtype: `str` or `unicode`
cmlenz@160:     """
cmlenz@160:     return string[1:-1].replace('\\\\', '\\') \
cmlenz@160:                        .replace('\\t', '\t') \
cmlenz@160:                        .replace('\\r', '\r') \
cmlenz@160:                        .replace('\\n', '\n') \
cmlenz@160:                        .replace('\\"', '\"')
cmlenz@160: 
cmlenz@160: def denormalize(string):
cmlenz@160:     r"""Reverse the normalization done by the `normalize` function.
palgarvio@202: 
cmlenz@160:     >>> print denormalize(r'''""
cmlenz@160:     ... "Say:\n"
cmlenz@160:     ... "  \"hello, world!\"\n"''')
cmlenz@160:     Say:
cmlenz@160:       "hello, world!"
cmlenz@160:     <BLANKLINE>
palgarvio@202: 
cmlenz@160:     >>> print denormalize(r'''""
cmlenz@160:     ... "Say:\n"
cmlenz@160:     ... "  \"Lorem ipsum dolor sit "
cmlenz@160:     ... "amet, consectetur adipisicing"
cmlenz@160:     ... " elit, \"\n"''')
cmlenz@160:     Say:
cmlenz@160:       "Lorem ipsum dolor sit amet, consectetur adipisicing elit, "
cmlenz@160:     <BLANKLINE>
palgarvio@202: 
cmlenz@160:     :param string: the string to denormalize
cmlenz@160:     :return: the denormalized string
cmlenz@160:     :rtype: `unicode` or `str`
cmlenz@160:     """
cmlenz@160:     if string.startswith('""'):
cmlenz@160:         lines = []
cmlenz@160:         for line in string.splitlines()[1:]:
cmlenz@160:             lines.append(unescape(line))
cmlenz@160:         return ''.join(lines)
cmlenz@160:     else:
cmlenz@160:         return unescape(string)
cmlenz@3: 
cmlenz@201: def read_po(fileobj, locale=None, domain=None, ignore_obsolete=False):
cmlenz@8:     """Read messages from a ``gettext`` PO (portable object) file from the given
cmlenz@66:     file-like object and return a `Catalog`.
palgarvio@202: 
cmlenz@8:     >>> from StringIO import StringIO
cmlenz@108:     >>> buf = StringIO('''
cmlenz@8:     ... #: main.py:1
cmlenz@8:     ... #, fuzzy, python-format
cmlenz@8:     ... msgid "foo %(name)s"
cmlenz@8:     ... msgstr ""
palgarvio@23:     ...
palgarvio@96:     ... # A user comment
palgarvio@96:     ... #. An auto comment
cmlenz@8:     ... #: main.py:3
cmlenz@8:     ... msgid "bar"
cmlenz@8:     ... msgid_plural "baz"
cmlenz@8:     ... msgstr[0] ""
cmlenz@8:     ... msgstr[1] ""
cmlenz@8:     ... ''')
cmlenz@66:     >>> catalog = read_po(buf)
cmlenz@106:     >>> catalog.revision_date = datetime(2007, 04, 01)
palgarvio@202: 
cmlenz@66:     >>> for message in catalog:
cmlenz@69:     ...     if message.id:
cmlenz@69:     ...         print (message.id, message.string)
palgarvio@107:     ...         print ' ', (message.locations, message.flags)
palgarvio@107:     ...         print ' ', (message.user_comments, message.auto_comments)
cmlenz@151:     (u'foo %(name)s', '')
cmlenz@151:       ([(u'main.py', 1)], set([u'fuzzy', u'python-format']))
palgarvio@107:       ([], [])
cmlenz@151:     ((u'bar', u'baz'), ('', ''))
cmlenz@151:       ([(u'main.py', 3)], set([]))
cmlenz@151:       ([u'A user comment'], [u'An auto comment'])
palgarvio@202: 
cmlenz@3:     :param fileobj: the file-like object to read the PO file from
cmlenz@198:     :param locale: the locale identifier or `Locale` object, or `None`
cmlenz@198:                    if the catalog is not bound to a locale (which basically
cmlenz@198:                    means it's a template)
cmlenz@198:     :param domain: the message domain
cmlenz@229:     :param ignore_obsolete: whether to ignore obsolete messages in the input
cmlenz@336:     :return: a catalog object representing the parsed PO file
cmlenz@336:     :rtype: `Catalog`
cmlenz@3:     """
cmlenz@198:     catalog = Catalog(locale=locale, domain=domain)
cmlenz@66: 
cmlenz@198:     counter = [0]
cmlenz@222:     offset = [0]
cmlenz@8:     messages = []
cmlenz@8:     translations = []
cmlenz@8:     locations = []
cmlenz@8:     flags = []
palgarvio@107:     user_comments = []
palgarvio@107:     auto_comments = []
cmlenz@201:     obsolete = [False]
cmlenz@337:     context = []
cmlenz@201:     in_msgid = [False]
cmlenz@201:     in_msgstr = [False]
aronacher@344:     in_msgctxt = [False]
cmlenz@8: 
cmlenz@66:     def _add_message():
cmlenz@8:         translations.sort()
cmlenz@66:         if len(messages) > 1:
cmlenz@108:             msgid = tuple([denormalize(m) for m in messages])
cmlenz@66:         else:
cmlenz@108:             msgid = denormalize(messages[0])
palgarvio@372:         if isinstance(msgid, (list, tuple)):
palgarvio@372:             string = []
palgarvio@372:             for idx in range(catalog.num_plurals):
palgarvio@372:                 try:
palgarvio@372:                     string.append(translations[idx])
palgarvio@372:                 except IndexError:
palgarvio@372:                     string.append((idx, ''))
palgarvio@372:             string = tuple([denormalize(t[1]) for t in string])
cmlenz@66:         else:
cmlenz@108:             string = denormalize(translations[0][1])
cmlenz@337:         if context:
cmlenz@337:             msgctxt = denormalize('\n'.join(context))
cmlenz@337:         else:
cmlenz@337:             msgctxt = None
cmlenz@201:         message = Message(msgid, string, list(locations), set(flags),
cmlenz@337:                           auto_comments, user_comments, lineno=offset[0] + 1,
cmlenz@337:                           context=msgctxt)
cmlenz@201:         if obsolete[0]:
cmlenz@201:             if not ignore_obsolete:
cmlenz@201:                 catalog.obsolete[msgid] = message
cmlenz@201:         else:
cmlenz@201:             catalog[msgid] = message
cmlenz@337:         del messages[:]; del translations[:]; del context[:]; del locations[:];
cmlenz@337:         del flags[:]; del auto_comments[:]; del user_comments[:];
cmlenz@201:         obsolete[0] = False
cmlenz@198:         counter[0] += 1
cmlenz@8: 
cmlenz@222:     def _process_message_line(lineno, line):
cmlenz@201:         if line.startswith('msgid_plural'):
cmlenz@201:             in_msgid[0] = True
cmlenz@201:             msg = line[12:].lstrip()
cmlenz@201:             messages.append(msg)
cmlenz@201:         elif line.startswith('msgid'):
cmlenz@201:             in_msgid[0] = True
cmlenz@222:             offset[0] = lineno
cmlenz@201:             txt = line[5:].lstrip()
cmlenz@201:             if messages:
cmlenz@201:                 _add_message()
cmlenz@201:             messages.append(txt)
cmlenz@201:         elif line.startswith('msgstr'):
cmlenz@201:             in_msgid[0] = False
cmlenz@201:             in_msgstr[0] = True
cmlenz@201:             msg = line[6:].lstrip()
cmlenz@201:             if msg.startswith('['):
jruigrok@443:                 idx, msg = msg[1:].split(']', 1)
cmlenz@201:                 translations.append([int(idx), msg.lstrip()])
cmlenz@201:             else:
cmlenz@201:                 translations.append([0, msg])
cmlenz@337:         elif line.startswith('msgctxt'):
cmlenz@430:             if messages:
cmlenz@430:                 _add_message()
cmlenz@337:             in_msgid[0] = in_msgstr[0] = False
cmlenz@337:             context.append(line[7:].lstrip())
cmlenz@201:         elif line.startswith('"'):
cmlenz@201:             if in_msgid[0]:
cmlenz@201:                 messages[-1] += u'\n' + line.rstrip()
cmlenz@201:             elif in_msgstr[0]:
cmlenz@201:                 translations[-1][1] += u'\n' + line.rstrip()
cmlenz@337:             elif in_msgctxt[0]:
cmlenz@337:                 context.append(line.rstrip())
cmlenz@201: 
cmlenz@222:     for lineno, line in enumerate(fileobj.readlines()):
pjenvey@416:         line = line.strip()
pjenvey@416:         if not isinstance(line, unicode):
pjenvey@416:             line = line.decode(catalog.charset)
cmlenz@3:         if line.startswith('#'):
cmlenz@201:             in_msgid[0] = in_msgstr[0] = False
cmlenz@201:             if messages and translations:
cmlenz@108:                 _add_message()
cmlenz@108:             if line[1:].startswith(':'):
cmlenz@108:                 for location in line[2:].lstrip().split():
aronacher@358:                     pos = location.rfind(':')
aronacher@358:                     if pos >= 0:
aronacher@358:                         try:
aronacher@358:                             lineno = int(location[pos + 1:])
aronacher@358:                         except ValueError:
aronacher@358:                             continue
aronacher@358:                         locations.append((location[:pos], lineno))
cmlenz@108:             elif line[1:].startswith(','):
cmlenz@108:                 for flag in line[2:].lstrip().split(','):
cmlenz@108:                     flags.append(flag.strip())
cmlenz@201:             elif line[1:].startswith('~'):
cmlenz@201:                 obsolete[0] = True
cmlenz@222:                 _process_message_line(lineno, line[2:].lstrip())
cmlenz@108:             elif line[1:].startswith('.'):
cmlenz@108:                 # These are called auto-comments
cmlenz@108:                 comment = line[2:].strip()
cmlenz@201:                 if comment: # Just check that we're not adding empty comments
cmlenz@108:                     auto_comments.append(comment)
cmlenz@122:             else:
cmlenz@108:                 # These are called user comments
cmlenz@122:                 user_comments.append(line[1:].strip())
cmlenz@106:         else:
cmlenz@222:             _process_message_line(lineno, line)
cmlenz@8: 
cmlenz@8:     if messages:
cmlenz@66:         _add_message()
cmlenz@198: 
cmlenz@198:     # No actual messages found, but there was some info in comments, from which
cmlenz@198:     # we'll construct an empty header message
cmlenz@198:     elif not counter[0] and (flags or user_comments or auto_comments):
cmlenz@198:         messages.append(u'')
cmlenz@198:         translations.append([0, u''])
cmlenz@198:         _add_message()
cmlenz@198: 
cmlenz@66:     return catalog
cmlenz@3: 
cmlenz@26: WORD_SEP = re.compile('('
cmlenz@26:     r'\s+|'                                 # any whitespace
cmlenz@26:     r'[^\s\w]*\w+[a-zA-Z]-(?=\w+[a-zA-Z])|' # hyphenated words
cmlenz@26:     r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w)'   # em-dash
cmlenz@26: ')')
cmlenz@26: 
cmlenz@26: def escape(string):
cmlenz@26:     r"""Escape the given string so that it can be included in double-quoted
cmlenz@26:     strings in ``PO`` files.
palgarvio@202: 
cmlenz@26:     >>> escape('''Say:
cmlenz@26:     ...   "hello, world!"
cmlenz@26:     ... ''')
cmlenz@26:     '"Say:\\n  \\"hello, world!\\"\\n"'
palgarvio@202: 
cmlenz@26:     :param string: the string to escape
cmlenz@26:     :return: the escaped string
cmlenz@26:     :rtype: `str` or `unicode`
cmlenz@26:     """
cmlenz@26:     return '"%s"' % string.replace('\\', '\\\\') \
cmlenz@26:                           .replace('\t', '\\t') \
cmlenz@26:                           .replace('\r', '\\r') \
cmlenz@26:                           .replace('\n', '\\n') \
cmlenz@26:                           .replace('\"', '\\"')
cmlenz@26: 
cmlenz@192: def normalize(string, prefix='', width=76):
cmlenz@108:     r"""Convert a string into a format that is appropriate for .po files.
palgarvio@202: 
cmlenz@26:     >>> print normalize('''Say:
cmlenz@26:     ...   "hello, world!"
cmlenz@26:     ... ''', width=None)
cmlenz@26:     ""
cmlenz@26:     "Say:\n"
cmlenz@26:     "  \"hello, world!\"\n"
palgarvio@202: 
cmlenz@26:     >>> print normalize('''Say:
cmlenz@26:     ...   "Lorem ipsum dolor sit amet, consectetur adipisicing elit, "
cmlenz@26:     ... ''', width=32)
cmlenz@26:     ""
cmlenz@26:     "Say:\n"
cmlenz@26:     "  \"Lorem ipsum dolor sit "
cmlenz@26:     "amet, consectetur adipisicing"
cmlenz@26:     " elit, \"\n"
palgarvio@202: 
cmlenz@26:     :param string: the string to normalize
cmlenz@192:     :param prefix: a string that should be prepended to every line
cmlenz@26:     :param width: the maximum line width; use `None`, 0, or a negative number
cmlenz@26:                   to completely disable line wrapping
cmlenz@26:     :return: the normalized string
cmlenz@26:     :rtype: `unicode`
cmlenz@26:     """
cmlenz@26:     if width and width > 0:
cmlenz@192:         prefixlen = len(prefix)
cmlenz@26:         lines = []
cmlenz@26:         for idx, line in enumerate(string.splitlines(True)):
cmlenz@192:             if len(escape(line)) + prefixlen > width:
cmlenz@26:                 chunks = WORD_SEP.split(line)
cmlenz@26:                 chunks.reverse()
cmlenz@26:                 while chunks:
cmlenz@26:                     buf = []
cmlenz@26:                     size = 2
cmlenz@26:                     while chunks:
cmlenz@192:                         l = len(escape(chunks[-1])) - 2 + prefixlen
cmlenz@26:                         if size + l < width:
cmlenz@26:                             buf.append(chunks.pop())
cmlenz@26:                             size += l
cmlenz@26:                         else:
cmlenz@26:                             if not buf:
cmlenz@26:                                 # handle long chunks by putting them on a
cmlenz@26:                                 # separate line
cmlenz@26:                                 buf.append(chunks.pop())
cmlenz@26:                             break
cmlenz@26:                     lines.append(u''.join(buf))
cmlenz@26:             else:
cmlenz@26:                 lines.append(line)
cmlenz@26:     else:
cmlenz@26:         lines = string.splitlines(True)
cmlenz@26: 
cmlenz@69:     if len(lines) <= 1:
cmlenz@26:         return escape(string)
cmlenz@26: 
cmlenz@26:     # Remove empty trailing line
cmlenz@69:     if lines and not lines[-1]:
cmlenz@26:         del lines[-1]
cmlenz@26:         lines[-1] += '\n'
cmlenz@192:     return u'""\n' + u'\n'.join([(prefix + escape(l)) for l in lines])
cmlenz@26: 
cmlenz@106: def write_po(fileobj, catalog, width=76, no_location=False, omit_header=False,
palgarvio@202:              sort_output=False, sort_by_file=False, ignore_obsolete=False,
cmlenz@205:              include_previous=False):
cmlenz@58:     r"""Write a ``gettext`` PO (portable object) template file for a given
cmlenz@58:     message catalog to the provided file-like object.
palgarvio@202: 
cmlenz@58:     >>> catalog = Catalog()
cmlenz@58:     >>> catalog.add(u'foo %(name)s', locations=[('main.py', 1)],
cmlenz@58:     ...             flags=('fuzzy',))
fschwarz@546:     <Message...>
cmlenz@58:     >>> catalog.add((u'bar', u'baz'), locations=[('main.py', 3)])
fschwarz@546:     <Message...>
cmlenz@3:     >>> from StringIO import StringIO
cmlenz@3:     >>> buf = StringIO()
cmlenz@106:     >>> write_po(buf, catalog, omit_header=True)
cmlenz@3:     >>> print buf.getvalue()
cmlenz@3:     #: main.py:1
cmlenz@8:     #, fuzzy, python-format
cmlenz@8:     msgid "foo %(name)s"
cmlenz@3:     msgstr ""
cmlenz@3:     <BLANKLINE>
cmlenz@3:     #: main.py:3
cmlenz@3:     msgid "bar"
cmlenz@3:     msgid_plural "baz"
cmlenz@3:     msgstr[0] ""
cmlenz@3:     msgstr[1] ""
cmlenz@3:     <BLANKLINE>
cmlenz@3:     <BLANKLINE>
palgarvio@202: 
cmlenz@3:     :param fileobj: the file-like object to write to
cmlenz@69:     :param catalog: the `Catalog` instance
cmlenz@26:     :param width: the maximum line width for the generated output; use `None`,
cmlenz@26:                   0, or a negative number to completely disable line wrapping
cmlenz@3:     :param no_location: do not emit a location comment for every message
cmlenz@3:     :param omit_header: do not include the ``msgid ""`` entry at the top of the
cmlenz@3:                         output
cmlenz@229:     :param sort_output: whether to sort the messages in the output by msgid
cmlenz@229:     :param sort_by_file: whether to sort the messages in the output by their
cmlenz@229:                          locations
cmlenz@229:     :param ignore_obsolete: whether to ignore obsolete messages and not include
cmlenz@229:                             them in the output; by default they are included as
cmlenz@229:                             comments
cmlenz@205:     :param include_previous: include the old msgid as a comment when
cmlenz@231:                              updating the catalog
cmlenz@3:     """
cmlenz@192:     def _normalize(key, prefix=''):
fschwarz@549:         return normalize(key, prefix=prefix, width=width)
cmlenz@26: 
cmlenz@26:     def _write(text):
cmlenz@26:         if isinstance(text, unicode):
fschwarz@549:             text = text.encode(catalog.charset, 'backslashreplace')
cmlenz@26:         fileobj.write(text)
cmlenz@3: 
cmlenz@183:     def _write_comment(comment, prefix=''):
palgarvio@425:         # xgettext always wraps comments even if --no-wrap is passed;
palgarvio@425:         # provide the same behaviour
cmlenz@183:         if width and width > 0:
palgarvio@425:             _width = width
palgarvio@425:         else:
palgarvio@425:             _width = 76
palgarvio@425:         for line in wraptext(comment, _width):
cmlenz@183:             _write('#%s %s\n' % (prefix, line.strip()))
cmlenz@183: 
cmlenz@183:     def _write_message(message, prefix=''):
cmlenz@183:         if isinstance(message.id, (list, tuple)):
palgarvio@423:             if message.context:
palgarvio@423:                 _write('%smsgctxt %s\n' % (prefix,
palgarvio@423:                                            _normalize(message.context, prefix)))
cmlenz@192:             _write('%smsgid %s\n' % (prefix, _normalize(message.id[0], prefix)))
cmlenz@192:             _write('%smsgid_plural %s\n' % (
cmlenz@192:                 prefix, _normalize(message.id[1], prefix)
cmlenz@192:             ))
palgarvio@372: 
palgarvio@372:             for idx in range(catalog.num_plurals):
palgarvio@372:                 try:
palgarvio@372:                     string = message.string[idx]
palgarvio@372:                 except IndexError:
palgarvio@372:                     string = ''
cmlenz@192:                 _write('%smsgstr[%d] %s\n' % (
palgarvio@372:                     prefix, idx, _normalize(string, prefix)
cmlenz@192:                 ))
cmlenz@183:         else:
palgarvio@423:             if message.context:
palgarvio@423:                 _write('%smsgctxt %s\n' % (prefix,
palgarvio@423:                                            _normalize(message.context, prefix)))
cmlenz@192:             _write('%smsgid %s\n' % (prefix, _normalize(message.id, prefix)))
cmlenz@192:             _write('%smsgstr %s\n' % (
cmlenz@192:                 prefix, _normalize(message.string or '', prefix)
cmlenz@192:             ))
cmlenz@183: 
cmlenz@106:     messages = list(catalog)
palgarvio@73:     if sort_output:
pjenvey@250:         messages.sort()
palgarvio@73:     elif sort_by_file:
palgarvio@73:         messages.sort(lambda x,y: cmp(x.locations, y.locations))
cmlenz@70: 
palgarvio@73:     for message in messages:
cmlenz@69:         if not message.id: # This is the header "message"
cmlenz@69:             if omit_header:
cmlenz@69:                 continue
cmlenz@106:             comment_header = catalog.header_comment
cmlenz@105:             if width and width > 0:
cmlenz@105:                 lines = []
cmlenz@106:                 for line in comment_header.splitlines():
cmlenz@317:                     lines += wraptext(line, width=width,
cmlenz@317:                                       subsequent_indent='# ')
cmlenz@106:                 comment_header = u'\n'.join(lines) + u'\n'
cmlenz@106:             _write(comment_header)
cmlenz@104: 
cmlenz@229:         for comment in message.user_comments:
cmlenz@183:             _write_comment(comment)
cmlenz@229:         for comment in message.auto_comments:
cmlenz@183:             _write_comment(comment, prefix='.')
cmlenz@3: 
cmlenz@3:         if not no_location:
cmlenz@136:             locs = u' '.join([u'%s:%d' % (filename.replace(os.sep, '/'), lineno)
cmlenz@136:                               for filename, lineno in message.locations])
cmlenz@183:             _write_comment(locs, prefix=':')
cmlenz@58:         if message.flags:
cmlenz@58:             _write('#%s\n' % ', '.join([''] + list(message.flags)))
cmlenz@26: 
cmlenz@205:         if message.previous_id and include_previous:
cmlenz@311:             _write_comment('msgid %s' % _normalize(message.previous_id[0]),
cmlenz@205:                            prefix='|')
cmlenz@205:             if len(message.previous_id) > 1:
cmlenz@311:                 _write_comment('msgid_plural %s' % _normalize(
cmlenz@205:                     message.previous_id[1]
cmlenz@205:                 ), prefix='|')
palgarvio@202: 
cmlenz@183:         _write_message(message)
cmlenz@26:         _write('\n')
cmlenz@183: 
cmlenz@193:     if not ignore_obsolete:
cmlenz@193:         for message in catalog.obsolete.values():
cmlenz@229:             for comment in message.user_comments:
cmlenz@193:                 _write_comment(comment)
cmlenz@193:             _write_message(message, prefix='#~ ')
cmlenz@193:             _write('\n')