# -*- coding: utf-8 -*-
# Copyright (C) 2004-2006 Edgewall Software
# Copyright (C) 2004 Daniel Lundin <>
# Copyright (C) 2005-2006 Christopher Lenz <>
# Copyright (C) 2006 Christian Boos <>
# All rights reserved.
# This software is licensed as described in the file COPYING, which
# you should have received as part of this distribution. The terms
# are also available at
# This software consists of voluntary contributions made by many
# individuals. For the exact contribution history, see the revision
# history and logs, available at
# Author: Daniel Lundin <>
#         Christopher Lenz <>
#         Christian Boos <>

The `trac.mimeview` module centralize the intelligence related to
file metadata, principally concerning the `type` (MIME type) of the content
and, if relevant, concerning the text encoding (charset) used by the content.

There are primarily two approaches for getting the MIME type of a given file:
 * taking advantage of existing conventions for the file name
 * examining the file content and applying various heuristics

The module also knows how to convert the file content from one type
to another type.

In some cases, only the `url` pointing to the file's content is actually
needed, that's why we avoid to read the file's content when it's not needed.

The actual `content` to be converted might be a `unicode` object,
but it can also be the raw byte string (`str`) object, or simply
an object that can be `read()`.

import re
from StringIO import StringIO

from trac.config import IntOption, ListOption, Option
from trac.core import *
from trac.util import sorted
from trac.util.text import to_utf8, to_unicode
from trac.util.markup import escape, Markup, Fragment, html

__all__ = ['get_mimetype', 'is_binary', 'detect_unicode', 'Mimeview',

# Some common MIME types and their associated keywords and/or file extensions

    'application/pdf':        ['pdf'],
    'application/postscript': ['ps'],
    'application/rtf':        ['rtf'],
    'application/x-sh':       ['sh'],
    'application/x-csh':      ['csh'],
    'application/x-troff':    ['nroff', 'roff', 'troff'],

    'image/x-icon':           ['ico'],
    'image/svg+xml':          ['svg'],
    'model/vrml':             ['vrml', 'wrl'],
    'text/css':               ['css'],
    'text/html':              ['html'],
    'text/plain':             ['txt', 'TXT', 'text', 'README', 'INSTALL',
                               'AUTHORS', 'COPYING', 'ChangeLog', 'RELEASE'],
    'text/xml':               ['xml'],
    'text/xsl':               ['xsl'],
    'text/x-csrc':            ['c', 'xs'],
    'text/x-chdr':            ['h'],
    'text/x-c++src':          ['cc', 'CC', 'cpp', 'C'],
    'text/x-c++hdr':          ['hh', 'HH', 'hpp', 'H'],
    'text/x-diff':            ['diff', 'patch'],
    'text/x-eiffel':          ['e'],
    'text/x-elisp':           ['el'],
    'text/x-fortran':         ['f'],
    'text/x-haskell':         ['hs'],
    'text/x-javascript':      ['js'],
    'text/x-objc':            ['m', 'mm'],
    'text/x-makefile':        ['make', 'mk',
                               'Makefile', 'makefile', 'GNUMakefile'],
    'text/x-pascal':          ['pas'],
    'text/x-perl':            ['pl', 'pm', 'PL', 'perl'],
    'text/x-php':             ['php', 'php3', 'php4'],
    'text/x-python':          ['py', 'python'],
    'text/x-pyrex':           ['pyx'],
    'text/x-ruby':            ['rb', 'ruby'],
    'text/x-scheme':          ['scm'],
    'text/x-textile':         ['txtl', 'textile'],
    'text/x-vba':             ['vb', 'vba', 'bas'],
    'text/x-verilog':         ['v', 'verilog'],
    'text/x-vhdl':            ['vhd'],

# extend the above with simple (text/x-<something>: <something>) mappings

for x in ['ada', 'asm', 'asp', 'awk', 'idl', 'inf', 'java', 'ksh', 'lua',
          'm4', 'mail', 'psp', 'rfc', 'rst', 'sql', 'tcl', 'tex', 'zsh']:
    KNOWN_MIME_TYPES.setdefault('text/x-%s' % x, []).append(x)

# Default mapping from keywords/extensions to known MIME types:

for t, exts in KNOWN_MIME_TYPES.items():
    MIME_MAP[t] = t
    for e in exts:
        MIME_MAP[e] = t

# Simple builtin autodetection from the content using a regexp
MODE_RE = re.compile(
    r"#!(?:[/\w.-_]+/)?(\w+)|"               # look for shebang
    r"-\*-\s*(?:mode:\s*)?([\w+-]+)\s*-\*-|" # look for Emacs' -*- mode -*-
    r"vim:.*?syntax=(\w+)"                   # look for VIM's syntax=<n>

def get_mimetype(filename, content=None, mime_map=MIME_MAP):
    """Guess the most probable MIME type of a file with the given name.

    `filename` is either a filename (the lookup will then use the suffix)
    or some arbitrary keyword.
    `content` is either a `str` or an `unicode` string.
    suffix = filename.split('.')[-1]
    if suffix in mime_map:
        # 1) mimetype from the suffix, using the `mime_map`
        return mime_map[suffix]
        mimetype = None
            import mimetypes
            # 2) mimetype from the suffix, using the `mimetypes` module
            mimetype = mimetypes.guess_type(filename)[0]
        if not mimetype and content:
            match =, content[:1000])
            if match:
                mode = or or \
                if mode in mime_map:
                    # 3) mimetype from the content, using the `MODE_RE`
                    return mime_map[mode]
                if is_binary(content):
                    # 4) mimetype from the content, using`is_binary`
                    return 'application/octet-stream'
        return mimetype

def is_binary(data):
    """Detect binary content by checking the first thousand bytes for zeroes.

    Operate on either `str` or `unicode` strings.
    if isinstance(data, str) and detect_unicode(data):
        return False
    return '\0' in data[:1000]

def detect_unicode(data):
    """Detect different unicode charsets by looking for BOMs (Byte Order Marks).

    Operate obviously only on `str` objects.
    if data.startswith('\xff\xfe'):
        return 'utf-16-le'
    elif data.startswith('\xfe\xff'):
        return 'utf-16-be'
    elif data.startswith('\xef\xbb\xbf'):
        return 'utf-8'
        return None

def content_to_unicode(env, content, mimetype):
    """Retrieve an `unicode` object from a `content` to be previewed"""
    mimeview = Mimeview(env)
    if hasattr(content, 'read'):
        content =
    return mimeview.to_unicode(content, mimetype)

class IHTMLPreviewRenderer(Interface):
    """Extension point interface for components that add HTML renderers of
    specific content types to the `Mimeview` component.


    # implementing classes should set this property to True if they
    # support text content where Trac should expand tabs into spaces
    expand_tabs = False

    def get_quality_ratio(mimetype):
        """Return the level of support this renderer provides for the `content`
        of the specified MIME type. The return value must be a number between
        0 and 9, where 0 means no support and 9 means "perfect" support.

    def render(req, mimetype, content, filename=None, url=None):
        """Render an XHTML preview of the raw `content`.

        The `content` might be:
         * a `str` object
         * an `unicode` string
         * any object with a `read` method, returning one of the above

        It is assumed that the content will correspond to the given `mimetype`.

        Besides the `content` value, the same content may eventually
        be available through the `filename` or `url` parameters.
        This is useful for renderers that embed objects, using <object> or
        <img> instead of including the content inline.
        Can return the generated XHTML text as a single string or as an
        iterable that yields strings. In the latter case, the list will
        be considered to correspond to lines of text in the original content.

class IHTMLPreviewAnnotator(Interface):
    """Extension point interface for components that can annotate an XHTML
    representation of file contents with additional information."""

    def get_annotation_type():
        """Return a (type, label, description) tuple that defines the type of
        annotation and provides human readable names. The `type` element should
        be unique to the annotator. The `label` element is used as column
        heading for the table, while `description` is used as a display name to
        let the user toggle the appearance of the annotation type.

    def annotate_line(number, content):
        """Return the XHTML markup for the table cell that contains the
        annotation data."""

class IContentConverter(Interface):
    """An extension point interface for generic MIME based content

    def get_supported_conversions():
        """Return an iterable of tuples in the form (key, name, extension,
        in_mimetype, out_mimetype, quality) representing the MIME conversions
        supported and
        the quality ratio of the conversion in the range 0 to 9, where 0 means
        no support and 9 means "perfect" support. eg. ('latex', 'LaTeX', 'tex',
        'text/x-trac-wiki', 'text/plain', 8)"""

    def convert_content(req, mimetype, content, key):
        """Convert the given content from mimetype to the output MIME type
        represented by key. Returns a tuple in the form (content,
        output_mime_type) or None if conversion is not possible."""

class Mimeview(Component):
    """A generic class to prettify data, typically source code."""

    renderers = ExtensionPoint(IHTMLPreviewRenderer)
    annotators = ExtensionPoint(IHTMLPreviewAnnotator)
    converters = ExtensionPoint(IContentConverter)

    default_charset = Option('trac', 'default_charset', 'iso-8859-15',
        """Charset to be used when in doubt.""")

    tab_width = IntOption('mimeviewer', 'tab_width', 8,
        """Displayed tab width in file preview (''since 0.9'').""")

    max_preview_size = IntOption('mimeviewer', 'max_preview_size', 262144,
        """Maximum file size for HTML preview. (''since 0.9'').""")

    mime_map = ListOption('mimeviewer', 'mime_map',
        """List of additional MIME types and keyword mappings.
        Mappings are comma-separated, and for each MIME type,
        there's a colon (":") separated list of associated keywords
        or file extensions. (''since 0.10'').""")

    def __init__(self):
        self._mime_map = None
    # Public API

    def get_supported_conversions(self, mimetype):
        """Return a list of target MIME types in same form as
        `IContentConverter.get_supported_conversions()`, but with the converter
        component appended. Output is ordered from best to worst quality."""
        converters = []
        for converter in self.converters:
            for k, n, e, im, om, q in converter.get_supported_conversions():
                if im == mimetype and q > 0:
                    converters.append((k, n, e, im, om, q, converter))
        converters = sorted(converters, key=lambda i: i[-1], reverse=True)
        return converters

    def convert_content(self, req, mimetype, content, key, filename=None,
        """Convert the given content to the target MIME type represented by
        `key`, which can be either a MIME type or a key. Returns a tuple of
        (content, output_mime_type, extension)."""
        if not content:
            return ('', 'text/plain;charset=utf-8')

        # Ensure we have a MIME type for this content
        full_mimetype = mimetype
        if not full_mimetype:
            if hasattr(content, 'read'):
                content =
            full_mimetype = self.get_mimetype(filename, content)
        if full_mimetype:
            mimetype = full_mimetype.split(';')[0].strip() # split off charset
            mimetype = full_mimetype = 'text/plain' # fallback if not binary

        # Choose best converter
        candidates = list(self.get_supported_conversions(mimetype))
        candidates = [c for c in candidates if key in (c[0], c[4])]
        if not candidates:
            raise TracError('No available MIME conversions from %s to %s' %
                            (mimetype, key))

        # First successful conversion wins
        for ck, name, ext, input_mimettype, output_mimetype, quality, \
                converter in candidates:
            output = converter.convert_content(req, mimetype, content, ck)
            if not output:
            return (output[0], output[1], ext)
        raise TracError('No available MIME conversions from %s to %s' %
                        (mimetype, key))

    def get_annotation_types(self):
        """Generator that returns all available annotation types."""
        for annotator in self.annotators:
            yield annotator.get_annotation_type()

    def render(self, req, mimetype, content, filename=None, url=None,
        """Render an XHTML preview of the given `content`.

        `content` is the same as an `IHTMLPreviewRenderer.render`'s
        `content` argument.

        The specified `mimetype` will be used to select the most appropriate
        `IHTMLPreviewRenderer` implementation available for this MIME type.
        If not given, the MIME type will be infered from the filename or the

        Return a string containing the XHTML text.
        if not content:
            return ''

        # Ensure we have a MIME type for this content
        full_mimetype = mimetype
        if not full_mimetype:
            if hasattr(content, 'read'):
                content =
            full_mimetype = self.get_mimetype(filename, content)
        if full_mimetype:
            mimetype = full_mimetype.split(';')[0].strip() # split off charset
            mimetype = full_mimetype = 'text/plain' # fallback if not binary

        # Determine candidate `IHTMLPreviewRenderer`s
        candidates = []
        for renderer in self.renderers:
            qr = renderer.get_quality_ratio(mimetype)
            if qr > 0:
                candidates.append((qr, renderer))
        candidates.sort(lambda x,y: cmp(y[0], x[0]))

        # First candidate which renders successfully wins.
        # Also, we don't want to expand tabs more than once.
        expanded_content = None
        for qr, renderer in candidates:
                self.log.debug('Trying to render HTML preview using %s'
                               % renderer.__class__.__name__)
                # check if we need to perform a tab expansion
                rendered_content = content
                if getattr(renderer, 'expand_tabs', False):
                    if expanded_content is None:
                        content = content_to_unicode(self.env, content,
                        expanded_content = content.expandtabs(self.tab_width)
                    rendered_content = expanded_content
                result = renderer.render(req, full_mimetype, rendered_content,
                                         filename, url)
                if not result:
                elif isinstance(result, Fragment):
                    return result
                elif isinstance(result, basestring):
                    return Markup(to_unicode(result))
                elif annotations:
                    return Markup(self._annotate(result, annotations))
                    buf = StringIO()
                    buf.write('<div class="code"><pre>')
                    for line in result:
                        buf.write(line + '\n')
                    return Markup(buf.getvalue())
            except Exception, e:
                self.log.warning('HTML preview using %s failed (%s)'
                                 % (renderer, e), exc_info=True)

    def _annotate(self, lines, annotations):
        buf = StringIO()
        buf.write('<table class="code"><thead><tr>')
        annotators = []
        for annotator in self.annotators:
            atype, alabel, adesc = annotator.get_annotation_type()
            if atype in annotations:
                buf.write('<th class="%s">%s</th>' % (atype, alabel))
        buf.write('<th class="content">&nbsp;</th>')

        space_re = re.compile('(?P<spaces> (?: +))|'
                              '^(?P<tag><\w+.*?>)?( )')
        def htmlify(match):
            m ='spaces')
            if m:
                div, mod = divmod(len(m), 2)
                return div * '&nbsp; ' + mod * '&nbsp;'
            return ('tag') or '') + '&nbsp;'

        num = -1
        for num, line in enumerate(_html_splitlines(lines)):
            cells = []
            for annotator in annotators:
                cells.append(annotator.annotate_line(num + 1, line))
            cells.append('<td>%s</td>\n' % space_re.sub(htmlify, line))
            buf.write('<tr>' + '\n'.join(cells) + '</tr>')
            if num < 0:
                return ''
        return buf.getvalue()

    def get_max_preview_size(self):
        """Deprecated: use `max_preview_size` attribute directly."""
        return self.max_preview_size

    def get_charset(self, content='', mimetype=None):
        """Infer the character encoding from the `content` or the `mimetype`.

        `content` is either a `str` or an `unicode` object.
        The charset will be determined using this order:
         * from the charset information present in the `mimetype` argument
         * auto-detection of the charset from the `content`
         * the configured `default_charset` 
        if mimetype:
            ctpos = mimetype.find('charset=')
            if ctpos >= 0:
                return mimetype[ctpos + 8:].strip()
        if isinstance(content, str):
            utf = detect_unicode(content)
            if utf is not None:
                return utf
        return self.default_charset

    def get_mimetype(self, filename, content=None):
        """Infer the MIME type from the `filename` or the `content`.

        `content` is either a `str` or an `unicode` object.

        Return the detected MIME type, augmented by the
        charset information (i.e. "<mimetype>; charset=..."),
        or `None` if detection failed.
        # Extend default extension to MIME type mappings with configured ones
        if not self._mime_map:
            self._mime_map = MIME_MAP
            for mapping in self.config['mimeviewer'].getlist('mime_map'):
                if ':' in mapping:
                    assocations = mapping.split(':')
                    for keyword in assocations: # Note: [0] kept on purpose
                        self._mime_map[keyword] = assocations[0]

        mimetype = get_mimetype(filename, content, self._mime_map)
        charset = None
        if mimetype:
            charset = self.get_charset(content, mimetype)
        if mimetype and charset and not 'charset' in mimetype:
            mimetype += '; charset=' + charset
        return mimetype

    def to_utf8(self, content, mimetype=None):
        """Convert an encoded `content` to utf-8.

        ''Deprecated in 0.10. You should use `unicode` strings only.''
        return to_utf8(content, self.get_charset(content, mimetype))

    def to_unicode(self, content, mimetype=None, charset=None):
        """Convert `content` (an encoded `str` object) to an `unicode` object.

        This calls `trac.util.to_unicode` with the `charset` provided,
        or the one obtained by `Mimeview.get_charset()`.
        if not charset:
            charset = self.get_charset(content, mimetype)
        return to_unicode(content, charset)

    def configured_modes_mapping(self, renderer):
        """Return a MIME type to `(mode,quality)` mapping for given `option`"""
        types, option = {}, '%s_modes' % renderer
        for mapping in self.config['mimeviewer'].getlist(option):
            if not mapping:
                mimetype, mode, quality = mapping.split(':')
                types[mimetype] = (mode, int(quality))
            except (TypeError, ValueError):
                self.log.warning("Invalid mapping '%s' specified in '%s' "
                                 "option." % (mapping, option))
        return types
    def preview_to_hdf(self, req, content, length, mimetype, filename,
                       url=None, annotations=None):
        """Prepares a rendered preview of the given `content`.

        Note: `content` will usually be an object with a `read` method.
        if length >= self.max_preview_size:
            return {'max_file_size_reached': True,
                    'max_file_size': self.max_preview_size,
                    'raw_href': url}
            return {'preview': self.render(req, mimetype, content, filename,
                                           url, annotations),
                    'raw_href': url}

    def send_converted(self, req, in_type, content, selector, filename='file'):
        """Helper method for converting `content` and sending it directly.

        `selector` can be either a key or a MIME Type."""
        from trac.web import RequestDone
        content, output_type, ext = self.convert_content(req, in_type,
                                                         content, selector)
        req.send_header('Content-Type', output_type)
        req.send_header('Content-Disposition', 'filename=%s.%s' % (filename,
        raise RequestDone        

def _html_splitlines(lines):
    """Tracks open and close tags in lines of HTML text and yields lines that
    have no tags spanning more than one line."""
    open_tag_re = re.compile(r'<(\w+)(\s.*?)?[^/]?>')
    close_tag_re = re.compile(r'</(\w+)>')
    open_tags = []
    for line in lines:
        # Reopen tags still open from the previous line
        for tag in open_tags:
            line = + line
        open_tags = []

        # Find all tags opened on this line
        for tag in open_tag_re.finditer(line):


        # Find all tags closed on this line
        for ctag in close_tag_re.finditer(line):
            for otag in open_tags:
                if ==

        # Close all tags still open at the end of line, they'll get reopened at
        # the beginning of the next line
        for tag in open_tags:
            line += '</%s>' %

        yield line

# -- Default annotators

class LineNumberAnnotator(Component):
    """Text annotator that adds a column with line numbers."""

    # ITextAnnotator methods

    def get_annotation_type(self):
        return 'lineno', 'Line', 'Line numbers'

    def annotate_line(self, number, content):
        return '<th id="L%s"><a href="#L%s">%s</a></th>' % (number, number,

# -- Default renderers

class PlainTextRenderer(Component):
    """HTML preview renderer for plain text, and fallback for any kind of text
    for which no more specific renderer is available.

    expand_tabs = True


    def get_quality_ratio(self, mimetype):
        if mimetype in self.TREAT_AS_BINARY:
            return 0
        return 1

    def render(self, req, mimetype, content, filename=None, url=None):
        if is_binary(content):
            self.env.log.debug("Binary data; no preview available")

        self.env.log.debug("Using default plain text mimeviewer")
        content = content_to_unicode(self.env, content, mimetype)
        for line in content.splitlines():
            yield escape(line)

class ImageRenderer(Component):
    """Inline image display. Here we don't need the `content` at all."""

    def get_quality_ratio(self, mimetype):
        if mimetype.startswith('image/'):
            return 8
        return 0

    def render(self, req, mimetype, content, filename=None, url=None):
        if url:
            return html.DIV(html.IMG(src=url,alt=filename),

class WikiTextRenderer(Component):
    """Render files containing Trac's own Wiki formatting markup."""

    def get_quality_ratio(self, mimetype):
        if mimetype in ('text/x-trac-wiki', 'application/x-trac-wiki'):
            return 8
        return 0

    def render(self, req, mimetype, content, filename=None, url=None):
        from import wiki_to_html
        return wiki_to_html(content_to_unicode(self.env, content, mimetype),
                            self.env, req)
