cmlenz@274: # -*- coding: utf-8 -*-
cmlenz@274: #
cmlenz@854: # Copyright (C) 2006-2009 Edgewall Software
cmlenz@274: # All rights reserved.
cmlenz@274: #
cmlenz@274: # This software is licensed as described in the file COPYING, which
cmlenz@274: # you should have received as part of this distribution. The terms
cmlenz@274: # are also available at http://genshi.edgewall.org/wiki/License.
cmlenz@274: #
cmlenz@274: # This software consists of voluntary contributions made by many
cmlenz@274: # individuals. For the exact contribution history, see the revision
cmlenz@274: # history and logs, available at http://genshi.edgewall.org/log/.
cmlenz@274:
cmlenz@274: """Various utility classes and functions."""
cmlenz@274:
cmlenz@859: import htmlentitydefs as entities
cmlenz@397: import re
cmlenz@397:
hodgestar@914: from compat import any, all, stringrepr
hodgestar@914:
cmlenz@425: __docformat__ = 'restructuredtext en'
cmlenz@425:
cmlenz@274:
cmlenz@274: class LRUCache(dict):
cmlenz@274: """A dictionary-like object that stores only a certain number of items, and
cmlenz@274: discards its least recently used item when full.
cmlenz@274:
cmlenz@274: >>> cache = LRUCache(3)
cmlenz@274: >>> cache['A'] = 0
cmlenz@274: >>> cache['B'] = 1
cmlenz@274: >>> cache['C'] = 2
cmlenz@274: >>> len(cache)
cmlenz@274: 3
cmlenz@274:
cmlenz@274: >>> cache['A']
cmlenz@274: 0
cmlenz@274:
cmlenz@274: Adding new items to the cache does not increase its size. Instead, the least
cmlenz@274: recently used item is dropped:
cmlenz@274:
cmlenz@274: >>> cache['D'] = 3
cmlenz@274: >>> len(cache)
cmlenz@274: 3
cmlenz@274: >>> 'B' in cache
cmlenz@274: False
cmlenz@274:
cmlenz@274: Iterating over the cache returns the keys, starting with the most recently
cmlenz@274: used:
cmlenz@274:
cmlenz@274: >>> for key in cache:
cmlenz@853: ... print(key)
cmlenz@274: D
cmlenz@274: A
cmlenz@274: C
cmlenz@274:
cmlenz@274: This code is based on the LRUCache class from ``myghtyutils.util``, written
cmlenz@274: by Mike Bayer and released under the MIT license. See:
cmlenz@274:
cmlenz@274: http://svn.myghty.org/myghtyutils/trunk/lib/myghtyutils/util.py
cmlenz@274: """
cmlenz@274:
cmlenz@274: class _Item(object):
cmlenz@274: def __init__(self, key, value):
cmlenz@854: self.prv = self.nxt = None
cmlenz@274: self.key = key
cmlenz@274: self.value = value
cmlenz@274: def __repr__(self):
cmlenz@274: return repr(self.value)
cmlenz@274:
cmlenz@274: def __init__(self, capacity):
cmlenz@274: self._dict = dict()
cmlenz@274: self.capacity = capacity
cmlenz@274: self.head = None
cmlenz@274: self.tail = None
cmlenz@274:
cmlenz@274: def __contains__(self, key):
cmlenz@274: return key in self._dict
cmlenz@274:
cmlenz@274: def __iter__(self):
cmlenz@274: cur = self.head
cmlenz@274: while cur:
cmlenz@274: yield cur.key
cmlenz@854: cur = cur.nxt
cmlenz@274:
cmlenz@274: def __len__(self):
cmlenz@274: return len(self._dict)
cmlenz@274:
cmlenz@274: def __getitem__(self, key):
cmlenz@274: item = self._dict[key]
cmlenz@274: self._update_item(item)
cmlenz@274: return item.value
cmlenz@274:
cmlenz@274: def __setitem__(self, key, value):
cmlenz@274: item = self._dict.get(key)
cmlenz@274: if item is None:
cmlenz@274: item = self._Item(key, value)
cmlenz@274: self._dict[key] = item
cmlenz@274: self._insert_item(item)
cmlenz@274: else:
cmlenz@274: item.value = value
cmlenz@274: self._update_item(item)
cmlenz@274: self._manage_size()
cmlenz@274:
cmlenz@274: def __repr__(self):
cmlenz@274: return repr(self._dict)
cmlenz@274:
cmlenz@274: def _insert_item(self, item):
cmlenz@854: item.prv = None
cmlenz@854: item.nxt = self.head
cmlenz@274: if self.head is not None:
cmlenz@854: self.head.prv = item
cmlenz@274: else:
cmlenz@274: self.tail = item
cmlenz@274: self.head = item
cmlenz@274: self._manage_size()
cmlenz@274:
cmlenz@274: def _manage_size(self):
cmlenz@274: while len(self._dict) > self.capacity:
cmlenz@274: olditem = self._dict[self.tail.key]
cmlenz@274: del self._dict[self.tail.key]
cmlenz@274: if self.tail != self.head:
cmlenz@854: self.tail = self.tail.prv
cmlenz@854: self.tail.nxt = None
cmlenz@274: else:
cmlenz@274: self.head = self.tail = None
cmlenz@274:
cmlenz@274: def _update_item(self, item):
cmlenz@274: if self.head == item:
cmlenz@274: return
cmlenz@274:
cmlenz@854: prv = item.prv
cmlenz@854: prv.nxt = item.nxt
cmlenz@854: if item.nxt is not None:
cmlenz@854: item.nxt.prv = prv
cmlenz@274: else:
cmlenz@854: self.tail = prv
cmlenz@274:
cmlenz@854: item.prv = None
cmlenz@854: item.nxt = self.head
cmlenz@854: self.head.prv = self.head = item
cmlenz@357:
cmlenz@357:
cmlenz@357: def flatten(items):
cmlenz@433: """Flattens a potentially nested sequence into a flat list.
cmlenz@433:
cmlenz@433: :param items: the sequence to flatten
cmlenz@357:
cmlenz@357: >>> flatten((1, 2))
cmlenz@357: [1, 2]
cmlenz@357: >>> flatten([1, (2, 3), 4])
cmlenz@357: [1, 2, 3, 4]
cmlenz@357: >>> flatten([1, (2, [3, 4]), 5])
cmlenz@357: [1, 2, 3, 4, 5]
cmlenz@357: """
cmlenz@357: retval = []
cmlenz@357: for item in items:
cmlenz@580: if isinstance(item, (frozenset, list, set, tuple)):
cmlenz@357: retval += flatten(item)
cmlenz@357: else:
cmlenz@357: retval.append(item)
cmlenz@357: return retval
cmlenz@397:
cmlenz@852:
cmlenz@397: def plaintext(text, keeplinebreaks=True):
cmlenz@856: """Return the text with all entities and tags removed.
cmlenz@433:
cmlenz@433: >>> plaintext('1 < 2')
cmlenz@433: u'1 < 2'
cmlenz@433:
cmlenz@433: The `keeplinebreaks` parameter can be set to ``False`` to replace any line
cmlenz@433: breaks by simple spaces:
cmlenz@433:
cmlenz@433: >>> plaintext('''1
cmlenz@433: ... <
cmlenz@433: ... 2''', keeplinebreaks=False)
cmlenz@433: u'1 < 2'
cmlenz@433:
cmlenz@433: :param text: the text to convert to plain text
cmlenz@433: :param keeplinebreaks: whether line breaks in the text should be kept intact
cmlenz@433: :return: the text with tags and entities removed
cmlenz@397: """
cmlenz@397: text = stripentities(striptags(text))
cmlenz@397: if not keeplinebreaks:
cmlenz@852: text = text.replace('\n', ' ')
cmlenz@397: return text
cmlenz@397:
cmlenz@852:
cmlenz@397: _STRIPENTITIES_RE = re.compile(r'&(?:#((?:\d+)|(?:[xX][0-9a-fA-F]+));?|(\w+);)')
cmlenz@397: def stripentities(text, keepxmlentities=False):
cmlenz@397: """Return a copy of the given text with any character or numeric entities
cmlenz@397: replaced by the equivalent UTF-8 characters.
cmlenz@397:
cmlenz@397: >>> stripentities('1 < 2')
cmlenz@397: u'1 < 2'
cmlenz@397: >>> stripentities('more …')
cmlenz@397: u'more \u2026'
cmlenz@397: >>> stripentities('…')
cmlenz@397: u'\u2026'
cmlenz@397: >>> stripentities('…')
cmlenz@397: u'\u2026'
cmlenz@397:
cmlenz@397: If the `keepxmlentities` parameter is provided and is a truth value, the
cmlenz@397: core XML entities (&, ', >, < and ") are left intact.
cmlenz@750:
cmlenz@397: >>> stripentities('1 < 2 …', keepxmlentities=True)
cmlenz@397: u'1 < 2 \u2026'
cmlenz@397: """
cmlenz@397: def _replace_entity(match):
cmlenz@397: if match.group(1): # numeric entity
cmlenz@397: ref = match.group(1)
cmlenz@397: if ref.startswith('x'):
cmlenz@397: ref = int(ref[1:], 16)
cmlenz@397: else:
cmlenz@397: ref = int(ref, 10)
cmlenz@397: return unichr(ref)
cmlenz@397: else: # character entity
cmlenz@397: ref = match.group(2)
cmlenz@397: if keepxmlentities and ref in ('amp', 'apos', 'gt', 'lt', 'quot'):
cmlenz@852: return '&%s;' % ref
cmlenz@397: try:
cmlenz@856: return unichr(entities.name2codepoint[ref])
cmlenz@397: except KeyError:
cmlenz@397: if keepxmlentities:
cmlenz@852: return '&%s;' % ref
cmlenz@397: else:
cmlenz@397: return ref
cmlenz@397: return _STRIPENTITIES_RE.sub(_replace_entity, text)
cmlenz@397:
cmlenz@852:
cmlenz@655: _STRIPTAGS_RE = re.compile(r'(|<[^>]*>)')
cmlenz@397: def striptags(text):
cmlenz@433: """Return a copy of the text with any XML/HTML tags removed.
cmlenz@397:
cmlenz@397: >>> striptags('Foo bar')
cmlenz@397: 'Foo bar'
cmlenz@397: >>> striptags('Foo')
cmlenz@397: 'Foo'
cmlenz@397: >>> striptags('Foo
')
cmlenz@397: 'Foo'
cmlenz@433:
cmlenz@655: HTML/XML comments are stripped, too:
cmlenz@655:
cmlenz@655: >>> striptags('test')
cmlenz@655: 'test'
cmlenz@655:
cmlenz@433: :param text: the string to remove tags from
cmlenz@433: :return: the text with tags removed
cmlenz@397: """
cmlenz@397: return _STRIPTAGS_RE.sub('', text)
cmlenz@856: