cmlenz@274: # -*- coding: utf-8 -*- cmlenz@274: # cmlenz@854: # Copyright (C) 2006-2009 Edgewall Software cmlenz@274: # All rights reserved. cmlenz@274: # cmlenz@274: # This software is licensed as described in the file COPYING, which cmlenz@274: # you should have received as part of this distribution. The terms cmlenz@274: # are also available at http://genshi.edgewall.org/wiki/License. cmlenz@274: # cmlenz@274: # This software consists of voluntary contributions made by many cmlenz@274: # individuals. For the exact contribution history, see the revision cmlenz@274: # history and logs, available at http://genshi.edgewall.org/log/. cmlenz@274: cmlenz@274: """Various utility classes and functions.""" cmlenz@274: cmlenz@859: import htmlentitydefs as entities cmlenz@397: import re cmlenz@397: hodgestar@914: from compat import any, all, stringrepr hodgestar@914: cmlenz@425: __docformat__ = 'restructuredtext en' cmlenz@425: cmlenz@274: cmlenz@274: class LRUCache(dict): cmlenz@274: """A dictionary-like object that stores only a certain number of items, and cmlenz@274: discards its least recently used item when full. cmlenz@274: cmlenz@274: >>> cache = LRUCache(3) cmlenz@274: >>> cache['A'] = 0 cmlenz@274: >>> cache['B'] = 1 cmlenz@274: >>> cache['C'] = 2 cmlenz@274: >>> len(cache) cmlenz@274: 3 cmlenz@274: cmlenz@274: >>> cache['A'] cmlenz@274: 0 cmlenz@274: cmlenz@274: Adding new items to the cache does not increase its size. Instead, the least cmlenz@274: recently used item is dropped: cmlenz@274: cmlenz@274: >>> cache['D'] = 3 cmlenz@274: >>> len(cache) cmlenz@274: 3 cmlenz@274: >>> 'B' in cache cmlenz@274: False cmlenz@274: cmlenz@274: Iterating over the cache returns the keys, starting with the most recently cmlenz@274: used: cmlenz@274: cmlenz@274: >>> for key in cache: cmlenz@853: ... print(key) cmlenz@274: D cmlenz@274: A cmlenz@274: C cmlenz@274: cmlenz@274: This code is based on the LRUCache class from ``myghtyutils.util``, written cmlenz@274: by Mike Bayer and released under the MIT license. See: cmlenz@274: cmlenz@274: http://svn.myghty.org/myghtyutils/trunk/lib/myghtyutils/util.py cmlenz@274: """ cmlenz@274: cmlenz@274: class _Item(object): cmlenz@274: def __init__(self, key, value): cmlenz@854: self.prv = self.nxt = None cmlenz@274: self.key = key cmlenz@274: self.value = value cmlenz@274: def __repr__(self): cmlenz@274: return repr(self.value) cmlenz@274: cmlenz@274: def __init__(self, capacity): cmlenz@274: self._dict = dict() cmlenz@274: self.capacity = capacity cmlenz@274: self.head = None cmlenz@274: self.tail = None cmlenz@274: cmlenz@274: def __contains__(self, key): cmlenz@274: return key in self._dict cmlenz@274: cmlenz@274: def __iter__(self): cmlenz@274: cur = self.head cmlenz@274: while cur: cmlenz@274: yield cur.key cmlenz@854: cur = cur.nxt cmlenz@274: cmlenz@274: def __len__(self): cmlenz@274: return len(self._dict) cmlenz@274: cmlenz@274: def __getitem__(self, key): cmlenz@274: item = self._dict[key] cmlenz@274: self._update_item(item) cmlenz@274: return item.value cmlenz@274: cmlenz@274: def __setitem__(self, key, value): cmlenz@274: item = self._dict.get(key) cmlenz@274: if item is None: cmlenz@274: item = self._Item(key, value) cmlenz@274: self._dict[key] = item cmlenz@274: self._insert_item(item) cmlenz@274: else: cmlenz@274: item.value = value cmlenz@274: self._update_item(item) cmlenz@274: self._manage_size() cmlenz@274: cmlenz@274: def __repr__(self): cmlenz@274: return repr(self._dict) cmlenz@274: cmlenz@274: def _insert_item(self, item): cmlenz@854: item.prv = None cmlenz@854: item.nxt = self.head cmlenz@274: if self.head is not None: cmlenz@854: self.head.prv = item cmlenz@274: else: cmlenz@274: self.tail = item cmlenz@274: self.head = item cmlenz@274: self._manage_size() cmlenz@274: cmlenz@274: def _manage_size(self): cmlenz@274: while len(self._dict) > self.capacity: cmlenz@274: olditem = self._dict[self.tail.key] cmlenz@274: del self._dict[self.tail.key] cmlenz@274: if self.tail != self.head: cmlenz@854: self.tail = self.tail.prv cmlenz@854: self.tail.nxt = None cmlenz@274: else: cmlenz@274: self.head = self.tail = None cmlenz@274: cmlenz@274: def _update_item(self, item): cmlenz@274: if self.head == item: cmlenz@274: return cmlenz@274: cmlenz@854: prv = item.prv cmlenz@854: prv.nxt = item.nxt cmlenz@854: if item.nxt is not None: cmlenz@854: item.nxt.prv = prv cmlenz@274: else: cmlenz@854: self.tail = prv cmlenz@274: cmlenz@854: item.prv = None cmlenz@854: item.nxt = self.head cmlenz@854: self.head.prv = self.head = item cmlenz@357: cmlenz@357: cmlenz@357: def flatten(items): cmlenz@433: """Flattens a potentially nested sequence into a flat list. cmlenz@433: cmlenz@433: :param items: the sequence to flatten cmlenz@357: cmlenz@357: >>> flatten((1, 2)) cmlenz@357: [1, 2] cmlenz@357: >>> flatten([1, (2, 3), 4]) cmlenz@357: [1, 2, 3, 4] cmlenz@357: >>> flatten([1, (2, [3, 4]), 5]) cmlenz@357: [1, 2, 3, 4, 5] cmlenz@357: """ cmlenz@357: retval = [] cmlenz@357: for item in items: cmlenz@580: if isinstance(item, (frozenset, list, set, tuple)): cmlenz@357: retval += flatten(item) cmlenz@357: else: cmlenz@357: retval.append(item) cmlenz@357: return retval cmlenz@397: cmlenz@852: cmlenz@397: def plaintext(text, keeplinebreaks=True): cmlenz@856: """Return the text with all entities and tags removed. cmlenz@433: cmlenz@433: >>> plaintext('1 < 2') cmlenz@433: u'1 < 2' cmlenz@433: cmlenz@433: The `keeplinebreaks` parameter can be set to ``False`` to replace any line cmlenz@433: breaks by simple spaces: cmlenz@433: cmlenz@433: >>> plaintext('''1 cmlenz@433: ... < cmlenz@433: ... 2''', keeplinebreaks=False) cmlenz@433: u'1 < 2' cmlenz@433: cmlenz@433: :param text: the text to convert to plain text cmlenz@433: :param keeplinebreaks: whether line breaks in the text should be kept intact cmlenz@433: :return: the text with tags and entities removed cmlenz@397: """ cmlenz@397: text = stripentities(striptags(text)) cmlenz@397: if not keeplinebreaks: cmlenz@852: text = text.replace('\n', ' ') cmlenz@397: return text cmlenz@397: cmlenz@852: cmlenz@397: _STRIPENTITIES_RE = re.compile(r'&(?:#((?:\d+)|(?:[xX][0-9a-fA-F]+));?|(\w+);)') cmlenz@397: def stripentities(text, keepxmlentities=False): cmlenz@397: """Return a copy of the given text with any character or numeric entities cmlenz@397: replaced by the equivalent UTF-8 characters. cmlenz@397: cmlenz@397: >>> stripentities('1 < 2') cmlenz@397: u'1 < 2' cmlenz@397: >>> stripentities('more …') cmlenz@397: u'more \u2026' cmlenz@397: >>> stripentities('…') cmlenz@397: u'\u2026' cmlenz@397: >>> stripentities('…') cmlenz@397: u'\u2026' cmlenz@397: cmlenz@397: If the `keepxmlentities` parameter is provided and is a truth value, the cmlenz@397: core XML entities (&, ', >, < and ") are left intact. cmlenz@750: cmlenz@397: >>> stripentities('1 < 2 …', keepxmlentities=True) cmlenz@397: u'1 < 2 \u2026' cmlenz@397: """ cmlenz@397: def _replace_entity(match): cmlenz@397: if match.group(1): # numeric entity cmlenz@397: ref = match.group(1) cmlenz@397: if ref.startswith('x'): cmlenz@397: ref = int(ref[1:], 16) cmlenz@397: else: cmlenz@397: ref = int(ref, 10) cmlenz@397: return unichr(ref) cmlenz@397: else: # character entity cmlenz@397: ref = match.group(2) cmlenz@397: if keepxmlentities and ref in ('amp', 'apos', 'gt', 'lt', 'quot'): cmlenz@852: return '&%s;' % ref cmlenz@397: try: cmlenz@856: return unichr(entities.name2codepoint[ref]) cmlenz@397: except KeyError: cmlenz@397: if keepxmlentities: cmlenz@852: return '&%s;' % ref cmlenz@397: else: cmlenz@397: return ref cmlenz@397: return _STRIPENTITIES_RE.sub(_replace_entity, text) cmlenz@397: cmlenz@852: cmlenz@655: _STRIPTAGS_RE = re.compile(r'(|<[^>]*>)') cmlenz@397: def striptags(text): cmlenz@433: """Return a copy of the text with any XML/HTML tags removed. cmlenz@397: cmlenz@397: >>> striptags('Foo bar') cmlenz@397: 'Foo bar' cmlenz@397: >>> striptags('Foo') cmlenz@397: 'Foo' cmlenz@397: >>> striptags('Foo
') cmlenz@397: 'Foo' cmlenz@433: cmlenz@655: HTML/XML comments are stripped, too: cmlenz@655: cmlenz@655: >>> striptags('test') cmlenz@655: 'test' cmlenz@655: cmlenz@433: :param text: the string to remove tags from cmlenz@433: :return: the text with tags removed cmlenz@397: """ cmlenz@397: return _STRIPTAGS_RE.sub('', text) cmlenz@856: