cmlenz@1: # -*- coding: utf-8 -*- cmlenz@1: # cmlenz@1: # Copyright (C) 2006 Christopher Lenz cmlenz@1: # All rights reserved. cmlenz@1: # cmlenz@1: # This software is licensed as described in the file COPYING, which cmlenz@1: # you should have received as part of this distribution. The terms cmlenz@1: # are also available at http://trac.edgewall.com/license.html. cmlenz@1: # cmlenz@1: # This software consists of voluntary contributions made by many cmlenz@1: # individuals. For the exact contribution history, see the revision cmlenz@1: # history and logs, available at http://projects.edgewall.com/trac/. cmlenz@1: cmlenz@1: from xml.parsers import expat cmlenz@1: try: cmlenz@1: frozenset cmlenz@1: except NameError: cmlenz@1: from sets import ImmutableSet as frozenset cmlenz@1: import HTMLParser as html cmlenz@1: import htmlentitydefs cmlenz@1: import re cmlenz@1: from StringIO import StringIO cmlenz@1: cmlenz@1: from markup.core import Attributes, Markup, QName, Stream cmlenz@1: cmlenz@1: cmlenz@1: class XMLParser(object): cmlenz@1: """Generator-based XML parser based on roughly equivalent code in cmlenz@1: Kid/ElementTree.""" cmlenz@1: cmlenz@1: def __init__(self, source): cmlenz@1: self.source = source cmlenz@1: cmlenz@1: # Setup the Expat parser cmlenz@1: parser = expat.ParserCreate('utf-8', '}') cmlenz@1: parser.buffer_text = True cmlenz@1: parser.returns_unicode = True cmlenz@1: parser.StartElementHandler = self._handle_start cmlenz@1: parser.EndElementHandler = self._handle_end cmlenz@1: parser.CharacterDataHandler = self._handle_data cmlenz@1: parser.XmlDeclHandler = self._handle_prolog cmlenz@1: parser.StartDoctypeDeclHandler = self._handle_doctype cmlenz@1: parser.StartNamespaceDeclHandler = self._handle_start_ns cmlenz@1: parser.EndNamespaceDeclHandler = self._handle_end_ns cmlenz@1: parser.ProcessingInstructionHandler = self._handle_pi cmlenz@1: parser.CommentHandler = self._handle_comment cmlenz@1: parser.DefaultHandler = self._handle_other cmlenz@1: cmlenz@1: # Location reporting is only support in Python >= 2.4 cmlenz@1: if not hasattr(parser, 'CurrentLineNumber'): cmlenz@1: self.getpos = self._getpos_unknown cmlenz@1: cmlenz@1: self.expat = parser cmlenz@1: self.queue = [] cmlenz@1: cmlenz@1: def __iter__(self): cmlenz@1: bufsize = 4 * 1024 # 4K cmlenz@1: done = False cmlenz@1: while True: cmlenz@1: while not done and len(self.queue) == 0: cmlenz@1: data = self.source.read(bufsize) cmlenz@1: if data == '': # end of data cmlenz@1: if hasattr(self, 'expat'): cmlenz@1: self.expat.Parse('', True) cmlenz@1: del self.expat # get rid of circular references cmlenz@1: done = True cmlenz@1: else: cmlenz@1: self.expat.Parse(data, False) cmlenz@1: for event in self.queue: cmlenz@1: yield event cmlenz@1: self.queue = [] cmlenz@1: if done: cmlenz@1: break cmlenz@1: cmlenz@1: def _getpos_unknown(self): cmlenz@1: return (-1, -1) cmlenz@1: cmlenz@1: def getpos(self): cmlenz@1: return self.expat.CurrentLineNumber, self.expat.CurrentColumnNumber cmlenz@1: cmlenz@1: def _handle_start(self, tag, attrib): cmlenz@1: self.queue.append((Stream.START, (QName(tag), Attributes(attrib.items())), cmlenz@1: self.getpos())) cmlenz@1: cmlenz@1: def _handle_end(self, tag): cmlenz@1: self.queue.append((Stream.END, QName(tag), self.getpos())) cmlenz@1: cmlenz@1: def _handle_data(self, text): cmlenz@1: self.queue.append((Stream.TEXT, text, self.getpos())) cmlenz@1: cmlenz@1: def _handle_prolog(self, version, encoding, standalone): cmlenz@1: self.queue.append((Stream.PROLOG, (version, encoding, standalone), cmlenz@1: self.getpos())) cmlenz@1: cmlenz@1: def _handle_doctype(self, name, sysid, pubid, has_internal_subset): cmlenz@1: self.queue.append((Stream.DOCTYPE, (name, pubid, sysid), self.getpos())) cmlenz@1: cmlenz@1: def _handle_start_ns(self, prefix, uri): cmlenz@1: self.queue.append((Stream.START_NS, (prefix or '', uri), self.getpos())) cmlenz@1: cmlenz@1: def _handle_end_ns(self, prefix): cmlenz@1: self.queue.append((Stream.END_NS, prefix or '', self.getpos())) cmlenz@1: cmlenz@1: def _handle_pi(self, target, data): cmlenz@1: self.queue.append((Stream.PI, (target, data), self.getpos())) cmlenz@1: cmlenz@1: def _handle_comment(self, text): cmlenz@1: self.queue.append((Stream.COMMENT, text, self.getpos())) cmlenz@1: cmlenz@1: def _handle_other(self, text): cmlenz@1: if text.startswith('&'): cmlenz@1: # deal with undefined entities cmlenz@1: try: cmlenz@1: text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) cmlenz@1: self.queue.append((Stream.TEXT, text, self.getpos())) cmlenz@1: except KeyError: cmlenz@1: lineno, offset = self.getpos() cmlenz@1: raise expat.error("undefined entity %s: line %d, column %d" % cmlenz@1: (text, lineno, offset)) cmlenz@1: cmlenz@1: cmlenz@1: def XML(text): cmlenz@1: return Stream(list(XMLParser(StringIO(text)))) cmlenz@1: cmlenz@1: cmlenz@1: class HTMLParser(html.HTMLParser): cmlenz@1: """Parser for HTML input based on the Python `HTMLParser` module. cmlenz@1: cmlenz@1: This class provides the same interface for generating stream events as cmlenz@1: `XMLParser`, and attempts to automatically balance tags. cmlenz@1: """ cmlenz@1: cmlenz@1: _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', cmlenz@1: 'hr', 'img', 'input', 'isindex', 'link', 'meta', cmlenz@1: 'param']) cmlenz@1: cmlenz@1: def __init__(self, source): cmlenz@1: html.HTMLParser.__init__(self) cmlenz@1: self.source = source cmlenz@1: self.queue = [] cmlenz@1: self._open_tags = [] cmlenz@1: cmlenz@1: def __iter__(self): cmlenz@1: bufsize = 4 * 1024 # 4K cmlenz@1: done = False cmlenz@1: while True: cmlenz@1: while not done and len(self.queue) == 0: cmlenz@1: data = self.source.read(bufsize) cmlenz@1: if data == '': # end of data cmlenz@1: self.close() cmlenz@1: done = True cmlenz@1: else: cmlenz@1: self.feed(data) cmlenz@1: for kind, data, pos in self.queue: cmlenz@1: yield kind, data, pos cmlenz@1: self.queue = [] cmlenz@1: if done: cmlenz@1: open_tags = self._open_tags cmlenz@1: open_tags.reverse() cmlenz@1: for tag in open_tags: cmlenz@1: yield Stream.END, QName(tag), pos cmlenz@1: break cmlenz@1: cmlenz@1: def handle_starttag(self, tag, attrib): cmlenz@1: pos = self.getpos() cmlenz@1: self.queue.append((Stream.START, (QName(tag), Attributes(attrib)), pos)) cmlenz@1: if tag in self._EMPTY_ELEMS: cmlenz@1: self.queue.append((Stream.END, QName(tag), pos)) cmlenz@1: else: cmlenz@1: self._open_tags.append(tag) cmlenz@1: cmlenz@1: def handle_endtag(self, tag): cmlenz@1: if tag not in self._EMPTY_ELEMS: cmlenz@1: pos = self.getpos() cmlenz@1: while self._open_tags: cmlenz@1: open_tag = self._open_tags.pop() cmlenz@1: if open_tag.lower() == tag.lower(): cmlenz@1: break cmlenz@1: self.queue.append((Stream.END, QName(open_tag), pos)) cmlenz@1: self.queue.append((Stream.END, QName(tag), pos)) cmlenz@1: cmlenz@1: def handle_data(self, text): cmlenz@1: self.queue.append((Stream.TEXT, text, self.getpos())) cmlenz@1: cmlenz@1: def handle_charref(self, name): cmlenz@1: self.queue.append((Stream.TEXT, Markup('&#%s;' % name), self.getpos())) cmlenz@1: cmlenz@1: def handle_entityref(self, name): cmlenz@1: self.queue.append((Stream.TEXT, Markup('&%s;' % name), self.getpos())) cmlenz@1: cmlenz@1: def handle_pi(self, data): cmlenz@1: target, data = data.split(maxsplit=1) cmlenz@1: data = data.rstrip('?') cmlenz@1: self.queue.append((Stream.PI, (target.strip(), data.strip()), cmlenz@1: self.getpos())) cmlenz@1: cmlenz@1: def handle_comment(self, text): cmlenz@1: self.queue.append((Stream.COMMENT, text, self.getpos())) cmlenz@1: cmlenz@1: cmlenz@1: def HTML(text): cmlenz@1: return Stream(list(HTMLParser(StringIO(text))))