cmlenz@1: # -*- coding: utf-8 -*- cmlenz@1: # cmlenz@66: # Copyright (C) 2006 Edgewall Software cmlenz@1: # All rights reserved. cmlenz@1: # cmlenz@1: # This software is licensed as described in the file COPYING, which cmlenz@1: # you should have received as part of this distribution. The terms cmlenz@230: # are also available at http://genshi.edgewall.org/wiki/License. cmlenz@1: # cmlenz@1: # This software consists of voluntary contributions made by many cmlenz@1: # individuals. For the exact contribution history, see the revision cmlenz@230: # history and logs, available at http://genshi.edgewall.org/log/. cmlenz@1: cmlenz@144: from itertools import chain cmlenz@1: from xml.parsers import expat cmlenz@1: try: cmlenz@1: frozenset cmlenz@1: except NameError: cmlenz@1: from sets import ImmutableSet as frozenset cmlenz@1: import HTMLParser as html cmlenz@1: import htmlentitydefs cmlenz@1: from StringIO import StringIO cmlenz@1: cmlenz@293: from genshi.core import Attrs, QName, Stream, stripentities cmlenz@230: from genshi.core import DOCTYPE, START, END, START_NS, END_NS, TEXT, \ cmlenz@144: START_CDATA, END_CDATA, PI, COMMENT cmlenz@144: cmlenz@290: __all__ = ['ET', 'ParseError', 'XMLParser', 'XML', 'HTMLParser', 'HTML'] cmlenz@290: cmlenz@290: def ET(element): cmlenz@290: """Convert a given ElementTree element to a markup stream.""" cmlenz@290: tag_name = QName(element.tag.lstrip('{')) cmlenz@290: attrs = Attrs(element.items()) cmlenz@290: cmlenz@290: yield START, (tag_name, attrs), (None, -1, -1) cmlenz@290: if element.text: cmlenz@290: yield TEXT, element.text, (None, -1, -1) cmlenz@290: for child in element.getchildren(): cmlenz@290: for item in ET(child): cmlenz@290: yield item cmlenz@290: yield END, tag_name, (None, -1, -1) cmlenz@290: if element.tail: cmlenz@290: yield TEXT, element.tail, (None, -1, -1) cmlenz@1: cmlenz@1: cmlenz@21: class ParseError(Exception): cmlenz@21: """Exception raised when fatal syntax errors are found in the input being cmlenz@21: parsed.""" cmlenz@21: cmlenz@21: def __init__(self, message, filename='', lineno=-1, offset=-1): cmlenz@21: Exception.__init__(self, message) cmlenz@213: self.msg = message cmlenz@21: self.filename = filename cmlenz@21: self.lineno = lineno cmlenz@21: self.offset = offset cmlenz@21: cmlenz@21: cmlenz@1: class XMLParser(object): cmlenz@1: """Generator-based XML parser based on roughly equivalent code in cmlenz@26: Kid/ElementTree. cmlenz@26: cmlenz@26: The parsing is initiated by iterating over the parser object: cmlenz@26: cmlenz@26: >>> parser = XMLParser(StringIO('Foo')) cmlenz@26: >>> for kind, data, pos in parser: cmlenz@26: ... print kind, data cmlenz@26: START (u'root', [(u'id', u'2')]) cmlenz@26: START (u'child', []) cmlenz@26: TEXT Foo cmlenz@26: END child cmlenz@26: END root cmlenz@26: """ cmlenz@1: cmlenz@293: _entitydefs = ['' % (name, value) for name, value in cmlenz@293: htmlentitydefs.name2codepoint.items()] cmlenz@293: _external_dtd = '\n'.join(_entitydefs) cmlenz@293: cmlenz@21: def __init__(self, source, filename=None): cmlenz@26: """Initialize the parser for the given XML text. cmlenz@26: cmlenz@26: @param source: the XML text as a file-like object cmlenz@26: @param filename: the name of the file, if appropriate cmlenz@26: """ cmlenz@1: self.source = source cmlenz@21: self.filename = filename cmlenz@1: cmlenz@1: # Setup the Expat parser cmlenz@1: parser = expat.ParserCreate('utf-8', '}') cmlenz@1: parser.buffer_text = True cmlenz@1: parser.returns_unicode = True cmlenz@160: parser.ordered_attributes = True cmlenz@160: cmlenz@1: parser.StartElementHandler = self._handle_start cmlenz@1: parser.EndElementHandler = self._handle_end cmlenz@1: parser.CharacterDataHandler = self._handle_data cmlenz@1: parser.StartDoctypeDeclHandler = self._handle_doctype cmlenz@1: parser.StartNamespaceDeclHandler = self._handle_start_ns cmlenz@1: parser.EndNamespaceDeclHandler = self._handle_end_ns cmlenz@143: parser.StartCdataSectionHandler = self._handle_start_cdata cmlenz@143: parser.EndCdataSectionHandler = self._handle_end_cdata cmlenz@1: parser.ProcessingInstructionHandler = self._handle_pi cmlenz@1: parser.CommentHandler = self._handle_comment cmlenz@209: cmlenz@209: # Tell Expat that we'll handle non-XML entities ourselves cmlenz@209: # (in _handle_other) cmlenz@1: parser.DefaultHandler = self._handle_other cmlenz@293: parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_ALWAYS) cmlenz@209: parser.UseForeignDTD() cmlenz@293: parser.ExternalEntityRefHandler = self._build_foreign cmlenz@1: cmlenz@1: # Location reporting is only support in Python >= 2.4 cmlenz@1: if not hasattr(parser, 'CurrentLineNumber'): cmlenz@21: self._getpos = self._getpos_unknown cmlenz@1: cmlenz@1: self.expat = parser cmlenz@21: self._queue = [] cmlenz@1: cmlenz@144: def parse(self): cmlenz@144: def _generate(): cmlenz@144: try: cmlenz@144: bufsize = 4 * 1024 # 4K cmlenz@144: done = False cmlenz@144: while 1: cmlenz@144: while not done and len(self._queue) == 0: cmlenz@144: data = self.source.read(bufsize) cmlenz@144: if data == '': # end of data cmlenz@144: if hasattr(self, 'expat'): cmlenz@144: self.expat.Parse('', True) cmlenz@144: del self.expat # get rid of circular references cmlenz@144: done = True cmlenz@144: else: cmlenz@207: if isinstance(data, unicode): cmlenz@207: data = data.encode('utf-8') cmlenz@144: self.expat.Parse(data, False) cmlenz@144: for event in self._queue: cmlenz@144: yield event cmlenz@144: self._queue = [] cmlenz@144: if done: cmlenz@144: break cmlenz@144: except expat.ExpatError, e: cmlenz@144: msg = str(e) cmlenz@144: if self.filename: cmlenz@144: msg += ', in ' + self.filename cmlenz@144: raise ParseError(msg, self.filename, e.lineno, e.offset) cmlenz@146: return Stream(_generate()).filter(_coalesce) cmlenz@144: cmlenz@1: def __iter__(self): cmlenz@144: return iter(self.parse()) cmlenz@1: cmlenz@293: def _build_foreign(self, context, base, sysid, pubid): cmlenz@293: parser = self.expat.ExternalEntityParserCreate(context) cmlenz@293: parser.ParseFile(StringIO(self._external_dtd)) cmlenz@293: return 1 cmlenz@293: cmlenz@143: def _enqueue(self, kind, data=None, pos=None): cmlenz@26: if pos is None: cmlenz@26: pos = self._getpos() cmlenz@144: if kind is TEXT: cmlenz@134: # Expat reports the *end* of the text event as current position. We cmlenz@134: # try to fix that up here as much as possible. Unfortunately, the cmlenz@134: # offset is only valid for single-line text. For multi-line text, cmlenz@134: # it is apparently not possible to determine at what offset it cmlenz@134: # started cmlenz@134: if '\n' in data: cmlenz@134: lines = data.splitlines() cmlenz@134: lineno = pos[1] - len(lines) + 1 cmlenz@134: offset = -1 cmlenz@134: else: cmlenz@134: lineno = pos[1] cmlenz@134: offset = pos[2] - len(data) cmlenz@134: pos = (pos[0], lineno, offset) cmlenz@26: self._queue.append((kind, data, pos)) cmlenz@26: cmlenz@1: def _getpos_unknown(self): cmlenz@134: return (self.filename, -1, -1) cmlenz@1: cmlenz@21: def _getpos(self): cmlenz@134: return (self.filename, self.expat.CurrentLineNumber, cmlenz@21: self.expat.CurrentColumnNumber) cmlenz@1: cmlenz@1: def _handle_start(self, tag, attrib): cmlenz@182: self._enqueue(START, (QName(tag), Attrs(zip(*[iter(attrib)] * 2)))) cmlenz@1: cmlenz@1: def _handle_end(self, tag): cmlenz@144: self._enqueue(END, QName(tag)) cmlenz@1: cmlenz@1: def _handle_data(self, text): cmlenz@144: self._enqueue(TEXT, text) cmlenz@1: cmlenz@1: def _handle_doctype(self, name, sysid, pubid, has_internal_subset): cmlenz@144: self._enqueue(DOCTYPE, (name, pubid, sysid)) cmlenz@1: cmlenz@1: def _handle_start_ns(self, prefix, uri): cmlenz@144: self._enqueue(START_NS, (prefix or '', uri)) cmlenz@1: cmlenz@1: def _handle_end_ns(self, prefix): cmlenz@144: self._enqueue(END_NS, prefix or '') cmlenz@1: cmlenz@143: def _handle_start_cdata(self): cmlenz@144: self._enqueue(START_CDATA) cmlenz@143: cmlenz@143: def _handle_end_cdata(self): cmlenz@144: self._enqueue(END_CDATA) cmlenz@143: cmlenz@1: def _handle_pi(self, target, data): cmlenz@144: self._enqueue(PI, (target, data)) cmlenz@1: cmlenz@1: def _handle_comment(self, text): cmlenz@144: self._enqueue(COMMENT, text) cmlenz@1: cmlenz@1: def _handle_other(self, text): cmlenz@1: if text.startswith('&'): cmlenz@1: # deal with undefined entities cmlenz@1: try: cmlenz@1: text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) cmlenz@144: self._enqueue(TEXT, text) cmlenz@1: except KeyError: cmlenz@209: filename, lineno, offset = self._getpos() cmlenz@209: error = expat.error('undefined entity "%s": line %d, column %d' cmlenz@209: % (text, lineno, offset)) cmlenz@209: error.code = expat.errors.XML_ERROR_UNDEFINED_ENTITY cmlenz@209: error.lineno = lineno cmlenz@209: error.offset = offset cmlenz@209: raise error cmlenz@1: cmlenz@1: cmlenz@1: def XML(text): cmlenz@1: return Stream(list(XMLParser(StringIO(text)))) cmlenz@1: cmlenz@1: cmlenz@21: class HTMLParser(html.HTMLParser, object): cmlenz@1: """Parser for HTML input based on the Python `HTMLParser` module. cmlenz@1: cmlenz@1: This class provides the same interface for generating stream events as cmlenz@1: `XMLParser`, and attempts to automatically balance tags. cmlenz@26: cmlenz@26: The parsing is initiated by iterating over the parser object: cmlenz@26: cmlenz@26: >>> parser = HTMLParser(StringIO('')) cmlenz@26: >>> for kind, data, pos in parser: cmlenz@26: ... print kind, data cmlenz@26: START (u'ul', [(u'compact', u'compact')]) cmlenz@26: START (u'li', []) cmlenz@26: TEXT Foo cmlenz@26: END li cmlenz@26: END ul cmlenz@1: """ cmlenz@1: cmlenz@1: _EMPTY_ELEMS = frozenset(['area', 'base', 'basefont', 'br', 'col', 'frame', cmlenz@1: 'hr', 'img', 'input', 'isindex', 'link', 'meta', cmlenz@1: 'param']) cmlenz@1: cmlenz@21: def __init__(self, source, filename=None): cmlenz@1: html.HTMLParser.__init__(self) cmlenz@1: self.source = source cmlenz@21: self.filename = filename cmlenz@21: self._queue = [] cmlenz@1: self._open_tags = [] cmlenz@1: cmlenz@144: def parse(self): cmlenz@144: def _generate(): cmlenz@144: try: cmlenz@144: bufsize = 4 * 1024 # 4K cmlenz@144: done = False cmlenz@144: while 1: cmlenz@144: while not done and len(self._queue) == 0: cmlenz@144: data = self.source.read(bufsize) cmlenz@144: if data == '': # end of data cmlenz@144: self.close() cmlenz@144: done = True cmlenz@144: else: cmlenz@144: self.feed(data) cmlenz@144: for kind, data, pos in self._queue: cmlenz@144: yield kind, data, pos cmlenz@144: self._queue = [] cmlenz@144: if done: cmlenz@144: open_tags = self._open_tags cmlenz@144: open_tags.reverse() cmlenz@144: for tag in open_tags: cmlenz@144: yield END, QName(tag), pos cmlenz@144: break cmlenz@144: except html.HTMLParseError, e: cmlenz@144: msg = '%s: line %d, column %d' % (e.msg, e.lineno, e.offset) cmlenz@144: if self.filename: cmlenz@144: msg += ', in %s' % self.filename cmlenz@144: raise ParseError(msg, self.filename, e.lineno, e.offset) cmlenz@146: return Stream(_generate()).filter(_coalesce) cmlenz@144: cmlenz@1: def __iter__(self): cmlenz@144: return iter(self.parse()) cmlenz@21: cmlenz@26: def _enqueue(self, kind, data, pos=None): cmlenz@26: if pos is None: cmlenz@26: pos = self._getpos() cmlenz@26: self._queue.append((kind, data, pos)) cmlenz@26: cmlenz@21: def _getpos(self): cmlenz@21: lineno, column = self.getpos() cmlenz@21: return (self.filename, lineno, column) cmlenz@1: cmlenz@1: def handle_starttag(self, tag, attrib): cmlenz@26: fixed_attrib = [] cmlenz@26: for name, value in attrib: # Fixup minimized attributes cmlenz@26: if value is None: cmlenz@26: value = name cmlenz@293: fixed_attrib.append((name, unicode(stripentities(value)))) cmlenz@26: cmlenz@182: self._enqueue(START, (QName(tag), Attrs(fixed_attrib))) cmlenz@1: if tag in self._EMPTY_ELEMS: cmlenz@144: self._enqueue(END, QName(tag)) cmlenz@1: else: cmlenz@1: self._open_tags.append(tag) cmlenz@1: cmlenz@1: def handle_endtag(self, tag): cmlenz@1: if tag not in self._EMPTY_ELEMS: cmlenz@1: while self._open_tags: cmlenz@1: open_tag = self._open_tags.pop() cmlenz@1: if open_tag.lower() == tag.lower(): cmlenz@1: break cmlenz@144: self._enqueue(END, QName(open_tag)) cmlenz@144: self._enqueue(END, QName(tag)) cmlenz@1: cmlenz@1: def handle_data(self, text): cmlenz@144: self._enqueue(TEXT, text) cmlenz@1: cmlenz@1: def handle_charref(self, name): cmlenz@144: text = unichr(int(name)) cmlenz@144: self._enqueue(TEXT, text) cmlenz@1: cmlenz@1: def handle_entityref(self, name): cmlenz@144: try: cmlenz@144: text = unichr(htmlentitydefs.name2codepoint[name]) cmlenz@144: except KeyError: cmlenz@144: text = '&%s;' % name cmlenz@144: self._enqueue(TEXT, text) cmlenz@1: cmlenz@1: def handle_pi(self, data): cmlenz@1: target, data = data.split(maxsplit=1) cmlenz@1: data = data.rstrip('?') cmlenz@144: self._enqueue(PI, (target.strip(), data.strip())) cmlenz@1: cmlenz@1: def handle_comment(self, text): cmlenz@144: self._enqueue(COMMENT, text) cmlenz@1: cmlenz@1: cmlenz@1: def HTML(text): cmlenz@1: return Stream(list(HTMLParser(StringIO(text)))) cmlenz@144: cmlenz@146: def _coalesce(stream): cmlenz@144: """Coalesces adjacent TEXT events into a single event.""" cmlenz@146: textbuf = [] cmlenz@146: textpos = None cmlenz@146: for kind, data, pos in chain(stream, [(None, None, None)]): cmlenz@146: if kind is TEXT: cmlenz@146: textbuf.append(data) cmlenz@146: if textpos is None: cmlenz@146: textpos = pos cmlenz@146: else: cmlenz@146: if textbuf: cmlenz@146: yield TEXT, u''.join(textbuf), textpos cmlenz@146: del textbuf[:] cmlenz@146: textpos = None cmlenz@146: if kind: cmlenz@146: yield kind, data, pos