# $Id$ # silly xml-ish parser. this is good enough to parse the output from # "tidy -n -asxml", but that's about all it's designed for... ## # Simple and silly XML parser, for machine-generated XML that uses # only a small subset of the full standard. ## # (in case you wonder, I had to deploy some ET-based code on a server # with Python 2.2, but no pyexpat. I could have used xmllib, but this # was more fun... and the result is several times faster (within 10% # of a pyexpat-based solution, in fact...) /F) import re try: import cElementTree as ET except ImportError: import elementtree.ElementTree as ET ENTITIES = { "amp": ">", "lt": "<", "gt": ">", "apos": "'", "quot": '"' } def _fixentity(tok): try: if tok[0] == "x": return unichr(int(tok[1:], 16)) return unichr(int(tok)) except ValueError: return ENTITIES[tok] _parse_xml = re.compile("<([^>]+)>|&#?([^;]+);|([^<&]+)") _parse_attrib = re.compile("(\w+)=(\"[^\"]*\"|'[^']*')") _parse_entity = re.compile("&#?(\w+);") ## # Parses a subset of XML into an element tree. This parser assumes # that the XML is well-formed, encoded using US-ASCII or UTF-8, and # only uses decimal character references and default namespaces. If # you're parsing XML from an arbitrary source, please use a real # parser. # # @param file Source file or file name. # @param target Optional target. If omitted, a standard TreeBuilder # is used to build the element tree. # @param encoding Optional encoding. If omitted, it's set to UTF-8. # The parser does not look at the XML header. # @return An ElementTree instance. def parse(file, target=None, encoding="utf-8"): if not hasattr(file, "read"): file = open(file) if not target: target = ET.TreeBuilder() for match in _parse_xml.finditer(unicode(file.read(), encoding)): typ = match.lastindex tok = match.group(typ) if typ == 1: # if tok[0] == "/": # target.end(tok[1:]) elif tok[0] == "!": # continue else: # close = tok[-1] == "/" tok = tok.split(None, 1) attrib = {} if len(tok) > 1: for key, value in _parse_attrib.findall(tok[1]): if key != "xmlns": value = _parse_entity.sub( lambda m: _fixentity(m.group(1)), value[1:-1] ) attrib[key] = value target.start(tok[0], attrib) if close: target.end(tok[0]) elif typ == 2: # &something; target.data(_fixentity(tok)) else: # something else target.data(tok) return ET.ElementTree(target.close()) if __name__ == "__main__": # performance & sanity check import time t0 = time.clock() tree = parse("pytut.html") print time.clock() - t0 tree.write("pytut.xml") # benchmark t0 = time.clock() tree = ET.parse("pytut.html") print time.clock() - t0 from elementtree import SimpleXMLTreeBuilder t0 = time.clock() tree = ET.parse("pytut.html", SimpleXMLTreeBuilder.TreeBuilder()) print time.clock() - t0