# $Id$
# silly xml-ish parser. this is good enough to parse the output from
# "tidy -n -asxml", but that's about all it's designed for...
##
# Simple and silly XML parser, for machine-generated XML that uses
# only a small subset of the full standard.
##
# (in case you wonder, I had to deploy some ET-based code on a server
# with Python 2.2, but no pyexpat. I could have used xmllib, but this
# was more fun... and the result is several times faster (within 10%
# of a pyexpat-based solution, in fact...) /F)
import re
try:
import cElementTree as ET
except ImportError:
import elementtree.ElementTree as ET
ENTITIES = {
"amp": ">",
"lt": "<", "gt": ">",
"apos": "'", "quot": '"'
}
def _fixentity(tok):
try:
if tok[0] == "x":
return unichr(int(tok[1:], 16))
return unichr(int(tok))
except ValueError:
return ENTITIES[tok]
_parse_xml = re.compile("<([^>]+)>|?([^;]+);|([^<&]+)")
_parse_attrib = re.compile("(\w+)=(\"[^\"]*\"|'[^']*')")
_parse_entity = re.compile("?(\w+);")
##
# Parses a subset of XML into an element tree. This parser assumes
# that the XML is well-formed, encoded using US-ASCII or UTF-8, and
# only uses decimal character references and default namespaces. If
# you're parsing XML from an arbitrary source, please use a real
# parser.
#
# @param file Source file or file name.
# @param target Optional target. If omitted, a standard TreeBuilder
# is used to build the element tree.
# @param encoding Optional encoding. If omitted, it's set to UTF-8.
# The parser does not look at the XML header.
# @return An ElementTree instance.
def parse(file, target=None, encoding="utf-8"):
if not hasattr(file, "read"):
file = open(file)
if not target:
target = ET.TreeBuilder()
for match in _parse_xml.finditer(unicode(file.read(), encoding)):
typ = match.lastindex
tok = match.group(typ)
if typ == 1:
#
if tok[0] == "/":
#
target.end(tok[1:])
elif tok[0] == "!":
#
continue
else:
#
close = tok[-1] == "/"
tok = tok.split(None, 1)
attrib = {}
if len(tok) > 1:
for key, value in _parse_attrib.findall(tok[1]):
if key != "xmlns":
value = _parse_entity.sub(
lambda m: _fixentity(m.group(1)), value[1:-1]
)
attrib[key] = value
target.start(tok[0], attrib)
if close:
target.end(tok[0])
elif typ == 2:
# &something;
target.data(_fixentity(tok))
else:
# something else
target.data(tok)
return ET.ElementTree(target.close())
if __name__ == "__main__":
# performance & sanity check
import time
t0 = time.clock()
tree = parse("pytut.html")
print time.clock() - t0
tree.write("pytut.xml")
# benchmark
t0 = time.clock()
tree = ET.parse("pytut.html")
print time.clock() - t0
from elementtree import SimpleXMLTreeBuilder
t0 = time.clock()
tree = ET.parse("pytut.html", SimpleXMLTreeBuilder.TreeBuilder())
print time.clock() - t0