# $Id$ # testbench for writer code. currently requires Python 2.4 or later. import re, sys try: import collections deque = collections.dequeu except (ImportError, AttributeError): deque = list # marginally slower, for our use case ## # Helper to traverse all elements in a subtree. This is a generator # that yields 'start', 'data', and 'end' events (similar to the code # XMLParser class). def traverse(elem): # generator-based version yield "start", elem if elem.text: yield "data", elem.text for e in elem: for i in traverse(e): yield i yield "end", elem if elem.tail: yield "data", elem.tail def traverse(elem): # ugly but efficient stack-based version of the above stack1 = deque(); stack2 = deque() pop1 = stack1.pop; append1 = stack1.append; extend1 = stack1.extend pop2 = stack2.pop; append2 = stack2.append; extend2 = stack2.extend append1("start"); append2(elem) while stack1: action = pop1(); elem = pop2() if action == "start": append1("end"); append2(elem) if len(elem): extend1(["start"] * len(elem)); extend2(reversed(elem)) yield action, elem if elem.text: yield "data", elem.text else: yield action, elem if elem.tail: yield "data", elem.tail ## # Standard XML output stream. # # @param output Output handler. This object is called with 8-bit string # fragments. To write to a file-like object, pass in a reference to # the write method. # @param encoding Output encoding. class xml_stream(object): def __init__(self, output, encoding): def encode(text): return text.encode(encoding) self.encode = encode self.output_raw = output def output_cdata(text): # it's worth avoiding do-nothing calls for strings that are # shorter than 500 character, or so. assume that's, by far, # the most common case. if "&" in text: text = text.replace("&", "&") if "<" in text: text = text.replace("<", "<") if ">" in text: text = text.replace(">", ">") output(text.encode(encoding, "xmlcharrefreplace")) self.output_cdata = output_cdata def output_attrib(text): if "&" in text: text = text.replace("&", "&") if '"' in text: text = text.replace('"', """) output('"') output(text.encode(encoding, "xmlcharrefreplace")) output('"') self.output_attrib = output_attrib ## # Standard XML writer. This writer is similar to the default writer # in ElementTree, but handles namespaces a bit better, and is a bit # faster. It currently works with arbitrary ASCII-compatible # encodings, but doesn't yet support QName attribute values or # text values. def write_xml(tree, stream, namespace_map=None, default_namespace=None): # rough profile: for 1.6-second test case, # 0.2 seconds are spent on prescan # 0.6 seconds are spent on generating output # 0.6 seconds are spent on encoding/escaping it # (ET 1.3 needs 2.5 seconds for the same task) # stream api encode = stream.encode output_raw = output = stream.output_raw output_cdata = stream.output_cdata output_attrib = stream.output_attrib qnames = {} # maps qnames to *encoded* prefix:local names namespaces = {} # maps uri:s to prefixes if namespace_map: if isinstance(namespace_map, dict): namespace_map = namespace_map.items() for prefix, uri in namespace_map: namespaces[uri] = prefix if default_namespace: namespaces[default_namespace] = "" def add_qname(qname): # calculate serialized qname representation if qname[:1] == "{": uri, tag = qname[1:].split("}", 1) try: prefix = namespaces[uri] except KeyError: namespaces[uri] = prefix = encode("ns%d" % len(namespaces)) if prefix: qnames[qname] = encode("%s:%s" % (prefix, tag)) else: qnames[qname] = encode(tag) # default element else: qnames[qname] = encode(qname) # first pass: populate qname and namespaces table for elem in tree.getiterator(): if elem.tag not in qnames: add_qname(elem.tag) for key in elem.keys(): if key not in qnames: add_qname(key) # second pass: generate output for event, elem in traverse(tree): if event == "data": # FIXME: handle qname objects (how?) output_cdata(elem) elif event == "start": output("<") output_raw(qnames[elem.tag]) # attributes for key in sorted(elem.keys()): output(" ") output_raw(qnames[key]) output("=") # FIXME: handle qname objects (how?) output_attrib(elem.get(key)) if namespaces: # XML namespace declarations for uri in sorted(namespaces): output(" ") prefix = namespaces[uri] if prefix: output("xmlns:") output(prefix) else: output("xmlns") # default namespace output("=") output_attrib(uri) namespaces = None if len(elem) or elem.text: output(">") else: output(" />") elif event == "end": if len(elem) or elem.text: output("") def write(tree, output, encoding="utf-8", **options): if not hasattr(tree, "tag"): # assume it's an ElementTree instance tree = tree.getroot() return write_xml(tree, xml_stream(output, encoding), **options)