# # JavaXMLTreeBuilder # $Id$ # # an Jython SAX tree builder, based on the Java Simple API For # XML Parsing (SAX) 2.0. API, this builder fully supports namespaces # # history: # # 2005-02-01 Anthony Tarlano created # # Copyright (c) 2005 by Anthony Tarlano. All rights reserved. # # tarlano@users.sf.net # # Copyright (c) 1999-2004 by Fredrik Lundh. All rights reserved. # # fredrik@pythonware.com # http://www.pythonware.com # # -------------------------------------------------------------------- # The ElementTree toolkit is # # Copyright (c) 1999-2004 by Fredrik Lundh # # By obtaining, using, and/or copying this software and/or its # associated documentation, you agree that you have read, understood, # and will comply with the following terms and conditions: # # Permission to use, copy, modify, and distribute this software and # its associated documentation for any purpose and without fee is # hereby granted, provided that the above copyright notice appears in # all copies, and that both that copyright notice and this permission # notice appear in supporting documentation, and that the name of # Secret Labs AB or the author not be used in advertising or publicity # pertaining to distribution of the software without specific, written # prior permission. # # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE # OF THIS SOFTWARE. # -------------------------------------------------------------------- import string, sys import ElementTree import cStringIO from java.io import StringReader from org.xml.sax import InputSource from org.xml.sax.helpers import DefaultHandler from javax.xml.parsers import SAXParserFactory ## # Java ElementTree builder for XML source data. # # class TreeBuilder(DefaultHandler): def __init__(self, *argv): self._saxpf = SAXParserFactory.newInstance() self._saxpf.validating = self._saxpf.namespaceAware = 1 self._parser = parser = self._saxpf.newSAXParser() self._target = target = ElementTree.TreeBuilder() self._names = {} # name memo cache encoding = "utf-8" # TODO: support parser passed encoding self._doctype = None self._source = cStringIO.StringIO() self.entity = {} def startElement(self, uri, localname, name, attrs): #name = localname if len(uri): name = uri + '}' + name # FIXME: is this correct? /F attrib_in = {} for each in range(attrs.length): attrib_in[unicode(attrs.getQName(each))] = unicode(attrs.getValue(each)) self._start(unicode(name), attrib_in) def characters(self, characters, start, length): characters = "".join(characters[start:start+length]) self._data(characters) def endElement(self, uri, localname, name): if len(uri): name = uri + '}' + name self._end(unicode(name)) def notationDecl(self, name, publicId, systemId): print 'name:\t', name print 'publicId:\t', publicId print 'systemId:\t', systemId def processingInstruction(self, target, data): print 'target:\t', target print 'data:\t', data def resolveEntity(self, publicId, systemId): print 'publicId:\t', publicId print 'systemId:\t', systemId def skippedEntity(self, name): print 'name:\t', name def startPrefixMapping(self, prefix, uri): if 0: print 'prefix:\t', prefix print 'uri:\t', uri def endPrefixMapping(self, prefix): pass def unparsedEntityDecl(self, name, publicId, systemId, notationName): print 'name:\t', name print 'publicId:\t', publicId print 'systemId:\t', systemId print 'notationName:\t', notationName def _fixtext(self, text, raw=0): # remove the nsid from the text if not raw: try: if ':' in text: if ('}' in text): (ns,ln) = text.split('}') if ':' in ln: text = ns +'}'+ ln.split(':')[1] else: text = text.split(':')[1] except: pass # convert text string to ascii, if possible try: return str(text) # what if the default encoding is changed? except UnicodeError: return text def _fixname(self, key): # expand qname, and convert name string to ascii, if possible try: name = self._names[key] except KeyError: name = key if "}" in name: name = "{" + name self._names[key] = name = self._fixtext(name) return name def _start(self, tag, attrib_in): fixname = self._fixname tag = fixname(tag) attrib = {} for key, value in attrib_in.items(): attrib[fixname(key)] = self._fixtext(value) return self._target.start(tag, attrib) def _data(self, text): return self._target.data(self._fixtext(text)) def _end(self, tag): return self._target.end(self._fixname(tag)) def _default(self, text): prefix = text[:1] if prefix == "&": # deal with undefined entities try: self._target.data(self.entity[text[1:-1]]) except KeyError: raise expat.error( "undefined entity %s: line %d, column %d" % (text, self._parser.ErrorLineNumber, self._parser.ErrorColumnNumber) ) elif prefix == "<" and text[:9] == "": self._doctype = None return text = string.strip(text) if not text: return self._doctype.append(text) n = len(self._doctype) if n > 2: type = self._doctype[1] if type == "PUBLIC" and n == 4: name, type, pubid, system = self._doctype elif type == "SYSTEM" and n == 3: name, type, system = self._doctype pubid = None else: return if pubid: pubid = pubid[1:-1] self.doctype(name, pubid, system[1:-1]) self._doctype = None ## # Handle doctype declaration. def doctype(self, name, pubid, system): pass ## # Feed data to the parser. # FIXME: Antony writes: Fredrik, the java parser wants the whole # InputSource, File or URI. In order to be compatible with the # __init__, parse, close interface and use the SAX parser # TreeBuilder the source file is reconstructed and parsed in close # FIXME: it would probably make sense to change ET's "parse" # method to look for a "parsefile" method (or something similar) # before falling back on the feed/close interface. cf. cET. /F def feed(self, data): self._source.write(data) ## # Finish feeding data to the parser. def close(self): self._source.seek(0) self._parser.parse(InputSource(StringReader( self._source.read())), self) tree = self._target.close() del self._target, self._parser # get rid of circular references return tree # ==================================================================== if __name__=='__main__': tb = TreeBuilder() tb.feed(sys.argv[1]) tb.close()