# $Id$ # XHTML/HTML loader import os, re # slightly silly try: import xml.etree.cElementTree as ET except ImportError: try: import cElementTree as ET except ImportError: import elementtree.ElementTree as ET NS_XHTML = "{http://www.w3.org/1999/xhtml}" ## # Loads an XHTML or HTML file into an Element structure. Note that # HTML files are converted to XHTML in place, via tidy. def load(file, loader=None): if not loader: loader = ET.parse try: f = open(file, "rb") try: elem = loader(f) finally: f.close() except: # FIXME: needs locking! (atomic rename should be good enough) os.system("tidy -qmn -asxml \"%s\" >tidy.out 2>tidy.err" % file) f = open(file, "rb") try: try: elem = loader(f) finally: f.close() except: emergency_cleanup(file) f = open(file, "rb") try: elem = loader(f) # if this fails, the file was too broken finally: f.close() # clean up namespace for node in elem.getiterator(): if node.tag.startswith(NS_XHTML): node.tag = node.tag[len(NS_XHTML):] return elem ## # Gets the inner text from an element subtree (the actual text content, # minus the tags, that is). def gettext(elem): text = elem.text or "" for e in elem: text += gettext(e) if e.tail: text += e.tail return text ## # Emergency HTML cleanup. def emergency_cleanup(file): f = open(file, "rb") data = f.read() f.close() # nuke script tags data = re.sub("(?s)", "", data) # ... add more stuff here ... (see e.g. getpages) f = open(file, "wb") f.write(data) f.close()