# $Id$
# XHTML/HTML loader
import os, re
# slightly silly
try:
import xml.etree.cElementTree as ET
except ImportError:
try:
import cElementTree as ET
except ImportError:
import elementtree.ElementTree as ET
NS_XHTML = "{http://www.w3.org/1999/xhtml}"
##
# Loads an XHTML or HTML file into an Element structure. Note that
# HTML files are converted to XHTML in place, via tidy.
def load(file, loader=None):
if not loader:
loader = ET.parse
try:
f = open(file, "rb")
try:
elem = loader(f)
finally:
f.close()
except:
# FIXME: needs locking! (atomic rename should be good enough)
os.system("tidy -qmn -asxml \"%s\" >tidy.out 2>tidy.err" % file)
f = open(file, "rb")
try:
try:
elem = loader(f)
finally:
f.close()
except:
emergency_cleanup(file)
f = open(file, "rb")
try:
elem = loader(f) # if this fails, the file was too broken
finally:
f.close()
# clean up namespace
for node in elem.getiterator():
if node.tag.startswith(NS_XHTML):
node.tag = node.tag[len(NS_XHTML):]
return elem
##
# Gets the inner text from an element subtree (the actual text content,
# minus the tags, that is).
def gettext(elem):
text = elem.text or ""
for e in elem:
text += gettext(e)
if e.tail:
text += e.tail
return text
##
# Emergency HTML cleanup.
def emergency_cleanup(file):
f = open(file, "rb")
data = f.read()
f.close()
# nuke script tags
data = re.sub("(?s)", "", data)
# ... add more stuff here ... (see e.g. getpages)
f = open(file, "wb")
f.write(data)
f.close()