# $Id$
# XHTML/HTML loader
import os, re
# slightly silly
try:
# import xml.etree.cElementTree as ET # may crash in 2.5b2 !?
import xml.etree.ElementTree as ET
except ImportError:
try:
import cElementTree as ET
except ImportError:
import elementtree.ElementTree as ET
import BeautifulSoup as BS
import htmlentitydefs, re
pattern = re.compile("&(\w+);")
def unescape(string):
# work around oddities in BeautifulSoup's entity handling
def unescape_entity(m, defs=htmlentitydefs.entitydefs):
try:
return defs[m.group(1)]
except KeyError:
return m.group(0) # use as is
return pattern.sub(unescape_entity, string)
##
# Loads an XHTML or HTML file into an Element structure, using Leonard's
# BeautifulSoup parser.
def load(file):
def emit(soup):
if isinstance(soup, BS.NavigableString):
bob.data(unescape(soup))
else:
bob.start(soup.name, dict((k, unescape(v)) for k, v in soup.attrs))
for s in soup:
emit(s)
bob.end(soup.name)
# determine encoding (the document charset is not reliable)
text = open(file).read()
try:
encoding = "utf-8"
unicode(text, encoding)
except UnicodeError:
encoding = "iso-8859-1"
soup = BS.BeautifulSoup(
text, convertEntities="html", fromEncoding=encoding
)
# build the tree
bob = ET.TreeBuilder()
for s in soup:
emit(s)
return bob.close()
##
# Gets the inner text from an element subtree (the actual text content,
# minus the tags, that is).
def gettext(elem):
text = elem.text or ""
for e in elem:
text += gettext(e)
if e.tail:
text += e.tail
return text
##
# Download URL.
def download(file, outfile=None):
import urllib
file = urllib.urlopen(file)
text = file.read()
if text.find("send-email-to-ipblocked-at-sourceforge-dot-net") >= 0:
raise SystemExit(
"*** temporarily blocked by sourceforge! (please try again a little later)"
)
if outfile:
out = open(outfile + ".tmp", "w")
out.write(text)
out.close()
try:
os.remove(outfile)
except:
pass
os.rename(outfile + ".tmp", outfile)
return file.headers, text