# $Id$ # XHTML/HTML loader import os, re # slightly silly try: # import xml.etree.cElementTree as ET # may crash in 2.5b2 !? import xml.etree.ElementTree as ET except ImportError: try: import cElementTree as ET except ImportError: import elementtree.ElementTree as ET import BeautifulSoup as BS import htmlentitydefs, re pattern = re.compile("&(\w+);") def unescape(string): # work around oddities in BeautifulSoup's entity handling def unescape_entity(m, defs=htmlentitydefs.entitydefs): try: return defs[m.group(1)] except KeyError: return m.group(0) # use as is return pattern.sub(unescape_entity, string) ## # Loads an XHTML or HTML file into an Element structure, using Leonard's # BeautifulSoup parser. def load(file): def emit(soup): if isinstance(soup, BS.NavigableString): bob.data(unescape(soup)) else: bob.start(soup.name, dict((k, unescape(v)) for k, v in soup.attrs)) for s in soup: emit(s) bob.end(soup.name) # determine encoding (the document charset is not reliable) text = open(file).read() try: encoding = "utf-8" unicode(text, encoding) except UnicodeError: encoding = "iso-8859-1" soup = BS.BeautifulSoup( text, convertEntities="html", fromEncoding=encoding ) # build the tree bob = ET.TreeBuilder() for s in soup: emit(s) return bob.close() ## # Gets the inner text from an element subtree (the actual text content, # minus the tags, that is). def gettext(elem): text = elem.text or "" for e in elem: text += gettext(e) if e.tail: text += e.tail return text ## # Download URL. def download(file, outfile=None): import urllib file = urllib.urlopen(file) text = file.read() if text.find("send-email-to-ipblocked-at-sourceforge-dot-net") >= 0: raise SystemExit( "*** temporarily blocked by sourceforge! (please try again a little later)" ) if outfile: out = open(outfile + ".tmp", "w") out.write(text) out.close() try: os.remove(outfile) except: pass os.rename(outfile + ".tmp", outfile) return file.headers, text