# $Id$ # tools to convert MHT document to individual HTML pages import email import StringIO import urlparse import ElementSoup # see http://effbot.org/zone/element-soup.htm ET = ElementSoup.ET ## # Gets inner text from an element. def gettext(elem): text = elem.text or "" for e in elem: text += gettext(e) if e.tail: text += e.tail return text ## # Wraps an RFC 2557-style web archive file (e.g. an MHT file generated # by Microsoft Word). class WebArchive: def __init__(self, file): if not hasattr(file, "read"): file = open(file) msg = email.message_from_file(file) master = None # first html resource resources = {} # other resources for part in msg.walk(): location = part.get("content-location") if location: if not master and part.get_content_type() == "text/html": master = part else: resources[location] = part else: pass # print "---", "skipping", part.get_content_type(), "node" self.master = master self.resources = resources # parse data = master.get_payload(decode=True) data = ElementSoup.parse(StringIO.StringIO(data)) # nuke the default namespace (which is a bogus HTML namespace) del data.attrib["xmlns"] # roundtrip the data to get proper XML self.tree = ET.XML(ET.tostring(data)) def get_base_uri(self): return self.master.get("content-location") def get_html(self): return self.tree def get_image_links(self): base = self.get_base_uri() for elem in self.tree.findall(".//img"): src = urlparse.urljoin(base, elem.get("src", "")) try: part = self.resources[src] except KeyError: pass else: name = part.get("content-location") type = part.get_content_type() data = part.get_payload(decode=True) # FIXME: use resource object instead of (name, type, data)? yield elem, name, type, data def get_resource(self, name): part = self.resources[name] type = part.get_content_type() data = part.get_payload(decode=True) # FIXME: use resource object instead of (type, data)? return type, data if __name__ == "__main__": import os archive = WebArchive("samples/test.mht") # fix up image links for elem, name, type, data in archive.get_image_links(): f = open(os.path.basename(name), "wb") f.write(data) f.close() elem.set("src", f.name) print f.name, "ok" # extract html body html = archive.get_html() f = open("out.html", "wb") f.write(ET.tostring(html)) f.close() print f.name, "ok"