# $Id$ # get text and xml versions of selected pages import os, sys import infogamibot import urllib2 ET = infogamibot.ET try: site = sys.argv[1] if "@" not in site: raise IndexError except IndexError: print "usage: infogami-backup user:pass@site [index]" sys.exit(1) try: index = sys.argv[2] except: index = "/_special/index" # use 5 minute caching to force if-modified-since behaviour and # allow restarts bot = infogamibot.Bot(site, cache=300) backup_dir = bot.site if backup_dir.startswith("http://"): backup_dir = backup_dir[7:] backup_dir = backup_dir.split(".")[0] if not os.path.isdir(backup_dir): os.makedirs(backup_dir) indexes = bot.get_index(index) indexes.append((index, "Index")) for page, title in indexes: if page.startswith("http://"): continue name, e = os.path.splitext(page.lstrip("/")) parts = page.split("/") if "_comments" in parts or "_special" in parts: continue # get rendered page try: result = bot.get_body(page) except urllib2.HTTPError, v: if v.code == 404: continue # deleted pages are included in the main index else: raise html = ET.Element("html") head = ET.SubElement(html, "head") if result.title is not None: head.append(result.title) if result.body is not None: html.append(result.body) try: f = open(os.path.join(backup_dir, name + ".xml"), "w") except IOError, v: print page, "skipped:", v continue ET.ElementTree(html).write(f) f.close() print page, "->", f.name, "ok" # get source result = bot.get_source(page) f = open(os.path.join(backup_dir, name + ".txt"), "w") f.write((result.title.encode("utf-8") or "") + "\n\n") if result.body: f.write(result.body.encode("utf-8")) f.close() print page, "->", f.name, "ok"