# $Id$ # generate comments page for an infogami site import sys, re, time import infogamibot import urlparse, urllib2 # -------------------------------------------------------------------- # configuration INDEX = "_special/index" # cache entries older than this are checked CACHE_AGE = 10 * 60 # 10 minutes # comments older than this are left out MAX_AGE = 10 * 24 * 3600 # 10 days # comments older than this are shown with titles only FRESH_AGE = 3 * 24 * 3600 # 1 day FRESH_COUNT = 5 # at least this many # -------------------------------------------------------------------- now = time.time() MAX_AGE = time.gmtime(now - MAX_AGE) FRESH_AGE = time.gmtime(now - FRESH_AGE) ET = infogamibot.ET def gettext(elem): text = elem.text or "" for e in elem: text += gettext(e) if e.tail: text += e.tail return text try: site = sys.argv[1] if "@" not in site: raise IndexError except IndexError: print "usage: infogami-comments user:pass@site" sys.exit(1) bot = infogamibot.Bot(site, cache=CACHE_AGE) # get list of comments comment_list = [] for page, title in bot.get_index(INDEX): if page.startswith("/%2F_"): continue # skip toplevel comments m = re.match(".*/_comments/(\w+)$", page) if not m: continue comment_id = m.group(1).zfill(10) comment_list.append((comment_id, page)) comment_list.sort() comment_list.reverse() # get comment information, starting with newest and going backwards comments = [] titles = {} for _, page in comment_list: # print "---", "fetch", page try: result = bot.get_html(page) # get comment timestamp except urllib2.HTTPError, v: if v.code == 404: continue print "===", "error reading", bot.site + page, "-", v continue except SyntaxError, v: print "===", "error parsing", bot.site + page, "-", v continue # check age mtime = result.headers.getdate("last-modified") if len(comments) > 10 and mtime < MAX_AGE: break # extract comment text text = author = "" for elem in result.getiterator(): if elem.get("class") != "page": continue for e in elem.getiterator(): if e.get("class") == "dateline" and not author: author = gettext(e).strip() e.clear() text = gettext(elem).strip() break # locate actual page page = page.split("/_comments")[0] comments.append((mtime, page, text, author)) if page not in titles: # print "---", "fetch", page, "title" try: result = bot.get_body(page) except urllib2.HTTPError, v: titles[page] = "(unknown)" else: if result.title is not None: titles[page] = infogamibot.escape(result.title.text) else: print "===", "unknown title for", page titles[page] = "(unknown)" comments.sort() comments.reverse() body = [ "This list is updated by an off-site script, at irregular intervals.\n", "\n", ] last_comment = None for index, (date, page, text, author) in enumerate(comments): if not date: continue timestamp = "%04d-%02d-%02d %02d:%02d" % date[:5] body.append("%s - [%s](%s#end) \n" % ( timestamp, titles[page], page )) if index < FRESH_COUNT or date >= FRESH_AGE: text = " ".join(text.split()[:20]) + "..." m = re.search("by\s(\w+)\s+#", author) if m: text = "**" + m.group(1) + ":** " + infogamibot.escape(text) if not last_comment: last_comment = text # most recent comment body.append("> " + text + "\n\n") if bot.update("comments", None, "".join(body)) is not None: print "---", bot.site + "/comments", "UPDATED!" print print last_comment