# $Id$ # tracker item data extractor import os, re import htmlload import mimetools ET = htmlload.ET gettext = htmlload.gettext def normalize(text): return " ".join(text.split()) def getcomment(elem): text = gettext(elem).strip() data = dict(description=text) sender = user_id = None for line in text.split("\n"): if not line: break if line.startswith("Date:") and not data.has_key('date'): data["date"] = line[5:].strip() # Only get the first # date. There's at least # one issue in the SF # tracker that has a # ^Date: somewhere in the # content. elif line.startswith("Sender:"): data["sender"] = line[7:].strip() elif line.startswith("user_id="): data["sender_user_id"] = line[8:].strip() return data def getchangenote(elem): c = elem.getchildren() return {'field':gettext(c[0]), 'oldvalue':gettext(c[1]), 'date':gettext(c[2]), 'change_by':gettext(c[3])} KEYMAP = { "Assigned To:": "assigned_to", "Category:": "category", "Changed to Closed status by:": "closed_by", "Changed to Deleted status by:": "deleted_by", "Changed to Pending status by:": "pending_by", "Closed as of:": "date_closed", "Date Last Updated:": "date_last_updated", "Date Submitted:": "date_submitted", "Deleted as of:": "date_deleted", "Group:": "group", "Last Updated By:": "last_updated_by", "Number of Attachments:": "number_of_attachments", "Number of Comments:": "number_of_comments", "Pending as of:": "date_pending", "Private:": "private", "Priority:": "priority", "Resolution:": "resolution", "Status:": "status", "Submitted By:": "submitted_by", "Summary:": "summary", } ## # Extracts information for a tracker item, based on the contents of the # 'page' file. # # @param pagefile Page file name. # @return A dictionary containing extracted information. def extract(pagefile): tree = htmlload.load(pagefile) for elem in tree.getiterator("div"): if elem.get("id") == "innerframe": break else: print "---", "no data found" return result = {} title = normalize(elem.findtext("h2", "")) m = re.match("\[ (\d+) \] (.+)$", title) if m: result["item_id"], result["title"] = m.groups() else: # FIXME: extract issue_id from filename ? result["title"] = title table = elem.find("table") # locate the description for tr in table: if len(tr) == 1 and tr[0].get("colspan") == "2": # map
to newlines for br in tr.findall(".//br"): br.text = chr(0) # temporarily use NULL as line terminator if br.tail and br.tail.startswith("\n"): br.tail = br.tail[1:] # trip extra newlines text = gettext(tr) if text.startswith("\n\n\t\t\t"): text = text[5:] text = text.replace("\n", " ") text = text.replace(chr(0), "\n") text = text.rstrip() result["description"] = text tr.clear() break # dig out other properties for td in table.findall(".//td"): if td and td[0].tag == "b": key = normalize(gettext(td[0])) if key.endswith(" (?)"): key = key[:-4] td[0].clear() value = normalize(gettext(td)) try: result[KEYMAP[key]] = value except KeyError: result.setdefault("extra", []).append((key, value)) td.clear() elif td and td[0].tag == "h3": key = gettext(td[0]).strip() if key == "Followups:": for i, e in enumerate(td.findall("table/tr/td")): if i: data = getcomment(e) result.setdefault("comments", []).append(data) elif key == "Changes:": for i, e in enumerate(td.findall("table/tr")[1:]): data = getchangenote(e) result.setdefault("changes", []).append(data) # nuke table contents for e in td.findall(".//td"): e.clear() td.clear() elif td and td[0].tag == "h4": key = gettext(td[0]).strip() if key == "Attached Files:": for i, e in enumerate(td.findall("table/tr")): if len(e) == 1: continue # no files attached if i: href = e[3][0].get("href") data = dict( title=normalize(gettext(e[1])), description=normalize(gettext(e[2])), link=href ) m = re.match( "/tracker/download\.php.*file_id=(\d+)", href ) if m: data["file_id"] = m.group(1) head, _ = extractdata(pagefile, m.group(1)) def copy(k): value = head.get(k, "").strip() if value: data[k.replace("-", "_")] = value copy("content-disposition") copy("content-length") copy("content-type") copy("etag") result.setdefault("attachments", []).append(data) # nuke table contents for e in td.findall(".//td"): e.clear() td.clear() return result ## # Loads data associated with a tracker item. # # @param pagefile Page file name. # @param file_id Attachment identifier (from the "attachments" list). # @return A 2-tuple containing a Message instance and a file handle, # as a string buffer. To get the data, call the read # method on the file handle. def extractdata(pagefile, file_id): datafile = pagefile[:-8] + "data-" + file_id + ".dat" file = open(datafile, "rb") message = mimetools.Message(file) return message, file ## # Lists all page files in a given tracker directory, sorted by item # number. # # @param tracker Tracker directory. # @return A sorted list of page file names. def getpagefiles(tracker): match = re.compile("item-(\d+)-page.xml$").match data = [] for file in os.listdir(tracker): m = match(file) if m: data.append((int(m.group(1)), file)) data.sort() return [os.path.join(tracker, item[1]) for item in data] # -------------------------------------------------------------------- if __name__ == "__main__": # sanity check TESTFILE = "tracker-105470/item-749831-page.xml" TESTFILE = "tracker-105470/item-1076985-page.xml" TESTFILE = "tracker-100103/item-526519-page.xml" TESTFILE = "tracker-100103/item-975768-page.xml" info = extract(TESTFILE) for key, value in sorted(info.items()): print key, repr(value)[:60] if key == "attachments" and value: for attachment in value: print "attachment" for key, value in sorted(attachment.items()): print " ", key, repr(value)[:60] message, file = extractdata(TESTFILE, attachment["file_id"]) print message data = file.read() print len(data), "bytes", repr(data[:20]) elif key == "comments": for comment in value: print "comment" for key, value in sorted(comment.items()): print " ", key, repr(value)[:60] elif key == "description": print value