# $Id$
# tracker item data extractor
import os, re
import htmlload
import mimetools
ET = htmlload.ET
def gettext(elem):
text = elem.text or ""
for e in elem:
text += gettext(e)
if e.tail:
text += e.tail
return text
def normalize(text):
return " ".join(text.split())
def getcomment(elem):
text = gettext(elem).strip()
data = dict(description=text)
sender = user_id = None
for line in text.split("\n"):
if not line:
break
if line.startswith("Date:") and not data.has_key('date'):
data["date"] = line[5:].strip() # Only get the first
# date. There's at least
# one issue in the SF
# tracker that has a
# ^Date: somewhere in the
# content.
elif line.startswith("Sender:"):
data["sender"] = line[7:].strip()
elif line.startswith("user_id="):
data["sender_user_id"] = line[8:].strip()
return data
def getchangenote(elem):
c = elem.getchildren()
return {'field':gettext(c[0]),
'oldvalue':gettext(c[1]),
'date':gettext(c[2]),
'change_by':gettext(c[3])}
KEYMAP = {
"Assigned To:": "assigned_to",
"Category:": "category",
"Changed to Closed status by:": "closed_by",
"Changed to Deleted status by:": "deleted_by",
"Changed to Pending status by:": "pending_by",
"Closed as of:": "date_closed",
"Date Last Updated:": "date_last_updated",
"Date Submitted:": "date_submitted",
"Deleted as of:": "date_deleted",
"Group:": "group",
"Last Updated By:": "last_updated_by",
"Number of Attachments:": "number_of_attachments",
"Number of Comments:": "number_of_comments",
"Pending as of:": "date_pending",
"Priority:": "priority",
"Resolution:": "resolution",
"Status:": "status",
"Submitted By:": "submitted_by",
"Summary:": "summary",
}
##
# Extracts information for a tracker item, based on the contents of the
# 'page' file.
#
# @param pagefile Page file name.
# @return A dictionary containing extracted information.
def extract(pagefile):
tree = htmlload.load(pagefile)
for elem in tree.getiterator("div"):
if elem.get("id") == "innerframe":
break
else:
print "---", "no data found"
exit
result = {}
title = normalize(elem.findtext("h2", ""))
m = re.match("\[ (\d+) \] (.+)$", title)
if m:
result["item_id"], result["title"] = m.groups()
else:
# FIXME: extract issue_id from filename ?
result["title"] = title
table = elem.find("table")
# locate the description
i = 0
for tr in table:
if len(tr) == 1 and tr[0].get("colspan") == "2":
if i == 2:
# map
to newlines
for br in tr.findall(".//br"):
br.text = chr(0) # temporarily use NULL as line terminator
if br.tail and br.tail.startswith("\n"):
br.tail = br.tail[1:] # trip extra newlines
text = gettext(tr)
if text.startswith("\n"):
text = text[1:]
text = text.replace("\n", " ")
text = text.replace(chr(0), "\n")
text = text.rstrip()
result["description"] = text
tr.clear()
break
i += 1
# dig out other properties
for td in table.findall(".//td"):
if td and td[0].tag == "b":
key = normalize(gettext(td[0]))
if key.endswith(" (?)"):
key = key[:-4]
td[0].clear()
value = normalize(gettext(td))
try:
result[KEYMAP[key]] = value
except KeyError:
result.setdefault("extra", []).append((key, value))
td.clear()
elif td and td[0].tag == "h3":
key = gettext(td[0]).strip()
if key == "Followups:":
for i, e in enumerate(td.findall("table/tr/td")):
if i:
data = getcomment(e)
result.setdefault("comments", []).append(data)
elif key == "Changes:":
for i, e in enumerate(td.findall("table/tr")[1:]):
data = getchangenote(e)
result.setdefault("changes", []).append(data)
# nuke table contents
for e in td.findall(".//td"):
e.clear()
td.clear()
elif td and td[0].tag == "h4":
key = gettext(td[0]).strip()
if key == "Attached Files:":
for i, e in enumerate(td.findall("table/tr")):
if len(e) == 1:
continue # no files attached
if i:
href = e[3][0].get("href")
data = dict(
title=normalize(gettext(e[1])),
description=normalize(gettext(e[2])),
link=href
)
m = re.match(
"/tracker/download\.php.*file_id=(\d+)", href
)
if m:
data["file_id"] = m.group(1)
head, _ = extractdata(pagefile, m.group(1))
def copy(k):
value = head.get(k, "").strip()
if value:
data[k.replace("-", "_")] = value
copy("content-disposition")
copy("content-length")
copy("content-type")
copy("etag")
result.setdefault("attachments", []).append(data)
# nuke table contents
for e in td.findall(".//td"):
e.clear()
td.clear()
return result
##
# Loads data associated with a tracker item.
#
# @param pagefile Page file name.
# @param file_id Attachment identifier (from the "attachments" list).
# @return A 2-tuple containing a Message instance and a file handle,
# as a string buffer. To get the data, call the read
# method on the file handle.
def extractdata(pagefile, file_id):
datafile = pagefile[:-8] + "data-" + file_id + ".dat"
file = open(datafile, "rb")
message = mimetools.Message(file)
return message, file
##
# Lists all page files in a given tracker directory, sorted by item
# number.
#
# @param tracker Tracker directory.
# @return A sorted list of page file names.
def getpagefiles(tracker):
match = re.compile("item-(\d+)-page.xml$").match
data = []
for file in os.listdir(tracker):
m = match(file)
if m:
data.append((int(m.group(1)), file))
data.sort()
return [os.path.join(tracker, item[1]) for item in data]
# --------------------------------------------------------------------
if __name__ == "__main__":
# sanity check
TESTFILE = "tracker-105470/item-749831-page.xml"
TESTFILE = "tracker-105470/item-1076985-page.xml"
info = extract(TESTFILE)
for key, value in sorted(info.items()):
print key, repr(value)[:60]
if key == "attachments" and value:
for attachment in value:
print "attachment"
for key, value in sorted(attachment.items()):
print " ", key, repr(value)[:60]
message, file = extractdata(TESTFILE, attachment["file_id"])
print message
data = file.read()
print len(data), "bytes", repr(data[:20])
elif key == "comments":
for comment in value:
print "comment"
for key, value in sorted(comment.items()):
print " ", key, repr(value)[:60]
elif key == "description":
print value