Index: extract.py =================================================================== --- extract.py (revision 349) +++ extract.py (working copy) @@ -127,6 +127,16 @@ if i: data = getcomment(e) result.setdefault("comments", []).append(data) + elif key == "Changes:": + # annotate attachment info with dates and senders + for i, e in enumerate(td.findall("table/tr")): + if i and gettext(e[0]).strip() == "File Added": + # the file ID is in the second cell + file_id = gettext(e[1]).strip().split(":")[0] + for data in result.get("attachments", []): + if data.get("file_id") == file_id: + data["date"] = gettext(e[2]).strip() + data["sender"] = gettext(e[3]).strip() # nuke table contents for e in td.findall(".//td"): e.clear() Index: htmlload.py =================================================================== --- htmlload.py (revision 349) +++ htmlload.py (working copy) @@ -31,7 +31,7 @@ f.close() except: # FIXME: needs locking! (atomic rename should be good enough) - os.system("tidy -qmn -asxml \"%s\" >tidy.out 2>tidy.err" % file) + os.system("tidy -qmn -asxml --force-output yes \"%s\" >tidy.out 2>tidy.err" % file) f = open(file, "rb") try: elem = loader(f) # if this fails, the file was too broken Index: getindex.py =================================================================== --- getindex.py (revision 349) +++ getindex.py (working copy) @@ -31,13 +31,13 @@ urllib.urlretrieve(URL % (group, tracker, offset), "temp.html") tree = htmlload.load("temp.html") for table in tree.getiterator("table"): - if table[0].get("bgcolor") == "#ffffff": + if len(table) > 0 and table[0].get("bgcolor") == "#FFFFFF": break else: return None # no index found data = [] for row in table: - if row.get("bgcolor") == "#ffffff": + if row.get("bgcolor") == "#FFFFFF": continue # print "id", row[0].text result = dict(