# $Id$ # get element list from HTML specification from urllib import urlopen from ElementSoup import ET, parse import keyword ELEMENTS = "http://www.w3.org/TR/html4/index/elements.html" ATTRIBUTES = "http://www.w3.org/TR/html4/index/attributes.html" def gettext(elem): text = elem.text or "" for e in elem: text += gettext(e) if e.tail: text += e.tail return text empty = [] print "from builder import E, ET" print print "# elements" for row in parse(urlopen(ELEMENTS)).findall(".//tr"): d = {} for cell in row: d[cell.get("title", "").strip()] = gettext(cell).strip() name = d.get("Name") if not name: continue name = name.lower().encode("ascii") description = " ".join(d.get("Description").split()) if keyword.iskeyword(name): print name.upper(), "=", "getattr(E, %r)" % name, else: print name.upper(), "=", "E." + name, print "#", description, if d.get("Depr."): print "(DEPRECATED)", print if d.get("Empty"): empty.append(name) print print "# attributes (only reserved words are included here)" print "ATTR = dict" last = None for row in parse(urlopen(ATTRIBUTES)).findall(".//tr"): d = {} for cell in row: d[cell.get("title", "").strip()] = gettext(cell).strip() name = d.get("Name") if not name: continue name = name.lower().encode("ascii") if name == last: continue if not keyword.iskeyword(name): continue print "def %s(v): return {%r: v}" % (name.upper(), name) last = name print print "# elements that don't have an end tag" print "empty", "=", empty