# $Id$ import BeautifulSoup as BS import elementtree.ElementTree as ET import htmlentitydefs, re pattern = re.compile("&(\w+);") def unescape(string): # work around oddities in BeautifulSoup's entity handling def unescape_entity(m, defs=htmlentitydefs.entitydefs): try: return defs[m.group(1)] except KeyError: return m.group(0) # use as is return pattern.sub(unescape_entity, string) def load(data): def emit(soup): if isinstance(soup, BS.NavigableString): bob.data(unescape(soup)) else: bob.start(soup.name, dict((k, unescape(v)) for k, v in soup.attrs)) for s in soup: emit(s) bob.end(soup.name) bob = ET.TreeBuilder() for s in BS.BeautifulSoup(data, convertEntities="html"): emit(s) return bob.close() if __name__ == "__main__": ET.dump(load("

text!&<stuff>

tail"))