# $Id$ # text extraction demo import cElementTree as ET # import elementtree.ElementTree as ET # # original implementation def flatten(elem, include_tail=0): text = elem.text or "" for e in elem: text += flatten(e, 1) if include_tail and elem.tail: text += elem.tail return text # # alternate implementation, from # http://effbot.org/zone/element-bits-and-pieces.htm # appears to be a little bit (10-20%) faster # rename to inner_text instead ? def gettext(elem): text = elem.text or "" for e in elem: text += gettext(e) if e.tail: text += e.tail return text # # alternate implementation, using string.join pattern. this is a lot # slower (not sure why) def gettext_join(elem): data = [] append = data.append def process(elem): if elem.text: append(elem.text) for e in elem: process(e) if e.tail: append(e.tail) process(elem) return "".join(data) # -------------------------------------------------------------------- # demo DOC2 = """\ text texttail """ tree = ET.XML(DOC2) def test1(): return flatten(tree) def test2(): return gettext(tree) def test3(): return gettext_join(tree) print repr(test1()) print repr(test2()) print repr(test3())