# $Id$ # get inner text from HTML element ## # Extracts inner text from an HTML element. This version attempts to # handle newlines according to the standard HTML rule, where newlines # just after a start tag, and just before an end tag, are ignored. # # @param elem HTML element def html_inner_text(elem): # brute-force solution # 1) synthesize parser event stream START, END = range(2) def add_events(elem, events): events.append(START) if elem.text: events.append(elem.text) for e in elem: add_events(e, events) if e.tail: events.append(e.tail) events.append(END) return events events = [] add_events(elem, events) # 2) clean up text fragments for i, e in enumerate(events): if not isinstance(e, basestring): continue if i and events[i-1] == START and e.startswith("\n"): events[i] = events[i][1:] if i < len(events)-1 and events[i+1] == END and e.endswith("\n"): events[i] = events[i][:-1] # return inner text return "".join([word for word in events if isinstance(word, basestring)]) # -------------------------------------------------------------------- # demo import cElementTree as ET HTML = """\
some bold text
""" tree = ET.XML(HTML) def test1(): return html_inner_text(tree) print repr(test1())