# $Id$ # stuff to manipulate (X)HTML block structures import cElementTree as ET BLOCK_TAGS = ( "p", "h1", "h2", "h3", "h4", "h5", "h6", "ul", "ol", "pre", "dl", "div", "noscript", "blockquote", "form", "hr", "table", "fieldset", "address" ) try: BLOCK_TAGS = set(BLOCK_TAGS) except NameError: pass def isblock(elem): return elem.tag in BLOCK_TAGS ## # Fixes tail block texts. This function looks for block elements with # trailing text contents, and moves that text (and following non-block # elements) to a new block element. def fix_tail(elem, tag="p"): for elem in elem.getiterator(): out = []; se = None i = 0 while i < len(elem): e = elem[i] out.append(e) i += 1 if isblock(e) and e.tail and e.tail.strip(): # found block element with tail text; insert new element se = ET.Element(tag) se.text = e.tail e.tail = None out.append(se) # move subsequent non-blocks to this element while i < len(elem) and not isblock(elem[i]): se.append(elem[i]) i += 1 continue if se is not None: elem[:] = out # -------------------------------------------------------------------- # def test(text): elem = ET.XML(text) fix_tail(elem) print ET.tostring(elem) test("
one
twotwo
three