这是您可以
lxml(和可爱的
requests)使用的方法:
import requestsimport lxml.html as lhfrom BeautifulSoup import UnipreDammitURL = "http://en.wikipedia.org/w/index.php?title=data_mining&printable=yes"HEADERS = {'User-agent': 'Mozilla/5.0'}def lhget(*args, **kwargs): r = requests.get(*args, **kwargs) html = UnipreDammit(r.content).unipre tree = lh.fromstring(html) return treedef remove(el): el.getparent().remove(el)tree = lhget(URL, headers=HEADERS)el = tree.xpath("//div[@]/p")[0]for ref in el.xpath("//sup[@]"): remove(ref)print lh.tostring(el, pretty_print=True)print el.text_content()


