from urllib2 import *
from lxml import etree
# 抓取网页源码
res = urlopen("http://www.baidu.com")
content = res.read()
# 利用xpath提取数据
html = etree.HTML(content)
# xpath = "/html/head/title/text()"
xpath = "//*[@id=\"s_tab\"]/div/a"
arr = html.xpath(xpath)
for a in arr:
print a.text
print a.xpath("text()")[0]
print a.xpath("@href")[0]
|