xpath爬虫
xpath是什么
xpath是在XML文档中搜索内容的一门语言
html是xml的一个子集
具体实现
主要依靠lxml类中的etree
demo代码
用法1、XML
from lxml import etree
xml = """
<book>
<id>1</id> <name> <nick id="10086">2112</nick> <nick id="10010">4555</nick> <nick class="joy">fgbgn</nick> <nick class="jolin">goog</nick> <dir> <nick class="ksj">gogofff</nick> <dir> <nick class="ksj">ggogo</nick> </dir> </dir> </name></book>
"""
tree=etree.XML(xml)
# rer = tree.xpath("/book") /表示层级
#
# rerr= tree.xpath("/book/id") /表示层级
#
# result = tree.xpath("/book/id/text()") /输出id层级的内容 test()拿文本
# result = tree.xpath("/book/name/nick/text()") #输出nick里的所有内容
# result = tree.xpath("/book/name/dir/nick/text()") #输出dir中nick的内容
# result = tree.xpath("/book/name//nick/text()") #输出name节点中的所有节点中所有的nick节点的内容,//后代
result = tree.xpath("/book/name/*/nick/text()") # *匹配任意节点,只匹配一层
print(result)
用法2、parse
from lxml import etree
tree =etree.parse("b.html")
result = tree.xpath("/html/a[@href='dabao']/text()") #从html中a里面的href=dapao的数据
result = tree.xpath("/html/@href") #输出html中href中的数据 比如dapao
ol_li_list=tree.xpath("/html/a")
for li in ol_li_list:
result=li.xpath("./o/text()") #查找html/a/o/中的结果
result2=li.xpath("./o/@href") #查找/html/a/o/中href=xxx的内容
用法3、HTML
newstart
import requests
from lxml import etree
url ='http://eci-2zedc18yc0kv02b4wfp5.cloudeci1.ichunqiu.com/start'
url1='http://eci-2zedc18yc0kv02b4wfp5.cloudeci1.ichunqiu.com/submit'
s=requests.session()
a=s.get(url).content
tree=etree.HTML(a)
titles = tree.xpath('//p/text()')
result = "".join(titles)
data={
"user_input":{result}
}
d=s.post(url1,data=data)
print(d.text)
几行就没了,比re和bs4都快,但是可能没有re灵活
爬取blog信息练手
本来应该爬猪八戒网的,但是现在搞了个二次认证,不太好爬取
蹭蹭blog(以得到同意,不要模仿)
import requests
from lxml import etree
url = 'https://noobxiaomeng.top/'
rspons = requests.get(url)
rel=rspons.content
tree =etree.HTML(rel)
rell=tree.xpath("//div/h6/text()") #查找个人信息
for i in rell:
print(i)
relll=tree.xpath("//header/a[@class='post-title']/text()") #爬取文章标题
rellll=tree.xpath("//header/a[@class='post-title']/@href") #爬取文章链接
for i,j in zip(relll,rellll):
print(i+":"+j+'\n')
与bs4爬取图库对比
bs4:
import requests
from bs4 import BeautifulSoup
url = "https://www.umei.cc/bizhitupian/weimeibizhi/"
resp = requests.get(url=url)
resp.encoding = 'utf-8'
re = BeautifulSoup(resp.text,"html.parser")
ac = re.find("div",class_="Clbc_r_cont").find_all("a")
for a in ac:
href = a.get("href")
print(href)
url2 = "https://www.umei.cc/"+href
page_re = requests.get(url2)
page_re.encoding = 'utf-8'
page_text = page_re.text
page = BeautifulSoup(page_text,"html.parser")
p=page.find("div",class_="big-pic")
img = p.find("img")
src = img.get("src")
#下载
img_re = requests.get(src)
img_re.content
img_name = src.split("/")[-1]
with open(img_name,mode="wb") as f:
f.write(img_re.content) #图片保存
print("ob")
print("over")
import requests
from lxml import etree
url = 'https://www.umei.cc/katongdongman/dongmanbizhi/index_2.htm' #不存在1,1就不要添加后面的index,所以从2开始,多页面也可以进行遍历,没搞了
re1=requests.get(url).content
tree1 = etree.HTML(re1)
tll=tree1.xpath('//div[@class="btns"]/a/@href') #读取页面链接
for i in tll:
url2 ="https://www.umei.cc"+i
re2=requests.get(url2).content
tree2=etree.HTML(re2)
tlll = tree2.xpath('//div[@class="big-pic"]//img/@title') #匹配标题
tllll = tree2.xpath('//div[@class="big-pic"]//img/@src') #匹配链接
img_name = "".join(tlll)+'.jpg'
url3 = "".join(tllll)
img_re = requests.get(url=url3)
img_re.content
with open(img_name, mode="wb") as f:
f.write(img_re.content) # 图片保存
print("ob")
print('over')
可以发现比bs4简单