爬取鲜花网站数据
待爬取网页:
代码:
import requests
from lxml import etree
import pandas as pd
from lxml import html
import xlwt
url = "https://www.haohua.com/xianhua/"
header = {
"accept":"image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
"accept-encoding":"gzip, deflate, br, zstd",
"accept-language":"zh-CN,zh;q=0.9",
"cookie":"MUID=35169CD2EDEA6D7E149B88BEECB06C7B; SRCHD=AF=NOFORM; SRCHUID=V=2&GUID=06DEDF3E60F3437B9D1E0E0541286638&dmnchg=1; MUIDB=35169CD2EDEA6D7E149B88BEECB06C7B; MMCASM=ID=5709703A12A449E3A5153FAA872F0450; _UR=QS=0&TQS=0&Pn=1; _TTSS_IN=hist=WyJ6aC1IYW5zIiwiZW4iLCJhdXRvLWRldGVjdCJd&isADRU=0; _TTSS_OUT=hist=WyJlbiIsInpoLUhhbnMiXQ==; _tarLang=default=zh-Hans&newFeature=tonetranslation; _EDGE_S=SID=10AB24CBE0666F783D443148E1B46E27; _Rwho=u=d&ts=2025-01-29; _SS=SID=10AB24CBE0666F783D443148E1B46E27&R=200&RB=0&GB=0&RG=200&RP=200&PC=U316; SRCHUSR=DOB=20240521&T=1738198155000&TPC=1736825154000; USRLOC=HS=1&ELOC=LAT=31.554468154907227|LON=117.24475860595703|N=%E8%82%A5%E8%A5%BF%E5%8E%BF%EF%BC%8C%E5%AE%89%E5%BE%BD%E7%9C%81|ELT=4|; SNRHOP=I=&TS=; _HPVN=CS=eyJQbiI6eyJDbiI6ODksIlN0IjoxLCJRcyI6MCwiUHJvZCI6IlAifSwiU2MiOnsiQ24iOjg5LCJTdCI6MCwiUXMiOjAsIlByb2QiOiJIIn0sIlF6Ijp7IkNuIjo4OSwiU3QiOjAsIlFzIjowLCJQcm9kIjoiVCJ9LCJBcCI6dHJ1ZSwiTXV0ZSI6dHJ1ZSwiTGFkIjoiMjAyNS0wMS0zMFQwMDowMDowMFoiLCJJb3RkIjowLCJHd2IiOjAsIlRucyI6MCwiRGZ0IjpudWxsLCJNdnMiOjAsIkZsdCI6MCwiSW1wIjo2MDgsIlRvYm4iOjB9; _RwBf=r=0&ilt=835&ihpd=0&ispd=8&rc=200&rb=0&gb=0&rg=200&pc=200&mtu=0&rbb=0&g=0&cid=&clo=0&v=15&l=2025-01-29T08:00:00.0000000Z&lft=2025-01-13T00:00:00.0000000-08:00&aof=0&ard=0001-01-01T00:00:00.0000000&rwdbt=0&rwflt=0&o=2&p=&c=&t=0&s=0001-01-01T00:00:00.0000000+00:00&ts=2025-01-30T01:37:12.0686804+00:00&rwred=0&wls=&wlb=&wle=&ccp=&cpt=&lka=0&lkt=0&aad=0&TH=&rwaul2=0; SRCHHPGUSR=SRCHLANG=zh-Hans&BRW=XW&BRH=S&CW=1495&CH=217&SCW=1479&SCH=217&DPR=1.5&UTC=480&DM=0&WTS=63873794963&PRVCW=1494&PRVCH=765&PV=15.0.0&HV=1738201032&BZA=0&WEBTHEME=0&THEME=0&EXLTT=31&AV=14&ADV=14&RB=0&MB=0",
"ect":"4g",
"priority":"i",
"referer":"https://cn.bing.com/chrome/newtab",
"sec-ch-ua":'"Not A(Brand";v="8", "Chromium";v="132", "Google Chrome";v="132"',
"sec-ch-ua-arch":"x86",
"sec-ch-ua-bitness":"64",
"sec-ch-ua-full-version":"132.0.6834.111",
"sec-ch-ua-full-version-list":'"Not A(Brand";v="8.0.0.0", "Chromium";v="132.0.6834.111", "Google Chrome";v="132.0.6834.111"',
"sec-ch-ua-mobile":"?0",
"sec-ch-ua-model":"",
"sec-ch-ua-platform":"Windows",
"sec-ch-ua-platform-version":"15.0.0",
"sec-fetch-dest":"image",
"sec-fetch-mode":"no-cors",
"sec-fetch-site":"same-origin",
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36"
}
response = requests.get(url = url,headers = header)
response.encoding = "utf-8"
# print(response.text)
# price = tree.xpath('//a[@class="info imghover"]/p[@class="price b"]/span[not(@class)]/text()')
#
# print(price[0].strip())
html = etree.HTML(response.text)
# print(html)
fresh_flowers = []
popularity = []
original_price = []
now_price = []
name = html.xpath('//a[@class = "info imghover"]/h5')
for i in name:
fresh_flowers.append(i.text)
# for i in xianhua_name:
# print(i)
price = html.xpath('//a[@class = "info imghover"]/p')
for i in price:
original_price.append(i[1].text)
popularity.append(i[2].text)
datalist = []
datalist.append(fresh_flowers)
datalist.append(original_price)
datalist.append(popularity)
# 将数据组织成字典
data = {
"fresh_flowers": fresh_flowers,
"original_price": original_price,
"popularity": popularity
}
# 创建DataFrame
df = pd.DataFrame(data)
# 将DataFrame写入Excel文件
df.to_excel("xianhua_data.xlsx", index=False)
print("数据已成功写入Excel文件")
# print(len(xianhua_name))
# print(len(original_price))
# print(len(popularity))
结果文件: