【python】爬虫
下载与批量下载
import requests
#第三方库,没有下载的下载一下 pip install requests
#爬虫下载图片
res=requests.get("url")
print(res.content)#二进制字节流
#写文件
with open("beauty.jpg","wb")as f:
f.write(res.content)
#批量下载短视频
headers={...}
params={...}
res=requests.get("url/post/",params=params,headers=headers)
print(res.json())
#只拿想要的
data=res.json().get(key)
video_urls=[data.get("desc"),data.get('video').get('play_addr').get('url_list')for item in data]
for title,url in video_urls:
res=requests.get(url)
with open(f"./document/{title}.mp4","wb")as f:
f.write(res.contnt)
print('ok')
获取url的小技巧:
点击下载,查看网络中的请求GET的url拿过来即可
代码转换网站 转换成爬虫代码
openpyxl库
读写操作excel文件(.xlsx格式)
pip install openpyxl
import openpyxl
#file
workbook=openpyxl.load_workbook('name.xlsx')
#sheet
sheet=workbook['Sheet']
#单元格
value=sheet['A1'].value
sheet['C2'].value='666'
#save
workbook.save('name.xlsx')
#create
workbook=openpyxl.Workbook()
sheet=workbook.active#获取默认工作表
sheet.append([1,2,3,4,5,7,6,8,9])#写入一行数据
for i in range(100):#多行
sheet.append([i,i+1,2,3,4,5,7,6,8,9])
#save
workbook.save('new_name.xlsx')
实战:爬数据写入excel
检查->网络->创建一个请求->复制cURL
找一个网站
import requests
import openpyxl
cookies={...}
headers={...}
response=requests.get(
'url?r=0&count=10&categories=%70....',
cookies=cookies,
headers=headers,
)
#url中常有count=10,可修改成想获取的条数
print(response.text)
print(response.json())
data=response.json().get("items")
for item in data:
if(item.get('type')=='a'):
title=item.get("title")
count=item.get("rating").get("count")
#save to excel
#create
workbook=openpyxl.Workbook()
sheet=workbook.active#获取默认工作表
for...#多行写入data
#save
workbook.save('new_name.xlsx')