爬虫案例学习6
获取淘宝商品数据2024-12-18
参考学习:
大佬博客
视频教程
通过搜索发现,数据是通过发送请求过来的,不是静态存在源代码的
所以我们需要请求这个接口获取数据:比如标题,价格,图片等信息
https://h5api.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/
但是我们直接发请求,携带上参数,无法获取到数据,会返回非法请求的字样。
因为有个参数sign是加密的,我们需要逆向
逆向参数获取sign
sign参数:貌似是一些参数经过哈希加密算法之后生成的32位小写加密参数。
具体的需要查看对应的js
点击main.js
搜素sign:相关的,分析
eT = eE(em.token + "&" + eC + "&" + eS + "&" + ep.data)
这一行就是生成sign
点击断点,可以查看变量的值
不过这里推荐打断点的时机,先鼠标滚动到下面的页码处,再接着打断点,点击下一页,此时进入js源码的参数才是正确的。
如果不这样做,鼠标滚轮下滑也进入了断点,ep.data的值不是我们需要的,需要放行很多次。
切换到控制台,输出这些值,等一下在python代码中需要使用,这里先记录一下
eE(em.token + "&" + eC + "&" + eS + "&" + ep.data) # 返回值是sign的值
em.token
eC
eS
ep.data
获得sign 8a3593958c55ff4115e359745dc9a665
是0-9,a-f MD5加密的字符串
我们需要带代码中生成sign
构建字符串MD5加密
#构建字符串str = em.token + "&" + eC + "&" + eS + "&" + ep.data
#Ec是时间
def getSign(eC):
em = 'cbee62bc9b064d508514dd6eb1c6cebd' # em变量存储token
eS = '12574478'
# signParam 是ep.data中的params字段
signParam = {
"device": "HMA-AL00",
"isBeta": "false",
"grayHair": "false",
"from": "nt_history",
"brand": "HUAWEI",
"info": "wifi",
"index": "4",
"rainbow": "",
"schemaType": "auction",
"elderHome": "false",
"isEnterSrpSearch": "true",
"newSearch": "false",
"network": "wifi",
"subtype": "",
"hasPreposeFilter": "false",
"prepositionVersion": "v2",
"client_os": "Android",
"gpsEnabled": "false",
"searchDoorFrom": "srp",
"debug_rerankNewOpenCard": "false",
"homePageVersion": "v7",
"searchElderHomeOpen": "false",
"search_action": "initiative",
"sugg": "_4_1",
"sversion": "13.6",
"style": "list",
"ttid": "600000@taobao_pc_10.7.0",
"needTabs": "true",
"areaCode": "CN",
"vm": "nw",
"countryNum": "156",
"m": "pc",
"page": 2,
"n": 48,
"q": "%E8%A3%A4%E5%AD%90",
"qSource": "url",
"pageSource": "",
"tab": "all",
"pageSize": "48",
"totalPage": "100",
"totalResults": "137306",
"sourceS": "0",
"sort": "_coefp",
"bcoffset": "-13",
"ntoffset": "13",
"filterTag": "",
"service": "",
"prop": "",
"loc": "",
"start_price": None,
"end_price": None,
"startPrice": None,
"endPrice": None,
"categoryp": "",
"ha3Kvpairs": None,
"couponFilter": 0,
"myCNA": "4PjnHzPgIA0CARsm5jekDfQ+"
}
json在线格式化
复制到python的函数的signParam字典中,将null值修改为None
接着继续完善getSign函数的MD5加密工作
import hashlib
n = json.dumps(signParam)
# print(json.dumps(json.dumps(signParam)))
data = {
"appId": "34385",
"params": n
}
# print(data)
n_data = json.dumps(data).replace(" ", "")
eC= "1734492057250" # 时间戳
str = em + "&" + eC + "&" + eS + "&" + json.dumps(data).replace(" ","")
# print(str)
MD5 = hashlib.md5()
MD5.update(str.encode("utf-8"))
sign = MD5.hexdigest()
return sign,n_data
调用函数,获取签名sign,上面的时间戳我是写死静态可,可以删除,改为动态的,
等一下在完整源码中会修改为动态当前时间戳
date_time = str(int(time.time() * 1000))
sign,n = getSign(eC = date_time)
# print(sign)
# f94586b665e0d865a20aa6d3acf708f3
有了sign,就可以发起请求,获取数据了,直接上完整源码
请求数据所在的api接口
https://h5api.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/
完整源码
# 可以运行版
# 获取淘宝数据:https://s.taobao.com/
# 搜索键盘相关数据,会自动拦截登录页面(所以需要cookie)
import csv
import time
import requests
from pprint import pprint
import hashlib
import json
import re
url = "https://h5api.m.taobao.com/h5/mtop.relationrecommend.wirelessrecommend.recommend/2.0/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"Referer": "https://s.taobao.com/",
"cookie":"自己的cookie"
}
"""
mtopjsonp6({"api":"mtop.relationrecommend.wirelessrecommend.recommend","data":{},"ret":["FAIL_SYS_ILLEGAL_ACCESS::非法请求"]
sign参数每次请求都会变化,导致请求不到数据(参数sign逆向)
"""
# eE(em.token + "&" + eC + "&" + eS + "&" + ep.data)
def getSign(eC):
em = 'db1e1adce046132af55f1e37728ca39b'
eS = '12574478'
signParam = {
"device": "HMA-AL00",
"isBeta": "false",
"grayHair": "false",
"from": "nt_history",
"brand": "HUAWEI",
"info": "wifi",
"index": "4",
"rainbow": "",
"schemaType": "auction",
"elderHome": "false",
"isEnterSrpSearch": "true",
"newSearch": "false",
"network": "wifi",
"subtype": "",
"hasPreposeFilter": "false",
"prepositionVersion": "v2",
"client_os": "Android",
"gpsEnabled": "false",
"searchDoorFrom": "srp",
"debug_rerankNewOpenCard": "false",
"homePageVersion": "v7",
"searchElderHomeOpen": "false",
"search_action": "initiative",
"sugg": "_4_1",
"sversion": "13.6",
"style": "list",
"ttid": "600000@taobao_pc_10.7.0",
"needTabs": "true",
"areaCode": "CN",
"vm": "nw",
"countryNum": "156",
"m": "pc",
"page": 1,
"n": 48,
"q": "%E8%A3%A4%E5%AD%90",
"qSource": "url",
"pageSource": "",
"tab": "all",
"pageSize": "48",
"totalPage": "100",
"totalResults": "5000",
"sourceS": "48",
"sort": "_coefp",
"bcoffset": "-26",
"ntoffset": "0",
"filterTag": "",
"service": "",
"prop": "",
"loc": "",
"start_price": None,
"end_price": None,
"startPrice": None,
"endPrice": None,
"categoryp": "",
"ha3Kvpairs": None,
"couponFilter": 0,
"myCNA": "4PjnHzPgIA0CARsm5jekDfQ+"
}
n = json.dumps(signParam)
# print(json.dumps(json.dumps(signParam)))
data = {
"appId": "34385",
"params": n
}
# print(data)
n_data = json.dumps(data).replace(" ", "")
str = em + "&" + eC + "&" + eS + "&" + json.dumps(data).replace(" ","")
# print(str)
MD5 = hashlib.md5()
MD5.update(str.encode("utf-8"))
sign = MD5.hexdigest()
return sign,n_data
date_time = str(int(time.time() * 1000))
sign,n = getSign(eC = date_time)
print(sign)
params = {
'jsv': '2.7.4',
'appKey': '12574478',
't': date_time,
'sign': sign,
'api': 'mtop.relationrecommend.wirelessrecommend.recommend',
'v': '2.0',
'timeout': '10000',
'type': 'jsonp',
'dataType': 'jsonp',
'callback': 'mtopjsonp6',
'data': n
}
resp = requests.get(url,params=params, headers=headers)
# print(resp.text)
html = resp.text
# 采集数据
info = re.findall(r'mtopjsonp\d+\((.*)', html)[0].replace(')','')
# pprint(info)
jsonData = json.loads(info)
# 循环获取数据
with open('taobao.csv',mode="w",newline='',encoding="utf-8") as f:
writer = csv.writer(f)
# 写入表头
head = ['标题','图片链接','价格','地区','销量','店铺']
writer.writerow(head)
for item in jsonData['data']['itemsArray']:
dit = {
'title': item['title'].replace('<span class=H>', '').replace('</span>',''),
'img': item['pic_path'],
'price': item['price'],
'procity': item['procity'],
'realSales': item['realSales'],
'shopName': item['nick'],
}
writer.writerow(dit.values())
print(dit)
注:需要获取其他数据
修改源码几个参数
url所在浏览器位置
改Referer和cookie
重写getSign函数的em值,eS值,signParam值
data中的appid也改
修改真正数据接口的参数:params
最后运行代码,即可获取数据