基于python多线程多进程爬虫的maa作业站技能使用分析
基于python多线程多进程爬虫的maa作业站技能使用分析
技能使用分析
多线程(8核)
import json
import multiprocessing
import requests
from multiprocessing.dummy import Pool
def maa(st):
url = "https://prts.maa.plus/copilot/get/"
m = 1
out = {}
for i in range(st[0], st[0] + st[1]):
print(i, "运行进度:" + str(m) + "/" + str(st[1]))
m += 1
skills = requests.get(url=url + str(i)).json()
if skills["status_code"] == 200:
try:
content = skills["data"]["content"]
opers = json.loads(content)["opers"]
for j in opers:
if j["name"] not in out:
out[j["name"]] = [0, 0, 0]
if "skill" in j:
out[j["name"]][j["skill"] - 1] += 1
except:
pass
else:
continue
print(out)
return out
start = 20000
end = 46625
pn = multiprocessing.cpu_count() # 线程数
selist = []
print(int((end - start) / pn))
for i in range(0, pn):
selist.append([start + int((end - start) / pn) * i, int((end - start) / pn)])
print(selist)
# 创建线程池,调用下载函数
print(multiprocessing.cpu_count())
pool = Pool(pn)
re = pool.map(maa, selist)
# 关闭线程池
pool.close()
pool.join()
print(re)
addout = {}
for i in re:
for j in i:
if j not in addout:
addout[j] = [0, 0, 0]
addout[j][0] += i[j][0]
addout[j][1] += i[j][1]
addout[j][2] += i[j][2]
print(addout)
with open('data.json', 'w', encoding='utf-8') as f:
json.dump(addout, f, ensure_ascii=False, indent=4)
json处理
import json
with open('data.json', 'r', encoding='utf-8') as file:
data = json.load(file)
print(data)
out = {}
for i in data:
if i + "1技能" not in out:
out[i + "1技能"] = 0
if i + "2技能" not in out:
out[i + "2技能"] = 0
if i + "3技能" not in out:
out[i + "3技能"] = 0
out[i+"1技能"] += data[i][0]
out[i + "2技能"] += data[i][1]
out[i + "3技能"] += data[i][2]
out = sorted(out.items(),key=lambda x:x[1],reverse=True)
for i in out:
print(i)
多线程模式(容易死机)
import json
import multiprocessing
import requests
from multiprocessing.dummy import Pool
import threading
def fetch_title(url, results, index):
try:
out = {}
global progress
global total_work
skills = requests.get(url).json()
if skills["status_code"] == 200:
try:
print(skills["data"]["id"])
with lock:
progress += 1
print(f"Progress: {progress}/{total_work}")
content = skills["data"]["content"]
opers = json.loads(content)["opers"]
for j in opers:
if j["name"] not in out:
out[j["name"]] = [0, 0, 0]
if "skill" in j:
out[j["name"]][j["skill"] - 1] += 1
except:
pass
results[index] = out
except:
pass
def maa(st):
url = "https://prts.maa.plus/copilot/get/"
threads = []
results = [None] * st[1]
urls = []
for i in range(st[0],st[0]+st[1]):
urls.append(url+str(i))
# 为每个URL创建一个线程
for i, url in enumerate(urls):
t = threading.Thread(target=fetch_title, args=(url, results, i))
t.start()
threads.append(t)
# 等待所有线程完成
for t in threads:
t.join()
# print(results)
return results
progress = 0
lock = threading.Lock()
start = 20000
end = 46625
total_work = end - start + 1
pn = multiprocessing.cpu_count() # 线程数
selist = []
print(int((end - start) / pn))
for i in range(0, pn):
selist.append([start + int((end - start) / pn) * i, int((end - start) / pn)])
print(selist)
# 创建线程池,调用下载函数
print(pn)
pool = Pool(pn)
re = pool.map(maa, selist)
# 关闭线程池
pool.close()
pool.join()
print(re)
addout = {}
for i in re[0]:
for j in i:
if j not in addout:
addout[j] = [0, 0, 0]
addout[j][0] += i[j][0]
addout[j][1] += i[j][1]
addout[j][2] += i[j][2]
print(addout)
with open('data.json', 'w', encoding='utf-8') as f:
json.dump(addout, f, ensure_ascii=False, indent=4)
出场率分析(单作业)
出场率分析(结合访问量)
出场率分析代码
import requests
first_num = 20000
limit = 50
page = 1
url = "https://prts.maa.plus/copilot/query?desc=true&limit=%d&page=%d&order_by=id"
last_url = "https://prts.maa.plus/copilot/query?desc=true&limit=1&page=1&order_by=id"
result = {}
model = 0 # 0 计数,1 热度
if model == 0:
txt_name = "result_num.txt"
else:
txt_name = "result_view.txt"
last_get = requests.get(last_url).json()
print(last_get)
last_num = last_get['data']['data'][0]['id']
print("maa最新ID:" + str(last_num))
for i in range(1, (last_num - first_num) // limit):
print("第%d次请求" % i)
maa_json = requests.get(url % (limit, i)).json()
print(maa_json)
if maa_json["status_code"] == 200 and (not maa_json['data']['data'] == []):
for j in range(0, limit - 1):
if j < len(maa_json['data']['data']):
maa_id = str(maa_json['data']['data'][j]['id'])
# print("maa://" + maa_id + " 请求成功")
content = eval(maa_json['data']['data'][j]['content'], {"true": True, "false": False, "null": None})
maa_opers = content['opers']
for m in maa_opers:
if m['name'] in result and model == 1:
result[m['name']] += maa_json['data']['data'][j]['views']
elif m['name'] in result and model == 0:
result[m['name']] += 1
elif (not m['name'] in result) and model == 1:
result[m['name']] = maa_json['data']['data'][j]['views']
else:
result[m['name']] = 1
# print(result)
with open(txt_name, "w", encoding="utf-8") as f:
for n in result:
f.write(str(n) + str(result[n]) + "\n")
maa_groups = content # ['groups']可能没有groups
# print(maa_groups)
else:
break
else:
print(str(i) + "请求失败")
break
result = sorted(result.items(), key=lambda x: x[1], reverse=True)
print(result)
with open(txt_name, "w", encoding="utf-8") as f:
for n in result:
f.write(str(n) + "\n")