当前位置: 首页 > article >正文

基于python多线程多进程爬虫的maa作业站技能使用分析

基于python多线程多进程爬虫的maa作业站技能使用分析

技能使用分析
在这里插入图片描述

多线程(8核)

import json
import multiprocessing
import requests
from multiprocessing.dummy import Pool


def maa(st):
    url = "https://prts.maa.plus/copilot/get/"
    m = 1
    out = {}
    for i in range(st[0], st[0] + st[1]):
        print(i, "运行进度:" + str(m) + "/" + str(st[1]))
        m += 1
        skills = requests.get(url=url + str(i)).json()
        if skills["status_code"] == 200:
            try:
                content = skills["data"]["content"]
                opers = json.loads(content)["opers"]
                for j in opers:
                    if j["name"] not in out:
                        out[j["name"]] = [0, 0, 0]
                    if "skill" in j:
                        out[j["name"]][j["skill"] - 1] += 1
            except:
                pass
        else:
            continue
        print(out)
    return out


start = 20000
end = 46625
pn = multiprocessing.cpu_count()  # 线程数
selist = []
print(int((end - start) / pn))
for i in range(0, pn):
    selist.append([start + int((end - start) / pn) * i, int((end - start) / pn)])
print(selist)
# 创建线程池,调用下载函数
print(multiprocessing.cpu_count())
pool = Pool(pn)
re = pool.map(maa, selist)
# 关闭线程池
pool.close()
pool.join()
print(re)
addout = {}
for i in re:
    for j in i:
        if j not in addout:
            addout[j] = [0, 0, 0]
        addout[j][0] += i[j][0]
        addout[j][1] += i[j][1]
        addout[j][2] += i[j][2]
print(addout)
with open('data.json', 'w', encoding='utf-8') as f:
    json.dump(addout, f, ensure_ascii=False, indent=4)

json处理

import json
with open('data.json', 'r', encoding='utf-8') as file:
    data = json.load(file)
print(data)
out = {}
for i in data:
    if i + "1技能" not in out:
        out[i + "1技能"] = 0
    if i + "2技能" not in out:
        out[i + "2技能"] = 0
    if i + "3技能" not in out:
        out[i + "3技能"] = 0
    out[i+"1技能"] += data[i][0]
    out[i + "2技能"] += data[i][1]
    out[i + "3技能"] += data[i][2]
out = sorted(out.items(),key=lambda x:x[1],reverse=True)
for i in out:
    print(i)

多线程模式(容易死机)

import json
import multiprocessing
import requests
from multiprocessing.dummy import Pool
import threading


def fetch_title(url, results, index):
    try:
        out = {}
        global progress
        global total_work
        skills = requests.get(url).json()
        if skills["status_code"] == 200:
            try:
                print(skills["data"]["id"])
                with lock:
                    progress += 1
                    print(f"Progress: {progress}/{total_work}")
                content = skills["data"]["content"]
                opers = json.loads(content)["opers"]
                for j in opers:
                    if j["name"] not in out:
                        out[j["name"]] = [0, 0, 0]
                    if "skill" in j:
                        out[j["name"]][j["skill"] - 1] += 1
            except:
                pass
        results[index] = out
    except:
        pass


def maa(st):
    url = "https://prts.maa.plus/copilot/get/"
    threads = []
    results = [None] * st[1]
    urls = []
    for i in range(st[0],st[0]+st[1]):
        urls.append(url+str(i))
    # 为每个URL创建一个线程
    for i, url in enumerate(urls):
        t = threading.Thread(target=fetch_title, args=(url, results, i))
        t.start()
        threads.append(t)

    # 等待所有线程完成
    for t in threads:
        t.join()
    # print(results)
    return results

progress = 0
lock = threading.Lock()
start = 20000
end = 46625
total_work = end - start + 1
pn = multiprocessing.cpu_count()  # 线程数
selist = []
print(int((end - start) / pn))
for i in range(0, pn):
    selist.append([start + int((end - start) / pn) * i, int((end - start) / pn)])
print(selist)
# 创建线程池,调用下载函数
print(pn)
pool = Pool(pn)
re = pool.map(maa, selist)
# 关闭线程池
pool.close()
pool.join()
print(re)
addout = {}
for i in re[0]:
    for j in i:
        if j not in addout:
            addout[j] = [0, 0, 0]
        addout[j][0] += i[j][0]
        addout[j][1] += i[j][1]
        addout[j][2] += i[j][2]
print(addout)
with open('data.json', 'w', encoding='utf-8') as f:
    json.dump(addout, f, ensure_ascii=False, indent=4)

出场率分析(单作业)
在这里插入图片描述
出场率分析(结合访问量)
在这里插入图片描述
出场率分析代码

import requests

first_num = 20000
limit = 50
page = 1
url = "https://prts.maa.plus/copilot/query?desc=true&limit=%d&page=%d&order_by=id"
last_url = "https://prts.maa.plus/copilot/query?desc=true&limit=1&page=1&order_by=id"
result = {}
model = 0  # 0 计数,1 热度
if model == 0:
    txt_name = "result_num.txt"
else:
    txt_name = "result_view.txt"

last_get = requests.get(last_url).json()
print(last_get)
last_num = last_get['data']['data'][0]['id']
print("maa最新ID:" + str(last_num))

for i in range(1, (last_num - first_num) // limit):
    print("第%d次请求" % i)
    maa_json = requests.get(url % (limit, i)).json()
    print(maa_json)
    if maa_json["status_code"] == 200 and (not maa_json['data']['data'] == []):
        for j in range(0, limit - 1):
            if j < len(maa_json['data']['data']):
                maa_id = str(maa_json['data']['data'][j]['id'])
                # print("maa://" + maa_id + " 请求成功")
                content = eval(maa_json['data']['data'][j]['content'], {"true": True, "false": False, "null": None})
                maa_opers = content['opers']

                for m in maa_opers:
                    if m['name'] in result and model == 1:
                        result[m['name']] += maa_json['data']['data'][j]['views']
                    elif m['name'] in result and model == 0:
                        result[m['name']] += 1
                    elif (not m['name'] in result) and model == 1:
                        result[m['name']] = maa_json['data']['data'][j]['views']
                    else:
                        result[m['name']] = 1
                # print(result)
                with open(txt_name, "w", encoding="utf-8") as f:
                    for n in result:
                        f.write(str(n) + str(result[n]) + "\n")
                maa_groups = content  # ['groups']可能没有groups
                # print(maa_groups)
            else:
                break
    else:
        print(str(i) + "请求失败")
        break

result = sorted(result.items(), key=lambda x: x[1], reverse=True)
print(result)
with open(txt_name, "w", encoding="utf-8") as f:
    for n in result:
        f.write(str(n) + "\n")


http://www.kler.cn/a/538182.html

相关文章:

  • C++,STL 迭代器简介:概念、分类、操作
  • C++基础系列【8】如何解决编译器报的错误
  • 数巅科技中标科学城数科集团AI辅助企业数字化转型评估诊断
  • PyTorch Profiler 的使用
  • 14vue3实战-----获取用户信息和用户的菜单树信息
  • 家用报警器的UML 设计及其在C++和VxWorks 上的实现01
  • Python----Python高级(网络编程:网络基础:发展历程,IP地址,MAC地址,域名,端口,子网掩码,网关,URL,DHCP,交换机)
  • 【爬虫开发】爬虫开发从0到1全知识教程第13篇:scrapy爬虫框架,介绍【附代码文档】
  • <tauri><rust><GUI>基于rust和tauri,在已有的前端框架上手动集成tauri示例
  • RabbitMQ 消息顺序性保证
  • 多线程下jdk1.7的头插法导致的死循环问题
  • 学JDBC 第二日
  • OSwatch性能分析工具部署
  • 为什么要学习AI/机器学习
  • 2025年02月07日Github流行趋势
  • vnev/Scripts/activate : 无法加载文件
  • 深度学习之DCGAN算法深度解析
  • 微服务组件LoadBalancer负载均衡
  • GnuTLS: 在 pull 函数中出错。 无法建立 SSL 连接。
  • 求组合数,
  • ubuntu18.04 编译安装opencv3.4.8
  • 云计算真的可以提高企业的IT敏捷性吗?
  • 【简单】27.移除元素
  • 《Java核心技术 卷II》本地化的数字格式
  • 3.攻防世界 Confusion1(服务器模板注入SSTI)
  • 直接抓取网页的爬虫技术:限制与合规挑战