利用爬虫获取某学习软件的考试题库(带源码)
首先要重新进行账号的登陆用来获取cookie
按下F12在控制台输入:// 获取当前页面的 cookies
var cookies = document.cookie.split(";");
// 创建一个数组来存储 cookies 对象
var cookieArray = [];
// 遍历每个 cookie 并将其转换为对象
cookies.forEach(function(cookie) {
var cookieParts = cookie.split("=");
var cookieName = cookieParts[0].trim();
var cookieValue = cookieParts[1] ? cookieParts[1].trim() : "";
cookieArray.push({ name: cookieName, value: cookieValue });
});
// 将 cookies 数组转换为 JSON 字符串
var cookieJson = JSON.stringify(cookieArray, null, 2);
// 创建一个 Blob 对象,将 JSON 内容保存到文件
var blob = new Blob([cookieJson], { type: "application/json" });
// 创建一个链接并触发下载
var a = document.createElement("a");
a.href = URL.createObjectURL(blob);
a.download = "cookies.json"; // 设置下载的文件名
document.body.appendChild(a);
a.click();
document.body.removeChild(a);
获取到的cookie如下:
爬虫代码:
import requests
from bs4 import BeautifulSoup
import time
# 设置请求头
headers = {
"Host": "mooc1-2.chaoxing.com",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Linux; Android 9; JSN-AL00a Build/HONORJSN-AL00a; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/74.0.3729.136 Mobile Safari/537.36 (schild:fb9eea5d0e85f1c0df8fc5a29fa45701) (device:JSN-AL00a) Language/zh_CN_#Hans com.chaoxing.mobile/ChaoXingStudy_3_6.4.4_android_phone_10830_262 (@Kalimdor)_ef54e909716f4179a342ae4dfc837d74",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Referer": "https://mooc1-2.chaoxing.com/exam-ans/exam/test/reVersionTestStartNew?keyboardDisplayRequiresUserAction=1&courseId=249366288&classId=113378271&source=0&imei=ef54e909716f4179a342ae4dfc837d74&tId=6300571&id=147285060&p=1&start=1&cpi=450898194&isphone=true&monitorStatus=360000&monitorOp=360000&remainTimeParam=6901&relationAnswerLastUpdateTime=1736168580574&enc=a409a1b9a921bbb17f047c2e1574b247",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
"Cookie": "填入自己的cookie数据",
"X-Requested-With": "com.chaoxing.mobile",
}
# 发送GET请求
url = "https://mooc1-2.chaoxing.com/exam-ans/exam/test/reVersionTestStartNew"
base_params = {
"keyboardDisplayRequiresUserAction": "填入自己的抓包数据",
"courseId": "填入自己的抓包数据",
"classId": "填入自己的抓包数据",
"source": "0",
"imei": "填入自己的抓包数据",
"tId": "填入自己的抓包数据",
"id": "填入自己的抓包数据",
"p": "填入自己的抓包数据",
"cpi": "填入自己的抓包数据",
"isphone": "true",
"monitorStatus": "360000",
"monitorOp": "-1",
"remainTimeParam": "6406",
"relationAnswerLastUpdateTime": "填入自己的抓包数据",
"enc": "填入自己的抓包数据",
}
# 循环从 start=1 到 start=34,我这里是因为只有35题。这里可以自己修改!
for start in range(1, 35):
# 更新 start 参数
params = base_params.copy()
params["start"] = str(start)
try:
response = requests.get(url, headers=headers, params=params, timeout=10)
if response.status_code != 200:
print(f"Failed to fetch data for start={start}: HTTP {response.status_code}")
continue
#获取响应体中的answerCon参数
soup = BeautifulSoup(response.text, 'html.parser')
# <div class="answerCon">
answer_con_divs = soup.find_all("div", class_="answerCon")
# 把题目写入1.txt 文件
with open("1.txt", "a", encoding="utf-8") as file:
for div in answer_con_divs:
answer_text = div.get_text(strip=True)
file.write(answer_text + "\n")
print(f"Processed start={start}, found {len(answer_con_divs)} answers.")
time.sleep(1)
except requests.exceptions.RequestException as e:
print(f"Error occurred for start={start}: {e}")
time.sleep(5)
抓取效果图片: