Python 实现的采集诸葛灵签
Python 实现的采集诸葛灵签
项目介绍
这是一个基于 Python 开发的诸葛灵签数据采集和展示项目。通过爬虫技术获取诸葛神签的签文和解签内容,并提供数据存储和查询功能。
项目结构
zhuge/
├── zhuge_scraper.py # 爬虫主程序
├── zhuge_pages/ # 数据存储目录
│ ├── all_signs.json # 汇总数据
│ └── zhuge_sign_*.json # 单个签文数据
└── zhuge.md # 项目说明文档
功能特点
- 支持批量爬取 384 个诸葛神签
- 自动将中文数字转换为阿拉伯数字
- 数据以 JSON 格式保存
- 实现断点续传和错误重试
- 智能延时,避免请求过频
- 双重保存机制(单独文件 + 汇总文件)
技术栈
- Python 3.x
- requests:网络请求
- BeautifulSoup4:HTML 解析
- json:数据序列化
- re:正则表达式处理
核心功能模块
1. 中文数字转换
实现了将"三百八十四"等中文数字转换为阿拉伯数字的功能:
def chinese_to_arabic(chinese_num):
# 将中文数字(如"三百八十四")转换为阿拉伯数字(384)
2. 页面解析
解析网页内容,提取签号、签文和解签信息:
def parse_zhuge_page(soup):
# 解析页面内容,返回包含签号、签文和解签的字典
3. 数据爬取
处理单个页面的爬取和数据保存:
def scrape_zhuge_page(url):
# 爬取单个页面并保存数据
4. 批量处理
控制批量爬取流程和请求频率:
def scrape_zhuge_range(start=1, end=384):
# 批量爬取指定范围的签文
数据存储结构
数据以 JSON 格式存储,包含以下字段:
{
"sign_number": "签号",
"sign_text": "签文内容",
"interpretation": "解签详解"
}
使用说明
环境准备
pip install requests beautifulsoup4
运行方式
python zhuge_scraper.py
数据输出
- 单个签文:zhuge_pages/zhuge_sign_[编号].json
- 汇总文件:zhuge_pages/all_signs.json
注意事项
- 请合理控制爬取频率
- 建议使用代理池轮换 IP
- 数据仅供学习研究使用
- 注意网站反爬虫机制
后续优化计划
- 添加代理池支持
- 优化中文数字转换算法
- 添加数据验证机制
- 实现更完善的错误处理
- 添加日志记录系统
项目源码
import requests
from bs4 import BeautifulSoup
import os
import time
import random
import re
import json
def chinese_to_arabic(chinese_num):
"""将中文数字转换为阿拉伯数字"""
cn_num = {
'零': 0, '一': 1, '二': 2, '三': 3, '四': 4,
'五': 5, '六': 6, '七': 7, '八': 8, '九': 9,
'十': 10, '百': 100
}
result = 0
temp_sum = 0
temp_num = 0
for char in chinese_num:
curr_num = cn_num.get(char)
if curr_num == 100: # 百
temp_sum += (temp_num if temp_num > 0 else 1) * curr_num
temp_num = 0
elif curr_num == 10: # 十
temp_sum += (temp_num if temp_num > 0 else 1) * curr_num
temp_num = 0
else: # 个位数
temp_num = curr_num
result = temp_sum + temp_num
return result
def parse_zhuge_page(soup):
"""
Parse a Zhuge divination page and extract key information.
"""
# Find the sign number
sign_number_elem = soup.find('dt', text=re.compile(r'诸葛测算第[零一二三四五六七八九十百]+签结果'))
if sign_number_elem:
chinese_num = re.search(r'第([零一二三四五六七八九十百]+)签', sign_number_elem.text).group(1)
sign_number = str(chinese_to_arabic(chinese_num))
else:
sign_number = None
# Find the sign text
sign_text_elem = soup.find('dd').find('em')
sign_text = sign_text_elem.text.strip() if sign_text_elem else None
# Find the detailed interpretation
interpretation_elems = soup.find_all('dd')[1].find_all('p')
interpretation = '\n'.join([p.text.strip() for p in interpretation_elems])
return {
'sign_number': sign_number,
'sign_text': sign_text,
'interpretation': interpretation
}
def scrape_zhuge_page(url):
try:
# 更完整的请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Referer': 'https://www.chazidian.com/',
'Upgrade-Insecure-Requests': '1'
}
# 增加重试机制
max_retries = 3
retry_delay = 5
for attempt in range(max_retries):
try:
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status()
break
except requests.RequestException as e:
if attempt == max_retries - 1:
raise
print(f"Attempt {attempt + 1} failed, retrying in {retry_delay} seconds...")
time.sleep(retry_delay)
# 其余代码保持不变
soup = BeautifulSoup(response.text, 'html.parser')
page_data = parse_zhuge_page(soup)
# 创建目录
os.makedirs('zhuge_pages', exist_ok=True)
# 保存到单个文件和独立文件
all_data_file = 'zhuge_pages/all_signs.json'
# 读取现有数据(如果存在)
existing_data = []
if os.path.exists(all_data_file):
with open(all_data_file, 'r', encoding='utf-8') as f:
existing_data = json.load(f)
# 追加新数据
existing_data.append(page_data)
# 保存所有数据
with open(all_data_file, 'w', encoding='utf-8') as f:
json.dump(existing_data, f, ensure_ascii=False, indent=2)
# 同时保存单独的文件(保持原有功能)
filename = f'zhuge_pages/zhuge_sign_{page_data["sign_number"]}.json'
with open(filename, 'w', encoding='utf-8') as f:
json.dump(page_data, f, ensure_ascii=False, indent=2)
print(f"Successfully scraped and saved {url}")
return page_data
except requests.RequestException as e:
print(f"Error scraping {url}: {e}")
return None
def scrape_zhuge_range(start=1, end=384):
"""
Scrape a range of Zhuge divination pages
Args:
start (int): Starting page number
end (int): Ending page number
"""
# Scrape pages
for page_num in range(start, end + 1):
url = f'https://www.chazidian.com/zhuge{page_num}/'
print(f"Scraping page {page_num}...")
# Scrape page
page_data = scrape_zhuge_page(url)
if page_data:
# Random delay to be nice to the server
time.sleep(random.uniform(0.5, 2))
# Optional: break if too many errors occur
if page_num % 50 == 0:
print(f"Paused at page {page_num}. Waiting a bit...")
time.sleep(random.uniform(3, 7))
def main():
try:
scrape_zhuge_range(1, 384)
print("Scraping completed successfully!")
except Exception as e:
print(f"An error occurred during scraping: {e}")
if __name__ == '__main__':
main()
原文地址:https://blog.csdn.net/hzether/article/details/146269566
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.kler.cn/a/586385.html 如若内容造成侵权/违法违规/事实不符,请联系邮箱:809451989@qq.com进行投诉反馈,一经查实,立即删除!
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.kler.cn/a/586385.html 如若内容造成侵权/违法违规/事实不符,请联系邮箱:809451989@qq.com进行投诉反馈,一经查实,立即删除!