Python爬虫实战:从抓取年报并分析数据开始
1.导读
在这个信息爆炸的时代,数据成为了企业和个人决策的重要依据。对于投资者而言,上市公司的年报是评估企业经营状况、财务健康度的关键资料。然而,手动下载和整理这些报告既耗时又费力。幸运的是,Python及其强大的爬虫技术可以让我们自动化这一过程,高效地获取并分析这些宝贵信息。本文将引导你从零开始,利用Python构建一个简单的年报爬虫程序,开启数据驱动的投资之旅!
2.年报获取
新浪财经网是我们学习爬虫技术很友好的网站,因此今天我们选择获取新浪财经网上市公司年报为例。先以一个公司为例。
# 单个上市公司年报抓取,以600900长江电力公司为例
import requests
from lxml import etree
import re
import pandas as pd
session = requests.session()
url = "https://vip.stock.finance.sina.com.cn/corp/go.php/vCB_Bulletin/stockid/600900/page_type/ndbg.phtml"
headers = {
"authority":"vip.stock.finance.sina.com.cn",
"method":"GET",
"path":"/corp/view/vCB_AllBulletin.php?stockid=600900&Page=1",
"scheme":"https",
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding":"gzip, deflate, br, zstd",
"Accept-Language":"zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Cookie":"UOR=www.baidu.com,finance.sina.com.cn,; SINAGLOBAL=125.34.83.118_1696236074.611955; U_TRS1=00000076.bd59125751.651a8233.befc8927; SCF=AnUCBLcVWTJXUBH-H38aCJVHeDe3DmK-cwG84dmct4Ns0at-eRnYNKcGL885mNEIO8IkFfrCxysbt1qogwN559Y.; FINA_V_S_2=sh600900,sh600036,sz000001,sz000601,sz000002; FIN_ALL_VISITED=sh600900; SR_SEL=1_511; close_leftanswer=1; SFA_version7.14.0=2024-04-08%2016%3A12; Apache=61.163.21.199_1712564150.380551; SFA_version7.14.0_click=2; hqEtagMode=0; U_TRS2=00000044.9a587f9f9.6613a7c0.4520ae56; rotatecount=3; ULV=1712564162257:32:4:2:61.163.21.199_1712564150.380551:1712564149632",
"Sec-Ch-Ua": '"Google Chrome";v="123", "Not:A-Brand";v="8", "Chromium";v="123"',
"Sec-Ch-Ua-Mobile":"?0",
"Sec-Ch-Ua-Platform":'"Windows"',
"Sec-Fetch-Dest":"document",
"Sec-Fetch-Mode":"navigate",
"Sec-Fetch-Site":"same-origin",
"Sec-Fetch-User":"?1",
"Upgrade-Insecure-Requests":"1",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
}
html = session.get(url, headers = headers)
tree = etree.HTML(html.text)
date_xpath = "//div[@class='datelist']/ul/text()"
title_xpath = "//ul/a/text()"
url_xpath = "//ul/a/@href"
date_list = tree.xpath(date_xpath)
date_list = [re.sub('/s', '', date) for date in date_list if re.sub('/s', '', date) != '']
title_list = tree.xpath(title_xpath)
url_list = tree.xpath(url_xpath)
# 以上代码可以获得公司年报的日期、标题、链接三部分
for date, title, url in zip(date_list, title_list, url_list):
print(date, title, url) #通过此可以将获取的内容展示出来,如图1所示
结果如下所示:
接下来获取公司年报内容,获取代码和结果如下所示:
# 年报内容
all_content = []
for u in url_list:
content_url = "https://vip.stock.finance.sina.com.cn" + u
content_html = session.get(content_url, headers = headers)
content_tree = etree.HTML(content_html.text)
content_xpath = "//p/text() | //pre/text()"
content_list = content_tree.xpath(content_xpath)
content = ''.join(content_list)
content = re.sub('\r\n[\s\d]+?\r\n', '',content)
content = re.sub('\s', '', content)
all_content.append(content)
# 将抓取的所有信息导出
file = "E:\年报.xlsx"
df = pd.DataFrame(data = [date_list, title_list, url_list, all_content]).T
df.columns = ['日期', '标题', '链接', '年报内容']
df.to_excel(file, index = None)
获取到一个公司所公布的年报后,接着学习如何获取所有上市公司的年报。在这之前我们需要先观察不同公司年报网页规律,我们发现不同公司年报网页在于股票代码的不同,更换股票代码就可以找到相应的年报网页。那如何得到所有公司的股票代码呢?本文使用tushare获取股票代码,具体代码如下:
# 获取股票代码
import requests
from lxml import etree
import tushare as ts
import re
import pandas as pd
session = requests.session()
# 通过tushare获取上市公司的股票代码,先在tushare注册
token = '注册后可获得自己的token'
pro = ts.pro_api(token)
stock_basic = pro.stock_basic()
stock_basic #获得公司股票代码如图3所示
for stk in stock_basic.symbol:
print(stk)
结果如下所示:
因为所有的上市公司太多,数据处理起来比较缓慢,因而这里我们随机选取10家公司获取其年报信息并进行处理。
stock_sample = stock_basic.sample(n=10, axis=0) # 随机抽取十家上市公司
接着使用循环获取十家上市公司年报相关信息:
import requests
from lxml import etree
import re
import pandas as pd
import time
import random
session = requests.session()
date_xpath = "//div[@class='datelist']/ul/text()"
title_xpath = "//ul/a/text()"
url_xpath = "//ul/a/@href"
for stk in stock_sample.symbol:
print("正在抓取的公司为%s" % stk)
url = "https://vip.stock.finance.sina.com.cn/corp/go.php/vCB_Bulletin/stockid/%s/page_type/ndbg.phtml" % stk
if int(stk) < 600000:
ref = 'sz%s' % stk
else:
ref = 'sh%s' % stk
headers = {
"authority":"vip.stock.finance.sina.com.cn",
"method":"GET",
"path":"/corp/go.php/vCI_CorpManager/stockid/%s/page_type/ndbg.phtml" % stk,
"scheme":"https",
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding":"gzip, deflate, br",
"Accept-Language":"zh-CN,zh;q=0.9",
"Cache-Control":"max-age=0",
"If-Modified-Since":"Sat, 05 Aug 2023 06:44:30 GMT",
"Referer":"https://finance.sina.com.cn/realstock/company/%s/nc.shtml" % ref,
"Sec-Ch-Ua":'"Not/A)Brand";v="99", "Google Chrome";v="115", "Chromium";v="115"',
"Sec-Ch-Ua-Mobile":"?0",
"Sec-Ch-Ua-Platform":"macOS",
"Sec-Fetch-Dest":"document",
"Sec-Fetch-Mode":"navigate",
"Sec-Fetch-Site":"same-site",
"Sec-Fetch-User":"?1",
"Upgrade-Insecure-Requests":"1",
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
}
while True:
try:
html = session.get(url, headers = headers, timeout = 10)
break
except:
time.sleep(10)
html = session.get(url, headers = headers)
html.encoding = 'gb18030'
tree = etree.HTML(html.text)
date_list = tree.xpath(date_xpath)
date_list = [re.sub('/s', '', date) for date in date_list if re.sub('/s', '', date) != '']
title_list = tree.xpath(title_xpath)
url_list = tree.xpath(url_xpath)
stkcd_list = [stk]*len(date_list)
#年报内容
all_content = []
for u in url_list:
content_url = "https://vip.stock.finance.sina.com.cn" + u
content_html = session.get(content_url, headers = headers)
content_tree = etree.HTML(content_html.text)
content_xpath = "//p/text() | //pre/text()"
content_list = content_tree.xpath(content_xpath)
content = ''.join(content_list)
content = re.sub('\r\n[\s\d]+?\r\n', '',content)
content = re.sub('\s', '', content)
all_content.append(content)
time.sleep(random.randint(1, 5))
file = f"E:\年报{stk}.xlsx"
df = pd.DataFrame(data = [date_list, title_list, url_list, stkcd_list, all_content]).T
df.columns = ['日期', '标题', '链接', '股票代码', '年报内容']
df.to_excel(file, index = None)
time.sleep(random.randint(1, 10))
结果展示如下所示:
将不同公司的信息进行合并并进行简单的清洗。
import pandas as pd
import os
# 改变工作目录
os.chdir('E:')
# 读取Excel文件
file_list = ['年报688291.xlsx', '年报688386.xlsx', '年报002358.xlsx', '年报002956.xlsx', '年报603703.xlsx', '年报002205.xlsx', '年报600408.xlsx', '年报300272.xlsx', '年报002806.xlsx','年报601369.xlsx']
dfs = [pd.read_excel(file) for file in file_list]
# 合并多个工作表
result = pd.concat(dfs, ignore_index=True)
# 删除多列,使用drop函数,指定列名列表
df = df.drop(['日期', '标题', '链接', '股票代码'], axis=1)
df = df.dropna(how='all')
# 保存到新的Excel文件
result.to_excel('十家公司年报数据.xlsx', index=False)
结果展示如下所示:
下一步我们对抓取到的年报进行一个简单的词云图和词云可视化展示。具体代码如下:
# 以600900长江电力公司的年报为例
import jieba
import jieba.analyse
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
from os import path
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
# 改变工作目录
os.chdir('E:')
# 读取文本文件
text = open(r"E:\多年年报.txt", encoding="utf-8").read()
def cutWord(text):
words=jieba.cut(text)
stopwords = {}.fromkeys([ line.rstrip() for line in open('E:\停用词.txt',encoding='utf-8')])
finalwords = []
for word in words:
if word not in stopwords:
if (word != "。" and word != ",") :
finalwords.append(word)
return finalwords
def countWord(text):
counts={}
for word in text:
if len(word) == 1 or word=='\n': #单个词和换行符不计算在内
continue
else:
if word not in counts.keys():
counts[word]=1
else:
counts[word]+=1
return counts
def drawBar(countdict,RANGE, heng):
#dicdata:字典的数据。
#RANGE:截取显示的字典的长度。
#heng=0,代表条状图的柱子是竖直向上的。heng=1,代表柱子是横向的。考虑到文字是从左到右的,让柱子横向排列更容易观察坐标轴。
by_value = sorted(countdict.items(),key = lambda item:item[1],reverse=True)
print(by_value[:20])
x = []
y = []
plt.figure(figsize=(9, 6))
for d in by_value:
x.append(d[0])
y.append(d[1])
if heng == 0:
plt.bar(x[0:RANGE], y[0:RANGE])
plt.show()
return
elif heng == 1:
plt.barh(x[0:RANGE], y[0:RANGE])
plt.show()
return
else:
return "heng的值仅为0或1!"
words = [word for word in jieba.lcut(text) if word not in stopwords and len(word) >= 5] #将分词中含有停用词的删去,并获取字数大于或等于3的词语
mask = imageio.imread(r'E:\背景图.jpeg')
# 根据分词结果产生词云
wc = WordCloud(
background_color='white', #设置背景为白色
max_font_size=500, #设置字体最大为500
max_words=300, #设置词数最多为300
width=300, #设置宽度为300
height=300, #设置高度为300
mask=mask, #设置词云的形状掩码,用于限制词云中单词显示的范围
font_path='C:/Windows/Fonts/msyh.ttc' #设置使用该路径下的字体
)
# 生成词云图像
wordcloud_img = wc.generate(' '.join(words))
# 以图片的形式显示词云
plt.imshow(wordcloud_img, interpolation="bilinear")
# 不显示图像坐标系
plt.axis("off")
# 显示图像
plt.show()
# 将词云图保存
wordcloud_img.to_file(r'E:\词云图.jpg')
#主函数
if __name__ == "__main__":
with open('多年年报.txt','r',encoding='utf-8') as f:
text=f.read()#读取文本
cutText=cutWord(text)#jieba分词
countdict=countWord(cutText)#生成词频字典
drawBar(countdict,10,0)#绘制词语出现次数前10的竖向条形图
drawBar(countdict,20,1)#绘制词语出现次数前20的横向条形图
结果展示如下图所示:
相信通过以上步骤,你不仅能够自动化获取年报,还能进行分析其中的数据,为投资决策提供有力支持。Python爬虫技术,让你在信息的海洋中游刃有余!