爬虫学习案例8
爬取京东评论信息
采用DrissionPage自动化工具采集,感觉比Selenium工具好,真香。
安装第三方库
pip install DrissionPage
pip install pandas
pip install pyecharts
pip install jieba
pip install wordcloud
1.安装DrissionPage库
DrissionPage安装博客
2.爬取评论信息到csv文件
from DrissionPage import ChromiumPage
import csv
# 打开浏览器
dp = ChromiumPage()
# 监听数据包
dp.listen.start('https://api.m.jd.com/?appid=item-v3&functionId=pc_club_productPageComments&client=pc')
dp.get("https://item.jd.com/100058720776.html#comment")
f = open('jd_comments.csv', 'w', encoding='utf-8-sig', newline='')
csv_writer = csv.DictWriter(f,['昵称','地区','产品','评论时间','评论内容'])
csv_writer.writeheader()
# 循环采集前20页评论数据
for page in range(1,21):
print(f"正在采集第{page}页评论数据")
dp.scroll.to_bottom()
resp = dp.listen.wait()
json_data = resp.response.body
print(json_data)
print("-------------------------")
commets = json_data['comments']
for comment_obj in commets:
dit = {
'昵称': comment_obj['nickname'],
'地区': comment_obj['location'],
'产品': comment_obj['productColor'],
'评论时间': comment_obj['creationTime'],
'评论内容': comment_obj['content'],
}
print(dit)
csv_writer.writerow(dit)
# 点击下一页按钮
dp.ele('css:.ui-pager-next').click()
如需获取其他商品评论,修改监听数据包,dp.listen.start,dp.get
dp.listen.start自己搜索,抓包请求
dp.get点击下一页,复制浏览器url即可
3.制作词云图
效果png图片:
新建一个py文件
# 词云图
import jieba
import wordcloud
import pandas as pd
df = pd.read_csv('jd_comments.csv')
content = ''.join([i for i in df['评论内容']])
# print(content)
# 结巴分词处理
string = ''.join(jieba.lcut(content))
# 词云图配置
wc = wordcloud.WordCloud(
background_color='white',
width=1000,
height=700,
font_path='msyhbd.ttc',
stopwords={'了','啊','的','都'}
)
# 导入词汇
wc.generate(string)
# 写出图片
wc.to_file('jd_wordcloud.png')
4.制作饼状图可视化
饼状图官网例子
新建一个py文件
# 饼状图可视化
from pyecharts import options as opts
from pyecharts.charts import Pie
import pandas as pd
df = pd.read_csv('jd_comments.csv')
x = df['地区'].value_counts().index.to_list()
y = df['地区'].value_counts().to_list()
print(x)
print(y)
c = (
Pie()
.add(
"",
[
list(z)
for z in zip(
x,
y,
)
],
center=["40%", "50%"],
)
.set_global_opts(
title_opts=opts.TitleOpts(title="京东-黑丝区域购买饼状图"),
legend_opts=opts.LegendOpts(type_="scroll", pos_left="80%", orient="vertical"),
)
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
.render("pie_scroll_legend.html")
)
x = df['产品'].value_counts().index.to_list()
y = df['产品'].value_counts().to_list()
print(x)
print(y)
c = (
Pie()
.add(
"",
[
list(z)
for z in zip(
x,
y,
)
],
center=["40%", "50%"],
)
.set_global_opts(
title_opts=opts.TitleOpts(title="京东-黑丝产品受欢迎饼状图"),
legend_opts=opts.LegendOpts(type_="scroll", pos_left="80%", orient="vertical"),
)
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
.render("京东-黑丝产品受欢迎饼状图.html")
)
分析后饼状图效果:性感黑丝最受欢迎,嘿嘿。
参考资料:
bilibili