import requests
import json
import csv
postUrl = "https://m.ctrip.com/restapi/soa2/13444/json/getCommentCollapseList"
# 将景点poiId和名称添加到此处
urls = [
["75487323","凤凰雾涧江景民宿"],
["11052220","古童临江客栈"],
["55918524","凤凰等你来三生三世艺术民宿"],
["15911807","凤凰虎耳草屋江景民宿"],
["42687808","凤凰素履莲花 璞树漫居江景度假民宿"],
["66065748","凤凰金水岸 慕名主题文创体验民宿"],
["64225492","凤凰雪晴集 人文半山民宿"],
["7078046","等你来 倾城轻奢民宿"],
["6842040","凤凰沱水人家精品民宿"],
["17541312","凤凰云桥自在江景精品民宿"],
]
# urls = [
# # ['76865', '星海广场'],
# ['75628', '棒棰岛'],
# ['75633', '大连森林动物园'],
# ['60514877', '三寰牧场'],
# ['75635', '劳动公园'],
# ['23035466', '东港音乐喷泉广场'],
# ['79494', '海之韵广场'],
# ['87618', '金石滩度假区'],
# ['87748', '滨海路'],
# ['87647', '滨海国家地质公园'],
# ['24845945', '莲花山观景台'],
# ['92196', '白玉山景区'],
# ['13301914', '大连天门山国家森林公园'],
# ]
for id in urls:
print("正在爬取景点:", id[1])
# 通过返回值判断总评论数,每页9条,计算出总页数,对大于2000条的数据只爬取两千条
data_pre = {
"arg": {
"channelType": 2,
"collapseType": 0,
"commentTagId": 0,
"pageIndex": 1,
"pageSize": 10,
"poiId": id[0],
"sourceType": 1,
"sortType": 3,
"starType": 0
},
"head": {
"cid": "09031069112760102754",
"ctok": "",
"cver": "1.0",
"lang": "01",
"sid": "8888",
"syscode": "09",
"auth": "",
"xsid": "",
"extension": []
}
}
html = requests.post(postUrl, data=json.dumps(data_pre)).text
html = json.loads(html)
# 确定总页数总页数
total_page = int(html['result']['totalCount'] / 10)
if total_page > 300:
total_page = 300
# 遍历查询评论
print("总页数:", total_page, "爬取中")
# 创建写入csv文件
path = './dalian/' + str(id[1]) + '.csv'
xuhao = 0
with open(path, 'w', newline='', encoding='utf-8') as f:
file = csv.writer(f)
file.writerow(['序号', '景区ID', '景区名称', '评论'])
for page in range(1, int(total_page) + 1):
data = {
"arg": {
"channelType": 2,
"collapseType": 0,
"commentTagId": 0,
"pageIndex": page,
"pageSize": 10,
"poiId": id[0],
"sourceType": 1,
"sortType": 3,
"starType": 0
},
"head": {
"cid": "09031069112760102754",
"ctok": "",
"cver": "1.0",
"lang": "01",
"sid": "8888",
"syscode": "09",
"auth": "",
"xsid": "",
"extension": []
}
}
html = requests.post(postUrl, data=json.dumps(data)).text
html = json.loads(html)
# 获取评论
for j in range(10):
result = html['result']['items'][j]['content']
file.writerow([xuhao, id[0], id[1], result])
print([xuhao, id[0], id[1], result])
xuhao += 1
print(id[1], "爬取完成")