利用 Python 进行数据分析实验(五)
一、实验目的
使用Python解决问题
二、实验要求
自主编写并运行代码,按照模板要求撰写实验报告
三、实验步骤
1 爬取并下载当当网某一本书的网页内容,并保存为html格式
2 在豆瓣网上爬取某本书的前50条短评内容并计算评分的平均值(自学正则表达式)
3 从https://cs.lianjia.com/上爬取长沙某小区的二手房信息(以名都花园为例),并将其保存到EXCEL文件当中
四、实验结果
T1
"""
爬取并下载当当网某一本书的网页内容,并保存为html格式
"""
import os
from urllib import request
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
url = 'http://product.dangdang.com/24029955.html'
req = request.Request(url, headers=header)
html = str(request.urlopen(req).read)
is_exist = os.path.exists('DangDang.html')
if not is_exist:
with open('DangDang.html', 'w+') as f:
f.write(html)
else:
print('File already exsist')
T2
"""
在豆瓣网上爬取某本书的前50条短评内容并计算评分的平均值(自学正则表达式)
"""
import re
from urllib import request
from bs4 import BeautifulSoup
comments = []
list = []
def get_commment(comment):
count = 0
for i in comment:
count = count + 1
# print(count, i.string) # 也可以使用正则
comments.append(i.string)
def get_score(score):
pattern = re.compile('<span class="user-stars allstar(.*?) rating"')
res = re.findall(pattern, str(score))
for irr in res:
list.append(float(irr))
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
p = 0
for i in range(0, 3):
url = f'https://book.douban.com/subject/26912767/comments/?start={i * 20}&limit={(i + 1) * 20}&status=P&sort=new_score'
req = request.Request(url, headers=header)
html = request.urlopen(req).read()
soup = BeautifulSoup(html, 'html.parser')
# get_commment(html.find_all("span", class_="short"))
get_score(soup)
get_commment(soup.find_all("span", class_="short"))
for j in range(0, 50):
print(comments[j])
sum = 0.0
for j in range(0, 50):
sum = sum + float(list[j])
print(sum / 50 * 2 / 10)
T3
"""
从https://cs.lianjia.com/上爬取长沙某小区的二手房信息(以名都花园为例),并将其保存到EXCEL文件当中
"""
from urllib import request
import xlwt
from bs4 import BeautifulSoup
def getHouseList(url):
house = []
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
req = request.Request(url, headers = header)
html = request.urlopen(req).read()
soup = BeautifulSoup(html, 'html.parser')
housename_divs = soup.find_all('div', class_='title')
for housename_div in housename_divs:
housename_as = housename_div.find_all('a')
for housename_a in housename_as:
housename = []
housename.append(housename_a.get_text())
housename.append(housename_a.get('href'))
house.append(housename)
huseinfo_divs = soup.find_all('div', class_='houseInfo')
for i in range(len(huseinfo_divs)):
info = huseinfo_divs[i].get_text()
infos = info.split('|')
# 小区名称
house[i].append(infos[0])
# 户型
house[i].append(infos[1])
# 平米
house[i].append(infos[2])
# 查询总价
house_prices = soup.find_all('div', class_='totalPrice')
for i in range(len(house_prices)):
# 价格
price = house_prices[i].get_text()
house[i].append(price)
return house
# 爬取房屋详细信息:所在区域、套内面积
def houseinfo(url):
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
req = request.Request(url, headers=header)
html = request.urlopen(req).read()
soup = BeautifulSoup(html, 'html.parser')
msg = []
# 所在区域
areainfos = soup.find_all('span', class_='info')
for areainfo in areainfos:
area = areainfo.find('a')
if (not area):
continue
hrefStr = area['href']
if (hrefStr.startswith('javascript')):
continue
msg.append(area.get_text())
break
infolist = soup.find_all('div', id='infoList')
num = []
for info in infolist:
cols = info.find_all('div', class_='col')
for i in cols:
pingmi = i.get_text()
try:
a = float(pingmi[:-2])
num.append(a)
except ValueError:
continue
msg.append(sum(num))
return msg
def writeExcel(excelPath, houses):
workbook = xlwt.Workbook()
sheet = workbook.add_sheet('git')
row0 = ['标题', '链接地址', '户型', '面积', '朝向', '总价', '所属区域', '套内面积']
for i in range(0, len(row0)):
sheet.write(0, i, row0[i])
for i in range(0, len(houses)):
house = houses[i]
print(house)
for j in range(0, len(house)):
sheet.write(i + 1, j, house[j])
workbook.save(excelPath)
# 主函数
def main():
data = []
for i in range(1, 5):
print('-----分隔符', i, '-------')
if i == 1:
url = 'https://cs.lianjia.com/ershoufang/c3511059937033rs%E5%90%8D%E9%83%BD%E8%8A%B1%E5%9B%AD/'
else:
url = 'https://cs.lianjia.com/ershoufang/pg' + str(
i) + 'c3511059937033rs%E5%90%8D%E9%83%BD%E8%8A%B1%E5%9B%AD/'
houses = getHouseList(url)
for house in houses:
link = house[1]
if (not link or not link.startswith('http')):
continue
mianji = houseinfo(link)
house.extend(mianji)
data.extend(houses)
writeExcel('C:/Users/Lunatic/Desktop/cs.xls', data)
if __name__ == '__main__':
main()
五、实验体会
爬虫是Python重要的应用场景,在使用相关技术时不仅仅需要熟悉相关的Python库,更要仔细分析网页,寻找其中规律进行爬取,达成自动化的初衷。