python爬虫的练习
- 1.爬取天气网的北京城市历史天气数据
- 1.1 第一种使用面向对象OOP编写爬虫
- 1.2 第二种使用面向过程函数编写爬虫
1.爬取天气网的北京城市历史天气数据
1.1 第一种使用面向对象OOP编写爬虫
import re
import requests
from bs4 import BeautifulSoup
import xlwt
class Spider(object):
"""
天气数据爬虫类
"""
datatime_pattern = re.compile(r'<div class="th200">(.*?)</div>')
wendu_pattern = re.compile(r'<div class="th140">(.*?)</div>')
def __init__(self, url, headers, filepath):
"""
初始化方法
:param url: 基础URL模板
:param headers: HTTP请求头
:param filepath: 输出文件路径
"""
self.url = url
self.headers = headers
self.datalist = []
self.mwen = []
self.iwen = []
self.tq = []
self.fx = []
self.filepath = filepath
def download_page(self,url):
"""
下载页面并返回页面内容
:param url: 要下载的页面URL
:return: 页面内容或None(如果下载失败)
"""
try:
response = requests.get(url, headers=self.headers)
response.raise_for_status()
return response.text
except requests.RequestException as e:
print(f"Error downloading page: {e}")
return None
def parse_page(self, html):
"""
解析页面内容,提取日期和温度数据
:param html: 页面内容
"""
soup = BeautifulSoup(html, 'html.parser')
for item in soup.find_all('ul', class_='thrui'):
item_str = str(item)
dates = re.findall(self.datatime_pattern, item_str)
self.datalist.extend(dates)
temperatures = re.findall(self.wendu_pattern, item_str)
print(temperatures)
for i in range(0, len(temperatures), 4):
self.mwen.append(temperatures[i])
self.iwen.append(temperatures[i + 1])
self.tq.append(temperatures[i + 2])
self.fx.append(temperatures[i + 3])
def download_and_parse_all_pages(self):
"""
下载并解析所有页面
"""
for year in range(23, 24):
for month in range(1, 2):
page_url = f"{self.url}20{year:02d}{month:02d}.html"
print(page_url)
html = self.download_page(page_url)
if html:
self.parse_page(html)
def save_to_excel(self):
"""
将爬取的数据保存到Excel文件中
"""
workbook = xlwt.Workbook(encoding='utf-8', style_compression=0)
worksheet = workbook.add_sheet('北京历史天气数据', cell_overwrite_ok=True)
columns = ("日期", "最高温度", "最低温度", "天气", "风向")
for i, col in enumerate(columns):
worksheet.write(0, i, col)
for i in range(len(self.datalist)):
worksheet.write(i + 1, 0, self.datalist[i])
worksheet.write(i + 1, 1, self.mwen[i])
worksheet.write(i + 1, 2, self.iwen[i])
worksheet.write(i + 1, 3, self.tq[i])
worksheet.write(i + 1, 4, self.fx[i])
workbook.save(self.filepath)
print(f"Data saved to {self.filepath}")
def run(self):
self.download_and_parse_all_pages()
self.save_to_excel()
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0(compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)'
}
url_template = "http://lishi.tianqi.com/beijing/"
filepath = "beijing_weather_data.xls"
spider = Spider(url_template, headers, filepath)
spider.run()
1.2 第二种使用面向过程函数编写爬虫
import requests
from bs4 import BeautifulSoup
import re
import xlwt
datatime = re.compile('<div class="th200">(.*?)</div>')
wendu = re.compile('<div class="th140">(.*?)</div>')
def down_allpage(url):
datalist = []
mwen = []
iwen = []
tq = []
fx = []
for i in range(23,24):
for j in range(1,2):
baseurl = url + '20{}{:0>2d}.html'.format(i, j)
html = down_page(baseurl)
soup = BeautifulSoup(html, 'html.parser')
for item in soup.find_all('ul',class_='thrui'):
item = str(item)
riqi = re.findall(datatime,item)
for item1 in riqi:
datalist.append(item1)
zb_all = re.findall(wendu,item)
for i in range(31):
mwen.append(zb_all[i*4+0])
iwen.append(zb_all[i*4+1])
tq.append(zb_all[i*4+2])
fx.append(zb_all[i*4+3])
return datalist,mwen,iwen,tq,fx
def save_xls(datalist,mwen,iwen,tq,fx):
wb = xlwt.Workbook(encoding='utf-8', style_compression=0)
ws = wb.add_sheet('天气数据',cell_overwrite_ok=True)
col = ("日期","最高温度","最低温度","天气","风向")
for i in range(len(col)):
ws.write(0,i,col[i])
for i in range(len(datalist)):
ws.write(i+1,0,datalist[i])
for i in range(len(mwen)):
ws.write(i+1,1,mwen[i])
for i in range(len(iwen)):
ws.write(i+1,2,iwen[i])
for i in range(len(tq)):
ws.write(i+1,3,tq[i])
for i in range(len(fx)):
ws.write(i+1,4,fx[i])
wb.save(r'D:\天气数据.xls')
def down_page(url):
headers = {
'User-Agent': 'Mozilla/5.0(compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)'
}
r = requests.get(url, headers=headers)
html = r.text
return html
if __name__ == '__main__':
url = 'http://lishi.tianqi.com/beijing/'
down_allpage(url)
datalist,mwen,iwen,tq,fx = down_allpage(url)
print(datalist)
save_xls(datalist,mwen,iwen,tq,fx)