第R5周:天气预测
- 🍨 本文为🔗365天深度学习训练营 中的学习记录博客
- 🍖 原作者:K同学啊
文章目录
- 一、代码流程
- 1、导入包
- 2、导入数据
- 3、探索式数据分析(EDA)
- 4、数据预处理
- 5、构建数据集
- 6、预测是否会下雨
- 7、结果可视化
电脑环境:
语言环境:Python 3.8.0
一、代码流程
1、导入包
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
2、导入数据
data = pd.read_csv('./weatherAUS.csv')
df = data.copy()
data.head()
data.describe()
# 将数据转换为日期时间格式
data['Date'] = pd.to_datetime(data['Date'])
data['year'] = data['Date'].dt.year
data['month'] = data['Date'].dt.month
data['day'] = data['Date'].dt.day
data.head()
data.drop('Date', axis=1, inplace=True)
data.columns
Index([‘Location’, ‘MinTemp’, ‘MaxTemp’, ‘Rainfall’, ‘Evaporation’, ‘Sunshine’,
‘WindGustDir’, ‘WindGustSpeed’, ‘WindDir9am’, ‘WindDir3pm’,
‘WindSpeed9am’, ‘WindSpeed3pm’, ‘Humidity9am’, ‘Humidity3pm’,
‘Pressure9am’, ‘Pressure3pm’, ‘Cloud9am’, ‘Cloud3pm’, ‘Temp9am’,
‘Temp3pm’, ‘RainToday’, ‘RainTomorrow’, ‘year’, ‘month’, ‘day’],
dtype=‘object’)
3、探索式数据分析(EDA)
数据相关性分析
plt.figure(figsize=(15,13))
# data.corr()表示了data中两个变量之间的相关性
ax = sns.heatmap(data.corr(), square=True, annot=True, fmt='.2f')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
plt.show()
是否会下雨
# 设置样式和调色板
sns.set(style="whitegrid", palette="Set2")
# 创建一个1行2列的图像布局
fig, axes = plt.subplots(1, 2, figsize=(10, 4)) # (10, 4)
# 图表标题样式
title_font = {'fontsize': 14, 'fontweight': 'bold', 'color': 'darkblue'}
# 第一张图:RainTomorrow
sns.countplot(x='RainTomorrow', data=data, ax=axes[0], edgecolor='black')# 添加边框
axes[0].set_title('Rain Tomorrow', fontdict=title_font) #设置标题
axes[0].set_xlabel('Will it Rain Tomorrow?', fontsize=12) # x轴标签
axes[0].set_ylabel('Count', fontsize=12) # Y轴标签
axes[0].tick_params(axis='x', labelsize=11)# X轴刻度字体大小
axes[0].tick_params(axis='y', labelsize=11) # Y轴刻度字体大小
# 第二张图:RainTodav
sns.countplot(x='RainToday', data=data, ax=axes[1], edgecolor='black')# 添加边框
axes[1].set_title('Rain Today', fontdict=title_font)# 设置标题
axes[1].set_xlabel('Did it Rain Today?', fontsize=12) # x轴标签
axes[1].set_ylabel('Count', fontsize=12) # Y轴标签
axes[1].tick_params(axis='x', labelsize=11)# x轴刻度字体大小
axes[1].tick_params(axis='y', lahelsize=11)# Y轴刻度字体大小
sns.despine()# 去除图表顶部和右侧的边框
plt.tight_layout() # 调整布局,避免图形之间的重叠
plt.show()
x = pd.crosstab(data['RainTomorrow'], data['RainToday'])
x
y = x/x.transpose().sum().values.reshape(2,1)*100
y
- 如果今天不下雨,那么明天下雨的几率=53.22%
- 如果今天下雨,那么明天下雨的几率=46.78%
y.plot(kind='bar', figsize=(4,3), color=['#006666', '#d279a6'])
地理位置与下雨的关系
x=pd.crosstab(data['Location'], data['RainToday'])
# 获取每个城市下雨天数和非下雨天数的百分比
y=x/x.transpose().sum().values.reshape((-1, 1))*100
# 按每个城市的兩天百分比排序
y=y.sort_values(by='Yes', ascending=True)
color=['#cc6699', '#006699', '#006666', '#862d86', '#ff9966']
y.Yes.plot(kind="barh", figsize=(15,20), color=color)
位置影响下雨,对于 Portland 来说,有36% 的时间在下雨,而对于woomers 来说,只有6%的时问在下雨。
湿度和压力对下兩的影响
data.columns
Index([‘Location’, ‘MinTemp’, ‘MaxTemp’, ‘Rainfall’, ‘Evaporation’, ‘Sunshine’,
‘WindGustDir’, ‘WindGustSpeed’, ‘WindDir9am’, ‘WindDir3pm’,
‘WindSpeed9am’, ‘WindSpeed3pm’, ‘Humidity9am’, ‘Humidity3pm’,
‘Pressure9am’, ‘Pressure3pm’, ‘Cloud9am’, ‘Cloud3pm’, ‘Temp9am’,
‘Temp3pm’, ‘RainToday’, ‘RainTomorrow’, ‘year’, ‘month’, ‘day’],
dtype=‘object’)
plt.figure(figsize=(10,8))
sns.scatterplot(data=data, x='Pressure9am',
y='Pressure3pm', hue='RainTomorrow')
plt.figure(figsize=(10,8))
sns.scatterplot(data=data, x='Humidity9am',
y='Humidity3pm', hue='RainTomorrow')
低压与高湿度会增加第二天下兩的概率,尤其是下午 3 点的空气湿度。
气温对下雨的影响
plt.figure(figsize=(10,8))
sns.scatterplot(x='MaxTemp', y='MinTemp', data=data, hue='RainTomorrow')
结论:当一天的最高气温和最低气温接近时,第二天下兩的概率会增加。
4、数据预处理
处理缺损值
# 每列中缺失数据的百分比
data.isnull().sum()/data.shape[0]*100
Location 0.000000
MinTemp 1.020899
MaxTemp 0.866905
Rainfall 2.241853
Evaporation 43.166506
Sunshine 48.009762
WindGustDir 7.098859
WindGustSpeed 7.055548
WindDir9am 7.263853
WindDir3pm 2.906641
WindSpeed9am 1.214767
WindSpeed3pm 2.105046
Humidity9am 1.824557
Humidity3pm 3.098446
Pressure9am 10.356799
Pressure3pm 10.331363
Cloud9am 38.421559
Cloud3pm 40.807095
Temp9am 1.214767
Temp3pm 2.481094
RainToday 2.241853
RainTomorrow 2.245978
year 0.000000
month 0.000000
day 0.000000
dtype: float64
# 在该列中随机选择数进行填充
lst=['Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm']
for col in lst:
fill_list = data[col].dropna()
data[col] = data[col].fillna(pd.Series(np.random.choice(fill_list, size=len(data.index))))
s = (data.dtypes == 'object')
object_cols = list(s[s].index)
object_cols
[‘Location’,
‘WindGustDir’,
‘WindDir9am’,
‘WindDir3pm’,
‘RainToday’,
‘RainTomorrow’]
# inplace=True: 直接修改原对象,不创建副本
# data[i].mode()[0] 返回频率出现最高的选项,众数
for i in object_cols:
data[i].fillna(data[i].mode()[0], inplace=True)
t = (data.dtypes == 'float64')
num_cols = list(t[t].index)
num_cols
[‘MinTemp’,
‘MaxTemp’,
‘Rainfall’,
‘Evaporation’,
‘Sunshine’,
‘WindGustSpeed’,
‘WindSpeed9am’,
‘WindSpeed3pm’,
‘Humidity9am’,
‘Humidity3pm’,
‘Pressure9am’,
‘Pressure3pm’,
‘Cloud9am’,
‘Cloud3pm’,
‘Temp9am’,
‘Temp3pm’]
# .median() 中位数
for i in num_cols:
data[i].fillna(data[i].median(), inplace=True)
data.isnull().sum()
Location 0
MinTemp 0
MaxTemp 0
Rainfall 0
Evaporation 0
Sunshine 0
WindGustDir 0
WindGustSpeed 0
WindDir9am 0
WindDir3pm 0
WindSpeed9am 0
WindSpeed3pm 0
Humidity9am 0
Humidity3pm 0
Pressure9am 0
Pressure3pm 0
Cloud9am 0
Cloud3pm 0
Temp9am 0
Temp3pm 0
RainToday 0
RainTomorrow 0
year 0
month 0
day 0
dtype: int64
5、构建数据集
from sklearn.preprocessing import LabelEncoder
label_encode = LabelEncoder()
for i in object_cols:
data[i] = label_encode.fit_transform(data[i])
X = data.drop(['RainTomorrow', 'day'], axis=1).values
y = data['RainTomorrow'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
6、预测是否会下雨
搭建神经网络
from tensorflow.keras.optimizers import Adam
model = Sequential()
model.add(Dense(units=24, activation='tanh'))
model.add(Dense(units=18, activation='tanh'))
model.add(Dense(units=23, activation='tanh'))
model.add(Dropout(0.5))
model.add(Dense(units=12, activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(units=1, activation='sigmoid'))
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
Early_stop = EarlyStopping(monitor='val_loss',
mode='min',
min_delta=0.001,
verbose=1,
patience=25,
restore_best_weights=True)
模型训练
model.fit(x=X_train,
y=y_train,
validation_data=(X_test, y_test),verbose=1,
batch_size=32,
epochs=10,
callbacks=[Early_stop])
Epoch 1/10
3410/3410 ━━━━━━━━━━━━━━━━━━━━ 12s 3ms/step - accuracy: 0.7578 - loss: 0.5259 - val_accuracy: 0.8282 - val_loss: 0.3941
Epoch 2/10
3410/3410 ━━━━━━━━━━━━━━━━━━━━ 8s 2ms/step - accuracy: 0.8289 - loss: 0.4041 - val_accuracy: 0.8365 - val_loss: 0.3801
Epoch 3/10
3410/3410 ━━━━━━━━━━━━━━━━━━━━ 12s 3ms/step - accuracy: 0.8385 - loss: 0.3843 - val_accuracy: 0.8385 - val_loss: 0.3763
Epoch 4/10
3410/3410 ━━━━━━━━━━━━━━━━━━━━ 9s 3ms/step - accuracy: 0.8360 - loss: 0.3888 - val_accuracy: 0.8396 - val_loss: 0.3738
Epoch 5/10
3410/3410 ━━━━━━━━━━━━━━━━━━━━ 7s 2ms/step - accuracy: 0.8387 - loss: 0.3828 - val_accuracy: 0.8399 - val_loss: 0.3725
Epoch 6/10
3410/3410 ━━━━━━━━━━━━━━━━━━━━ 12s 3ms/step - accuracy: 0.8378 - loss: 0.3817 - val_accuracy: 0.8400 - val_loss: 0.3717
Epoch 7/10
3410/3410 ━━━━━━━━━━━━━━━━━━━━ 11s 3ms/step - accuracy: 0.8395 - loss: 0.3800 - val_accuracy: 0.8392 - val_loss: 0.3723
Epoch 8/10
3410/3410 ━━━━━━━━━━━━━━━━━━━━ 8s 2ms/step - accuracy: 0.8398 - loss: 0.3771 - val_accuracy: 0.8397 - val_loss: 0.3702
Epoch 9/10
3410/3410 ━━━━━━━━━━━━━━━━━━━━ 11s 3ms/step - accuracy: 0.8388 - loss: 0.3786 - val_accuracy: 0.8402 - val_loss: 0.3705
Epoch 10/10
3410/3410 ━━━━━━━━━━━━━━━━━━━━ 10s 3ms/step - accuracy: 0.8394 - loss: 0.3782 - val_accuracy: 0.8403 - val_loss: 0.3694
7、结果可视化
acc = model.history.history['accuracy']
val_acc = model.history.history['val_accuracy']
loss = model.history.history['loss']
val_loss = model.history.history['val_loss']
epochs_range = range(10)
plt.figure(figsize=(15, 5))
plt.subplot(1,2,1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')
plt.subplot(1,2,2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()