异常检测作业
题目1:检测异常服务器
代码:
import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt
def get_means_and_variance(X, Variance):
means = np.mean(X, axis=0)
if Variance:#为真取协方差,为假则取为方差
sigma2 = (X-means).T@(X-means)/len(X)
else:
sigma2 = np.var(X,axis=0)
return means, sigma2
def gaussian(X, means, sigma2):
if np.ndim(sigma2) ==1:# 如果取得的是方差,维度不匹配,作维度变换方便函数运行
sigma2 = np.diag(sigma2)
X = X - means
n = X.shape[1]
first = np.power(2*np.pi, -n/2)*(np.linalg.det(sigma2)**(-0.5))
second = np.diag(X@np.linalg.inv(sigma2)@X.T)
p = first*np.exp(-0.5*second)
p = p.reshape(-1, 1)
return p
def plot_gaussian(X, means, sigma2):
x = np.arange(0, 30, 0.5)
y = np.arange(0, 30, 0.5)
xx, yy = np.meshgrid(x, y) # 网格变换
z = gaussian(np.c_[xx.ravel(), yy.ravel()], means, sigma2)
zz = z.reshape(xx.shape)
plt.plot(X[:, 0], X[:, 1], 'bx')
contour_levels = [10**h for h in range(-20, 0, 3)] # 绘制等高线
plt.contour(xx, yy, zz, contour_levels)
def select_epsilon(y_val, p):
bestEpsilon = 0
bestF1 = 0
epsilon = np.linspace(min(p), max(p), 1000)
for e in epsilon:
p_ = p < e
tp = np.sum((y_val == 1)&(p_ ==1))#真实值和预测值皆为真的样本点
fp = np.sum((y_val == 0)&(p_ ==1))#真实值为假,预测值为真的样本点
fn = np.sum((y_val == 1)&(p_ ==0))#真实值为真,预测值为假的样本点
prec = tp/(tp + fp) if (tp + fp) else 0 #准确率
rec = tp/(tp + fn) if (tp + fn) else 0#召回率
F1_e = 2*prec*rec/(prec+rec) if (prec+rec) else 0
if F1_e > bestF1:
bestF1 = F1_e
bestEpsilon = e
return bestEpsilon, bestF1
mat = sio.loadmat('./data/ex8data1.mat')
print(mat.keys())
X = mat['X']
X_val, y_val = mat['Xval'], mat['yval']
print(X.shape, X_val.shape, y_val.shape)
plt.plot(X[:, 0], X[:, 1], 'bx')
plt.show()
means, sigma2 = get_means_and_variance(X, Variance=False)
plot_gaussian(X, means, sigma2)
plt.show()
pval = gaussian(X_val, means, sigma2)
bestEpsilon, bestF1 = select_epsilon(y_val, pval)
print(bestEpsilon, bestF1)
p = gaussian(X, means, sigma2)
anoms = np.array([X[i] for i in range(X.shape[0]) if p[i]<bestEpsilon])#异常样本点判断
plot_gaussian(X, means, sigma2)
plt.scatter(anoms[:, 0], anoms[:, 1], c='r', marker='o')
plt.show()
输出:
dict_keys(['__header__', '__version__', '__globals__', 'X', 'Xval', 'yval'])
(307, 2) (307, 2) (307, 1)
[8.99985263e-05] 0.8750000000000001
原始数据散点图
数据的高斯分布等高线图
将异常值标记出来
题目2:高维数据的异常检测
代码:
mat2 = sio.loadmat('./data/ex8data2.mat')
print(mat2.keys())
X2 = mat2['X']
X2_val, y2_val = mat2['Xval'], mat2['yval']
print(X2.shape, X2_val.shape, y2_val.shape)
means_2, sigma2_2 = get_means_and_variance(X2, Variance=False)
pval_2 = gaussian(X2_val, means_2, sigma2_2)
bestEpsilon_2, bestF1_2 = select_epsilon(y2_val, pval_2)
p2 = gaussian(X2, means_2, sigma2_2)
anoms2 = [X2[i] for i in range(X2.shape[0]) if p2[i]<bestEpsilon_2]
print(len(anoms2))# 取协方差时为122个
输出:
dict_keys(['__header__', '__version__', '__globals__', 'X', 'Xval', 'yval'])
(1000, 11) (100, 11) (100, 1)
117
小结:在算法中使用协方差或方差对于准确率、召回率的影响都不大,但是前者能catch到的异常值在这里比方差多,我想是因为其考虑到了多个维度上多个数据的偏差。
作业批改参考:https://www.bilibili.com/video/BV124411A75S?spm_id_from=333.788.videopod.episodes&vd_source=867b8ecbd62561f6cb9b4a83a368f691&p=13