层次聚类(Hierarchical Cluster)—无监督学习方法、非概率模型、判别模型、线性模型、非参数化模型、批量学习
定义
聚合聚类算法
输入:
n
n
n个样本组成的样本集合及样本之间的距离;
输出:对样本集合的一个层次化聚类。
(1)计算
n
n
n个样本两两之间的欧式距离
{
d
i
j
}
\{ d_{ij} \}
{dij},记作矩阵
D
=
[
d
i
j
]
n
∗
n
D = \big[ d_{ij} \big]_{n*n}
D=[dij]n∗n;
(2)构造
n
n
n个类,每个类只包含一个样本;
(3)合并类间距离最小的两个类,其中最短距离为类间距离,构建一个新类;
(4)计算新类与当前个类的距离。若类的个数为1,终止计算,否则回到步骤(3)。
输入空间
T= { ( x 1 , x 2 , , … , x N ) } \left\{(x_1,x_2,,\dots,x_N)\right\} {(x1,x2,,…,xN)}
import numpy as np
import math
import time
from scipy.special import comb
def load_data(file):
'''
加载iris数据集 下载地址:https://download.csdn.net/download/nanxiaotao/89743712
:param fileName:要加载的文件路径
:return: 数据集和标签集
'''
Xlist = [] #定义一个列表用来保存每条数据
Ylist = [] #定义一个列表用来保存每条数据的类别标签
fr = open(file)
for line in fr.readlines():
cur = line.split(',')
label = cur[-1]
X = [float(x) for x in cur[:-1]]
Xlist.append(X)
Ylist.append(label)
Xarray = np.array(Xlist) #将特征数据转换为数组类型,方便之后的操作
return Xarray, Ylist
Xarray, Ylist = load_data('iris.data')
np.shape(Xarray)
def Normalize(Xarray):
'''
INPUT:
Xarray - (array) 特征数据数组
OUTPUT:
Xarray - (array) 标准化处理后的特征数据数组
'''
for f in range(Xarray.shape[1]):
maxf = np.max(Xarray[:, f])
minf = np.min(Xarray[:, f])
for n in range(Xarray.shape[0]):
Xarray[n][f] = (Xarray[n][f]-minf) / (maxf-minf)
return Xarray
Xarray = Normalize(Xarray) #对特征数据进行标准化处理
np.shape(Xarray)
特征空间(Feature Space)
Xarray[0][0:4]
统计学习方法
模型
y = f ( x 1 , x 2 , x 3 , x 4 ) , x 1 , x 2 , x 3 , x 4 ∈ χ ⊆ R n , y ∈ { I r i s − s e t o s a , I r i s − v e r s i c o l o r , I r i s − v i r g i n i c a } y = f(x_1,x_2,x_3,x_4),x_1,x_2,x_3,x_4 \in \chi \subseteq R^n,y \in \{ Iris-setosa ,Iris-versicolor,Iris-virginica \} y=f(x1,x2,x3,x4),x1,x2,x3,x4∈χ⊆Rn,y∈{Iris−setosa,Iris−versicolor,Iris−virginica}
策略
D p q = m i n { d i j ∣ x i ∈ G p , x j ∈ G q } , d i j = ( ∑ k = 1 m ∣ x k i − x k j ∣ 2 ) 1 2 D_{pq} = min \{ d_{ij}|x_i \in G_p,x_j \in G_q \},d_{ij} = \bigg( \sum_{k=1}^m \big| x_{ki} - x_{kj} \big|^2 \bigg)^{\frac{1}{2}} Dpq=min{dij∣xi∈Gp,xj∈Gq},dij=(k=1∑m xki−xkj 2)21
算法
k = 3 #设定聚类数为3
d i j = ( ∑ k = 1 m ∣ x k i − x k j ∣ 2 ) 1 2 d_{ij} = \bigg( \sum_{k=1}^m \big| x_{ki} - x_{kj} \big|^2 \bigg)^{\frac{1}{2}} dij=(k=1∑m xki−xkj 2)21
def cal_distance(xi, xj):
'''欧式距离计算
INPUT:
Xi - (array) 第i条特征数据
Xj - (array) 第j条特征数据
OUTPUT:
dist - (float) 两条数据的欧式距离
'''
dist = 0
for col in range(len(xi)):
dist += (xi[col]-xj[col]) ** 2
dist = math.sqrt(dist)
return dist
def Distances(Xarray):
'''计算所有特征数据两两之间距离
INPUT:
Xarray - (array) 特征数据数组
OUTPUT:
dists - (array) 两两数据的欧式距离数组
'''
dists = np.zeros((Xarray.shape[0], Xarray.shape[0]))
for n1 in range(Xarray.shape[0]):
for n2 in range(n1):
dists[n1][n2] = cal_distance(Xarray[n1], Xarray[n2])
dists[n2][n1] = dists[n1][n2]
return dists
dists = Distances(Xarray) #计算特征数据的距离数组
D p q = m i n { d i j ∣ x i ∈ G p , x j ∈ G q } D_{pq} = min \{ d_{ij}|x_i \in G_p,x_j \in G_q \} Dpq=min{dij∣xi∈Gp,xj∈Gq}
def cal_groupdist(g1, g2, group_dict, dists):
'''计算两类的类间最短距离
INPUT:
g1 - (int) 类别1的标签
g2 - (int) 类别2的标签
group_dict - (dict) 类别字典
dists - (array) 两两数据的欧式距离数组
OUTPUT:
(int) 类间最短距离
'''
d = []
#循环计算两类之间两两数据的距离
for xi in group_dict[g1]:
for xj in group_dict[g2]:
if xi != xj:
d.append(dists[xi][xj])
return min(d)
def Clustering(Xarray, k, dists):
'''层次聚类
INPUT:
Xarray - (array) 特征数据数组
k - (int) 设定的类别数
dists - (array) 两两数据的欧式距离数组
OUTPUT:
group_dict - (dict) 类别字典
'''
group_dict = {}
for n in range(Xarray.shape[0]):
group_dict[n] = [n]
newgroup = Xarray.shape[0]
while len(group_dict.keys()) > k:
group_dists = {}
for g1 in group_dict.keys():
for g2 in group_dict.keys():
if g1 != g2:
if (g1, g2) not in group_dists.values():
d = cal_groupdist(g1, g2, group_dict, dists)
group_dists[d] = (g1, g2)
group_mindist = min(list(group_dists.keys()))
mingroups = group_dists[group_mindist]
new = []
for g in mingroups:
new.extend(group_dict[g])
del group_dict[g]
group_dict[newgroup] = new
newgroup += 1
return group_dict
group_dict = Clustering(Xarray, k, dists) #进行层次聚类
模型评估
训练误差
A R I = R I − E [ R I ] m a x ( R I ) − E [ R I ] , A R I ∈ [ − 1 , 1 ] , 其中 , R I = a + b ( n 2 ) , R I ∈ [ 0 , 1 ] , n : 实例总数 , ( n 2 ) = C n 2 = n ( n − 1 ) 2 ARI = \dfrac{RI - E[RI]}{max(RI) - E[RI]},ARI \in [-1,1],其中,RI=\dfrac{a+b}{\left( \begin{array}{cccc} n \\ 2 \end{array} \right) },RI \in [0,1],n:实例总数,\left( \begin{array}{cccc} n \\ 2 \end{array} \right)=C_n^2=\dfrac{n(n-1)}{2} ARI=max(RI)−E[RI]RI−E[RI],ARI∈[−1,1],其中,RI=(n2)a+b,RI∈[0,1],n:实例总数,(n2)=Cn2=2n(n−1)
def Adjusted_Rand_Index(group_dict, Ylist, k):
'''计算调整兰德系数(ARI)
INPUT:
group_dict - (dict) 类别字典
Ylist - (list) 类别标签列表
k - (int) 设定的类别数
OUTPUT:
(int) 调整兰德系数
'''
group_array = np.zeros((k, k))
y_dict = {}
for i in range(len(Ylist)):
if Ylist[i] not in y_dict:
y_dict[Ylist[i]] = [i]
else:
y_dict[Ylist[i]].append(i)
for i in range(k):
for j in range(k):
for n in range(len(Ylist)):
if n in group_dict[list(group_dict.keys())[i]] and n in y_dict[list(y_dict.keys())[j]]:
group_array[i][j] += 1
RI = 0 #定义兰德系数(RI)
sum_i = np.zeros(k)
sum_j = np.zeros(k)
for i in range(k):
for j in range(k):
sum_i[i] += group_array[i][j]
sum_j[j] += group_array[i][j]
if group_array[i][j] >= 2:
RI += comb(group_array[i][j], 2)
ci = 0
cj = 0
for i in range(k):
if sum_i[i] >= 2:
ci += comb(sum_i[i], 2)
for j in range(k):
if sum_j[j] >= 2:
cj += comb(sum_j[j], 2)
E_RI = ci * cj / comb(len(Ylist), 2) #计算RI的期望
max_RI = (ci + cj) / 2 #计算RI的最大值
return (RI-E_RI) / (max_RI-E_RI) #返回调整兰德系数的值
ARI = Adjusted_Rand_Index(group_dict, Ylist, k) #计算ARI用来评估聚类结果
print('Adjusted Rand Index:', ARI)