指定聚类中心的聚类算法实现
在k-mean聚类算法中,聚类中心是根据数据样本分布自动计算得到的,如果想要自己指定聚类中心,就需要使用改进的kMediod算法。
from numpy import *
import time
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
"""
参考:https://www.cnblogs.com/yifanrensheng/p/12354910.html#_label2_3
"""
#euclDistance函数计算两个向量之间的欧氏距离
def euclDistance(vector1, vector2):
return np.sqrt(sum(np.power(vector2 - vector1, 2)))
# initCentroids选取设定点为初始均值点
# dataSet: 数据集, k: 人为设定的聚类簇数目
def initCentroids(dataSet, k):
numSamples, dim = dataSet.shape
centroids = np.zeros((k, dim)) #k行dim的0矩阵
centroids[0, :] = [0.5,0.5]
centroids[1, :] = [0.5,2.5]
centroids[2, :] = [2.5,0.5]
centroids[3, :] = [2.5,2.5]
# print(centroids)
return centroids
#定义每个点到其他点的距离和。步骤四的更新簇的均值点时会用到
def costsum( vector1,matrix1):
sum = 0
for i in range(matrix1.shape[0]):
sum += euclDistance(matrix1[i,:], vector1)
return sum
# kmediod: k-mediod聚类功能主函数
# 输入:dataSet-数据集,k-人为设定的聚类簇数目
# 输出:centroids-k个聚类簇的均值点,clusterAssment-聚类簇中的数据点
def kMediod(dataSet, k):
numSamples = dataSet.shape[0]
clusterAssment = np.mat(np.zeros((numSamples, 2)))
# clusterAssment第一列存储当前点所在的簇
# clusterAssment第二列存储点与质心点的距离
clusterChanged = True #用于遍历的标记
## 步骤一: 初始化均值点
centroids = initCentroids(dataSet, k)
while clusterChanged:
clusterChanged = False
## 遍历每一个样本点
for i in range(numSamples):
minDist = 100000.0 # minDist:最近距离,初始定一个较大的值
minIndex = 0 # minIndex:最近的均值点编号
## 步骤二: 寻找最近的均值点
for j in range(k):
distance = euclDistance(centroids[j, :], dataSet[i, :]) #每个点和中心点的距离,共有k个值
if distance < minDist:
#循环去找最小的那个
minDist = distance
minIndex = j
## 步骤三: 更新所属簇
if clusterAssment[i, 0] != minIndex:
clusterChanged = True
clusterAssment[i, :] = minIndex, minDist**2 #记录序号和点与质心点的距离
## 步骤四: 更新簇核心点
for j in range(k):
pointsInCluster = dataSet[np.nonzero(clusterAssment[:, 0] == j)[0]] #当前属于j类的序号
mincostsum = costsum(centroids[j,:],pointsInCluster)
for point in range(pointsInCluster.shape[0]):
cost = costsum( pointsInCluster[point, :],pointsInCluster)
if cost < mincostsum:
mincostsum = cost
centroids[j, :] = pointsInCluster[point, :]
print ('Congratulations, cluster complete!')
return centroids, clusterAssment
# showCluster利用pyplot绘图显示聚类结果(二维平面)
# 输入:dataSet-数据集,k-聚类簇数目,centroids-聚类簇的均值点,clusterAssment-聚类簇中数据点
def showCluster(dataSet, k, centroids, clusterAssment):
numSamples, dim = dataSet.shape
if dim != 2:
print ("Sorry, the dimension of your data is not 2!")
return 1
mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']
if k > len(mark):
return 1
# 画出所有的样本点
for i in range(numSamples):
markIndex = int(clusterAssment[i, 0])
plt.plot(dataSet[i, 0], dataSet[i, 1], mark[markIndex])
mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb']
# 标记簇的质心
for i in range(k):
plt.plot(centroids[i, 0], centroids[i, 1], mark[i], markersize = 12)
plt.show()
## step 1: 构造数据
matrix1=np.random.random((12,2))
matrix2=np.random.random((12,2))
matrix3=np.random.random((12,2))
matrix4=np.random.random((12,2))
for i in range(12):
matrix2[i,0] = matrix2[i,0]+2
matrix3[i,1] = matrix3[i,1]+2
matrix4[i,:] = matrix4[i,:]+2
dataSet = np.vstack((matrix1,matrix2,matrix3,matrix4))
# print(dataSet)
## step 2: 开始聚类...
# dataSet = mat(dataSet)
k = 4
centroids, clusterAssment = kMediod(dataSet, k)
#step 3: 显示聚类结果
showCluster(dataSet, k, centroids, clusterAssment)
# 计算轮廓系数
lable = [np.array(row) for row in clusterAssment]
lable = [row[0] for row in lable]
lable = [row[0] for row in lable]
silhouette_avg = silhouette_score(dataSet, lable )
print("轮廓系数:", silhouette_avg)