Kmeans聚类算法的大概流程是:
①从样本中随机找出K个样本作为中心点;
②求所有样本到这些样本的距离,按照最短的进行归类;
③求每个聚类中的样本的元素的平均值,作为新的中心点;
④继续②,③,知道所有样本再也无法找到新的聚类,就算完成。
# encoding: utf-8 ''' Created on 2017年12月11日 ''' import time from numpy import * from scipy.cluster.hierarchy import centroid import matplotlib.pyplot as plt # 计算欧氏距离 def euclDistance(vector1,vector2): return sqrt(sum(power(vector2-vector1,2))) # 初始化K个中心点 def initCentroids(dataSet,k): # 拿到数据集的格式 例如[[2,3,4][3,4,5]].shape = (2,3) [1,2,3] = (3,) numSamples,dim = dataSet.shape # 按照给定的shape,初始化一个数据类型和排列方式的填满0的数组 centroids = zeros((k,dim)) for i in range(k): index = int(random.uniform(0,numSamples)) #样本集随机挑一个,作为初始质心 centroids[i,:] = dataSet[index,:] return centroids # k-means cluster def kmeans(dataSet,k): numSamples = dataSet.shape[0] #mat 对数组转换用于线性操作,类型变为:numpy.matrixlib.defmatrix.matrix # 初始化一个二维数据,第一列存储样本属于哪个聚类 第二列存储样本和中心的距离 [[0,0],[0,0]... ...] clusterAssment = mat(zeros((numSamples,2))) clusterChanged = True ## 步骤1:初始化中心点 centroids = initCentroids(dataSet, k) while clusterChanged: clusterChanged = False # 遍历每个样本 for i in range(numSamples): minDist = 1000000000.0 #与样本点最近族群距离 minIndex = 0 #所属族 #步骤2 找到一个最近的中心点 for j in range(k): distance = euclDistance(centroids[j,:], dataSet[i,:]) #计算每个点到样本点的距离,找出最近的那一个样本 if distance < minDist: #更新最小距离,所属族 minDist = distance minIndex = j ## 步骤3 找到距离最短的点,然后更新族群 if clusterAssment[i,0] != minIndex: clusterChanged = True clusterAssment[i,:] = minIndex,minDist**2 #族群索引号 #当所有样本都找不到不同的聚类的时候,退出循环 if clusterChanged: for j in range(k): #找到聚类序号为j的所有样本 pointsInCluster = dataSet[nonzero(clusterAssment[:,0].A == j)[0]] #所有族群元素特征值求平均,得到新的中心点 centroids[j,:] = mean(pointsInCluster,axis = 0) print('Congratulations, cluster complete!') return centroids,clusterAssment #展示 def showCluster(dataSet,k,centroids,clusterAssment): numSamples,dim = dataSet.shape if dim != 2: print("Sorry! I can not draw because the dimension of your data is not 2!") return 1 #color mark = ['or','ob','og','ok','^r','+r','sr','dr','<r','pr'] if k > len(mark): print("Sorry! Your k is too large!please contact Zouxy") return 1 for i in range(numSamples): markIndex = int(clusterAssment[i,0]) #每个样本所属族群 plt.plot(dataSet[i,0],dataSet[i,1],mark[markIndex]) mark = ['Dr','Db','Dg','Dk','^b','+b','sb','db','<b','pb'] for i in range(k): plt.plot(centroids[i,0],centroids[i,1],mark[i],markersize = 6) plt.show()
# encoding: utf-8 ''' Created on 2017年12月11日 ''' from numpy import * import time import matplotlib.pyplot as plt import kmeans.Kmeans as kean #步骤1 print('step 1:load data...') dataSet = [] fileIn = open('D:\Users\zhangjie116\Downloads\Wholesale customers data.csv') i = 0; for line in fileIn.readlines(): i = i + 1 lineArr = line.strip().split(',') if i != 1 : dataSet.append([float(lineArr[4]),float(lineArr[5])]) #步骤2 print("step 2:clustering") dataSet = mat(dataSet) k = 4 centroids,clusterAssment = kean.kmeans(dataSet, k) #步骤3 print("step 3:show the result...") kean.showCluster(dataSet, k, centroids, clusterAssment)
# encoding: utf-8 ''' Created on 2017年12月13日 ''' import numpy as np from sklearn import cluster from sklearn.cluster import KMeans import matplotlib.pyplot as plt from sklearn import metrics #从数据集中加载数据 dataSet = [] fileIn = open('D:\Users\zhangjie116\Downloads\Wholesale customers data.csv') i = 0; for line in fileIn.readlines(): i = i + 1 lineArr = line.strip().split(',') if i != 1 : dataSet.append([float(lineArr[4]),float(lineArr[5])]) #代码生成数据集 cluster1=np.random.uniform(0.5,1.5,(2,10)) cluster2=np.random.uniform(3.5,4.5,(2,10)) cluster3=np.random.uniform(7.5,8.5,(2,10)) dataSet=np.hstack((cluster1,cluster2,cluster3)).T max_silhouette_coefficient = 0 max_k = 0 max_centroids = [] max_labels_ = [] numSamples = 0 for k in range(2,10): #设定K clf = KMeans(n_clusters=k) #加载数据集合 s = clf.fit(dataSet) #样本数量 numSamples = len(dataSet) #中心点 centroids = clf.cluster_centers_ labels_ = clf.labels_ #获取轮廓系数 silhouette_coefficient = metrics.silhouette_score(dataSet, clf.labels_,metric='euclidean',sample_size=numSamples) print 'k:%d ==== silhouette_coefficient:%f'%(k,silhouette_coefficient) #找到轮廓系数最大的K值,为效果最好的 if max_silhouette_coefficient < silhouette_coefficient : max_silhouette_coefficient = silhouette_coefficient max_k = k max_centroids = centroids max_labels_ = labels_ #获取聚类效果值 print 'k:%d ==== inertia_:%f'%(k,clf.inertia_) print 'max_k:%d ==== max_silhouette_coefficient:%f'%(max_k,max_silhouette_coefficient) #画出所有样例点 属于同一分类的绘制同样的颜色 mark1 = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr'] for i in xrange(numSamples): plt.plot(dataSet[i][0], dataSet[i][1], mark1[max_labels_[i]]) #mark[markIndex]) mark2 = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb'] # 画出质点,用特殊图型 for i in range(max_k): plt.plot(max_centroids[i,0], max_centroids[i,1], mark2[i],markersize = 12) plt.show()
运行结果:
k:2 ==== silhouette_coefficient:0.749433 k:2 ==== inertia_:93.451706 k:3 ==== silhouette_coefficient:0.887454 k:3 ==== inertia_:4.294235 k:4 ==== silhouette_coefficient:0.746558 k:4 ==== inertia_:3.366372 k:5 ==== silhouette_coefficient:0.579589 k:5 ==== inertia_:2.555257 k:6 ==== silhouette_coefficient:0.622791 k:6 ==== inertia_:2.088957 k:7 ==== silhouette_coefficient:0.452267 k:7 ==== inertia_:1.602857 k:8 ==== silhouette_coefficient:0.447269 k:8 ==== inertia_:1.265230 k:9 ==== silhouette_coefficient:0.454158 k:9 ==== inertia_:0.976325 max_k:3 ==== max_silhouette_coefficient:0.887454