|
| 1 | +#coding=utf-8 |
| 2 | + |
| 3 | +''' |
| 4 | +@author: wepon, http://2hwp.com |
| 5 | +Reference: |
| 6 | + Book: <<Machine Learning in Action>> |
| 7 | + Software: sklearn.cluster.KMeans |
| 8 | +
|
| 9 | +''' |
| 10 | +import numpy as np |
| 11 | + |
| 12 | +class KMeans(object): |
| 13 | + """ |
| 14 | + - 参数 |
| 15 | + n_clusters: |
| 16 | + 聚类个数,即k |
| 17 | + initCent: |
| 18 | + 质心初始化方式,可选"random"或指定一个具体的array,默认random,即随机初始化 |
| 19 | + max_iter: |
| 20 | + 最大迭代次数 |
| 21 | + """ |
| 22 | + def __init__(self,n_clusters=5,initCent='random',max_iter=300): |
| 23 | + if hasattr(initCent, '__array__'): |
| 24 | + n_clusters = initCent.shape[0] |
| 25 | + self.centroids = np.asarray(initCent, dtype=np.float) |
| 26 | + else: |
| 27 | + self.centroids = None |
| 28 | + |
| 29 | + self.n_clusters = n_clusters |
| 30 | + self.max_iter = max_iter |
| 31 | + self.initCent = initCent |
| 32 | + self.clusterAssment = None |
| 33 | + self.labels = None |
| 34 | + self.sse = None |
| 35 | + |
| 36 | + #计算两点的欧式距离 |
| 37 | + def _distEclud(self, vecA, vecB): |
| 38 | + return np.linalg.norm(vecA - vecB) |
| 39 | + |
| 40 | + #随机选取k个质心,必须在数据集的边界内 |
| 41 | + def _randCent(self, X, k): |
| 42 | + n = X.shape[1] #特征维数 |
| 43 | + centroids = np.empty((k,n)) #k*n的矩阵,用于存储质心 |
| 44 | + for j in range(n): #产生k个质心,一维一维地随机初始化 |
| 45 | + minJ = min(X[:,j]) |
| 46 | + rangeJ = float(max(X[:,j]) - minJ) |
| 47 | + centroids[:,j] = (minJ + rangeJ * np.random.rand(k,1)).flatten() |
| 48 | + return centroids |
| 49 | + |
| 50 | + def fit(self, X): |
| 51 | + #类型检查 |
| 52 | + if not isinstance(X,np.ndarray): |
| 53 | + try: |
| 54 | + X = np.asarray(X) |
| 55 | + except: |
| 56 | + raise TypeError("numpy.ndarray required for X") |
| 57 | + |
| 58 | + m = X.shape[0]#m代表样本数量 |
| 59 | + self.clusterAssment = np.empty((m,2))#m*2的矩阵,第一列存储样本点所属的族的索引值, |
| 60 | + #第二列存储该点与所属族的质心的平方误差 |
| 61 | + if self.initCent == 'random': |
| 62 | + self.centroids = self._randCent(X, self.n_clusters) |
| 63 | + |
| 64 | + clusterChanged = True |
| 65 | + for _ in range(self.max_iter): |
| 66 | + clusterChanged = False |
| 67 | + for i in range(m):#将每个样本点分配到离它最近的质心所属的族 |
| 68 | + minDist = np.inf; minIndex = -1 |
| 69 | + for j in range(self.n_clusters): |
| 70 | + distJI = self._distEclud(self.centroids[j,:],X[i,:]) |
| 71 | + if distJI < minDist: |
| 72 | + minDist = distJI; minIndex = j |
| 73 | + if self.clusterAssment[i,0] != minIndex: |
| 74 | + clusterChanged = True |
| 75 | + self.clusterAssment[i,:] = minIndex,minDist**2 |
| 76 | + |
| 77 | + if not clusterChanged:#若所有样本点所属的族都不改变,则已收敛,结束迭代 |
| 78 | + break |
| 79 | + for i in range(self.n_clusters):#更新质心,即将每个族中的点的均值作为质心 |
| 80 | + ptsInClust = X[np.nonzero(self.clusterAssment[:,0]==i)[0]]#取出属于第i个族的所有点 |
| 81 | + self.centroids[i,:] = np.mean(ptsInClust, axis=0) |
| 82 | + |
| 83 | + self.labels = self.clusterAssment[:,0] |
| 84 | + self.sse = sum(self.clusterAssment[:,1]) |
| 85 | + |
| 86 | + |
| 87 | + def predict(self,X):#根据聚类结果,预测新输入数据所属的族 |
| 88 | + #类型检查 |
| 89 | + if not isinstance(X,np.ndarray): |
| 90 | + try: |
| 91 | + X = np.asarray(X) |
| 92 | + except: |
| 93 | + raise TypeError("numpy.ndarray required for X") |
| 94 | + |
| 95 | + m = X.shape[0]#m代表样本数量 |
| 96 | + preds = np.empty((m,)) |
| 97 | + for i in range(m):#将每个样本点分配到离它最近的质心所属的族 |
| 98 | + minDist = np.inf |
| 99 | + for j in range(self.n_clusters): |
| 100 | + distJI = self._distEclud(self.centroids[j,:],X[i,:]) |
| 101 | + if distJI < minDist: |
| 102 | + minDist = distJI |
| 103 | + preds[i] = j |
| 104 | + return preds |
| 105 | + |
| 106 | + |
| 107 | +class biKMeans(object): |
| 108 | + def __init__(self,n_clusters=5): |
| 109 | + self.n_clusters = n_clusters |
| 110 | + self.centroids = None |
| 111 | + self.clusterAssment = None |
| 112 | + self.labels = None |
| 113 | + self.sse = None |
| 114 | + |
| 115 | + |
| 116 | + #计算两点的欧式距离 |
| 117 | + def _distEclud(self, vecA, vecB): |
| 118 | + return np.linalg.norm(vecA - vecB) |
| 119 | + |
| 120 | + def fit(self,X): |
| 121 | + m = X.shape[0] |
| 122 | + self.clusterAssment = np.zeros((m,2)) |
| 123 | + centroid0 = np.mean(X, axis=0).tolist() |
| 124 | + centList =[centroid0] |
| 125 | + for j in range(m):#计算每个样本点与质心之间初始的平方误差 |
| 126 | + self.clusterAssment[j,1] = self._distEclud(np.asarray(centroid0), X[j,:])**2 |
| 127 | + |
| 128 | + while (len(centList) < self.n_clusters): |
| 129 | + lowestSSE = np.inf |
| 130 | + for i in range(len(centList)):#尝试划分每一族,选取使得误差最小的那个族进行划分 |
| 131 | + ptsInCurrCluster = X[np.nonzero(self.clusterAssment[:,0]==i)[0],:] |
| 132 | + clf = KMeans(n_clusters=2) |
| 133 | + clf.fit(ptsInCurrCluster) |
| 134 | + centroidMat, splitClustAss = clf.centroids, clf.clusterAssment#划分该族后,所得到的质心、分配结果及误差矩阵 |
| 135 | + sseSplit = sum(splitClustAss[:,1]) |
| 136 | + sseNotSplit = sum(self.clusterAssment[np.nonzero(self.clusterAssment[:,0]!=i)[0],1]) |
| 137 | + if (sseSplit + sseNotSplit) < lowestSSE: |
| 138 | + bestCentToSplit = i |
| 139 | + bestNewCents = centroidMat |
| 140 | + bestClustAss = splitClustAss.copy() |
| 141 | + lowestSSE = sseSplit + sseNotSplit |
| 142 | + #该族被划分成两个子族后,其中一个子族的索引变为原族的索引,另一个子族的索引变为len(centList),然后存入centList |
| 143 | + bestClustAss[np.nonzero(bestClustAss[:,0] == 1)[0],0] = len(centList) |
| 144 | + bestClustAss[np.nonzero(bestClustAss[:,0] == 0)[0],0] = bestCentToSplit |
| 145 | + centList[bestCentToSplit] = bestNewCents[0,:].tolist() |
| 146 | + centList.append(bestNewCents[1,:].tolist()) |
| 147 | + self.clusterAssment[np.nonzero(self.clusterAssment[:,0] == bestCentToSplit)[0],:]= bestClustAss |
| 148 | + |
| 149 | + self.labels = self.clusterAssment[:,0] |
| 150 | + self.sse = sum(self.clusterAssment[:,1]) |
| 151 | + self.centroids = np.asarray(centList) |
| 152 | + |
| 153 | + def predict(self,X):#根据聚类结果,预测新输入数据所属的族 |
| 154 | + #类型检查 |
| 155 | + if not isinstance(X,np.ndarray): |
| 156 | + try: |
| 157 | + X = np.asarray(X) |
| 158 | + except: |
| 159 | + raise TypeError("numpy.ndarray required for X") |
| 160 | + |
| 161 | + m = X.shape[0]#m代表样本数量 |
| 162 | + preds = np.empty((m,)) |
| 163 | + for i in range(m):#将每个样本点分配到离它最近的质心所属的族 |
| 164 | + minDist = np.inf |
| 165 | + for j in range(self.n_clusters): |
| 166 | + distJI = self._distEclud(self.centroids[j,:],X[i,:]) |
| 167 | + if distJI < minDist: |
| 168 | + minDist = distJI |
| 169 | + preds[i] = j |
| 170 | + return preds |
0 commit comments