Skip to content

Commit 0786c3b

Browse files
committed
KMeans, bisecting KMeans
Python implement , based on numpy and matplotlib
1 parent ef4337a commit 0786c3b

File tree

4 files changed

+266
-9
lines changed

4 files changed

+266
-9
lines changed

KMeans/data.pkl

Lines changed: 46 additions & 0 deletions
Large diffs are not rendered by default.

KMeans/kmeans.py

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
#coding=utf-8
2+
3+
'''
4+
@author: wepon, http://2hwp.com
5+
Reference:
6+
Book: <<Machine Learning in Action>>
7+
Software: sklearn.cluster.KMeans
8+
9+
'''
10+
import numpy as np
11+
12+
class KMeans(object):
13+
"""
14+
- 参数
15+
n_clusters:
16+
聚类个数,即k
17+
initCent:
18+
质心初始化方式,可选"random"或指定一个具体的array,默认random,即随机初始化
19+
max_iter:
20+
最大迭代次数
21+
"""
22+
def __init__(self,n_clusters=5,initCent='random',max_iter=300):
23+
if hasattr(initCent, '__array__'):
24+
n_clusters = initCent.shape[0]
25+
self.centroids = np.asarray(initCent, dtype=np.float)
26+
else:
27+
self.centroids = None
28+
29+
self.n_clusters = n_clusters
30+
self.max_iter = max_iter
31+
self.initCent = initCent
32+
self.clusterAssment = None
33+
self.labels = None
34+
self.sse = None
35+
36+
#计算两点的欧式距离
37+
def _distEclud(self, vecA, vecB):
38+
return np.linalg.norm(vecA - vecB)
39+
40+
#随机选取k个质心,必须在数据集的边界内
41+
def _randCent(self, X, k):
42+
n = X.shape[1] #特征维数
43+
centroids = np.empty((k,n)) #k*n的矩阵,用于存储质心
44+
for j in range(n): #产生k个质心,一维一维地随机初始化
45+
minJ = min(X[:,j])
46+
rangeJ = float(max(X[:,j]) - minJ)
47+
centroids[:,j] = (minJ + rangeJ * np.random.rand(k,1)).flatten()
48+
return centroids
49+
50+
def fit(self, X):
51+
#类型检查
52+
if not isinstance(X,np.ndarray):
53+
try:
54+
X = np.asarray(X)
55+
except:
56+
raise TypeError("numpy.ndarray required for X")
57+
58+
m = X.shape[0]#m代表样本数量
59+
self.clusterAssment = np.empty((m,2))#m*2的矩阵,第一列存储样本点所属的族的索引值,
60+
#第二列存储该点与所属族的质心的平方误差
61+
if self.initCent == 'random':
62+
self.centroids = self._randCent(X, self.n_clusters)
63+
64+
clusterChanged = True
65+
for _ in range(self.max_iter):
66+
clusterChanged = False
67+
for i in range(m):#将每个样本点分配到离它最近的质心所属的族
68+
minDist = np.inf; minIndex = -1
69+
for j in range(self.n_clusters):
70+
distJI = self._distEclud(self.centroids[j,:],X[i,:])
71+
if distJI < minDist:
72+
minDist = distJI; minIndex = j
73+
if self.clusterAssment[i,0] != minIndex:
74+
clusterChanged = True
75+
self.clusterAssment[i,:] = minIndex,minDist**2
76+
77+
if not clusterChanged:#若所有样本点所属的族都不改变,则已收敛,结束迭代
78+
break
79+
for i in range(self.n_clusters):#更新质心,即将每个族中的点的均值作为质心
80+
ptsInClust = X[np.nonzero(self.clusterAssment[:,0]==i)[0]]#取出属于第i个族的所有点
81+
self.centroids[i,:] = np.mean(ptsInClust, axis=0)
82+
83+
self.labels = self.clusterAssment[:,0]
84+
self.sse = sum(self.clusterAssment[:,1])
85+
86+
87+
def predict(self,X):#根据聚类结果,预测新输入数据所属的族
88+
#类型检查
89+
if not isinstance(X,np.ndarray):
90+
try:
91+
X = np.asarray(X)
92+
except:
93+
raise TypeError("numpy.ndarray required for X")
94+
95+
m = X.shape[0]#m代表样本数量
96+
preds = np.empty((m,))
97+
for i in range(m):#将每个样本点分配到离它最近的质心所属的族
98+
minDist = np.inf
99+
for j in range(self.n_clusters):
100+
distJI = self._distEclud(self.centroids[j,:],X[i,:])
101+
if distJI < minDist:
102+
minDist = distJI
103+
preds[i] = j
104+
return preds
105+
106+
107+
class biKMeans(object):
108+
def __init__(self,n_clusters=5):
109+
self.n_clusters = n_clusters
110+
self.centroids = None
111+
self.clusterAssment = None
112+
self.labels = None
113+
self.sse = None
114+
115+
116+
#计算两点的欧式距离
117+
def _distEclud(self, vecA, vecB):
118+
return np.linalg.norm(vecA - vecB)
119+
120+
def fit(self,X):
121+
m = X.shape[0]
122+
self.clusterAssment = np.zeros((m,2))
123+
centroid0 = np.mean(X, axis=0).tolist()
124+
centList =[centroid0]
125+
for j in range(m):#计算每个样本点与质心之间初始的平方误差
126+
self.clusterAssment[j,1] = self._distEclud(np.asarray(centroid0), X[j,:])**2
127+
128+
while (len(centList) < self.n_clusters):
129+
lowestSSE = np.inf
130+
for i in range(len(centList)):#尝试划分每一族,选取使得误差最小的那个族进行划分
131+
ptsInCurrCluster = X[np.nonzero(self.clusterAssment[:,0]==i)[0],:]
132+
clf = KMeans(n_clusters=2)
133+
clf.fit(ptsInCurrCluster)
134+
centroidMat, splitClustAss = clf.centroids, clf.clusterAssment#划分该族后,所得到的质心、分配结果及误差矩阵
135+
sseSplit = sum(splitClustAss[:,1])
136+
sseNotSplit = sum(self.clusterAssment[np.nonzero(self.clusterAssment[:,0]!=i)[0],1])
137+
if (sseSplit + sseNotSplit) < lowestSSE:
138+
bestCentToSplit = i
139+
bestNewCents = centroidMat
140+
bestClustAss = splitClustAss.copy()
141+
lowestSSE = sseSplit + sseNotSplit
142+
#该族被划分成两个子族后,其中一个子族的索引变为原族的索引,另一个子族的索引变为len(centList),然后存入centList
143+
bestClustAss[np.nonzero(bestClustAss[:,0] == 1)[0],0] = len(centList)
144+
bestClustAss[np.nonzero(bestClustAss[:,0] == 0)[0],0] = bestCentToSplit
145+
centList[bestCentToSplit] = bestNewCents[0,:].tolist()
146+
centList.append(bestNewCents[1,:].tolist())
147+
self.clusterAssment[np.nonzero(self.clusterAssment[:,0] == bestCentToSplit)[0],:]= bestClustAss
148+
149+
self.labels = self.clusterAssment[:,0]
150+
self.sse = sum(self.clusterAssment[:,1])
151+
self.centroids = np.asarray(centList)
152+
153+
def predict(self,X):#根据聚类结果,预测新输入数据所属的族
154+
#类型检查
155+
if not isinstance(X,np.ndarray):
156+
try:
157+
X = np.asarray(X)
158+
except:
159+
raise TypeError("numpy.ndarray required for X")
160+
161+
m = X.shape[0]#m代表样本数量
162+
preds = np.empty((m,))
163+
for i in range(m):#将每个样本点分配到离它最近的质心所属的族
164+
minDist = np.inf
165+
for j in range(self.n_clusters):
166+
distJI = self._distEclud(self.centroids[j,:],X[i,:])
167+
if distJI < minDist:
168+
minDist = distJI
169+
preds[i] = j
170+
return preds

KMeans/test.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#coding=utf-8
2+
import cPickle
3+
import matplotlib.pyplot as plt
4+
import numpy as np
5+
from kmeans import KMeans,biKMeans
6+
7+
if __name__ == "__main__":
8+
#加载数据
9+
X,y = cPickle.load(open('data.pkl','r'))
10+
11+
#依次画出迭代1次、2次、3次...的图
12+
for max_iter in range(6):
13+
#设置参数
14+
n_clusters = 10
15+
initCent = X[50:60] #将初始质心初始化为X[50:60]
16+
#训练模型
17+
clf = KMeans(n_clusters,initCent,max_iter)
18+
clf.fit(X)
19+
cents = clf.centroids
20+
labels = clf.labels
21+
sse = clf.sse
22+
#画出聚类结果,每一类用一种颜色
23+
colors = ['b','g','r','k','c','m','y','#e24fff','#524C90','#845868']
24+
for i in range(n_clusters):
25+
index = np.nonzero(labels==i)[0]
26+
x0 = X[index,0]
27+
x1 = X[index,1]
28+
y_i = y[index]
29+
for j in range(len(x0)):
30+
plt.text(x0[j],x1[j],str(int(y_i[j])),color=colors[i],\
31+
fontdict={'weight': 'bold', 'size': 9})
32+
plt.scatter(cents[i,0],cents[i,1],marker='x',color=colors[i],linewidths=12)
33+
plt.title("SSE={:.2f}".format(sse))
34+
plt.axis([-30,30,-30,30])
35+
#plt.savefig("{}.png".format(max_iter))
36+
#plt.close()
37+
plt.show()

README.md

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,11 @@
11
MachineLearning
22
====================
33

4-
This project contain some machine learning algrithm demo.Maybe the code is also useful to you.
54

6-
这个仓库包含一些常用的机器学习算法的实现代码,代码中也有会一些具体的小应用。
7-
8-
此外,每个算法我都会写一篇文章来地介绍它们,同时详细地解读代码。文章发表在我的CSDN专栏以及个人网站上:
95

10-
CSDN:[wepon的专栏](http://blog.csdn.net/u012162613)
6+
这个仓库包含一些常用的机器学习算法的实现代码,代码中也有会一些具体的小应用。
117

12-
个人网站:[Wepon's blog](http://2hwp.com)
8+
此外,每个算法我都会写一篇文章来地介绍它们,同时详细地解读代码。文章发表在我的CSDN专栏以及个人网站上。欢迎所有的机器学习爱好者参与进来,并请保证文章和代码高质量。
139

1410

1511
##目录介绍
@@ -58,9 +54,17 @@ CSDN:[wepon的专栏](http://blog.csdn.net/u012162613)
5854

5955
- **DecisionTree**
6056

61-
Python、Numpy、Matplotlib实现的ID3、C4.5,其中C4.5有待完善,后续加入CART。文章待总结
57+
Python、Numpy、Matplotlib实现的ID3、C4.5,其中C4.5有待完善,后续加入CART。文章待总结。[代码](https://github.com/wepe/MachineLearning/tree/master/DecisionTree)
58+
59+
- **KMeans**
60+
61+
介绍了聚类分析中最常用的KMeans算法(及二分KMeans算法),基于NumPy的算法实现,以及基于Matplotlib的聚类过程可视化。[文章链接]()
62+
63+
##Contributor
64+
65+
- [wepon](https://github.com/wepe)
66+
- [Gogary](https://github.com/enjoyhot)
6267

63-
##Contributing
6468

65-
欢迎加入本项目,任何机器学习/深度学习的demo都可以push进来,并且最好有相应的博文介绍代码
69+
欢迎加入本项目,任何机器学习/深度学习的demo都可以push进来,辅以详细的博文介绍
6670

0 commit comments

Comments
 (0)