@@ -15,7 +15,7 @@ def loadDataSet(fileName):
15
15
:return:
16
16
'''
17
17
# 初始化一个空列表
18
- dataMat = []
18
+ dataSet = []
19
19
# 读取文件
20
20
fr = open (fileName )
21
21
# 循环遍历文件所有行
@@ -26,9 +26,9 @@ def loadDataSet(fileName):
26
26
# fltLine = [float(x) for x in curLine]
27
27
# 将数据追加到dataMat
28
28
fltLine = list (map (float ,curLine )) # 映射所有的元素为 float(浮点数)类型
29
- dataMat .append (fltLine )
29
+ dataSet .append (fltLine )
30
30
# 返回dataMat
31
- return dataMat
31
+ return dataSet
32
32
33
33
34
34
def distEclud (vecA , vecB ):
@@ -41,48 +41,48 @@ def distEclud(vecA, vecB):
41
41
return sqrt (sum (power (vecA - vecB , 2 )))
42
42
43
43
44
- def randCent (dataSet , k ):
44
+ def randCent (dataMat , k ):
45
45
'''
46
46
为给定数据集构建一个包含K个随机质心的集合,
47
47
随机质心必须要在整个数据集的边界之内,这可以通过找到数据集每一维的最小和最大值来完成
48
48
然后生成0到1.0之间的随机数并通过取值范围和最小值,以便确保随机点在数据的边界之内
49
- :param dataSet :
49
+ :param dataMat :
50
50
:param k:
51
51
:return:
52
52
'''
53
53
# 获取样本数与特征值
54
- m , n = shape (dataSet )
54
+ m , n = shape (dataMat )
55
55
# 初始化质心,创建(k,n)个以零填充的矩阵
56
56
centroids = mat (zeros ((k , n )))
57
57
# 循环遍历特征值
58
58
for j in range (n ):
59
59
# 计算每一列的最小值
60
- minJ = min (dataSet [:, j ])
60
+ minJ = min (dataMat [:, j ])
61
61
# 计算每一列的范围值
62
- rangeJ = float (max (dataSet [:, j ]) - minJ )
62
+ rangeJ = float (max (dataMat [:, j ]) - minJ )
63
63
# 计算每一列的质心,并将值赋给centroids
64
64
centroids [:, j ] = mat (minJ + rangeJ * random .rand (k , 1 ))
65
65
# 返回质心
66
66
return centroids
67
67
68
68
69
- def kMeans (dataSet , k , distMeas = distEclud , createCent = randCent ):
69
+ def kMeans (dataMat , k , distMeas = distEclud , createCent = randCent ):
70
70
'''
71
71
创建K个质心,然后将每个店分配到最近的质心,再重新计算质心。
72
72
这个过程重复数次,直到数据点的簇分配结果不再改变为止
73
- :param dataSet : 数据集
73
+ :param dataMat : 数据集
74
74
:param k: 簇的数目
75
75
:param distMeans: 计算距离
76
76
:param createCent: 创建初始质心
77
77
:return:
78
78
'''
79
79
# 获取样本数和特征数
80
- m , n = shape (dataSet )
80
+ m , n = shape (dataMat )
81
81
# 初始化一个矩阵来存储每个点的簇分配结果
82
82
# clusterAssment包含两个列:一列记录簇索引值,第二列存储误差(误差是指当前点到簇质心的距离,后面会使用该误差来评价聚类的效果)
83
83
clusterAssment = mat (zeros ((m , 2 )))
84
84
# 创建质心,随机K个质心
85
- centroids = createCent (dataSet , k )
85
+ centroids = createCent (dataMat , k )
86
86
# 初始化标志变量,用于判断迭代是否继续,如果True,则继续迭代
87
87
clusterChanged = True
88
88
while clusterChanged :
@@ -95,7 +95,7 @@ def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
95
95
for j in range (k ):
96
96
# 计算数据点到质心的距离
97
97
# 计算距离是使用distMeas参数给出的距离公式,默认距离函数是distEclud
98
- distJI = distMeas (centroids [j , :], dataSet [i , :])
98
+ distJI = distMeas (centroids [j , :], dataMat [i , :])
99
99
# 如果距离比minDist(最小距离)还小,更新minDist(最小距离)和最小质心的index(索引)
100
100
if distJI < minDist :
101
101
minDist = distJI
@@ -108,38 +108,38 @@ def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
108
108
# 遍历所有质心并更新它们的取值
109
109
for cent in range (k ):
110
110
# 通过数据过滤来获得给定簇的所有点
111
- ptsInClust = dataSet [nonzero (clusterAssment [:, 0 ].A == cent )[0 ]]
111
+ ptsInClust = dataMat [nonzero (clusterAssment [:, 0 ].A == cent )[0 ]]
112
112
# 计算所有点的均值,axis=0表示沿矩阵的列方向进行均值计算
113
113
centroids [cent , :] = mean (ptsInClust , axis = 0 )
114
114
# 返回所有的类质心与点分配结果
115
115
return centroids , clusterAssment
116
116
117
117
118
- def biKmeans (dataSet , k , distMeas = distEclud ):
118
+ def biKmeans (dataMat , k , distMeas = distEclud ):
119
119
'''
120
120
在给定数据集,所期望的簇数目和距离计算方法的条件下,函数返回聚类结果
121
- :param dataSet :
121
+ :param dataMat :
122
122
:param k:
123
123
:param distMeas:
124
124
:return:
125
125
'''
126
- m , n = shape (dataSet )
126
+ m , n = shape (dataMat )
127
127
# 创建一个矩阵来存储数据集中每个点的簇分配结果及平方误差
128
128
clusterAssment = mat (zeros ((m , 2 )))
129
129
# 计算整个数据集的质心,并使用一个列表来保留所有的质心
130
- centroid0 = mean (dataSet , axis = 0 ).tolist ()[0 ]
130
+ centroid0 = mean (dataMat , axis = 0 ).tolist ()[0 ]
131
131
centList = [centroid0 ]
132
132
# 遍历数据集中所有点来计算每个点到质心的误差值
133
133
for j in range (m ):
134
- clusterAssment [j , 1 ] = distMeas (mat (centroid0 ), dataSet [j , :]) ** 2
134
+ clusterAssment [j , 1 ] = distMeas (mat (centroid0 ), dataMat [j , :]) ** 2
135
135
# 对簇不停的进行划分,直到得到想要的簇数目为止
136
136
while (len (centList ) < k ):
137
137
# 初始化最小SSE为无穷大,用于比较划分前后的SSE
138
138
lowestSSE = inf
139
139
# 通过考察簇列表中的值来获得当前簇的数目,遍历所有的簇来决定最佳的簇进行划分
140
140
for i in range (len (centList )):
141
141
# 对每一个簇,将该簇中的所有点堪称一个小的数据集
142
- ptsInCurrCluster = dataSet [nonzero (clusterAssment [:, 0 ].A == i )[0 ], :]
142
+ ptsInCurrCluster = dataMat [nonzero (clusterAssment [:, 0 ].A == i )[0 ], :]
143
143
# 将ptsInCurrCluster输入到函数kMeans中进行处理,k=2,
144
144
# kMeans会生成两个质心(簇),同时给出每个簇的误差值
145
145
centroidMat , splitClustAss = kMeans (ptsInCurrCluster , 2 , distMeas )
0 commit comments