forked from hal3/ciml
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathclustering.py
137 lines (105 loc) · 3.83 KB
/
clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from numpy import *
import numpy as np
from util import *
from pylab import *
def kmeans(X, mu0, doPlot=True):
'''
X is an N*D matrix of N data points in D dimensions.
mu is a K*D matrix of initial cluster centers, K is
the desired number of clusters.
this function should return a tuple (mu, z, obj) where mu is the
final cluster centers, z is the assignment of data points to
clusters, and obj[i] is the kmeans objective function:
(1/N) sum_n || x_n - mu_{z_n} ||^2
at iteration [i].
mu[k,:] is the mean of cluster k
z[n] is the assignment (number in 0...K-1) of data point n
you should run at *most* 100 iterations, but may run fewer
if the algorithm has converged
'''
mu = mu0.copy() # for safety
N,D = X.shape
K = mu.shape[0]
# initialize assignments and objective list
z = zeros((N,), dtype=int)
obj = []
# run at most 100 iterations
for it in range(100):
# store the old value of z so we can check convergence
z_old = z.copy()
# recompute the assignment of points to centers
for n in range(N):
bestK = -1
bestDist = 0
for k in range(K):
d = linalg.norm(X[n,:] - mu[k,:])
if d < bestDist or bestK == -1:
bestK = k
bestDist = d
z[n] = bestK
# recompute means
for k in range(K):
mu[k,:] = mean(X[z==k, :], axis=0)
# compute the objective
currentObjective = 0
for n in range(N):
currentObjective = currentObjective + linalg.norm(X[n,:] - mu[z[n],:]) ** 2 / float(N)
obj.append(currentObjective)
print 'Iteration %d, objective=%g' % (it, currentObjective)
if doPlot:
plotDatasetClusters(X, mu, z)
show(block=False)
x = raw_input("Press enter to continue...")
if x == "q":
doPlot = False
# check to see if we've converged
if all(z == z_old):
break
if doPlot and D==2:
plotDatasetClusters(X, mu, z)
show(block=False)
# return the required values
return (mu, z, array(obj))
def initialize_clusters(X, K, method):
'''
X is N*D matrix of data
K is desired number of clusters (>=1)
method is one of:
determ: initialize deterministically (for comparitive reasons)
random: just initialize randomly
ffh : use furthest-first heuristic
returns a matrix K*D of initial means.
you may assume K <= N
'''
N,D = X.shape
mu = zeros((K,D))
if method == 'determ':
# just use the first K points as centers
mu = X[0:K,:].copy() # be sure to copy otherwise bad things happen!!!
elif method == 'random':
# pick K random centers
dataPoints = range(N)
permute(dataPoints)
mu = X[dataPoints[0:K], :].copy() # ditto above
elif method == 'ffh':
# pick the first center randomly and each subsequent
# subsequent center according to the furthest first
# heuristic
# pick the first center totally randomly
mu[0,:] = X[int(rand() * N), :].copy() # be sure to copy!
# pick each subsequent center by ldh
for k in range(1, K):
# find m such that data point n is the best next mean, set
# this to mu[k,:]
### TODO: YOUR CODE HERE
util.raiseNotDefined()
elif method == 'km++':
# pick the first center randomly and each subsequent
# subsequent center according to the kmeans++ method
# HINT: see numpy.random.multinomial
### TODO: YOUR CODE HERE
util.raiseNotDefined()
else:
print "Initialization method not implemented"
sys.exit(1)
return mu