-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclustering.py
97 lines (91 loc) · 4.05 KB
/
clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import sklearn
import matplotlib.pyplot as plt
plt.style.use('matplotlibrc')
## ALGORITHM: CLUSTERING
## TEST STATISTIC CALCULATION
'''
def computeStat(h1, h2, eigvecsa, device='/CPU:0'):
with tf.device(device):
h1t = tf.convert_to_tensor(h1, np.float64)
h2t = tf.convert_to_tensor(h2, np.float64)
eigvecsat = tf.convert_to_tensor(eigvecsa, np.float64)
proj1t = tf.squeeze(h1t[..., None,:] @ eigvecsat[None,...])
proj2t = tf.squeeze(h2t[..., None,:] @ eigvecsat[None,...])
statmns = tf.reduce_max(
tf.reduce_sum(
(proj1t[None,...] - proj1t[:,None,...]) *
(proj2t[None,...] - proj2t[:,None,...]),
axis=-1),
axis=(2,3)).numpy()
return statmns
'''
def computeStat(hs, eigvecsa, numpy=True, smalldata=True, device='/CPU:0', proj=True):
if numpy:
if proj:
projs = (hs[..., None,:] @ eigvecsa[None,...]).squeeze()
else:
projs = hs
if smalldata:
statmns = np.max(
np.nansum(
(projs[0,None,...] - projs[0,:,None,...]) *
(projs[1,None,...] - projs[1,:,None,...]),
axis=-1),
axis=(2,3))
else:
statmns = np.zeros((projs.shape[1], projs.shape[1]))
for s in tqdm(range(projs.shape[2])):
for a in range(projs.shape[3]):
newstats = np.nansum((projs[0, None, :, s, a, :] - projs[0, :, None, s, a, :]) *
(projs[1, None, :, s, a, :] - projs[1, :, None, s, a, :]), -1)
statmns = np.maximum(statmns,
newstats)
return statmns
else:
with tf.device(device):
hs = tf.convert_to_tensor(hs, np.float32)
eigvecsa = tf.convert_to_tensor(eigvecsa, np.float32)
if proj:
projs = tf.squeeze(hs[..., None,:] @ eigvecsa[None,...])
else:
projs = hs
if smalldata:
statmns = tf.reduce_max(
tf.reduce_sum(
(projs[0,None,...] - projs[0,:,None,...]) *
(projs[1,None,...] - projs[1,:,None,...]),
axis=-1),
axis=(2,3)).numpy()
else:
statmns = tf.zeros((projs.shape[1], projs.shape[1]), dtype=tf.float32)
for s in tqdm(range(projs.shape[2])):
for a in range(projs.shape[3]):
newstats = tf.reduce_sum((projs[0, None, :, s, a, :] - projs[0, :, None, s, a, :]) *
(projs[1, None, :, s, a, :] - projs[1, :, None, s, a, :]), -1)
statmns = tf.math.maximum(statmns, newstats)
statmns = statmns.numpy()
return statmns
## OBTAINING CLUSTERS
def getClusters(statmns, thresh, K, method='kmeans'):
return sklearn.cluster.spectral_clustering((statmns < thresh).astype(int), n_clusters=K,
assign_labels='kmeans')
## DIAGNOSTICS
def clusterDiagnostics(statmns, K, labels, lo, hi, step, method='kmeans', figsize=(16,9)):
accs = []
wts = []
taus = np.arange(lo, hi, step)
for tau in tqdm(taus):
clusterlabs = getClusters(statmns, thresh=tau, K=K, method=method)
accs.append(max(np.mean(clusterlabs == labels),
np.mean(clusterlabs != labels)))
wts.append(max(np.mean(clusterlabs==1), np.mean(clusterlabs==0)))
plt.figure(figsize=figsize)
plt.plot(taus, 100*np.array(accs), label='Accuracies (%)')
plt.plot(taus, 100*np.array(wts), label='Max. Cluster Weight (%)')
plt.xlabel('Threshold')
plt.ylabel('Clustering Accuracy')
plt.title('Accuracy Against Thresholds')
plt.legend()