Skip to content

Commit 1823fa7

Browse files
committed
change
1 parent bedcb68 commit 1823fa7

File tree

2 files changed

+294
-0
lines changed

2 files changed

+294
-0
lines changed

Tensor/Tensor_evaluation.py

+80
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
#! /usr/bin/python
2+
3+
from model import *
4+
5+
6+
def load_file(path):
7+
return scipy.sparse.csr_matrix(cPickle.load(open(path)),
8+
dtype=theano.config.floatX)
9+
10+
11+
def convert2idx(spmat):
12+
rows, cols = spmat.nonzero()
13+
return rows[np.argsort(cols)]
14+
15+
16+
def compute_prauc(pred, lab):
17+
pred = np.asarray(pred)
18+
lab = np.asarray(lab)
19+
20+
order = np.argsort(pred)
21+
lab_ordered = lab[order]
22+
pred_ordered = pred[order]
23+
24+
precision = {}
25+
recall = {}
26+
# All examples are classified 1
27+
precision[np.min(pred_ordered) - 1.0] = (np.sum(lab_ordered) /
28+
float(len(lab)))
29+
recall[np.min(pred_ordered) - 1.0] = 1.
30+
for i in range(len(lab)):
31+
if len(lab) - i - 1 == 0:
32+
# No examples are classified 1
33+
precision[pred_ordered[i]] = 1
34+
else:
35+
precision[pred_ordered[i]] = (np.sum(lab_ordered[i + 1:]) /
36+
float(len(lab) - i - 1))
37+
recall[pred_ordered[i]] = (np.sum(lab_ordered[i + 1:]) /
38+
float(np.sum(lab_ordered)))
39+
40+
# Precision-Recall curve points
41+
points = []
42+
for i in np.sort(precision.keys())[::-1]:
43+
points += [(float(recall[i]), float(precision[i]))]
44+
# Compute area
45+
auc = sum((y0 + y1) / 2. * (x1 - x0) for (x0, y0), (x1, y1) in
46+
zip(points[:-1], points[1:]))
47+
return auc
48+
49+
50+
def PRAUCEval(datapath='../data/', dataset='umls-test',
51+
loadmodel='best_valid_model.pkl', fold=0):
52+
53+
# Load model
54+
f = open(loadmodel)
55+
embeddings = cPickle.load(f)
56+
leftop = cPickle.load(f)
57+
rightop = cPickle.load(f)
58+
simfn = cPickle.load(f)
59+
f.close()
60+
61+
# Load data
62+
l = load_file(datapath + dataset + '-lhs-fold%s.pkl' % fold)
63+
r = load_file(datapath + dataset + '-rhs-fold%s.pkl' % fold)
64+
o = load_file(datapath + dataset + '-rel-fold%s.pkl' % fold)
65+
if type(embeddings) is list:
66+
o = o[-embeddings[1].N:, :]
67+
out = cPickle.load(open(datapath + '%s-targets-fold%s.pkl' %
68+
(dataset, fold)))
69+
70+
func = SimFn(simfn, embeddings, leftop, rightop)
71+
sim = func(l, r, o)[0]
72+
73+
AUC = compute_prauc(list(sim), list(out))
74+
print "### Prediction Recall AUC:", AUC
75+
76+
return AUC
77+
78+
79+
if __name__ == '__main__':
80+
PRAUCEval()

Tensor/Tensor_parse.py

+214
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
import os
2+
import cPickle
3+
4+
import numpy
5+
import scipy.sparse
6+
7+
# Number of folds
8+
K = 10
9+
datapath = None
10+
assert datapath is not None
11+
12+
if 'data' not in os.listdir('../'):
13+
os.mkdir('../data')
14+
15+
for dataset in ['kinships', 'umls', 'nations']:
16+
f = open(datapath + dataset + '.pkl')
17+
dictdata = cPickle.load(f)
18+
tensordata = dictdata['tensor']
19+
20+
# List non-zeros
21+
lnz = []
22+
# List zeros
23+
lz = []
24+
# List of feature triplets
25+
if dataset == 'nations':
26+
lzfeat = []
27+
lnzfeat = []
28+
# Fill the lists
29+
for i in range(tensordata.shape[0]):
30+
for j in range(tensordata.shape[1]):
31+
for k in range(tensordata.shape[2]):
32+
# Separates features triplets for nation
33+
if dataset == 'nations' and (i >= 14 or j >= 14):
34+
if tensordata[i, j, k] == 0:
35+
lzfeat += [(i, j, k)]
36+
elif tensordata[i, j, k] == 1:
37+
lnzfeat += [(i, j, k)]
38+
else:
39+
if tensordata[i, j, k] == 0:
40+
lz += [(i, j, k)]
41+
elif tensordata[i, j, k] == 1:
42+
lnz += [(i, j, k)]
43+
44+
# Pad the feature triplets lists (same for all training folds)
45+
if dataset == 'nation':
46+
if len(lzfeat) < len(lnzfeat):
47+
while len(lzfeat) < len(lnzfeat):
48+
lzfeat += lzfeat[:len(lnzfeat) - len(lzfeat)]
49+
else:
50+
while len(lnzfeat) < len(lzfeat):
51+
lnzfeat += lnzfeat[:len(lzfeat) - len(lnzfeat)]
52+
53+
f = open(datapath + dataset + '_permutations.pkl')
54+
idxnz = cPickle.load(f)
55+
idxz = cPickle.load(f)
56+
f.close()
57+
58+
# For each fold
59+
for k in range(K):
60+
if k != K - 1:
61+
tmpidxnz = (idxnz[:k * len(idxnz) / K] +
62+
idxnz[(k + 2) * len(idxnz) / K:])
63+
tmpidxz = (idxz[:k * len(idxz) / K] +
64+
idxz[(k + 2) * len(idxz) / K:])
65+
tmpidxtestnz = idxnz[k * len(idxnz) / K:(k + 1) * len(idxnz) / K]
66+
tmpidxtestz = idxz[k * len(idxz) / K:(k + 1) * len(idxz) / K]
67+
tmpidxvalnz = idxnz[(k + 1) * len(idxnz) / K:
68+
(k + 2) * len(idxnz) / K]
69+
tmpidxvalz = idxz[(k + 1) * len(idxz) / K:(k + 2) * len(idxz) / K]
70+
else:
71+
tmpidxnz = idxnz[len(idxnz) / K:k * len(idxnz) / K]
72+
tmpidxz = idxz[len(idxz) / K:k * len(idxz) / K]
73+
tmpidxtestnz = idxnz[k * len(idxnz) / K:(k + 1) * len(idxnz) / K]
74+
tmpidxtestz = idxz[k * len(idxz) / K:(k + 1) * len(idxz) / K]
75+
tmpidxvalnz = idxnz[:len(idxnz) / K]
76+
tmpidxvalz = idxz[:len(idxz) / K]
77+
78+
# Test data files
79+
testl = scipy.sparse.lil_matrix((tensordata.shape[1] +
80+
tensordata.shape[2], len(tmpidxtestnz) + len(tmpidxtestz)))
81+
testr = scipy.sparse.lil_matrix((tensordata.shape[1] +
82+
tensordata.shape[2], len(tmpidxtestnz) + len(tmpidxtestz)))
83+
testo = scipy.sparse.lil_matrix((tensordata.shape[1] +
84+
tensordata.shape[2], len(tmpidxtestnz) + len(tmpidxtestz)))
85+
outtest = []
86+
ct = 0
87+
for j in tmpidxtestnz:
88+
i = lnz[j]
89+
testl[i[0], ct] = 1
90+
testr[i[1], ct] = 1
91+
testo[i[2] + tensordata.shape[1], ct] = 1
92+
outtest += [1]
93+
ct += 1
94+
for j in tmpidxtestz:
95+
i = lz[j]
96+
testl[i[0], ct] = 1
97+
testr[i[1], ct] = 1
98+
testo[i[2] + tensordata.shape[1], ct] = 1
99+
outtest += [0]
100+
ct += 1
101+
f = open('../data/%s-test-lhs-fold%s.pkl' % (dataset, k), 'w')
102+
g = open('../data/%s-test-rhs-fold%s.pkl' % (dataset, k), 'w')
103+
h = open('../data/%s-test-rel-fold%s.pkl' % (dataset, k), 'w')
104+
l = open('../data/%s-test-targets-fold%s.pkl' % (dataset, k), 'w')
105+
cPickle.dump(testl.tocsr(), f, -1)
106+
cPickle.dump(testr.tocsr(), g, -1)
107+
cPickle.dump(testo.tocsr(), h, -1)
108+
cPickle.dump(numpy.asarray(outtest), l, -1)
109+
f.close()
110+
g.close()
111+
h.close()
112+
l.close()
113+
114+
# Valid data files
115+
validl = scipy.sparse.lil_matrix((tensordata.shape[1] +
116+
tensordata.shape[2], len(tmpidxvalnz) + len(tmpidxvalz)))
117+
validr = scipy.sparse.lil_matrix((tensordata.shape[1] +
118+
tensordata.shape[2], len(tmpidxvalnz) + len(tmpidxvalz)))
119+
valido = scipy.sparse.lil_matrix((tensordata.shape[1] +
120+
tensordata.shape[2], len(tmpidxvalnz) + len(tmpidxvalz)))
121+
outvalid = []
122+
ct = 0
123+
for j in tmpidxvalnz:
124+
i = lnz[j]
125+
validl[i[0], ct] = 1
126+
validr[i[1], ct] = 1
127+
valido[i[2] + tensordata.shape[1], ct] = 1
128+
outvalid += [1]
129+
ct += 1
130+
for j in tmpidxvalz:
131+
i = lz[j]
132+
validl[i[0], ct] = 1
133+
validr[i[1], ct] = 1
134+
valido[i[2] + tensordata.shape[1], ct] = 1
135+
outvalid += [0]
136+
ct += 1
137+
f = open('../data/%s-valid-lhs-fold%s.pkl' % (dataset, k), 'w')
138+
g = open('../data/%s-valid-rhs-fold%s.pkl' % (dataset, k), 'w')
139+
h = open('../data/%s-valid-rel-fold%s.pkl' % (dataset, k), 'w')
140+
l = open('../data/%s-valid-targets-fold%s.pkl' % (dataset, k), 'w')
141+
cPickle.dump(validl.tocsr(), f, -1)
142+
cPickle.dump(validr.tocsr(), g, -1)
143+
cPickle.dump(valido.tocsr(), h, -1)
144+
cPickle.dump(numpy.asarray(outvalid), l, -1)
145+
f.close()
146+
g.close()
147+
h.close()
148+
l.close()
149+
150+
# Train data files
151+
# Pad the shorter list
152+
if len(tmpidxz) < len(tmpidxnz):
153+
while len(tmpidxz) < len(tmpidxnz):
154+
tmpidxz += tmpidxz[:len(tmpidxnz) - len(tmpidxz)]
155+
else:
156+
while len(tmpidxnz) < len(tmpidxz):
157+
tmpidxnz += tmpidxnz[:len(tmpidxz) - len(tmpidxnz)]
158+
159+
ct = len(tmpidxz)
160+
if dataset == 'nations':
161+
ct += len(lzfeat)
162+
trainposl = scipy.sparse.lil_matrix((tensordata.shape[1] +
163+
tensordata.shape[2], ct))
164+
trainnegl = scipy.sparse.lil_matrix((tensordata.shape[1] +
165+
tensordata.shape[2], ct))
166+
trainposr = scipy.sparse.lil_matrix((tensordata.shape[1] +
167+
tensordata.shape[2], ct))
168+
trainnegr = scipy.sparse.lil_matrix((tensordata.shape[1] +
169+
tensordata.shape[2], ct))
170+
trainposo = scipy.sparse.lil_matrix((tensordata.shape[1] +
171+
tensordata.shape[2], ct))
172+
trainnego = scipy.sparse.lil_matrix((tensordata.shape[1] +
173+
tensordata.shape[2], ct))
174+
ct = 0
175+
for u, v in zip(tmpidxnz, tmpidxz):
176+
ipos = lnz[u]
177+
ineg = lz[v]
178+
trainposl[ipos[0], ct] = 1
179+
trainnegl[ineg[0], ct] = 1
180+
trainposr[ipos[1], ct] = 1
181+
trainnegr[ineg[1], ct] = 1
182+
trainposo[ipos[2] + tensordata.shape[1], ct] = 1
183+
trainnego[ineg[2] + tensordata.shape[1], ct] = 1
184+
ct += 1
185+
# Add all the feature triplets to each folds
186+
if dataset == 'nations':
187+
for u, v in zip(lnzfeat, lzfeat):
188+
ipos = u
189+
ineg = v
190+
trainposl[ipos[0], ct] = 1
191+
trainnegl[ineg[0], ct] = 1
192+
trainposr[ipos[1], ct] = 1
193+
trainnegr[ineg[1], ct] = 1
194+
trainposo[ipos[2] + tensordata.shape[1], ct] = 1
195+
trainnego[ineg[2] + tensordata.shape[1], ct] = 1
196+
ct += 1
197+
f = open('../data/%s-train-pos-lhs-fold%s.pkl' % (dataset, k), 'w')
198+
g = open('../data/%s-train-pos-rhs-fold%s.pkl' % (dataset, k), 'w')
199+
h = open('../data/%s-train-pos-rel-fold%s.pkl' % (dataset, k), 'w')
200+
l = open('../data/%s-train-neg-lhs-fold%s.pkl' % (dataset, k), 'w')
201+
m = open('../data/%s-train-neg-rhs-fold%s.pkl' % (dataset, k), 'w')
202+
n = open('../data/%s-train-neg-rel-fold%s.pkl' % (dataset, k), 'w')
203+
cPickle.dump(trainposl.tocsr(), f, -1)
204+
cPickle.dump(trainposr.tocsr(), g, -1)
205+
cPickle.dump(trainposo.tocsr(), h, -1)
206+
cPickle.dump(trainnegl.tocsr(), l, -1)
207+
cPickle.dump(trainnegr.tocsr(), m, -1)
208+
cPickle.dump(trainnego.tocsr(), n, -1)
209+
f.close()
210+
g.close()
211+
h.close()
212+
l.close()
213+
m.close()
214+
n.close()

0 commit comments

Comments
 (0)