|
| 1 | +import os |
| 2 | +import cPickle |
| 3 | + |
| 4 | +import numpy |
| 5 | +import scipy.sparse |
| 6 | + |
| 7 | +# Number of folds |
| 8 | +K = 10 |
| 9 | +datapath = None |
| 10 | +assert datapath is not None |
| 11 | + |
| 12 | +if 'data' not in os.listdir('../'): |
| 13 | + os.mkdir('../data') |
| 14 | + |
| 15 | +for dataset in ['kinships', 'umls', 'nations']: |
| 16 | + f = open(datapath + dataset + '.pkl') |
| 17 | + dictdata = cPickle.load(f) |
| 18 | + tensordata = dictdata['tensor'] |
| 19 | + |
| 20 | + # List non-zeros |
| 21 | + lnz = [] |
| 22 | + # List zeros |
| 23 | + lz = [] |
| 24 | + # List of feature triplets |
| 25 | + if dataset == 'nations': |
| 26 | + lzfeat = [] |
| 27 | + lnzfeat = [] |
| 28 | + # Fill the lists |
| 29 | + for i in range(tensordata.shape[0]): |
| 30 | + for j in range(tensordata.shape[1]): |
| 31 | + for k in range(tensordata.shape[2]): |
| 32 | + # Separates features triplets for nation |
| 33 | + if dataset == 'nations' and (i >= 14 or j >= 14): |
| 34 | + if tensordata[i, j, k] == 0: |
| 35 | + lzfeat += [(i, j, k)] |
| 36 | + elif tensordata[i, j, k] == 1: |
| 37 | + lnzfeat += [(i, j, k)] |
| 38 | + else: |
| 39 | + if tensordata[i, j, k] == 0: |
| 40 | + lz += [(i, j, k)] |
| 41 | + elif tensordata[i, j, k] == 1: |
| 42 | + lnz += [(i, j, k)] |
| 43 | + |
| 44 | + # Pad the feature triplets lists (same for all training folds) |
| 45 | + if dataset == 'nation': |
| 46 | + if len(lzfeat) < len(lnzfeat): |
| 47 | + while len(lzfeat) < len(lnzfeat): |
| 48 | + lzfeat += lzfeat[:len(lnzfeat) - len(lzfeat)] |
| 49 | + else: |
| 50 | + while len(lnzfeat) < len(lzfeat): |
| 51 | + lnzfeat += lnzfeat[:len(lzfeat) - len(lnzfeat)] |
| 52 | + |
| 53 | + f = open(datapath + dataset + '_permutations.pkl') |
| 54 | + idxnz = cPickle.load(f) |
| 55 | + idxz = cPickle.load(f) |
| 56 | + f.close() |
| 57 | + |
| 58 | + # For each fold |
| 59 | + for k in range(K): |
| 60 | + if k != K - 1: |
| 61 | + tmpidxnz = (idxnz[:k * len(idxnz) / K] + |
| 62 | + idxnz[(k + 2) * len(idxnz) / K:]) |
| 63 | + tmpidxz = (idxz[:k * len(idxz) / K] + |
| 64 | + idxz[(k + 2) * len(idxz) / K:]) |
| 65 | + tmpidxtestnz = idxnz[k * len(idxnz) / K:(k + 1) * len(idxnz) / K] |
| 66 | + tmpidxtestz = idxz[k * len(idxz) / K:(k + 1) * len(idxz) / K] |
| 67 | + tmpidxvalnz = idxnz[(k + 1) * len(idxnz) / K: |
| 68 | + (k + 2) * len(idxnz) / K] |
| 69 | + tmpidxvalz = idxz[(k + 1) * len(idxz) / K:(k + 2) * len(idxz) / K] |
| 70 | + else: |
| 71 | + tmpidxnz = idxnz[len(idxnz) / K:k * len(idxnz) / K] |
| 72 | + tmpidxz = idxz[len(idxz) / K:k * len(idxz) / K] |
| 73 | + tmpidxtestnz = idxnz[k * len(idxnz) / K:(k + 1) * len(idxnz) / K] |
| 74 | + tmpidxtestz = idxz[k * len(idxz) / K:(k + 1) * len(idxz) / K] |
| 75 | + tmpidxvalnz = idxnz[:len(idxnz) / K] |
| 76 | + tmpidxvalz = idxz[:len(idxz) / K] |
| 77 | + |
| 78 | + # Test data files |
| 79 | + testl = scipy.sparse.lil_matrix((tensordata.shape[1] + |
| 80 | + tensordata.shape[2], len(tmpidxtestnz) + len(tmpidxtestz))) |
| 81 | + testr = scipy.sparse.lil_matrix((tensordata.shape[1] + |
| 82 | + tensordata.shape[2], len(tmpidxtestnz) + len(tmpidxtestz))) |
| 83 | + testo = scipy.sparse.lil_matrix((tensordata.shape[1] + |
| 84 | + tensordata.shape[2], len(tmpidxtestnz) + len(tmpidxtestz))) |
| 85 | + outtest = [] |
| 86 | + ct = 0 |
| 87 | + for j in tmpidxtestnz: |
| 88 | + i = lnz[j] |
| 89 | + testl[i[0], ct] = 1 |
| 90 | + testr[i[1], ct] = 1 |
| 91 | + testo[i[2] + tensordata.shape[1], ct] = 1 |
| 92 | + outtest += [1] |
| 93 | + ct += 1 |
| 94 | + for j in tmpidxtestz: |
| 95 | + i = lz[j] |
| 96 | + testl[i[0], ct] = 1 |
| 97 | + testr[i[1], ct] = 1 |
| 98 | + testo[i[2] + tensordata.shape[1], ct] = 1 |
| 99 | + outtest += [0] |
| 100 | + ct += 1 |
| 101 | + f = open('../data/%s-test-lhs-fold%s.pkl' % (dataset, k), 'w') |
| 102 | + g = open('../data/%s-test-rhs-fold%s.pkl' % (dataset, k), 'w') |
| 103 | + h = open('../data/%s-test-rel-fold%s.pkl' % (dataset, k), 'w') |
| 104 | + l = open('../data/%s-test-targets-fold%s.pkl' % (dataset, k), 'w') |
| 105 | + cPickle.dump(testl.tocsr(), f, -1) |
| 106 | + cPickle.dump(testr.tocsr(), g, -1) |
| 107 | + cPickle.dump(testo.tocsr(), h, -1) |
| 108 | + cPickle.dump(numpy.asarray(outtest), l, -1) |
| 109 | + f.close() |
| 110 | + g.close() |
| 111 | + h.close() |
| 112 | + l.close() |
| 113 | + |
| 114 | + # Valid data files |
| 115 | + validl = scipy.sparse.lil_matrix((tensordata.shape[1] + |
| 116 | + tensordata.shape[2], len(tmpidxvalnz) + len(tmpidxvalz))) |
| 117 | + validr = scipy.sparse.lil_matrix((tensordata.shape[1] + |
| 118 | + tensordata.shape[2], len(tmpidxvalnz) + len(tmpidxvalz))) |
| 119 | + valido = scipy.sparse.lil_matrix((tensordata.shape[1] + |
| 120 | + tensordata.shape[2], len(tmpidxvalnz) + len(tmpidxvalz))) |
| 121 | + outvalid = [] |
| 122 | + ct = 0 |
| 123 | + for j in tmpidxvalnz: |
| 124 | + i = lnz[j] |
| 125 | + validl[i[0], ct] = 1 |
| 126 | + validr[i[1], ct] = 1 |
| 127 | + valido[i[2] + tensordata.shape[1], ct] = 1 |
| 128 | + outvalid += [1] |
| 129 | + ct += 1 |
| 130 | + for j in tmpidxvalz: |
| 131 | + i = lz[j] |
| 132 | + validl[i[0], ct] = 1 |
| 133 | + validr[i[1], ct] = 1 |
| 134 | + valido[i[2] + tensordata.shape[1], ct] = 1 |
| 135 | + outvalid += [0] |
| 136 | + ct += 1 |
| 137 | + f = open('../data/%s-valid-lhs-fold%s.pkl' % (dataset, k), 'w') |
| 138 | + g = open('../data/%s-valid-rhs-fold%s.pkl' % (dataset, k), 'w') |
| 139 | + h = open('../data/%s-valid-rel-fold%s.pkl' % (dataset, k), 'w') |
| 140 | + l = open('../data/%s-valid-targets-fold%s.pkl' % (dataset, k), 'w') |
| 141 | + cPickle.dump(validl.tocsr(), f, -1) |
| 142 | + cPickle.dump(validr.tocsr(), g, -1) |
| 143 | + cPickle.dump(valido.tocsr(), h, -1) |
| 144 | + cPickle.dump(numpy.asarray(outvalid), l, -1) |
| 145 | + f.close() |
| 146 | + g.close() |
| 147 | + h.close() |
| 148 | + l.close() |
| 149 | + |
| 150 | + # Train data files |
| 151 | + # Pad the shorter list |
| 152 | + if len(tmpidxz) < len(tmpidxnz): |
| 153 | + while len(tmpidxz) < len(tmpidxnz): |
| 154 | + tmpidxz += tmpidxz[:len(tmpidxnz) - len(tmpidxz)] |
| 155 | + else: |
| 156 | + while len(tmpidxnz) < len(tmpidxz): |
| 157 | + tmpidxnz += tmpidxnz[:len(tmpidxz) - len(tmpidxnz)] |
| 158 | + |
| 159 | + ct = len(tmpidxz) |
| 160 | + if dataset == 'nations': |
| 161 | + ct += len(lzfeat) |
| 162 | + trainposl = scipy.sparse.lil_matrix((tensordata.shape[1] + |
| 163 | + tensordata.shape[2], ct)) |
| 164 | + trainnegl = scipy.sparse.lil_matrix((tensordata.shape[1] + |
| 165 | + tensordata.shape[2], ct)) |
| 166 | + trainposr = scipy.sparse.lil_matrix((tensordata.shape[1] + |
| 167 | + tensordata.shape[2], ct)) |
| 168 | + trainnegr = scipy.sparse.lil_matrix((tensordata.shape[1] + |
| 169 | + tensordata.shape[2], ct)) |
| 170 | + trainposo = scipy.sparse.lil_matrix((tensordata.shape[1] + |
| 171 | + tensordata.shape[2], ct)) |
| 172 | + trainnego = scipy.sparse.lil_matrix((tensordata.shape[1] + |
| 173 | + tensordata.shape[2], ct)) |
| 174 | + ct = 0 |
| 175 | + for u, v in zip(tmpidxnz, tmpidxz): |
| 176 | + ipos = lnz[u] |
| 177 | + ineg = lz[v] |
| 178 | + trainposl[ipos[0], ct] = 1 |
| 179 | + trainnegl[ineg[0], ct] = 1 |
| 180 | + trainposr[ipos[1], ct] = 1 |
| 181 | + trainnegr[ineg[1], ct] = 1 |
| 182 | + trainposo[ipos[2] + tensordata.shape[1], ct] = 1 |
| 183 | + trainnego[ineg[2] + tensordata.shape[1], ct] = 1 |
| 184 | + ct += 1 |
| 185 | + # Add all the feature triplets to each folds |
| 186 | + if dataset == 'nations': |
| 187 | + for u, v in zip(lnzfeat, lzfeat): |
| 188 | + ipos = u |
| 189 | + ineg = v |
| 190 | + trainposl[ipos[0], ct] = 1 |
| 191 | + trainnegl[ineg[0], ct] = 1 |
| 192 | + trainposr[ipos[1], ct] = 1 |
| 193 | + trainnegr[ineg[1], ct] = 1 |
| 194 | + trainposo[ipos[2] + tensordata.shape[1], ct] = 1 |
| 195 | + trainnego[ineg[2] + tensordata.shape[1], ct] = 1 |
| 196 | + ct += 1 |
| 197 | + f = open('../data/%s-train-pos-lhs-fold%s.pkl' % (dataset, k), 'w') |
| 198 | + g = open('../data/%s-train-pos-rhs-fold%s.pkl' % (dataset, k), 'w') |
| 199 | + h = open('../data/%s-train-pos-rel-fold%s.pkl' % (dataset, k), 'w') |
| 200 | + l = open('../data/%s-train-neg-lhs-fold%s.pkl' % (dataset, k), 'w') |
| 201 | + m = open('../data/%s-train-neg-rhs-fold%s.pkl' % (dataset, k), 'w') |
| 202 | + n = open('../data/%s-train-neg-rel-fold%s.pkl' % (dataset, k), 'w') |
| 203 | + cPickle.dump(trainposl.tocsr(), f, -1) |
| 204 | + cPickle.dump(trainposr.tocsr(), g, -1) |
| 205 | + cPickle.dump(trainposo.tocsr(), h, -1) |
| 206 | + cPickle.dump(trainnegl.tocsr(), l, -1) |
| 207 | + cPickle.dump(trainnegr.tocsr(), m, -1) |
| 208 | + cPickle.dump(trainnego.tocsr(), n, -1) |
| 209 | + f.close() |
| 210 | + g.close() |
| 211 | + h.close() |
| 212 | + l.close() |
| 213 | + m.close() |
| 214 | + n.close() |
0 commit comments