Skip to content

Commit

Permalink
add debugging stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
npow committed Aug 16, 2015
1 parent bf72c43 commit 64ab445
Showing 1 changed file with 29 additions and 7 deletions.
36 changes: 29 additions & 7 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ class Model:
def __init__(self, train_file, test_file, batch_size=32, embedding_size=20, max_norm=40, lr=0.01, num_hops=3, adj_weight_tying=True, **kwargs):
train_lines, test_lines = self.get_lines(train_file), self.get_lines(test_file)
lines = np.concatenate([train_lines, test_lines], axis=0)
vocab, word_to_idx, max_seqlen, max_sentlen = self.get_vocab(lines)
vocab, word_to_idx, idx_to_word, max_seqlen, max_sentlen = self.get_vocab(lines)

self.data = { 'train': {}, 'test': {} }
S_train, self.data['train']['C'], self.data['train']['Q'], self.data['train']['Y'] = self.process_dataset(train_lines, word_to_idx, max_sentlen, offset=0)
Expand All @@ -137,7 +137,6 @@ def __init__(self, train_file, test_file, batch_size=32, embedding_size=20, max_
for i in range(10):
for k in ['C', 'Q', 'Y']:
print k, self.data['test'][k][i]

print 'batch_size:', batch_size, 'max_seqlen:', max_seqlen, 'max_sentlen:', max_sentlen
print 'sentences:', S.shape
print 'vocab:', len(vocab), vocab
Expand All @@ -159,7 +158,8 @@ def __init__(self, train_file, test_file, batch_size=32, embedding_size=20, max_
self.lb = lb
self.init_lr = lr
self.lr = self.init_lr
self.questions = np.array([x for x in lines if x['type']=='q'])
self.S = S
self.idx_to_word = idx_to_word

c = T.imatrix()
q = T.ivector()
Expand Down Expand Up @@ -238,7 +238,11 @@ def compute_f1(self, dataset):
y_true = [self.vocab.index(y) for y in dataset['Y'][:len(y_pred)]]
print metrics.confusion_matrix(y_true, y_pred)
print metrics.classification_report(y_true, y_pred)
return metrics.f1_score(y_true, y_pred, average='weighted', pos_label=None)
errors = []
for i,(t,p) in enumerate(zip(y_true,y_pred)):
if t != p:
errors.append((i,self.lb.classes_[p]))
return metrics.f1_score(y_true, y_pred, average='weighted', pos_label=None), errors

def train(self, n_epochs=100, shuffle_batch=False):
epoch = 0
Expand Down Expand Up @@ -267,13 +271,27 @@ def train(self, n_epochs=100, shuffle_batch=False):
print 'epoch:', epoch, 'cost:', (total_cost / len(indices)), ' took: %d(s)' % (end_time - start_time)

print 'TRAIN', '=' * 40
train_f1 = self.compute_f1(self.data['train'])
train_f1, train_errors = self.compute_f1(self.data['train'])
for i,pred in train_errors[:10]:
print 'context: ', self.to_words(self.data['train']['C'][i])
print 'question: ', self.to_words([self.data['train']['Q'][i]])
print 'correct answer: ', self.data['train']['Y'][i]
print 'predicted answer: ', pred
print '---' * 20

print 'TRAIN ERROR:', 1-train_f1

print 'TEST', '=' * 40
test_f1 = self.compute_f1(self.data['test'])
test_f1, test_errors = self.compute_f1(self.data['test'])
print '*** TEST ERROR:', 1-test_f1

def to_words(self, indices):
sents = []
for idx in indices:
words = ' '.join([self.idx_to_word[idx] for idx in self.S[idx] if idx > 0])
sents.append(words)
return ' '.join(sents)

def shuffle_sync(self, dataset):
p = np.random.permutation(len(dataset['Y']))
for k in ['C', 'Q', 'Y']:
Expand Down Expand Up @@ -310,14 +328,18 @@ def get_vocab(self, lines):
for w in vocab:
word_to_idx[w] = len(word_to_idx) + 1

idx_to_word = {}
for w,idx in word_to_idx.iteritems():
idx_to_word[idx] = w

max_seqlen = 0
for i,line in enumerate(lines):
if line['type'] == 'q':
id = line['id']-1
indices = [idx for idx in range(i-id, i) if lines[idx]['type'] == 's']
max_seqlen = max(len(indices), max_seqlen)

return vocab, word_to_idx, max_seqlen, max_sentlen
return vocab, word_to_idx, idx_to_word, max_seqlen, max_sentlen

def process_dataset(self, lines, word_to_idx, max_sentlen, offset):
S, C, Q, Y = [], [], [], []
Expand Down

0 comments on commit 64ab445

Please sign in to comment.