Skip to content

Commit

Permalink
New changes BB working with CNN
Browse files Browse the repository at this point in the history
  • Loading branch information
Ubuntu committed May 3, 2018
1 parent 9f70264 commit 5da67a8
Show file tree
Hide file tree
Showing 34 changed files with 1,241 additions and 221 deletions.
98 changes: 47 additions & 51 deletions active_cls.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
elif opt.usemodel == 'BiLSTM' and opt.dataset == 'mareview':
parameters['dpout'] = 0.5
parameters['wldim'] = 200
parameters['nepch'] = 5
parameters['nepch'] = 10

parameters['lrate'] = 0.001
parameters['batch_size'] = 50
Expand All @@ -77,7 +77,7 @@
elif opt.usemodel == 'CNN' and opt.dataset == 'mareview':
parameters['dpout'] = 0.5
parameters['wlchl'] = 100
parameters['nepch'] = 5
parameters['nepch'] = 10

parameters['lrate'] = 0.001
parameters['batch_size'] = 50
Expand All @@ -97,7 +97,7 @@
elif opt.usemodel == 'BiLSTM_MC' and opt.dataset == 'mareview':
parameters['dpout'] = 0.5
parameters['wldim'] = 200
parameters['nepch'] = 5
parameters['nepch'] = 10

parameters['lrate'] = 0.001
parameters['batch_size'] = 50
Expand All @@ -117,7 +117,7 @@
elif opt.usemodel == 'CNN_MC' and opt.dataset == 'mareview':
parameters['dpout'] = 0.5
parameters['wlchl'] = 100
parameters['nepch'] = 5
parameters['nepch'] = 10

parameters['lrate'] = 0.001
parameters['batch_size'] = 50
Expand Down Expand Up @@ -169,59 +169,56 @@
total_sentences = len(train_data)
avail_budget = total_sentences

print('Building Model............................................................................')
if (model_name == 'BiLSTM'):
print ('BiLSTM')
word_vocab_size = len(word_to_id)
word_embedding_dim = parameters['wrdim']
word_hidden_dim = parameters['wldim']
output_size = parameters['opsiz']

model = BiLSTM(word_vocab_size, word_embedding_dim, word_hidden_dim,
output_size, pretrained = word_embeds)

elif (model_name == 'CNN'):
print ('CNN')
word_vocab_size = len(word_to_id)
word_embedding_dim = parameters['wrdim']
word_out_channels = parameters['wlchl']
output_size = parameters['opsiz']

model = CNN(word_vocab_size, word_embedding_dim, word_out_channels,
output_size, pretrained = word_embeds)

elif (model_name == 'BiLSTM_MC'):
print ('BiLSTM_MC')
word_vocab_size = len(word_to_id)
word_embedding_dim = parameters['wrdim']
word_hidden_dim = parameters['wldim']
output_size = parameters['opsiz']

model = BiLSTM_MC(word_vocab_size, word_embedding_dim, word_hidden_dim,
output_size, pretrained = word_embeds)

elif (model_name == 'CNN_MC'):
print ('CNN_MC')
word_vocab_size = len(word_to_id)
word_embedding_dim = parameters['wrdim']
word_out_channels = parameters['wlchl']
output_size = parameters['opsiz']

model = CNN_MC(word_vocab_size, word_embedding_dim, word_out_channels,
output_size, pretrained = word_embeds)

if model_load:
print ('Loading Saved Weights....................................................................')
model_path = os.path.join(result_path, model_name, 'active_checkpoint', acquire_method,
checkpoint, 'modelweights')
model=torch.load(model_path)

acquisition_path = os.path.join(result_path, model_name, 'active_checkpoint', acquire_method,
checkpoint, 'acquisition2.p')
acquisition_function = pkl.load(open(acquisition_path,'rb'))

else:
print('Building Model............................................................................')
if (model_name == 'BiLSTM'):
print ('BiLSTM')
word_vocab_size = len(word_to_id)
word_embedding_dim = parameters['wrdim']
word_hidden_dim = parameters['wldim']
output_size = parameters['opsiz']

model = BiLSTM(word_vocab_size, word_embedding_dim, word_hidden_dim,
output_size, pretrained = word_embeds)

elif (model_name == 'CNN'):
print ('CNN')
word_vocab_size = len(word_to_id)
word_embedding_dim = parameters['wrdim']
word_out_channels = parameters['wlchl']
output_size = parameters['opsiz']

model = CNN(word_vocab_size, word_embedding_dim, word_out_channels,
output_size, pretrained = word_embeds)

elif (model_name == 'BiLSTM_MC'):
print ('BiLSTM_MC')
word_vocab_size = len(word_to_id)
word_embedding_dim = parameters['wrdim']
word_hidden_dim = parameters['wldim']
output_size = parameters['opsiz']

model = BiLSTM_MC(word_vocab_size, word_embedding_dim, word_hidden_dim,
output_size, pretrained = word_embeds)

elif (model_name == 'CNN_MC'):
print ('CNN_MC')
word_vocab_size = len(word_to_id)
word_embedding_dim = parameters['wrdim']
word_out_channels = parameters['wlchl']
output_size = parameters['opsiz']

model = CNN_MC(word_vocab_size, word_embedding_dim, word_out_channels,
output_size, pretrained = word_embeds)

acquisition_function = Acquisition_CLS(train_data, init_percent=init_percent, seed=9,
acquisition_function = Acquisition_CLS(train_data, init_percent=init_percent, seed=0,
acq_mode = parameters['acqmd'])

model.cuda()
Expand All @@ -230,8 +227,6 @@
print('Initial learning rate is: %s' %(learning_rate))
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

trainer = Trainer(model, optimizer, result_path, model_name, usedataset=opt.dataset)

active_train_data = [train_data[i] for i in acquisition_function.train_index]
sentences_acquired = len(acquisition_function.train_index)

Expand All @@ -247,6 +242,7 @@
os.makedirs(checkpoint_path)

acq_plot_every = max(len(acquisition_function.train_index)/(5*parameters['batch_size']),1)
trainer = Trainer(model, optimizer, result_path, model_name, usedataset=opt.dataset)
losses, all_F = trainer.train_model(num_epochs, active_train_data, test_data, learning_rate,
batch_size = parameters['batch_size'], checkpoint_folder = checkpoint_folder,
plot_every = acq_plot_every)
Expand Down
3 changes: 1 addition & 2 deletions active_learning/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
from .acquisition import Acquisition
from .acquisition_cls import Acquisition_CLS
from .acquisition_srl import Acquisition_SRL
6 changes: 3 additions & 3 deletions active_learning/acquisition.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def get_random(self, data, num_tokens):
i+=1
self.train_index.update(cur_indices)

def get_mnlp(self, dataset, model_path, decoder, num_tokens, batch_size = 32):
def get_mnlp(self, dataset, model_path, decoder, num_tokens, batch_size = 50):

model = torch.load(model_path)
model.train(False)
Expand Down Expand Up @@ -87,7 +87,7 @@ def get_mnlp(self, dataset, model_path, decoder, num_tokens, batch_size = 32):

print ('D Acquisition took %d seconds:' %(time.time()-tm))

def get_mnlp_mc(self, dataset, model_path, decoder, num_tokens, nsamp=150, batch_size = 32):
def get_mnlp_mc(self, dataset, model_path, decoder, num_tokens, nsamp=100, batch_size = 50):

model = torch.load(model_path)
model.train(True)
Expand Down Expand Up @@ -167,7 +167,7 @@ def get_mnlp_mc(self, dataset, model_path, decoder, num_tokens, nsamp=150, batch
print ('MC Acquisition took %d seconds:' %(time.time()-tm))
print ('*'*80)

def obtain_data(self, data, model_path=None, model_name=None, acquire=2, method='random', num_samples=150):
def obtain_data(self, data, model_path=None, model_name=None, acquire=2, method='random', num_samples=100):

num_tokens = (acquire*self.tokenlen)/100

Expand Down
190 changes: 190 additions & 0 deletions active_learning/acquisition_srl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
import torch
torch.manual_seed(0)
from torch.autograd import Variable
import numpy as np
from collections import Counter
import time
from scipy import stats
from neural_srl.util.utils import *
import pandas as pd

class Acquisition_SRL(object):

def __init__(self, train_data, acq_mode='d', init_percent=2, seed=0, usecuda = True):
self.tokenlen = sum([len(x['words']) for x in train_data])
self.train_index = set()
self.npr = np.random.RandomState(seed)
self.obtain_data(train_data, acquire = init_percent)
self.acq_mode = acq_mode
self.usecuda = usecuda

def get_random(self, data, num_tokens):
test_indices = self.npr.permutation(len(data))
cur_tokens=0
cur_indices = set()
i = 0
while cur_tokens<num_tokens:
if test_indices[i] not in self.train_index:
cur_indices.add(test_indices[i])
cur_tokens += len(data[test_indices[i]]['words'])
i+=1
self.train_index.update(cur_indices)

def get_mnlp(self, dataset, model_path, decoder, num_tokens, batch_size = 50):

model = torch.load(model_path)
model.train(False)
tm = time.time()
probs = np.ones(len(dataset))*float('Inf')

new_dataset = [datapoint for j,datapoint in enumerate(dataset) if j not in self.train_index]
new_datapoints = [j for j in range(len(dataset)) if j not in self.train_index]

data_batches = create_batches(new_dataset, batch_size = batch_size, str_words = True,
tag_padded = False)
probscores = []
for data in data_batches:

words = data['words']
verbs = data['verbs']
caps = data['caps']
mask = data['tagsmask']

if self.usecuda:
words = Variable(torch.LongTensor(words)).cuda()
verbs = Variable(torch.LongTensor(verbs)).cuda()
caps = Variable(torch.LongTensor(caps)).cuda()
mask = Variable(torch.LongTensor(mask)).cuda()
else:
words = Variable(torch.LongTensor(words))
verbs = Variable(torch.LongTensor(verbs))
caps = Variable(torch.LongTensor(caps))
mask = Variable(torch.LongTensor(mask))

wordslen = data['wordslen']
sort_info = data['sort_info']

score = model.decode(words, verbs, caps, wordslen, mask, usecuda = self.usecuda,
score_only = True)

norm_scores = score/np.array(wordslen)
assert len(norm_scores) == len(words)
probscores.extend(list(norm_scores[np.array(sort_info)]))

assert len(new_datapoints) == len(probscores)
probs[new_datapoints] = np.array(probscores)

test_indices = np.argsort(probs)
cur_tokens=0
cur_indices = set()
i = 0
while cur_tokens<num_tokens:
cur_indices.add(test_indices[i])
cur_tokens += len(dataset[test_indices[i]]['words'])
i+=1
self.train_index.update(cur_indices)

print ('D Acquisition took %d seconds:' %(time.time()-tm))

def get_mnlp_mc(self, dataset, model_path, decoder, num_tokens, nsamp=100, batch_size = 50):

model = torch.load(model_path)
model.train(True)
tm = time.time()

probs = np.ones((len(dataset),nsamp))*float('Inf')
varsc = np.ones(len(dataset))*float('Inf')

new_dataset = [datapoint for j,datapoint in enumerate(dataset) if j not in self.train_index]
new_datapoints = [j for j in range(len(dataset)) if j not in self.train_index]

data_batches = create_batches(new_dataset, batch_size = batch_size, str_words = True,
tag_padded = False)

varsc_outer_list = []
probs_outer_list = []
for data in data_batches:

words = data['words']
verbs = data['verbs']
caps = data['caps']
mask = data['tagsmask']

if self.usecuda:
words = Variable(torch.LongTensor(words)).cuda()
verbs = Variable(torch.LongTensor(verbs)).cuda()
caps = Variable(torch.LongTensor(caps)).cuda()
mask = Variable(torch.LongTensor(mask)).cuda()
else:
words = Variable(torch.LongTensor(words))
verbs = Variable(torch.LongTensor(verbs))
caps = Variable(torch.LongTensor(caps))
mask = Variable(torch.LongTensor(mask))

wordslen = data['wordslen']
sort_info = data['sort_info']

tag_seq_list = []
probs_list = []
for itr in range(nsamp):
score, tag_seq = model.decode(words, verbs, caps, wordslen, mask, usecuda = self.usecuda,
score_only = False)
tag_seq = [[str(tg) for tg in one_tag_seq] for one_tag_seq in tag_seq]
tag_seq = np.array(['_'.join(one_tag_seq) for one_tag_seq in tag_seq])
tag_seq_new = tag_seq[np.array(sort_info)]
assert len(tag_seq_new) == len(words)
tag_seq_list.append(tag_seq_new)
norm_scores = score/np.array(wordslen)
probs_list.append(norm_scores[np.array(sort_info)])

tag_seq_list = np.array(tag_seq_list)
probs_list = np.array(probs_list).transpose()
_, tag_seq_count = stats.mode(tag_seq_list)
tag_seq_count = tag_seq_count.squeeze(0)
assert len(tag_seq_count) == len(words)
varsc_outer_list.extend(list(tag_seq_count))
probs_outer_list.extend(list(probs_list))

assert len(new_datapoints) == len(varsc_outer_list)
varsc[new_datapoints] = np.array(varsc_outer_list)
assert len(new_datapoints) == len(probs_outer_list)
probs[new_datapoints,:] = np.array(probs_outer_list)
probsmean = np.mean(probs, axis = 1)
test_indices = np.lexsort((probsmean, varsc))

cur_tokens=0
cur_indices = set()
i = 0
while cur_tokens<num_tokens:
cur_indices.add(test_indices[i])
cur_tokens += len(dataset[test_indices[i]]['words'])
i+=1
self.train_index.update(cur_indices)

print ('*'*80)
print ('MC Acquisition took %d seconds:' %(time.time()-tm))
print ('*'*80)

def obtain_data(self, data, model_path=None, model_name=None, acquire=2, method='random', num_samples=100):

num_tokens = (acquire*self.tokenlen)/100

if model_path is None or model_name is None:
method = 'random'

if method=='random':
self.get_random(data, num_tokens)
else:
decoder = None
if self.acq_mode == 'd':
if method=='mnlp':
self.get_mnlp(data, model_path, decoder, num_tokens)
else:
raise NotImplementedError()
elif self.acq_mode == 'm':
if method=='mnlp':
self.get_mnlp_mc(data, model_path, decoder, num_tokens, nsamp = num_samples)
else:
raise NotImplementedError()
else:
raise NotImplementedError()
Loading

0 comments on commit 5da67a8

Please sign in to comment.