Skip to content

Commit

Permalink
모델 세이브 기능과 evaluation 추가
Browse files Browse the repository at this point in the history
  • Loading branch information
ramel0915 authored and kimjson committed Dec 22, 2021
1 parent 3fec89e commit 562eae6
Show file tree
Hide file tree
Showing 3 changed files with 130 additions and 13 deletions.
Binary file modified text-classifier/__pycache__/preprocessor.cpython-37.pyc
Binary file not shown.
39 changes: 35 additions & 4 deletions text-classifier/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ def __init__(self, file_name, root, dataset_name, tokenizer, taxo_manager):
if dataset_name.startswith('amazon'):
self.id_name = "asin"
self.text_name = "reviewText"
self.core_name = "core_classes"
self.core_name = "coreclasses"
else:
self.id_name = "index"
self.text_name = "text"
Expand All @@ -302,6 +302,9 @@ def get_tokens (self, id) :
def get_output_label(self, id):
return (self.id2pos[id], self.id2nonneg[id])

def get_categories(self, id):
return self.id2category[id]

def load_from_raw (self):
with open(self.root + self.file_name, "r") as fin:
for i, line in enumerate(fin,0):
Expand Down Expand Up @@ -338,7 +341,15 @@ def load_from_raw (self):

self.id2tokens[id] = token_list
self.id2core[id] = core
self.id2category[id] = category
parent = 0
categories = []
for node in category:
childs = self.taxo_manager.child_from_parent(parent)
label_list = self.taxo_manager.id_from_label(node)
label_id = ( list(set(childs) & set(label_list)) )[0]
parent = label_id
categories = categories + [label_id]
self.id2category[id] = categories
if i%5000 == 4999:
print("%dth data preprocessed!"%(i+1))
self.save_tokens()
Expand All @@ -359,12 +370,16 @@ def load_tokens (self):

def load_dicts (self):
with open(self.root + self.file_name, "r") as fin:
sum = 0
for line in fin:
data = json.loads(line)
id = data[self.id_name]
core = data[self.core_name]
category = data["categories"]

if category[0] == '[':
category = category.replace("'", "\"")
category = json.loads(category)
print(category[0])
#find positive, nonnegative set
self.id2pos[id] = []
self.id2nonneg[id] = []
Expand All @@ -384,7 +399,23 @@ def load_dicts (self):
self.id2nonneg[id] = self.id2nonneg[id] + [parent] + [self.taxo_manager.parent_from_child(parent)] + self.taxo_manager.child_from_parent(parent)

self.id2core[id] = core
self.id2category[id] = category

parent = 0
categories = [-1, -1, -1]
for index, node in enumerate(category,0):
childs = self.taxo_manager.child_from_parent(parent)
label_list = self.taxo_manager.id_from_label(node)
if label_list == None:
print(node)
label_id = -1
sum += 1
break
else:
label_id = ( list(set(childs) & set(label_list)) )[0]
parent = label_id
categories[index] = label_id
self.id2category[id] = categories
print("total %d error cases.."%sum)

def save_tokens (self):
with open(self.root + self.dataset_name + '_tokens.jsonl', "w") as fout:
Expand Down
104 changes: 95 additions & 9 deletions text-classifier/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,19 @@ def __init__(self, dir, train_file, taxonomy_file, data_name,
self.activation = activation
self.rescaling = rescaling

def F1_evaluation(self, true_labels, prediction, threshold):
N = true_labels.shape[0]
true_labels = torch.reshape(true_labels, (N, -1))
prediction = torch.reshape(prediction, (N, -1))
sum = 0.0
for i in range(N):
mask = prediction[i]>=threshold
pred_label = torch.flatten(torch.nonzero(mask))
numerator = len(set(true_labels[i].tolist()) & set(pred_label.tolist()))
denominator = true_labels.shape[1] + pred_label.shape[0]
sum += numerator * 2 / denominator

return sum

def prepare_train(self):

Expand Down Expand Up @@ -120,8 +133,11 @@ def prepare_train(self):
tokens = torch.tensor( self.train_dm.get_tokens(document_id) ,dtype = torch.int32)
tokens = torch.reshape(tokens, (-1, 1))
pos, nonneg = self.train_dm.get_output_label(document_id)
output = torch.zeros(self.L,1)
output = torch.zeros(self.L + 3,1)
mask = torch.ones(self.L,1, dtype = torch.int32)
categories = self.train_dm.get_categories(document_id)
for num, category in enumerate(categories,0):
output[self.L + num] = category

for j in nonneg:
if j in pos:
Expand All @@ -137,16 +153,24 @@ def prepare_train(self):
train_y = torch.cat((train_y, output),0)

train_x = torch.reshape(train_x, ( -1, self.L + self.T ))
train_y = torch.reshape(train_y, ( -1, self.L, 1 ))

train_y = torch.reshape(train_y, ( -1, self.L + 3, 1 ))

self.data_size = train_x.shape[0]
train_dataset = TensorDataset(train_x, train_y)


self.train_dataloader = DataLoader(train_dataset, batch_size=self.B, shuffle=True)


def train(self):
def save_model(self):
torch.save(self.text_classifier.state_dict(), self.dir+ "trained/text-classifier.pt")
return

def load_pretrained_model(self):
self.text_classifier.load_state_dict(torch.load(self.dir+ "trained/text-classifier.pt"))
self.text_classifier.eval()
return

def self_train(self):
print("Start training! bert learning rate: %f, other learning rate: %f, epoch: %d, batch size: %d"
%(self.bert_lr, self.others_lr, self.epoch, self.B))
self.text_classifier.cuda()
Expand All @@ -161,8 +185,9 @@ def train(self):
for i, train_data in enumerate(self.train_dataloader):



inputs, outputs = train_data
true_labels = outputs[:, self.L:, :]
outputs = outputs[:, :self.L, :]
predicted = self.text_classifier(inputs.cuda())
loss = self.loss_fun(predicted, outputs.cuda())
batch_loss += loss.item()
Expand Down Expand Up @@ -201,6 +226,66 @@ def train(self):
print('elapsed time : %f'%(time.time()-start))

print('Finished Training')

def train(self, patience, threshold):
print("Start training! bert learning rate: %f, other learning rate: %f, epoch: %d, batch size: %d"
%(self.bert_lr, self.others_lr, self.epoch, self.B))
self.text_classifier.cuda()
self.text_classifier.train()
max_accuracy = 0.0
patience_count = 0

for epoch in range(self.epoch):
start = time.time()
running_loss = 0.0
batch_loss = 0.0

batch_accuracy = 0.0
running_accuracy = 0.0

self.optimizer.zero_grad()
for i, train_data in enumerate(self.train_dataloader):



inputs, outputs = train_data
true_labels = outputs[:, self.L:, :]
outputs = outputs[:, :self.L, :]

predicted = self.text_classifier(inputs.cuda())
loss = self.loss_fun(predicted, outputs.cuda())
loss.backward()

accuracy = self.F1_evaluation(true_labels, predicted, threshold)
batch_accuracy += accuracy

batch_loss += loss.item()

if (i+1)%8 == 0:
self.optimizer.step()
self.optimizer.zero_grad()
print('[%d, %5d] batch loss: %.3f accuracy : %.3f%%' %
(epoch + 1, i + 1, batch_loss, batch_accuracy * 100 / (self.B * 7 + predicted.shape[0])))
running_accuracy += batch_accuracy
batch_accuracy = 0.0
running_loss += batch_loss
batch_loss = 0.0
print('[%d] total loss: %.3f, accuracy: %.3f%%' %
(epoch + 1, running_loss , running_accuracy, running_accuracy * 100 / self.data_size))
print('[%d] elapsed time : %f'%(epoch+1, time.time()-start))

if (running_accuracy > max_accuracy):
self.save_model()
max_accuracy = running_accuracy
patience_count = 0
else :
patience_count += 1

if patience_count >=patience :
print('Finished Training')
return

print('Finished Training')



Expand All @@ -222,17 +307,18 @@ def train(self):

"""
#DBPedia dataset
train_file = 'DBPEDIA_30000_coreclass.jsonl'
train_file = 'DBPEDIA-coreclass-45000.jsonl'
taxonomy_file = 'taxonomy.json'
data_name = 'DBPEDIA'
"""


#amazon dataset
train_file = 'train-with-core-class-1000.jsonl'
train_file = 'amazon-coreclass-1-10000.jsonl'
taxonomy_file = 'taxonomy.json'
data_name = 'amazon'


bert_lr = 5e-5
others_lr = 4e-3
token_length = 500
Expand All @@ -251,4 +337,4 @@ def train(self):
batch_size, epoch, activation, rescaling)

trainer.prepare_train()
trainer.train()
trainer.train(patience = 3, threshold = 0.3)

0 comments on commit 562eae6

Please sign in to comment.