모델 세이브 기능과 evaluation 추가

team-corefinder · Dec 22, 2021 · 562eae6 · 562eae6
1 parent 3fec89e
commit 562eae6
Show file tree

Hide file tree

Showing 3 changed files with 130 additions and 13 deletions.
diff --git a/text-classifier/__pycache__/preprocessor.cpython-37.pyc b/text-classifier/__pycache__/preprocessor.cpython-37.pyc
diff --git a/text-classifier/preprocessor.py b/text-classifier/preprocessor.py
@@ -287,7 +287,7 @@ def __init__(self, file_name, root, dataset_name, tokenizer, taxo_manager):
     if dataset_name.startswith('amazon'):
       self.id_name = "asin"
       self.text_name = "reviewText"
-      self.core_name = "core_classes"
+      self.core_name = "coreclasses"
     else:
       self.id_name = "index"
       self.text_name = "text"
@@ -302,6 +302,9 @@ def get_tokens (self, id) :
   def get_output_label(self, id):
     return (self.id2pos[id], self.id2nonneg[id])
 
+  def get_categories(self, id):
+    return self.id2category[id]
+
   def load_from_raw (self):
     with open(self.root + self.file_name, "r") as fin:
       for i, line in enumerate(fin,0):
@@ -338,7 +341,15 @@ def load_from_raw (self):
 
         self.id2tokens[id] = token_list
         self.id2core[id] = core
-        self.id2category[id] = category
+        parent = 0
+        categories = []
+        for node in category:
+            childs = self.taxo_manager.child_from_parent(parent)
+            label_list = self.taxo_manager.id_from_label(node)
+            label_id = ( list(set(childs) & set(label_list)) )[0]
+            parent = label_id
+            categories = categories + [label_id]
+        self.id2category[id] = categories
         if i%5000 == 4999:
           print("%dth data preprocessed!"%(i+1))
       self.save_tokens()
@@ -359,12 +370,16 @@ def load_tokens (self):
 
   def load_dicts (self):
     with open(self.root + self.file_name, "r") as fin:
+      sum = 0
       for line in fin:
         data = json.loads(line)
         id = data[self.id_name]
         core = data[self.core_name]
         category = data["categories"]
-
+        if category[0] == '[':
+          category = category.replace("'", "\"")
+          category = json.loads(category)
+          print(category[0])
         #find positive, nonnegative set
         self.id2pos[id] = []
         self.id2nonneg[id] = []
@@ -384,7 +399,23 @@ def load_dicts (self):
           self.id2nonneg[id] = self.id2nonneg[id] + [parent] + [self.taxo_manager.parent_from_child(parent)] + self.taxo_manager.child_from_parent(parent)
 
         self.id2core[id] = core
-        self.id2category[id] = category
+
+        parent = 0
+        categories = [-1, -1, -1]
+        for index, node in enumerate(category,0):
+            childs = self.taxo_manager.child_from_parent(parent)
+            label_list = self.taxo_manager.id_from_label(node)
+            if label_list == None:
+              print(node)
+              label_id = -1
+              sum += 1
+              break
+            else:
+              label_id = ( list(set(childs) & set(label_list)) )[0]
+            parent = label_id
+            categories[index] = label_id
+        self.id2category[id] = categories
+      print("total %d error cases.."%sum)
 
   def save_tokens (self):
     with open(self.root + self.dataset_name + '_tokens.jsonl', "w") as fout:

diff --git a/text-classifier/trainer.py b/text-classifier/trainer.py
@@ -44,6 +44,19 @@ def __init__(self, dir, train_file, taxonomy_file, data_name,
                 self.activation = activation
                 self.rescaling = rescaling
 
+        def F1_evaluation(self, true_labels, prediction, threshold):
+                N = true_labels.shape[0]
+                true_labels = torch.reshape(true_labels, (N, -1))
+                prediction = torch.reshape(prediction, (N, -1))
+                sum = 0.0
+                for i in range(N):
+                        mask = prediction[i]>=threshold
+                        pred_label = torch.flatten(torch.nonzero(mask))
+                        numerator = len(set(true_labels[i].tolist()) & set(pred_label.tolist()))
+                        denominator = true_labels.shape[1] + pred_label.shape[0]
+                        sum += numerator * 2 / denominator
+
+                return sum
 
         def prepare_train(self):
 
@@ -120,8 +133,11 @@ def prepare_train(self):
                         tokens = torch.tensor( self.train_dm.get_tokens(document_id) ,dtype = torch.int32)
                         tokens = torch.reshape(tokens, (-1, 1))
                         pos, nonneg = self.train_dm.get_output_label(document_id)
-                        output = torch.zeros(self.L,1)
+                        output = torch.zeros(self.L + 3,1)
                         mask = torch.ones(self.L,1, dtype = torch.int32)
+                        categories = self.train_dm.get_categories(document_id)
+                        for num, category in enumerate(categories,0):
+                                output[self.L + num] = category
 
                         for j in nonneg:
                                 if j in pos:
@@ -137,16 +153,24 @@ def prepare_train(self):
                                 train_y = torch.cat((train_y, output),0)
 
                 train_x = torch.reshape(train_x, ( -1, self.L + self.T ))
-                train_y = torch.reshape(train_y, ( -1, self.L, 1 ))
-
+                train_y = torch.reshape(train_y, ( -1, self.L + 3, 1 ))
 
+                self.data_size = train_x.shape[0]
                 train_dataset = TensorDataset(train_x, train_y)
 
 
                 self.train_dataloader = DataLoader(train_dataset, batch_size=self.B, shuffle=True)
 
-
-        def train(self):
+        def save_model(self):
+                torch.save(self.text_classifier.state_dict(), self.dir+ "trained/text-classifier.pt")
+                return
+
+        def load_pretrained_model(self):
+                self.text_classifier.load_state_dict(torch.load(self.dir+ "trained/text-classifier.pt"))
+                self.text_classifier.eval()
+                return
+
+        def self_train(self):
                 print("Start training! bert learning rate: %f, other learning rate: %f, epoch: %d, batch size: %d"
                         %(self.bert_lr, self.others_lr, self.epoch, self.B))
                 self.text_classifier.cuda()
@@ -161,8 +185,9 @@ def train(self):
                         for i, train_data in enumerate(self.train_dataloader):
 
 
-
                                 inputs, outputs = train_data
+                                true_labels = outputs[:, self.L:, :]
+                                outputs = outputs[:, :self.L, :]
                                 predicted = self.text_classifier(inputs.cuda())
                                 loss = self.loss_fun(predicted, outputs.cuda())
                                 batch_loss += loss.item()
@@ -201,6 +226,66 @@ def train(self):
                         print('elapsed time : %f'%(time.time()-start))
 
                 print('Finished Training')
+
+        def train(self, patience, threshold):
+                print("Start training! bert learning rate: %f, other learning rate: %f, epoch: %d, batch size: %d"
+                        %(self.bert_lr, self.others_lr, self.epoch, self.B))
+                self.text_classifier.cuda()
+                self.text_classifier.train()
+                max_accuracy = 0.0
+                patience_count = 0
+
+                for epoch in range(self.epoch): 
+                        start = time.time()
+                        running_loss = 0.0
+                        batch_loss = 0.0
+
+                        batch_accuracy = 0.0
+                        running_accuracy = 0.0
+
+                        self.optimizer.zero_grad()
+                        for i, train_data in enumerate(self.train_dataloader):
+
+
+
+                                inputs, outputs = train_data
+                                true_labels = outputs[:, self.L:, :]
+                                outputs = outputs[:, :self.L, :]
+
+                                predicted = self.text_classifier(inputs.cuda())
+                                loss = self.loss_fun(predicted, outputs.cuda())
+                                loss.backward()
+
+                                accuracy = self.F1_evaluation(true_labels, predicted, threshold)
+                                batch_accuracy += accuracy
+
+                                batch_loss += loss.item()
+
+                                if (i+1)%8 == 0:
+                                        self.optimizer.step()
+                                        self.optimizer.zero_grad()
+                                        print('[%d, %5d] batch loss: %.3f accuracy : %.3f%%' %
+                                                (epoch + 1, i + 1, batch_loss, batch_accuracy * 100 / (self.B * 7 + predicted.shape[0])))
+                                        running_accuracy += batch_accuracy
+                                        batch_accuracy = 0.0
+                                        running_loss += batch_loss
+                                        batch_loss = 0.0
+                        print('[%d] total loss: %.3f, accuracy: %.3f%%' %
+                                (epoch + 1, running_loss , running_accuracy, running_accuracy * 100 / self.data_size))
+                        print('[%d] elapsed time : %f'%(epoch+1, time.time()-start))
+
+                        if (running_accuracy > max_accuracy):
+                                self.save_model()
+                                max_accuracy = running_accuracy
+                                patience_count = 0
+                        else :
+                                patience_count += 1
+
+                        if patience_count >=patience : 
+                                print('Finished Training')
+                                return
+
+                print('Finished Training')
 
 
 
@@ -222,17 +307,18 @@ def train(self):
 
         """
         #DBPedia dataset
-        train_file = 'DBPEDIA_30000_coreclass.jsonl'
+        train_file = 'DBPEDIA-coreclass-45000.jsonl'
         taxonomy_file = 'taxonomy.json'
         data_name = 'DBPEDIA'
         """
 
 
         #amazon dataset
-        train_file = 'train-with-core-class-1000.jsonl'
+        train_file = 'amazon-coreclass-1-10000.jsonl'
         taxonomy_file = 'taxonomy.json'
         data_name = 'amazon'
 
+
         bert_lr = 5e-5
         others_lr = 4e-3
         token_length = 500
@@ -251,4 +337,4 @@ def train(self):
                         batch_size, epoch, activation, rescaling)
 
         trainer.prepare_train()
-        trainer.train()
+        trainer.train(patience = 3, threshold = 0.3)