sec-bert-num-v2.py

# -*- coding: utf-8 -*-
"""sec-bert-num v2.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1JvS2hm4JACsxX-PHKBGCNGvvOhpbc3z2
"""

# pip install transformers seqeval[gpu] datasets

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification, AutoTokenizer, AutoModel

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

import datasets
import re
import csv
import pickle as pkl

# finer_train = datasets.load_dataset("nlpaueb/finer-139", split="train")
# finer_val = datasets.load_dataset("nlpaueb/finer-139", split="validation")
# finer_test = datasets.load_dataset("nlpaueb/finer-139", split="test")
# print(finer_train)

trainFile = '/home/soumyasharma/HULK/Financial-GS/Financial-Numerical-Extreme-Classification/data/FiNER/sentence-train.csv'
testFile = '/home/soumyasharma/HULK/Financial-GS/Financial-Numerical-Extreme-Classification/data/FiNER/sentence-test.csv'
valFile = '/home/soumyasharma/HULK/Financial-GS/Financial-Numerical-Extreme-Classification/data/FiNER/sentence-dev.csv'

finnum_train = pd.read_csv(trainFile)
finnum_test = pd.read_csv(testFile)
finnum_val = pd.read_csv(valFile)

num_token = "[NUM]"

def get_finer_dataframe(finer):
  tokens = finer['tokens']
  for i in range(len(tokens)):
      for j in range(len(tokens[i])):
          if re.fullmatch(r"(\d+[\d,.]*)|([,.]\d+)", tokens[i][j]):
              tokens[i][j] = num_token
  labels = finer['ner_tags']
  return pd.DataFrame(list(zip(tokens, labels)), columns =['sentence', 'word_labels'])

def get_dataframe(finer):
    df = finer[['masked_sentence', 'ner-tags']]
    df = df.rename(columns={'masked_sentence': 'sentence', 'ner-tags': 'word_labels'})
    df['sentence'] = df.apply(lambda x: eval(x['sentence']), axis=1)
    df['word_labels'] = df.apply(lambda x: eval(x['word_labels']), axis=1)
    return df

# finer_train_dataset = get_finer_dataframe(finer_train)
# finer_val_dataset = get_finer_dataframe(finer_val)
# finer_test_dataset = get_finer_dataframe(finer_test)

finnum_train_dataset = get_dataframe(finnum_train)
finnum_val_dataset = get_dataframe(finnum_val)
finnum_test_dataset = get_dataframe(finnum_test)

# print(train_dataset.head())

# def save_data_distributions(df, split):
#   df['word_labels'].apply(np.count_nonzero).value_counts().to_csv('Our Data - Number of labels vs number of sentences - ' + split + '.csv', index_label = 'Number of Labels', header=['Number of sentences'])
#   pd.Series(np.concatenate(df['word_labels'].values).flat).value_counts().to_csv('Our Data - Labels vs Counts - ' + split + '.csv', index_label = 'Labels', header=['Counts'])

# save_data_distributions(finer_train_dataset, 'train')
# save_data_distributions(finer_val_dataset, 'validation')
# save_data_distributions(finer_test_dataset, 'test')

def iob_to_labels(label):
  return label.split('-')[-1]

# iob_feature_names = finer_train.features["ner_tags"].feature.names
# feature_names = list(map(iob_to_labels, iob_feature_names))

def labelid_to_label(labelid):
  return feature_names[labelid]

def save_data_distributions(df, split):
  df['word_labels'].apply(np.count_nonzero).value_counts().to_csv('Number of labels vs number of sentences - ' + split + '.csv', index_label = 'Number of Labels', header=['Number of sentences'])
  pd.Series(np.concatenate(df['word_labels'].values).flat).apply(labelid_to_label).value_counts().to_csv('Labels vs Counts - ' + split + '.csv', index_label = 'Labels', header=['Counts'])
   

# save_data_distributions(train_dataset, 'train')
# save_data_distributions(val_dataset, 'validation')
# save_data_distributions(test_dataset, 'test')

MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 1
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = AutoTokenizer.from_pretrained('nlpaueb/sec-bert-num')
tokenizer.add_special_tokens({'additional_special_tokens': [num_token]})

class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
  def __getitem__(self, index):
        # step 1: get the sentence and word labels 
        sentence = self.data.sentence[index]
        labels = self.data.word_labels[index]
        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                             is_split_into_words=True, 
                             return_offsets_mapping=True, 
                             padding='max_length', 
                             truncation=True)
        # code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
        # create an empty array of -100 of length max_length
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        # set only labels whose first offset position is 0 and the second is not 0
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
          if mapping[0] == 0 and mapping[1] != 0:
            # overwrite label
            encoded_labels[idx] = labels[i]
            i += 1
        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)
        return item
  def __len__(self):
        return self.len

training_set = dataset(finnum_train_dataset, tokenizer, MAX_LEN)
validating_set = dataset(finnum_val_dataset, tokenizer, MAX_LEN)
testing_set = dataset(finnum_test_dataset, tokenizer, MAX_LEN)

for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["input_ids"]), training_set[0]["labels"]):
  print('{0:10}  {1}'.format(token, label))

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

val_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
validating_loader = DataLoader(validating_set, **val_params)
testing_loader = DataLoader(testing_set, **test_params)

"""#### **Defining the model**"""

model = BertForTokenClassification.from_pretrained('nlpaueb/sec-bert-num', num_labels=170)
model.resize_token_embeddings(len(tokenizer))
model.to(device)

"""#### **Training the model**"""

# inputs = training_set[2]
# input_ids = inputs["input_ids"].unsqueeze(0)
# attention_mask = inputs["attention_mask"].unsqueeze(0)
# labels = inputs["labels"].unsqueeze(0)
#
# input_ids = input_ids.to(device)
# attention_mask = attention_mask.to(device)
# labels = labels.to(device)
#
# outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
# initial_loss = outputs[0]
# print(initial_loss)
#
# tr_logits = outputs[1]
# print(tr_logits.shape)

optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

from seqeval.metrics import classification_report

# finer_tag_names = finer_train.features["ner_tags"].feature.names
finer_tag_names = pkl.load(open('/home/soumyasharma/HULK/Financial-GS/Financial-Numerical-Extreme-Classification/data/FiNER/labels.pkl', 'rb'))

def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    with torch.no_grad():
        for idx, batch in tqdm(enumerate(testing_loader)):
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss, eval_logits = outputs[0], outputs[1]
            eval_loss += loss.item()
            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
            if idx%5000==0:
                loss_step = eval_loss/nb_eval_steps
                # print(f"Validation loss per 100 evaluation steps: {loss_step}")
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy
    labels = [finer_tag_names[id.item()] for id in eval_labels]
    predictions = [finer_tag_names[id.item()] for id in eval_preds]
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")
    return labels, predictions

# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    for idx, batch in tqdm(enumerate(training_loader)):
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)
        outputs = model(ids, attention_mask=mask, labels=labels)
        loss, tr_logits = outputs[0], outputs[1]
        tr_loss += loss.item()
        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        if idx%1000==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        tr_labels.extend(labels)
        tr_preds.extend(predictions)
        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if idx%10000==0:
          val_labels, val_predictions = valid(model, validating_loader)
          print(classification_report([val_labels], [val_predictions]))
    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)
    val_labels, val_predictions = valid(model, validating_loader)
    print(classification_report([val_labels], [val_predictions]))
    print()

"""#### **Test and Analysis**"""

total_count = dict()
correct_count = dict()

def test(model, testing_loader):
    
    test_loss, test_accuracy = 0, 0
    nb_test_examples, nb_test_steps = 0, 0
    test_preds, test_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
            loss, test_logits = outputs[0], outputs[1]
            
            test_loss += loss.item()

            nb_test_steps += 1
            nb_test_examples += labels.size(0)
              
            # compute test accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = test_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            count = np.count_nonzero(labels.cpu().numpy())
            if count in total_count.keys():
              total_count[count] += len(labels.cpu().numpy())
            else:
              total_count[count] = len(labels.cpu().numpy())
            
            if count in correct_count.keys():
              correct_count[count] += np.sum(labels.cpu().numpy() == predictions.cpu().numpy())
            else:
              correct_count[count] = np.sum(labels.cpu().numpy() == predictions.cpu().numpy())
            
            test_labels.extend(labels)
            test_preds.extend(predictions)
            
            tmp_test_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            test_accuracy += tmp_test_accuracy

    labels = [finer_tag_names[id.item()] for id in test_labels]
    predictions = [finer_tag_names[id.item()] for id in test_preds]
    
    test_loss = test_loss / nb_test_steps
    test_accuracy = test_accuracy / nb_test_steps
    print(f"Test Loss: {test_loss}")
    print(f"Test Accuracy: {test_accuracy}")

    return labels, predictions

from sklearn.metrics import confusion_matrix
from seqeval.metrics import classification_report

labels, predictions = test(model, testing_loader)
print(labels)
conf_matrix = confusion_matrix(labels, predictions, labels = finer_tag_names)
acc_col = (conf_matrix.diagonal()*100)/conf_matrix.sum(axis=1)

test_cols = np.unique(labels, return_counts=True)
test_df1 = pd.DataFrame(list(zip(test_cols[0], test_cols[1])), columns=['class', 'count_sample'])
test_df2 = pd.DataFrame(list(zip(finer_tag_names, acc_col)), columns=['class', 'accuracy'])
final_result_df = test_df1.merge(test_df2, on='class', how='right')
final_result_df.to_csv('Performance according to the number of data points in train per label.csv')

print(classification_report([labels], [predictions]))

import matplotlib.pyplot as plt

x = []
y = []

for key in sorted(total_count.keys()):
  x.append(key)
  y.append((100*correct_count[key])/total_count[key])

plt.plot(x, y)
plt.xlabel('Number of labels in a sentence')
plt.ylabel('accuracy')
 
plt.title(' Performance as number of labels per sentence increases')
plt.show()

"""#### **Saving the model for future use**"""

import os

directory = "./ourmodel-sec-bert-num"

if not os.path.exists(directory):
    os.makedirs(directory)

# save vocabulary of the tokenizer
tokenizer.save_vocabulary(directory)
# save the model weights and its configuration file
model.save_pretrained(directory)
print('All files saved')