forked from aianaconda/pytorch-GNN-2nd-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcode_14_BERT_CH.py
189 lines (151 loc) · 7.5 KB
/
code_14_BERT_CH.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 1 21:21:46 2020
@author: 代码医生工作室
@公众号:xiangyuejiqiren (内有更多优秀文章及学习资料)
@来源: <PyTorch深度学习和图神经网络(卷2)——开发应用>配套代码
@配套代码技术支持:bbs.aianaconda.com
"""
import os
import torch
from transformers import (
get_linear_schedule_with_warmup,BertTokenizer,
AdamW,
AutoModelForSequenceClassification,
AutoConfig
)
from torch.utils.data import DataLoader,dataset
import time
import numpy as np
from sklearn import metrics
from datetime import timedelta
data_dir='./THUCNews/data'
def read_file(path):
with open(path, 'r', encoding="UTF-8") as file:
docus = file.readlines()
newDocus = []
for data in docus:
newDocus.append(data)
return newDocus
#建立数据集
class Label_Dataset(dataset.Dataset):
def __init__(self,data):
self.data = data
def __len__(self):#返回数据长度
return len(self.data)
def __getitem__(self,ind):
onetext = self.data[ind]
content, label = onetext.split('\t')
label = torch.LongTensor([int(label)])
return content,label
trainContent = read_file(os.path.join(data_dir, "train.txt"))
testContent = read_file(os.path.join(data_dir, "test.txt"))
traindataset =Label_Dataset( trainContent )
testdataset =Label_Dataset( testContent )
testdataloder = DataLoader(testdataset, batch_size=1, shuffle = False)
batch_size = 8
traindataloder = DataLoader(traindataset, batch_size=batch_size, shuffle = True)
class_list = [x.strip() for x in open(
os.path.join(data_dir, "class.txt")).readlines()]
pretrained_weights = 'bert-base-chinese'#建立模型
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
config = AutoConfig.from_pretrained(pretrained_weights,num_labels=len(class_list))
#单独指定config,在config中指定分类个数
nlp_classif = AutoModelForSequenceClassification.from_pretrained(pretrained_weights,
config=config)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
nlp_classif = nlp_classif.to(device)
time_start = time.time() #开始时间
epochs = 2
gradient_accumulation_steps = 1
max_grad_norm =0.1 #梯度剪辑的阀值
require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练
savedir = './myfinetun-bert_chinese/'
os.makedirs(savedir, exist_ok=True)
def get_time_dif(start_time):
"""获取已使用时间"""
end_time = time.time()
time_dif = end_time - start_time
return timedelta(seconds=int(round(time_dif)))
def train( model, traindataloder, testdataloder):
start_time = time.time()
model.train()
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps=0, num_training_steps=len(traindataloder) * epochs)
total_batch = 0 # 记录进行到多少batch
dev_best_loss = float('inf')
last_improve = 0 # 记录上次验证集loss下降的batch数
flag = False # 记录是否很久没有效果提升
for epoch in range(epochs):
print('Epoch [{}/{}]'.format(epoch + 1, epochs))
for i, (sku_name, labels) in enumerate(traindataloder):
model.train()
ids = tokenizer.batch_encode_plus( sku_name,
# max_length=model.config.max_position_embeddings, #模型的配置文件中就是512,当有超过这个长度的会报错
pad_to_max_length=True,return_tensors='pt')#没有return_tensors会返回list!!!!
labels = labels.squeeze().to(device)
outputs = model(ids["input_ids"].to(device), labels=labels,
attention_mask =ids["attention_mask"].to(device) )
loss, logits = outputs[:2]
if gradient_accumulation_steps > 1:
loss = loss / gradient_accumulation_steps
loss.backward()
if (i + 1) % gradient_accumulation_steps == 0:
torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
optimizer.step()
scheduler.step() # Update learning rate schedule
model.zero_grad()
if total_batch % 100 == 0:
# 每多少轮输出在训练集和验证集上的效果
truelabel = labels.data.cpu()
predic = torch.argmax(logits,axis=1).data.cpu()
# predic = torch.max(outputs.data, 1)[1].cpu()
train_acc = metrics.accuracy_score(truelabel, predic)
dev_acc, dev_loss = evaluate( model, testdataloder)
if dev_loss < dev_best_loss:
dev_best_loss = dev_loss
model.save_pretrained(savedir)
improve = '*'
last_improve = total_batch
else:
improve = ''
time_dif = get_time_dif(start_time)
msg = 'Iter: {0:>6}, Train Loss: {1:>5.2}, Train Acc: {2:>6.2%}, Val Loss: {3:>5.2}, Val Acc: {4:>6.2%}, Time: {5} {6}'
print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve))
model.train()
total_batch += 1
if total_batch - last_improve > require_improvement:
# 验证集loss超过1000batch没下降,结束训练
print("No optimization for a long time, auto-stopping...")
flag = True
break
if flag:
break
def evaluate(model, testdataloder):
model.eval()
loss_total = 0
predict_all = np.array([], dtype=int)
labels_all = np.array([], dtype=int)
with torch.no_grad():
for sku_name, labels in testdataloder:
ids = tokenizer.batch_encode_plus( sku_name,
# max_length=model.config.max_position_embeddings, #模型的配置文件中就是512,当有超过这个长度的会报错
pad_to_max_length=True,return_tensors='pt')#没有return_tensors会返回list!!!!
labels = labels.squeeze().to(device)
outputs = model(ids["input_ids"].to(device), labels=labels,
attention_mask =ids["attention_mask"].to(device) )
loss, logits = outputs[:2]
loss_total += loss
labels = labels.data.cpu().numpy()
predic = torch.argmax(logits,axis=1).data.cpu().numpy()
labels_all = np.append(labels_all, labels)
predict_all = np.append(predict_all, predic)
acc = metrics.accuracy_score(labels_all, predict_all)
return acc, loss_total / len(testdataloder)
train( nlp_classif, traindataloder, testdataloder)