Skip to content

Commit

Permalink
feat: save train metrics and val loss
Browse files Browse the repository at this point in the history
  • Loading branch information
Fafa-DL committed Dec 2, 2023
1 parent 216b5a9 commit e97dd70
Show file tree
Hide file tree
Showing 8 changed files with 100 additions and 60 deletions.
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@ train_pipeline = [

## 更新日志

**`2023.12.02`**
- 新增Issue中多人提及的输出**Train Acc****Val loss**
- `metrics_outputs.csv`保存每周期`train_loss, train_acc, train_precision, train_recall, train_f1-score, val_loss, val_acc, val_precision, val_recall, val_f1-score`方便各位绘图
- 终端由原先仅输出**Val**相关metrics升级为Train与Val都输出

![](https://raw.githubusercontent.com/Fafa-DL/readme-data/main/backbones/terminal.jpg)

**`2023.08.05`**
- 新增**TinyViT**(预训练权重不匹配)、**DeiT3****EdgeNeXt****RevVisionTransformer**

Expand Down
23 changes: 13 additions & 10 deletions models/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,22 +120,25 @@ def extract_feat(self, img, stage='neck'):
if stage == 'neck':
return x

def forward(self, x,return_loss=True,**kwargs):
if return_loss:
return self.forward_train(x,**kwargs)
else:
return self.forward_test(x,**kwargs)

def forward_train(self,x,targets,**kwargs):
def forward(self, x, return_loss=True, train_statu=False, **kwargs):
x = self.extract_feat(x)

if not train_statu:
if return_loss:
return self.forward_train(x, **kwargs)
else:
return self.forward_test(x, **kwargs)
else:
return self.forward_test(x), self.forward_train(x, **kwargs)

def forward_train(self, x, targets, **kwargs):

losses = dict()
loss = self.head.forward_train(x,targets,**kwargs)
loss = self.head.forward_train(x, targets, **kwargs)
losses.update(loss)
return losses

def forward_test(self, x,**kwargs):
x = self.extract_feat(x)
def forward_test(self, x, **kwargs):

out = self.head.simple_test(x,**kwargs)
return out
Expand Down
4 changes: 2 additions & 2 deletions tools/batch_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def main():
parser.add_argument(
'--classes-map', default='datas/annotations.txt', help='classes map of datasets')
parser.add_argument(
'--device', default='cuda', help='Device used for inference')
'--device', default='cpu', help='Device used for inference')
parser.add_argument(
'--save-path',
help='The path to save prediction image, default not to save.')
Expand All @@ -26,7 +26,7 @@ def main():

classes_names, label_names = get_info(args.classes_map)
# build the model from a config file and a checkpoint file
model_cfg,train_pipeline,val_pipeline,data_cfg,lr_config,optimizer_cfg = file2dict(args.config)
model_cfg, train_pipeline, val_pipeline, data_cfg, lr_config, optimizer_cfg = file2dict(args.config)
if args.device is not None:
device = torch.device(args.device)
else:
Expand Down
4 changes: 2 additions & 2 deletions tools/single_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@ def main():
parser.add_argument(
'--classes-map', default='datas/annotations.txt', help='classes map of datasets')
parser.add_argument(
'--device', default='cuda', help='Device used for inference')
'--device', default='cpu', help='Device used for inference')
parser.add_argument(
'--save-path',
help='The path to save prediction image, default not to save.')
args = parser.parse_args()

classes_names, label_names = get_info(args.classes_map)
# build the model from a config file and a checkpoint file
model_cfg,train_pipeline,val_pipeline,data_cfg,lr_config,optimizer_cfg = file2dict(args.config)
model_cfg, train_pipeline, val_pipeline,data_cfg, lr_config, optimizer_cfg = file2dict(args.config)
if args.device is not None:
device = torch.device(args.device)
else:
Expand Down
14 changes: 7 additions & 7 deletions tools/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,13 @@ def parse_args():
def main():
# 读取配置文件获取关键字段
args = parse_args()
model_cfg,train_pipeline,val_pipeline,data_cfg,lr_config,optimizer_cfg = file2dict(args.config)
model_cfg, train_pipeline, val_pipeline, data_cfg, lr_config, optimizer_cfg = file2dict(args.config)
print_info(model_cfg)

# 初始化
meta = dict()
dirname = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
save_dir = os.path.join('logs',model_cfg.get('backbone').get('type'),dirname)
save_dir = os.path.join('logs', model_cfg.get('backbone').get('type'), dirname)
meta['save_dir'] = save_dir

# 设置随机数种子
Expand Down Expand Up @@ -108,10 +108,10 @@ def main():
model.freeze_layers(data_cfg.get('train').get('freeze_layers'))

if device != torch.device('cpu'):
model = DataParallel(model,device_ids=[args.gpu_id])
model = DataParallel(model, device_ids=[args.gpu_id])

# 初始化优化器
optimizer = eval('optim.' + optimizer_cfg.pop('type'))(params=model.parameters(),**optimizer_cfg)
optimizer = eval('optim.' + optimizer_cfg.pop('type'))(params=model.parameters(), **optimizer_cfg)

# 初始化学习率更新策略
lr_update_func = eval(lr_config.pop('type'))(**lr_config)
Expand Down Expand Up @@ -146,7 +146,7 @@ def main():

# 是否从中断处恢复训练
if args.resume_from:
model,runner,meta = resume_model(model,runner,args.resume_from,meta)
model, runner,meta = resume_model(model, runner, args.resume_from, meta)
else:
os.makedirs(save_dir)
shutil.copyfile(args.config,os.path.join(save_dir,os.path.split(args.config)[1]))
Expand All @@ -161,8 +161,8 @@ def main():
# 训练
for epoch in range(runner.get('epoch'),runner.get('max_epochs')):
lr_update_func.before_train_epoch(runner)
train(model,runner, lr_update_func, device, epoch, data_cfg.get('train').get('epoches'), meta)
validation(model,runner, data_cfg.get('test'), device, epoch, data_cfg.get('train').get('epoches'), meta)
train(model, runner, lr_update_func, device, epoch, data_cfg.get('train').get('epoches'), data_cfg.get('test'), meta)
validation(model, runner, data_cfg.get('test'), device, epoch, data_cfg.get('train').get('epoches'), meta)

train_history.after_epoch(meta)

Expand Down
4 changes: 2 additions & 2 deletions tools/video_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def main():
parser.add_argument(
'--classes-map', default='datas/annotations.txt', help='classes map of datasets')
parser.add_argument(
'--device', default='cuda', help='Device used for inference')
'--device', default='cpu', help='Device used for inference')
parser.add_argument(
'--save-path',
help='The path to save prediction image, default not to save.')
Expand All @@ -26,7 +26,7 @@ def main():

classes_names,label_names = get_info(args.classes_map)
# build the model from a config file and a checkpoint file
model_cfg,train_pipeline,val_pipeline,data_cfg,lr_config,optimizer_cfg = file2dict(args.config)
model_cfg, train_pipeline, val_pipeline,data_cfg, lr_config, optimizer_cfg = file2dict(args.config)
if args.device is not None:
device = torch.device(args.device)
else:
Expand Down
56 changes: 33 additions & 23 deletions utils/history.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,54 +8,64 @@

class History():
def __init__(self, dir):
self.dir = dir
self.csv_dir = os.path.join(dir,'metrics_outputs.csv')
self.pic_dir = os.path.join(dir,'loss-acc.png')
self.losses_epoch = []
self.acc_epoch = []
self.epoch_outputs = [['Epoch', 'Train Loss', 'Val Acc', 'Precision', 'Recall', 'F1 Score']]
self.temp_data = []

def update(self,data,mode):
def update(self, data, mode):
if mode == 'train':
self.temp_data.append(data)
self.losses_epoch.append(data)
elif mode == 'test':
self.temp_data.extend([data.get('accuracy_top-1'),mean(data.get('precision',0.0)),mean(data.get('recall',0.0)),mean(data.get('f1_score',0.0))])
self.acc_epoch.append(data.get('accuracy_top-1'))

def draw_loss_acc(self, loss, acc, save_path):
total_epoch = range(1,len(loss)+1)

fig, ax1 = plt.subplots()
color = 'tab:red'
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.plot(total_epoch, loss, 'red', linewidth = 2, label='loss')
ax1.grid(True)

ax2 = ax1.twinx()
color = 'tab:blue'
ax2.set_ylabel('Acc')
ax2.plot(total_epoch, acc, 'blue', linewidth = 2, label='acc')
fig.legend()
fig.tight_layout()
plt.savefig(save_path)
plt.close("all")


def after_epoch(self,meta):
def after_epoch(self, meta):
'''
保存每周期的 'Train Loss', 'Val Acc', 'Precision', 'Recall', 'F1 Score'
'''
acc_epoch = []
epoch_outputs = []
val_acc_epoch = []
train_acc_epoch = []
epoch_outputs = [['index', 'train_loss', 'train_acc', 'train_precision', 'train_recall', 'train_f1-score', 'val_loss', 'val_acc', 'val_precision', 'val_recall', 'val_f1-score']]
with open(self.csv_dir, 'w', newline='') as f:
writer = csv.writer(f)
for i in range(len(meta['train_info']['train_loss'])):
temp_data = [i+1, meta['train_info']['train_loss'][i], meta['train_info']['val_acc'][i].get('accuracy_top-1'),mean(meta['train_info']['val_acc'][i].get('precision',0.0)),mean(meta['train_info']['val_acc'][i].get('recall',0.0)),mean(meta['train_info']['val_acc'][i].get('f1_score',0.0))]
acc_epoch.append(meta['train_info']['val_acc'][i].get('accuracy_top-1'))
temp_data = [i+1, meta['train_info']['train_loss'][i], meta['train_info']['train_acc'][i].get('accuracy_top-1'),mean(meta['train_info']['train_acc'][i].get('precision',0.0)),mean(meta['train_info']['train_acc'][i].get('recall',0.0)),mean(meta['train_info']['train_acc'][i].get('f1_score',0.0)), meta['train_info']['val_loss'][i], meta['train_info']['val_acc'][i].get('accuracy_top-1'),mean(meta['train_info']['val_acc'][i].get('precision',0.0)),mean(meta['train_info']['val_acc'][i].get('recall',0.0)),mean(meta['train_info']['val_acc'][i].get('f1_score',0.0))]
val_acc_epoch.append(meta['train_info']['val_acc'][i].get('accuracy_top-1'))
train_acc_epoch.append(meta['train_info']['train_acc'][i].get('accuracy_top-1'))
epoch_outputs.append(temp_data)
writer.writerows(epoch_outputs)

'''
绘制每周期Train Loss以及Validation Accuracy
绘制每周期Train|Val Loss-Accuracy
'''
total_epoch = range(1,len(meta['train_info']['train_loss'])+1)

fig, ax1 = plt.subplots()
color = 'tab:red'
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.plot(total_epoch, meta['train_info']['train_loss'], 'red', linewidth = 2, label='Train loss')
ax1.grid(True)
train_loss_acc_pic = os.path.join(self.dir, 'train_loss-acc.png')
self.draw_loss_acc(meta['train_info']['train_loss'], train_acc_epoch, train_loss_acc_pic)

ax2 = ax1.twinx()
color = 'tab:blue'
ax2.set_ylabel('Acc')
ax2.plot(total_epoch, acc_epoch, 'blue', linewidth = 2, label='Val acc')
fig.legend()
fig.tight_layout()
plt.savefig(self.pic_dir)
plt.close("all")
val_loss_acc_pic = os.path.join(self.dir, 'val_loss-acc.png')
self.draw_loss_acc(meta['train_info']['val_loss'], val_acc_epoch, val_loss_acc_pic)

48 changes: 34 additions & 14 deletions utils/train_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,25 +197,28 @@ def resume_model(model, runner, checkpoint, meta, resume_optimizer=True, map_loc
'''
训练
'''
def train(model, runner, lr_update_func, device, epoch, epoches, meta):
def train(model, runner, lr_update_func, device, epoch, epoches, test_cfg, meta):
train_loss = 0
pred_list, target_list = [], []
runner['epoch'] = epoch + 1
meta['epoch'] = runner['epoch']

model.train()
with tqdm(total=len(runner.get('train_loader')),desc=f'Train: Epoch {epoch + 1}/{epoches}',postfix=dict,mininterval=0.3) as pbar:
with tqdm(total=len(runner.get('train_loader')),desc=f'Train: Epoch {epoch + 1}/{epoches}', postfix=dict, mininterval=0.3) as pbar:
for iter, batch in enumerate(runner.get('train_loader')):
images, targets, _ = batch
with torch.no_grad():
images = images.to(device)
targets = targets.to(device)

target_list.append(targets)

runner.get('optimizer').zero_grad()
lr_update_func.before_train_iter(runner)
losses = model(images,targets=targets,return_loss=True)
preds, losses = model(images, targets=targets, return_loss=True, train_statu=True)
losses.get('loss').backward()
runner.get('optimizer').step()

pred_list.append(preds)
train_loss += losses.get('loss').item()
pbar.set_postfix(**{'Loss': train_loss / (iter + 1),
'Lr' : get_lr(runner.get('optimizer'))
Expand All @@ -224,7 +227,10 @@ def train(model, runner, lr_update_func, device, epoch, epoches, meta):
meta['iter'] = runner['iter']
pbar.update(1)

eval_results = evaluate(torch.cat(pred_list), torch.cat(target_list), test_cfg.get('metrics'), test_cfg.get('metric_options'))

meta['train_info']['train_loss'].append(train_loss / (iter + 1))
meta['train_info']['train_acc'].append(eval_results)

if train_loss/len(runner.get('train_loader')) < runner.get('best_train_loss') :
runner['best_train_loss'] = train_loss/len(runner.get('train_loader'))
Expand All @@ -233,31 +239,45 @@ def train(model, runner, lr_update_func, device, epoch, epoches, meta):
os.remove(runner['best_train_weight'])
runner['best_train_weight'] = os.path.join(meta['save_dir'],'Train_Epoch{:03}-Loss{:.3f}.pth'.format(epoch+1,train_loss / len(runner.get('train_loader'))))
meta['best_train_weight'] = runner['best_train_weight']
save_checkpoint(model,runner.get('best_train_weight'),runner.get('optimizer'),meta)
save_checkpoint(model,runner.get('best_train_weight'),runner.get('optimizer'), meta)

TITLE = 'Train Results'
TABLE_DATA = (
('Top-1 Acc', 'Top-5 Acc', 'Mean Precision', 'Mean Recall', 'Mean F1 Score'),
('{:.2f}'.format(eval_results.get('accuracy_top-1',0.0)), '{:.2f}'.format(eval_results.get('accuracy_top-5',100.0)), '{:.2f}'.format(mean(eval_results.get('precision',0.0))),'{:.2f}'.format(mean(eval_results.get('recall',0.0))),'{:.2f}'.format(mean(eval_results.get('f1_score',0.0)))),
)
table_instance = AsciiTable(TABLE_DATA,TITLE)
#table_instance.justify_columns[2] = 'right'
print()
print(table_instance.table)
print()


def validation(model, runner, cfg, device, epoch, epoches, meta):
preds,targets = [],[]
pred_list, target_list = [], []
val_loss = 0.0
model.eval()
with torch.no_grad():
with tqdm(total=len(runner.get('val_loader')), desc=f'Test : Epoch {epoch + 1}/{epoches}',mininterval=0.3) as pbar:
with tqdm(total=len(runner.get('val_loader')), desc=f'Test : Epoch {epoch + 1}/{epoches}', postfix=dict, mininterval=0.3) as pbar:
for iter, batch in enumerate(runner.get('val_loader')):
images, target, _ = batch
outputs = model(images.to(device),return_loss=False)
preds.append(outputs)
targets.append(target.to(device))
images, targets, _ = batch
preds, losses = model(images.to(device), targets = targets.to(device), return_loss=True, train_statu=True)
pred_list.append(preds)
target_list.append(targets.to(device))
val_loss += losses.get('loss').item()
pbar.set_postfix(**{'Loss': val_loss / (iter + 1)})
pbar.update(1)

eval_results = evaluate(torch.cat(preds),torch.cat(targets),cfg.get('metrics'),cfg.get('metric_options'))
eval_results = evaluate(torch.cat(pred_list),torch.cat(target_list),cfg.get('metrics'),cfg.get('metric_options'))

meta['train_info']['val_acc'].append(eval_results)
meta['train_info']['val_loss'].append(val_loss / (iter + 1))

TITLE = 'Validation Results'
TABLE_DATA = (
('Top-1 Acc', 'Top-5 Acc', 'Mean Precision', 'Mean Recall', 'Mean F1 Score'),
('{:.2f}'.format(eval_results.get('accuracy_top-1',0.0)), '{:.2f}'.format(eval_results.get('accuracy_top-5',100.0)), '{:.2f}'.format(mean(eval_results.get('precision',0.0))),'{:.2f}'.format(mean(eval_results.get('recall',0.0))),'{:.2f}'.format(mean(eval_results.get('f1_score',0.0)))),

)
)
table_instance = AsciiTable(TABLE_DATA,TITLE)
#table_instance.justify_columns[2] = 'right'
print()
Expand Down

0 comments on commit e97dd70

Please sign in to comment.