Skip to content

Commit

Permalink
Log updates/sec at each checkpoint. Start measuring time after checkp…
Browse files Browse the repository at this point in the history
…ointing and validatation data eval (awslabs#279)
  • Loading branch information
fhieber authored Jan 23, 2018
1 parent 061f678 commit dbeb4fa
Showing 1 changed file with 5 additions and 3 deletions.
8 changes: 5 additions & 3 deletions sockeye/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,10 +489,10 @@ def _fit(self,
memory_data=utils.get_gpu_memory_usage(self.context))

toc = time.time()
logger.info("Checkpoint [%d]\tUpdates=%d Epoch=%d Samples=%d Time-cost=%.3f",
time_cost = toc - tic
logger.info("Checkpoint [%d]\tUpdates=%d Epoch=%d Samples=%d Time-cost=%.3f Updates/sec=%.3f",
train_state.checkpoint, train_state.updates, train_state.epoch,
train_state.samples, (toc - tic))
tic = time.time()
train_state.samples, time_cost, checkpoint_frequency/time_cost)

for name, val in metric_train.get_name_value():
logger.info('Checkpoint [%d]\tTrain-%s=%f', train_state.checkpoint, name, val)
Expand Down Expand Up @@ -560,6 +560,8 @@ def _fit(self,

self._checkpoint(train_state, output_folder, train_iter)

tic = time.time()

cleanup_params_files(output_folder, max_params_files_to_keep,
train_state.checkpoint, self.training_monitor.get_best_checkpoint())

Expand Down

0 comments on commit dbeb4fa

Please sign in to comment.