Skip to content

Commit

Permalink
Merge: [GNMT/PyT] Added synchronization before collecting timers, swi…
Browse files Browse the repository at this point in the history
…tched to correct averaging when reporting avg throughput
  • Loading branch information
nv-kkudrynski committed Feb 15, 2023
2 parents 7ddd062 + 327898a commit ed28348
Show file tree
Hide file tree
Showing 5 changed files with 21 additions and 10 deletions.
6 changes: 5 additions & 1 deletion PyTorch/Translation/GNMT/seq2seq/inference/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,8 @@ def evaluate(self, loader, epoch=0, iteration=0, warmup=0, summary=False):
output = []

for i, (src, indices) in enumerate(loader):
if device.type == 'cuda':
torch.cuda.synchronize()
translate_timer = time.time()
src, src_length = src
stats['total_enc_len'] = int(src_length.sum())
Expand All @@ -207,12 +209,14 @@ def evaluate(self, loader, epoch=0, iteration=0, warmup=0, summary=False):
detok = self.tokenizer.detokenize(pred)
output.append(detok)

if device.type == 'cuda':
torch.cuda.synchronize()
elapsed = time.time() - translate_timer
batch_time.update(elapsed, batch_size)

total_tokens = stats['total_dec_len'] + stats['total_enc_len']
ttps = total_tokens / elapsed
tot_tok_per_sec.update(ttps, batch_size)
tot_tok_per_sec.update(ttps, elapsed)

iterations.update(stats['iters'])
enc_seq_len.update(stats['total_enc_len'] / batch_size, batch_size)
Expand Down
12 changes: 9 additions & 3 deletions PyTorch/Translation/GNMT/seq2seq/train/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,8 @@ def feed_data(self, data_loader, training=True):

batch_size = data_loader.batch_size

if self.device.type == 'cuda':
torch.cuda.synchronize()
end = time.time()
for i, (src, tgt) in enumerate(data_loader):
self.save_counter += 1
Expand All @@ -241,12 +243,14 @@ def feed_data(self, data_loader, training=True):
losses_per_sentence.update(loss_per_sentence, batch_size)

# measure elapsed time
if self.device.type == 'cuda':
torch.cuda.synchronize()
elapsed = time.time() - end
batch_time.update(elapsed)
src_tok_time.update(num_toks['src'] / elapsed)
tgt_tok_time.update(num_toks['tgt'] / elapsed)
src_tok_time.update(num_toks['src'] / elapsed, elapsed)
tgt_tok_time.update(num_toks['tgt'] / elapsed, elapsed)
tot_num_toks = num_toks['tgt'] + num_toks['src']
tot_tok_time.update(tot_num_toks / elapsed)
tot_tok_time.update(tot_num_toks / elapsed, elapsed)
self.loss = losses_per_token.avg

if training and i in eval_iters:
Expand Down Expand Up @@ -298,6 +302,8 @@ def feed_data(self, data_loader, training=True):
if rank == 0:
self.save(identifier=identifier)

if self.device.type == 'cuda':
torch.cuda.synchronize()
end = time.time()

tot_tok_time.reduce('sum')
Expand Down
5 changes: 4 additions & 1 deletion PyTorch/Translation/GNMT/seq2seq/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,13 @@ def setup_seeds(master_seed, epochs, device):

def barrier():
"""
Call torch.distributed.barrier() if distritubed is in use
Call torch.distributed.barrier() if distritubed is in use, else calls
torch.cuda.synchronize() if CUDA is initialized.
"""
if torch.distributed.is_available() and torch.distributed.is_initialized():
torch.distributed.barrier()
elif torch.cuda.is_available() and torch.cuda.is_initialized():
torch.cuda.synchronize()


def get_rank():
Expand Down
2 changes: 1 addition & 1 deletion PyTorch/Translation/GNMT/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,7 +634,7 @@ def main():
logging.info(f'Total training time {training_time:.0f} s')

table = TrainingTable()
avg_training_perf = sum(training_perf) / len(training_perf)
avg_training_perf = len(training_perf) / sum(1 / v for v in training_perf)
table.add(utils.get_world_size(), args.train_batch_size, test_bleu,
avg_training_perf, training_time)
if utils.get_rank() == 0:
Expand Down
6 changes: 2 additions & 4 deletions PyTorch/Translation/GNMT/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,12 +352,10 @@ def main():
latency_table.write('Inference latency', 'fp16',
relative=relative, reverse_speedup=True)

avg_throughput = np.array(stats['throughputs']).mean()
avg_latency = np.array(stats['runtimes']).mean()
summary = {
'eval_throughput': avg_throughput,
'eval_throughput': stats['tokens_per_sec'],
'eval_bleu': stats['bleu'],
'eval_avg_latency': avg_latency,
'eval_avg_latency': np.array(stats['runtimes']).mean(),
}
for p in args.percentiles:
summary[f'eval_{p}%_latency'] = np.percentile(stats['runtimes'], p)
Expand Down

0 comments on commit ed28348

Please sign in to comment.