From 2f976aae0acb68941e45bb168d2d7a5ede91dd7d Mon Sep 17 00:00:00 2001
From: Sergey Edunov <edunov@apache.org>
Date: Tue, 27 Feb 2018 11:41:20 -0800
Subject: [PATCH] Making our code compatible with the latest pytorch (#223)

* Making our code compatible with the latest pytorch

* revert

* torch.nn.utils.clip_grad_norm now returns tensor
---
 fairseq/criterions/cross_entropy.py                | 4 ++--
 fairseq/criterions/label_smoothed_cross_entropy.py | 4 ++--
 fairseq/distributed_utils.py                       | 2 +-
 fairseq/trainer.py                                 | 2 +-
 fairseq/utils.py                                   | 7 +++++++
 5 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/fairseq/criterions/cross_entropy.py b/fairseq/criterions/cross_entropy.py
index 85e0195c62..5d35d7f0ba 100644
--- a/fairseq/criterions/cross_entropy.py
+++ b/fairseq/criterions/cross_entropy.py
@@ -9,7 +9,7 @@
 import torch.nn.functional as F
 
 from . import FairseqCriterion, register_criterion
-
+from fairseq import utils
 
 @register_criterion('cross_entropy')
 class CrossEntropyCriterion(FairseqCriterion):
@@ -33,7 +33,7 @@ def forward(self, model, sample, reduce=True):
                           reduce=reduce)
         sample_size = sample['target'].size(0) if self.args.sentence_avg else sample['ntokens']
         logging_output = {
-            'loss': loss.data[0] if reduce else loss.data,
+            'loss': utils.item(loss.data) if reduce else loss.data,
             'ntokens': sample['ntokens'],
             'sample_size': sample_size,
         }
diff --git a/fairseq/criterions/label_smoothed_cross_entropy.py b/fairseq/criterions/label_smoothed_cross_entropy.py
index e36c29624f..ebe698f050 100644
--- a/fairseq/criterions/label_smoothed_cross_entropy.py
+++ b/fairseq/criterions/label_smoothed_cross_entropy.py
@@ -79,8 +79,8 @@ def forward(self, model, sample, reduce=True):
         nll_loss = F.nll_loss(lprobs, target, size_average=False, ignore_index=self.padding_idx, reduce=reduce)
         sample_size = sample['target'].size(0) if self.args.sentence_avg else sample['ntokens']
         logging_output = {
-            'loss': loss.data[0] if reduce else loss.data,
-            'nll_loss': nll_loss.data[0] if reduce else loss.data,
+            'loss': utils.item(loss.data) if reduce else loss.data,
+            'nll_loss': utils.item(nll_loss.data) if reduce else loss.data,
             'ntokens': sample['ntokens'],
             'sample_size': sample_size,
         }
diff --git a/fairseq/distributed_utils.py b/fairseq/distributed_utils.py
index 75aec3f8b4..8d8b279486 100644
--- a/fairseq/distributed_utils.py
+++ b/fairseq/distributed_utils.py
@@ -116,7 +116,7 @@ def all_gather_list(data, max_size=4096):
     if len(enc) >= max_size:
         raise ValueError('encoded data exceeds max_size: {}'.format(len(enc)))
     in_buffer[0] = len(enc)
-    in_buffer[1:len(enc)+1] = torch.ByteTensor(enc)
+    in_buffer[1:len(enc)+1] = torch.ByteTensor(list(enc))
 
     torch.distributed.all_gather(out_buffers, in_buffer.cuda())
 
diff --git a/fairseq/trainer.py b/fairseq/trainer.py
index b4be2400dc..a68f2473a9 100644
--- a/fairseq/trainer.py
+++ b/fairseq/trainer.py
@@ -190,7 +190,7 @@ def _backward_and_opt(self, loss, grad_denom):
 
         # clip grads
         if self.args.clip_norm > 0:
-            grad_norm = torch.nn.utils.clip_grad_norm(self.model.parameters(), self.args.clip_norm)
+            grad_norm = utils.item(torch.nn.utils.clip_grad_norm(self.model.parameters(), self.args.clip_norm))
         else:
             grad_norm = math.sqrt(sum(p.grad.data.norm()**2 for p in self.model.parameters()))
 
diff --git a/fairseq/utils.py b/fairseq/utils.py
index 8215d6cb72..edda6a1c33 100644
--- a/fairseq/utils.py
+++ b/fairseq/utils.py
@@ -304,3 +304,10 @@ def convert_padding_direction(
     else:
         index = torch.remainder(range + num_pads, max_len)
     return src_tokens.gather(1, index)
+
+def item(tensor):
+    if hasattr(tensor, 'item'):
+        return tensor.item()
+    if hasattr(tensor, '__getitem__'):
+        return tensor[0]
+    return tensor