Cuda rng_state_all is used when saving in distributed mode so same sh…

…ould also be used when loading (huggingface#23045) cuda rng state should be all for distributed bc all were saved
githubhjs · Apr 28, 2023 · 4d0ea3d · 4d0ea3d
1 parent 521a8ff
commit 4d0ea3d
Showing 1 changed file with 2 additions and 2 deletions.
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
@@ -2327,10 +2327,10 @@ def _load_rng_state(self, checkpoint):
         torch.random.set_rng_state(checkpoint_rng_state["cpu"])
         if torch.cuda.is_available():
             if self.args.parallel_mode == ParallelMode.DISTRIBUTED:
-                torch.cuda.random.set_rng_state(checkpoint_rng_state["cuda"])
+                torch.cuda.random.set_rng_state_all(checkpoint_rng_state["cuda"])
             else:
                 try:
-                    torch.cuda.random.set_rng_state_all(checkpoint_rng_state["cuda"])
+                    torch.cuda.random.set_rng_state(checkpoint_rng_state["cuda"])
                 except Exception as e:
                     logger.info(
                         f"Didn't manage to set back the RNG states of the GPU because of the following error:\n {e}"