diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index 13ca29c9fceb..3f43e865fa72 100644 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -1575,7 +1575,7 @@ def set_none_gradients_to_zero(self, i, partition_id): for param_id in self.is_grad_computed[i][partition_id]: param = self.param_dict[param_id] if param.grad is None: - param.grad = torch.zero_like(param) + param.grad = torch.zeros_like(param) ######################Reduction Related Methods############################## diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index 225c085f6f2b..3d5ff5e6b43e 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -1474,7 +1474,7 @@ def set_none_gradients_to_zero(self, i, partition_id): for param_id in self.is_grad_computed[i][partition_id]: param = self.param_dict[param_id] if param.grad is None: - param.grad = torch.zero_like(param) + param.grad = torch.zeros_like(param) ######################Reduction Related Methods############################## def allreduce_bucket(self, bucket, rank=None, log=None, divide=True, process_group=None):