Merge pull request karpathy#71 from cchan/patch-1

Zero-grad more aggressively to save memory
rasutt · Jan 20, 2023 · 3611338 · 3611338
2 parents 1f77d03 + 6716607
commit 3611338
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/train.py b/train.py
@@ -259,7 +259,6 @@ def get_lr(iter):
         break
 
     # forward backward update, with optional gradient accumulation to simulate larger batch size
-    optimizer.zero_grad(set_to_none=True)
     for micro_step in range(gradient_accumulation_steps):
         X, Y = get_batch('train')
         if ddp:
@@ -272,6 +271,7 @@ def get_lr(iter):
             logits, loss = model(X, Y)
         loss.backward()
     optimizer.step()
+    optimizer.zero_grad(set_to_none=True)
 
     # timing and logging
     t1 = time.time()