[Hetero] Fix bug during loading ckpt if enabling CPU communication (F…

…lagOpen#248) Fix bug during loading kept if enabling CPU communication Co-authored-by: lizhiyu <[email protected]>
CathySama · Nov 1, 2024 · 9e5b3cb · 9e5b3cb
1 parent de75f52
commit 9e5b3cb
Show file tree

Hide file tree

Showing 2 changed files with 2 additions and 2 deletions.
diff --git a/examples/llama/conf/train/train_llama3_8b.yaml b/examples/llama/conf/train/train_llama3_8b.yaml
@@ -37,7 +37,7 @@ model:
   normalization: RMSNorm
   rotary_interleaved_patch: False
   position_embedding_type: rope
-  rotary_base: 500000.0
+  rotary_base: 500000
   untie_embeddings_and_output_weights: True
   init_method_std: 0.02
   attention_dropout: 0.0

diff --git a/megatron/megatron/training/checkpointing.py b/megatron/megatron/training/checkpointing.py
@@ -244,7 +244,7 @@ def read_metadata(tracker_filename):
 
     # Get the max iteration retrieved across the ranks.
     if torch.distributed.is_initialized():
-        iters_cuda = torch.tensor([iteration], dtype=torch.long, device='cuda')
+        iters_cuda = torch.tensor([iteration], dtype=torch.long, device='cuda' if 'nccl' in torch.distributed.get_backend() else 'cpu')
         torch.distributed.all_reduce(iters_cuda, op=torch.distributed.ReduceOp.MAX)
         max_iter = iters_cuda[0].item()