forked from karpathy/llm.c
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpyrun124M.sh
executable file
·31 lines (29 loc) · 1 KB
/
pyrun124M.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# the same as run124M.sh but with PyTorch
# current restrictions:
# - does not write checkpoint, only logs of the train/val losses
# - does not evaluate hellaswag accuracy
# - cannot "resume training" (i.e. the `-y 1` flag)
# if you wish to train on just a single GPU, simply skip the torchrun part, i.e.
# python train_gpt2.py ... (all the other arguments the same)
torchrun --standalone --nproc_per_node=4 train_gpt2.py \
--input_bin "dev/data/fineweb10B/fineweb_train_*.bin" \
--input_val_bin "dev/data/fineweb10B/fineweb_val_*.bin" \
--val_loss_every 250 \
--sample_every 0 \
--output_dir pylog124M \
--write_tensors 0 \
--model d12 \
--batch_size 32 \
--sequence_length 1024 \
--total_batch_size 524288 \
--dtype bfloat16 \
--compile 1 \
--tensorcores 1 \
--flash 1 \
--num_iterations 18865 \
--weight_decay 0.1 \
--zero_stage 1 \
--learning_rate 0.0006 \
--warmup_iters 700 \
--learning_rate_decay_frac 0.0 \
--overfit_single_batch 0