forked from THUDM/CodeGeeX
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pretrain_codegeex_deepspeed_zero2.sh
122 lines (115 loc) · 3.07 KB
/
pretrain_codegeex_deepspeed_zero2.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
SCRIPT_PATH=$(realpath "$0")
SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
MAIN_DIR=$(dirname "$SCRIPT_DIR")
# ====== Environment ======
# - NCCL & IB
# export NCCL_DEBUG=info
export NCCL_IB_DISABLE=0
export NCCL_IB_GID_INDEX=3
HOSTFILE=${SCRIPT_DIR}/my_hostfile
MASTER_IP=$(cat $HOSTFILE | head -n 1)
cat $HOSTFILE | awk '{print $1 " slots=8"}' > $SCRIPT_DIR/hostfile
echo "MASTER_IP=$MASTER_IP"
# ====== Parameters ======
TP=4
PP=1
DATA_PATH=${MAIN_DIR}/pt_data/my_data
CKPT_PATH=${SCRIPT_DIR}/mp${TP}_parallel_weights/
DS_CONFIG=${SCRIPT_DIR}/ds_config.json
# - 13b
NLAYERS=39
HIDDEN=5120
NATTN_HEAD=40
EMBED_VOCAB=52224
GLOBAL_BATCH=8
MICRO_BATCH=2
NTRAIN_ITERS=25
EVAL_INT=10
SAVE_INT=100
TRIAL_TAG="13b-test"
# - trial
TRIAL_NAME="pretrain-codegeex"
# - zero stage
ZERO_STAGE=2
# - logging & output
NOW=$(date +"%Y%m%d_%H%M%S")
OUTPUT_DIR="/data2/csw/$TRIAL_NAME-$TRIAL_TAG"
TB_DIR=$OUTPUT_DIR/tb$NOW
mkdir -p $OUTPUT_DIR
mkdir -p $TB_DIR
# Deepspeed config
cat <<EOT > $DS_CONFIG
{
"train_batch_size" : $GLOBAL_BATCH,
"train_micro_batch_size_per_gpu": $MICRO_BATCH,
"steps_per_print": 5,
"zero_optimization": {
"stage": $ZERO_STAGE,
"reduce_bucket_size": 50000000,
"allgather_bucket_size": 50000000,
"overlap_comm": true,
"contiguous_gradients": false
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"wall_clock_breakdown" : true
}
EOT
ds_args=""
ds_args=" --deepspeed ${ds_args}"
ds_args=" --no-pipeline-parallel ${ds_args}"
ds_args=" --deepspeed_config=$DS_CONFIG ${ds_args}"
ds_args=" --zero-stage=$ZERO_STAGE ${ds_args}"
ds_args=" --deepspeed-activation-checkpointing ${ds_args}"
echo "Launching deepspeed"
deepspeed \
--hostfile hostfile \
--master_addr $MASTER_IP \
--master_port 29501 \
$MAIN_DIR/codegeex/megatron/tools/pretrain_codegeex.py \
--tensor-model-parallel-size $TP \
--pipeline-model-parallel-size $PP \
--no-pipeline-parallel \
--num-layers $NLAYERS \
--hidden-size $HIDDEN \
--make-vocab-size-divisible-by $EMBED_VOCAB \
--num-attention-heads $NATTN_HEAD \
--seq-length 512 \
--loss-scale 12 \
--max-position-embeddings 2048 \
--micro-batch-size $MICRO_BATCH \
--global-batch-size $GLOBAL_BATCH \
--train-iters $NTRAIN_ITERS \
--lr 2e-4 \
--min-lr 1e-7 \
--lr-decay-iters 100000 \
--lr-decay-style cosine \
--lr-warmup-iters 1500 \
--log-interval 1 \
--eval-iters 10 \
--eval-interval $EVAL_INT \
--data-path $DATA_PATH \
--data-impl mmap \
--vocab-file $MAIN_DIR/codegeex/tokenizer/vocab.json \
--merge-file $MAIN_DIR/codegeex/tokenizer/merges.txt \
--save-interval $SAVE_INT \
--save $OUTPUT_DIR \
--load-state $CKPT_PATH \
--split 100,0,0 \
--clip-grad 1.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--fp16 \
--ln-fp16 \
--attention-softmax-in-fp32 \
--checkpoint-activations \
--override-lr-scheduler \
--tensorboard-dir $TB_DIR \
$ds_args |& tee ${OUTPUT_DIR}/$NOW.log