Skip to content

Commit

Permalink
support finetuning with redpajama models
Browse files Browse the repository at this point in the history
  • Loading branch information
xzyaoi committed May 7, 2023
1 parent 2777e03 commit 39bc7c0
Show file tree
Hide file tree
Showing 4 changed files with 257 additions and 76 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,10 @@ dmypy.json
/pretrained/GPT-NeoX-20B/EleutherAI_gpt-neox-20b/
/pretrained/Pythia-6.9B-deduped/EleutherAI_pythia-6.9b-deduped/
/pretrained/RedPajama-3B/

# ignore training output
/model_ckpts/
/huggingface_models/
/training/wandb/

*.jsonl
1 change: 1 addition & 0 deletions data/OIG-chip2/prepare.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
wget https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl -O data/OIG-chip2/unified_chip2.jsonl
56 changes: 56 additions & 0 deletions training/finetune_RedPajama-INCITE-Chat-3B-v1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
DIR=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)

netif=lo
export GLOO_SOCKET_IFNAME=${netif}
export NCCL_SOCKET_IFNAME=${netif}
export MODEL_NAME=rp-incite-chat-3b-fintuned

export SHOW_DATA=0

BASE_MODEL="${DIR}/../pretrained/RedPajama-3B/togethercomputer_RedPajama-INCITE-Chat-3B-v1"

CHECKPOINT_STEPS=100

DATASETS="${DIR}/../data/OIG-chip2/unified_chip2.jsonl:1"

ARGS="--model-name ${BASE_MODEL} \
--tokenizer-name ${BASE_MODEL} \
--project-name together \
--model-type gptneox \
--optimizer adam \
--seed 42 \
--load-pretrained-model true \
--task-name \
"${DATASETS}" \
--checkpoint-path ${DIR}/../model_ckpts/${MODEL_NAME} \
--total-steps 200 --warmup-steps 10 --train-warmup-steps 0 \
--checkpoint-steps ${CHECKPOINT_STEPS} \
--lr 1e-5 --seq-length 2048 --batch-size 32 --micro-batch-size 1 --gradient-accumulate-step 1 \
--dist-url tcp://127.0.0.1:7033 \
--num-layers 8 --embedding-dim 2560 \
--world-size 8 --pipeline-group-size 4 --data-group-size 2 \
--job-id 0 --net-interface ${netif} \
--fp16 \
--dp-backend nccl \
--dp-mode allreduce \
--pp-mode gpipe --profiling no-profiling"


(trap 'kill 0' SIGINT; \
python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 0 --rank 0 \
& \
python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 1 --rank 1 \
& \
python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 2 --rank 2 \
& \
python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 3 --rank 3 \
& \
python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 4 --rank 4 \
& \
python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 5 --rank 5 \
& \
python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 6 --rank 6 \
& \
python ${DIR}/dist_clm_train.py $(echo ${ARGS}) --cuda-id 7 --rank 7 \
& \
wait)
Loading

0 comments on commit 39bc7c0

Please sign in to comment.