add support for pt

michaelskou · Jul 19, 2023 · 8aad566 · 8aad566
1 parent 7b356b9
commit 8aad566
Show file tree

Hide file tree

Showing 7 changed files with 693 additions and 40 deletions.
diff --git a/train/README.md b/train/README.md
@@ -96,7 +96,7 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
 训练的启动脚本写在scripts/run.sh，你需要按照实际需求修改run.sh中的参数
 
 ```bash
-bash scripts/run.sh
+bash scripts/run_sft.sh
 ```
 
 - model_name_or_path 代表预训练模型（如果是LLaMA模型，需事先转为hf格式才能通过from_pretrained读取）
@@ -116,7 +116,7 @@ run.sh中包含了全量参数微调和LoRA两种训练方式的启动命令，
 下面的命令是单机多卡进行全量参数微调，同时采用deepspeed，基础模型是LLaMA
 
 ```bash
-torchrun --nproc_per_node 8 train.py \
+torchrun --nproc_per_node 8 src/entrypoint/sft_train.py \
     --model_name_or_path ${model_name_or_path} \
     --llama \
     --deepspeed configs/deepspeed_config.json \
@@ -180,7 +180,7 @@ trainer_state.json记录了loss、learning_rate的变化
 #### 2.2.2 LoRA
 
 ```bash
-torchrun --nproc_per_node 8 train.py \
+torchrun --nproc_per_node 8 src/entry_point/sft_train.py \
     --model_name_or_path ${model_name_or_path} \
     --llama \
     --use_lora True \
@@ -284,7 +284,7 @@ torchrun --nproc_per_node 8 --nnodes 2 --master_addr ${master_addr} --master_por
 如果您看到了这里，说明您已经完成了训练。现在我们加载训练好的模型，验证模型生成文本的效果。
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 python src/inference.py \
+CUDA_VISIBLE_DEVICES=0 python src/entry_point/inference.py \
     --model_name_or_path model_name_or_path \
     --ckpt_path ckpt_path \
     --llama \
@@ -307,7 +307,7 @@ CUDA_VISIBLE_DEVICES=0 python src/inference.py \
 我们也提供了一个简洁的基于gradio的交互式web界面，启动服务：
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 python src/interface.py \
+CUDA_VISIBLE_DEVICES=0 python src/entry_point/interface.py \
     --model_name_or_path model_name_or_path \
     --ckpt_path ckpt_path \
     --llama \
@@ -334,7 +334,7 @@ bash scripts/run_multi_backend.sh
 首先，您需要从[facebookresearch/llama](https://github.com/facebookresearch/llama)获取LLaMA模型的访问权限，下载官方检查点
 
 ```bash
-python training_scripts/convert_llama_weights_to_hf.py --input_dir download_official_llama_path --model_size 7B --output_dir xx/llama-7b-hf
+python scripts/convert_llama_weights_to_hf.py --input_dir download_official_llama_path --model_size 7B --output_dir xx/llama-7b-hf
 ```
 
 运行训练脚本时将model_name_or_path改为xx/llama-7b-hf即可

diff --git a/train/scripts/multinode_run.sh b/train/scripts/multinode_run.sh
@@ -17,7 +17,7 @@ cutoff_len=1024
 master_addr="10.111.112.223"
 
 # #Multi-node
-torchrun --nproc_per_node 8 --nnodes 2 --master_addr ${master_addr} --master_port 14545 --node_rank ${node_rank} src/entry_point/train.py \
+torchrun --nproc_per_node 8 --nnodes 2 --master_addr ${master_addr} --master_port 14545 --node_rank ${node_rank} src/entry_point/sft_train.py \
     --model_name_or_path ${model_name_or_path} \
     --llama \
     --deepspeed configs/deepspeed_config.json \

diff --git a/train/scripts/run.sh → train/scripts/run_pt.sh b/train/scripts/run.sh → train/scripts/run_pt.sh
@@ -17,7 +17,7 @@ mkdir -p ${cache_dir}
 cutoff_len=1024
 
 #FT
-# torchrun --nproc_per_node 8 src/entry_point/train.py \
+# torchrun --nproc_per_node 8 src/entry_point/pt_train.py \
 #     --ddp_timeout 36000 \
 #     --model_name_or_path ${model_name_or_path} \
 #     --llama \
@@ -46,7 +46,7 @@ cutoff_len=1024
 
 
 #LoRA with 8bit
-# torchrun --nproc_per_node 8 src/entry_point/train.py \
+# torchrun --nproc_per_node 8 src/entry_point/pt_train.py \
 #     --ddp_timeout 36000 \
 #     --model_name_or_path ${model_name_or_path} \
 #     --llama \
@@ -76,7 +76,7 @@ cutoff_len=1024
 #    # --resume_from_checkpoint ...
 
 # LoRA without 8bit
-torchrun --nproc_per_node 8 src/entry_point/train.py \
+torchrun --nproc_per_node 8 src/entry_point/pt_train.py \
     --ddp_timeout 36000 \
     --model_name_or_path ${model_name_or_path} \
     --llama \

diff --git a/train/scripts/run_sft.sh b/train/scripts/run_sft.sh
@@ -0,0 +1,106 @@
+#! /bin/bash
+export CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
+export WANDB_PROJECT=...
+export WANDB_RUN_ID=...
+export WANDB_RESUME=allow
+export ABS_PATH=...
+export PYTHONPATH="$ABS_PATH/BELLE/train"
+model_name_or_path=/path_to_llm/hf_llama_7b/ # or bloomz-7b1-mt
+
+train_file=belleMath.json
+validation_file=belleMath-dev1K.json
+output_dir="$ABS_PATH/saved_models/${WANDB_PROJECT}_${WANDB_RUN_ID}"
+mkdir -p ${output_dir}
+
+cache_dir=hf_cache_dir
+mkdir -p ${cache_dir}
+cutoff_len=1024
+
+#FT
+# torchrun --nproc_per_node 8 src/entry_point/sft_train.py \
+#     --ddp_timeout 36000 \
+#     --model_name_or_path ${model_name_or_path} \
+#     --llama \
+#     --deepspeed configs/deepspeed_config.json \
+#     --train_file ${train_file} \
+#     --validation_file ${validation_file} \
+#     --per_device_train_batch_size 2 \
+#     --per_device_eval_batch_size 2 \
+#     --gradient_accumulation_steps 4 \
+#     --num_train_epochs 2 \
+#     --model_max_length ${cutoff_len} \
+#     --save_strategy "steps" \
+#     --save_total_limit 3 \
+#     --learning_rate 8e-6 \
+#     --weight_decay 0.00001 \
+#     --warmup_ratio 0.05 \
+#     --lr_scheduler_type "cosine" \
+#     --logging_steps 10 \
+#     --evaluation_strategy "steps" \
+#     --fp16 \
+#     --seed 1234 \
+#     --gradient_checkpointing \
+#     --cache_dir ${cache_dir} \
+#     --output_dir ${output_dir} \
+#    # --resume_from_checkpoint ...
+
+
+#LoRA with 8bit
+# torchrun --nproc_per_node 8 src/entry_point/sft_train.py \
+#     --ddp_timeout 36000 \
+#     --model_name_or_path ${model_name_or_path} \
+#     --llama \
+#     --use_lora \
+#     --use_int8_training \
+#     --lora_config configs/lora_config_llama.json \
+#     --train_file ${train_file} \
+#     --validation_file ${validation_file} \
+#     --per_device_train_batch_size 1 \
+#     --per_device_eval_batch_size 1 \
+#     --gradient_accumulation_steps 8 \
+#     --num_train_epochs 2 \
+#     --model_max_length ${cutoff_len} \
+#     --save_strategy "steps" \
+#     --save_total_limit 3 \
+#     --learning_rate 8e-6 \
+#     --weight_decay 0.00001 \
+#     --warmup_ratio 0.05 \
+#     --lr_scheduler_type "cosine" \
+#     --logging_steps 10 \
+#     --evaluation_strategy "steps" \
+#     --fp16 \
+#     --seed 1234 \
+#     --gradient_checkpointing \
+#     --cache_dir ${cache_dir} \
+#     --output_dir ${output_dir} \
+#    # --resume_from_checkpoint ...
+
+# LoRA without 8bit
+torchrun --nproc_per_node 8 src/entry_point/sft_train.py \
+    --ddp_timeout 36000 \
+    --model_name_or_path ${model_name_or_path} \
+    --llama \
+    --use_lora \
+    --deepspeed configs/deepspeed_config_stage3.json \
+    --lora_config configs/lora_config_llama.json \
+    --train_file ${train_file} \
+    --validation_file ${validation_file} \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 1 \
+    --num_train_epochs 10 \
+    --model_max_length ${cutoff_len} \
+    --save_strategy "steps" \
+    --save_total_limit 3 \
+    --learning_rate 3e-4 \
+    --weight_decay 0.00001 \
+    --warmup_ratio 0.01 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 10 \
+    --evaluation_strategy "steps" \
+    --fp16 \
+    --seed 1234 \
+    --gradient_checkpointing \
+    --cache_dir ${cache_dir} \
+    --output_dir ${output_dir} \
+   # --resume_from_checkpoint ...