update on att clip

yifan1130 · Jan 29, 2024 · a758969 · a758969
1 parent 9ebcb58
commit a758969
Show file tree

Hide file tree

Showing 218 changed files with 129,484 additions and 143 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 results
 models
 wandb
+data
 data/*
 # !data/processed
 output/

diff --git a/eval/gsm/run_eval.py b/eval/gsm/run_eval.py
@@ -106,7 +106,7 @@ def main(args):
                 model=model,
                 tokenizer=tokenizer,
                 prompts=prompts,
-                max_new_tokens=512,
+                max_new_tokens=256, ###Revision
                 batch_size=args.eval_batch_size,
                 stop_id_sequences=[[new_line_token]] if not args.use_chat_format else None,  # we only use stop token for non-chat format (usually applied to vanilla pretrained language models). For chat format, we will rely on the model knows when to stop.
                 do_sample=False,

diff --git a/eval_gsm8k.cmd b/eval_gsm8k.cmd
@@ -0,0 +1,12 @@
+requirements = (Machine ==  "isye-hpc0457.isye.gatech.edu")
+universe = vanilla
+getenv = true
+executable = gsm.sh
+notify_user = yyu429@gatech.edu
+Log = /home/yyu429/eval_ppl2/$(Cluster).$(process).log
+output = /home/yyu429/eval_ppl2/$(Cluster).$(process).out
+error = /home/yyu429/eval_ppl2/$(Cluster).$(process).error
+notification = error
+notification = complete
+request_gpus = 1
+queue
diff --git a/finetune_alpaca.sh b/finetune_alpaca.sh
@@ -0,0 +1,60 @@
+#!/usr/bin/env bash
+export CUDA_VISIBLE_DEVICES=0
+export LD_LIBRARY_PATH=/home/yyu429/ENTER/envs/s5/lib
+MODEL_SIZE=7B
+NUM_GPUS=1
+BATCH_SIZE_PER_GPU=4
+TOTAL_BATCH_SIZE=16
+lr=5e-5
+seq_len=1024
+max_value=0.4
+max_value_final=0.05
+num_token=8
+init_warmup=500
+final_warmup=1000
+GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
+echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"
+
+# Lora training
+
+#accelerate launch \
+/home/yyu429/ENTER/envs/s5/bin/python3 -m accelerate.commands.launch \
+    --mixed_precision bf16 \
+    --num_machines 1 \
+    --num_processes $NUM_GPUS \
+    --use_deepspeed \
+    --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
+    open_instruct/finetune.py \
+    --model_name_or_path meta-llama/Llama-2-7b-hf \
+    --use_lora \
+    --lora_rank 64 \
+    --lora_alpha 16 \
+    --lora_dropout 0.1 \
+    --tokenizer_name meta-llama/Llama-2-7b-hf \
+    --use_slow_tokenizer \
+    --train_file data/processed/tulu_v1/gpt4_alpaca_subset/gpt4_alpaca_data.jsonl \
+    --max_seq_length $seq_len \
+    --preprocessing_num_workers 16 \
+    --checkpointing_steps 1000 \
+    --per_device_train_batch_size $BATCH_SIZE_PER_GPU \
+    --gradient_accumulation_steps $GRADIENT_ACC_STEPS \
+    --learning_rate $lr \
+    --lr_scheduler_type linear \
+    --warmup_ratio 0.03 \
+    --weight_decay 0. \
+    --num_train_epochs 2 \
+    --output_dir output/llama2-7b_lr${lr}_seq_len${seq_len}_bsz${TOTAL_BATCH_SIZE}_initwp${init_warmup}_finalwp${final_warmup}_maxvalue${max_value}_final${max_value_final} \
+    --with_tracking \
+    --report_to tensorboard \
+    --logging_steps 1 \
+    --max_value $max_value \
+    --num_token $num_token \
+    --max_value_final $max_value_final \
+    --init_warmup $init_warmup \
+    --final_warmup $final_warmup
+
+#python open_instruct/merge_lora.py \
+#    --base_model_name_or_path ../hf_llama2_models/${MODEL_SIZE} \
+#    --lora_model_name_or_path output/tulu_v2_${MODEL_SIZE}_lora/ \
+#    --output_dir output/tulu_v2_${MODEL_SIZE}_lora_merged/ \
+#    --save_tokenizer
diff --git a/ft_alpaca.cmd b/ft_alpaca.cmd
@@ -0,0 +1,12 @@
+requirements = (Machine ==  "isye-hpc0457.isye.gatech.edu")
+universe = vanilla
+getenv = true
+executable = finetune_alpaca.sh
+notify_user = yyu429@gatech.edu
+Log = /home/yyu429/eval_ppl2/$(Cluster).$(process).log
+output = /home/yyu429/eval_ppl2/$(Cluster).$(process).out
+error = /home/yyu429/eval_ppl2/$(Cluster).$(process).error
+notification = error
+notification = complete
+request_gpus = 1
+queue
diff --git a/gsm.sh b/gsm.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+# Here we use 1 GPU for demonstration, but you can use multiple GPUs and larger eval_batch_size to speed up the evaluation.
+export CUDA_VISIBLE_DEVICES=0
+export LD_LIBRARY_PATH=/home/yyu429/ENTER/envs/s5/lib
+/home/yyu429/ENTER/envs/s5/bin/python3 -m eval.gsm.run_eval \
+    --data_dir data/eval/gsm/ \
+    --save_dir results/gsm/llama-7B-cot-8shot \
+    --model meta-llama/Llama-2-7b-hf \
+    --tokenizer meta-llama/Llama-2-7b-hf \
+    --n_shot 8 \
+    --use_slow_tokenizer \
+    --max_num_examples 200
diff --git a/open_instruct/finetune.py b/open_instruct/finetune.py
@@ -34,6 +34,26 @@
 from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training
 
 logger = get_logger(__name__)
+class Scheduler:
+    def __init__(self,init_warmup,final_warmup,total_steps, max_value, max_value_final,schedule='linear'):
+        self.init_warmup = init_warmup
+        self.final_warmup = final_warmup
+        self.total_steps = total_steps
+        self.max_value = max_value
+        self.max_value_final=max_value_final
+        self.schedule = schedule
+
+    def calculate_schedule(self, step):
+        if step<=self.init_warmup:
+            return self.max_value
+        elif step>=self.total_steps-self.final_warmup:
+            return self.max_value_final
+        else:
+            if self.schedule == 'linear':
+                return self.max_value - (self.max_value-self.max_value_final) * (step-self.init_warmup) / (self.total_steps-self.init_warmup-self.final_warmup)
+            else:
+                return
+
 
 
 def parse_args():
@@ -223,6 +243,23 @@ def parse_args():
         action='store_true',
         help='Use 8bit optimizer from bitsandbytes. Not compatible with deepspeed (use deepspeed config instead).',
     )
+    parser.add_argument("--num_token",
+                        default=1,
+                        type=int)
+
+    parser.add_argument("--max_value",
+                        default=1.0,
+                        type=float)
+    parser.add_argument("--max_value_final",
+                        default=0.1,
+                        type=float)
+    parser.add_argument("--init_warmup",
+                        default=1000,
+                        type=float)
+    parser.add_argument("--final_warmup",
+                        default=1000,
+                        type=float)
+
     args = parser.parse_args()
 
     # Sanity checks
@@ -395,9 +432,14 @@ def main():
 
     # Load pretrained model and tokenizer
     if args.config_name:
-        config = AutoConfig.from_pretrained(args.config_name)
+        config = AutoConfig.from_pretrained(args.config_name, )
     elif args.model_name_or_path:
-        config = AutoConfig.from_pretrained(args.model_name_or_path)
+        # config = AutoConfig.from_pretrained(args.model_name_or_path)
+        ###Revision
+        config = transformers.AutoConfig.from_pretrained(
+            args.model_name_or_path, token="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", num_token=args.num_token,
+            max_value=args.max_value, _attn_implementation="eager"
+        )
     else:
         raise ValueError(
             "You are instantiating a new config instance from scratch. This is not supported by this script."
@@ -487,6 +529,9 @@ def main():
         model.print_trainable_parameters()
 
     # Preprocessing the datasets.
+    ###Revision
+    print(raw_datasets["train"])
+    print(f"max length is {args.max_seq_length}")
     if "prompt" in raw_datasets["train"].column_names and "completion" in raw_datasets["train"].column_names:
         encode_function = partial(
             encode_with_prompt_completion_format,
@@ -614,6 +659,8 @@ def main():
     completed_steps = 0
     starting_epoch = 0
 
+    ###Revision
+    clip_scheduler = Scheduler(init_warmup=args.init_warmup, final_warmup=args.final_warmup, total_steps=args.max_train_steps, max_value=args.max_value, max_value_final=args.max_value_final)
     # Potentially load in the weights and states from a previous save
     if args.resume_from_checkpoint:
         if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
@@ -666,6 +713,12 @@ def main():
         else:
             active_dataloader = train_dataloader
         for step, batch in enumerate(active_dataloader):
+            ###Revision
+            threshold = clip_scheduler.calculate_schedule(step)
+            for layer in model.module.model.model.layers:
+                layer.self_attn.max_value = threshold
+                layer.self_attn.clipping = True
+
             with accelerator.accumulate(model):
                 outputs = model(**batch, use_cache=False)                
                 loss = outputs.loss

diff --git a/open_instruct/reformat_datasets.py b/open_instruct/reformat_datasets.py
@@ -235,6 +235,7 @@ def convert_stanford_alpaca_data(data_dir, output_dir, num_examples=None):
     output_path = os.path.join(output_dir, "stanford_alpaca_data.jsonl")
     with open(output_path, "w") as fout:
         for idx, example in enumerate(examples):
+            print(example)
             encoded_example = encode_instruction_example(
                 instruction=example["instruction"], 
                 input=example["input"], 
@@ -786,4 +787,11 @@ def should_be_filtered(example):
                                 fout.write(line)
         else:
             print(f"Processing {dataset} data with default configurations...")
-            globals()[f"convert_{dataset}_data"](os.path.join(args.raw_data_dir, dataset), os.path.join(args.output_dir, dataset))
+            # globals()[f"convert_{dataset}_data"](os.path.join(args.raw_data_dir, dataset), os.path.join(args.output_dir, dataset))
+            convert_gpt4_alpaca_data(
+                data_dir=os.path.join(args.raw_data_dir, "gpt4_alpaca"),
+                output_dir=os.path.join(args.output_dir, "tulu_v1", "gpt4_alpaca_subset"),
+                load_en=True,
+                load_zh=False,
+                num_examples=None
+            )
-Original file line number
+Diff line change
@@ -1,6 +1,7 @@
     results
     models
     wandb
+    data
     data/*
     # !data/processed
     output/
@@ Expand Down @@