Skip to content

Commit

Permalink
update on att clip
Browse files Browse the repository at this point in the history
  • Loading branch information
Yu committed Jan 29, 2024
1 parent 9ebcb58 commit a758969
Show file tree
Hide file tree
Showing 218 changed files with 129,484 additions and 143 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
results
models
wandb
data
data/*
# !data/processed
output/
Expand Down
2 changes: 1 addition & 1 deletion eval/gsm/run_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def main(args):
model=model,
tokenizer=tokenizer,
prompts=prompts,
max_new_tokens=512,
max_new_tokens=256, ###Revision
batch_size=args.eval_batch_size,
stop_id_sequences=[[new_line_token]] if not args.use_chat_format else None, # we only use stop token for non-chat format (usually applied to vanilla pretrained language models). For chat format, we will rely on the model knows when to stop.
do_sample=False,
Expand Down
12 changes: 12 additions & 0 deletions eval_gsm8k.cmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
requirements = (Machine == "isye-hpc0457.isye.gatech.edu")
universe = vanilla
getenv = true
executable = gsm.sh
notify_user = yyu429@gatech.edu
Log = /home/yyu429/eval_ppl2/$(Cluster).$(process).log
output = /home/yyu429/eval_ppl2/$(Cluster).$(process).out
error = /home/yyu429/eval_ppl2/$(Cluster).$(process).error
notification = error
notification = complete
request_gpus = 1
queue
60 changes: 60 additions & 0 deletions finetune_alpaca.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/usr/bin/env bash
export CUDA_VISIBLE_DEVICES=0
export LD_LIBRARY_PATH=/home/yyu429/ENTER/envs/s5/lib
MODEL_SIZE=7B
NUM_GPUS=1
BATCH_SIZE_PER_GPU=4
TOTAL_BATCH_SIZE=16
lr=5e-5
seq_len=1024
max_value=0.4
max_value_final=0.05
num_token=8
init_warmup=500
final_warmup=1000
GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_GPUS/$BATCH_SIZE_PER_GPU))
echo "Training llama model ${MODEL_SIZE} using $NUM_GPUS GPUs, $BATCH_SIZE_PER_GPU batch size per GPU, $GRADIENT_ACC_STEPS gradient accumulation steps"

# Lora training

#accelerate launch \
/home/yyu429/ENTER/envs/s5/bin/python3 -m accelerate.commands.launch \
--mixed_precision bf16 \
--num_machines 1 \
--num_processes $NUM_GPUS \
--use_deepspeed \
--deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf \
open_instruct/finetune.py \
--model_name_or_path meta-llama/Llama-2-7b-hf \
--use_lora \
--lora_rank 64 \
--lora_alpha 16 \
--lora_dropout 0.1 \
--tokenizer_name meta-llama/Llama-2-7b-hf \
--use_slow_tokenizer \
--train_file data/processed/tulu_v1/gpt4_alpaca_subset/gpt4_alpaca_data.jsonl \
--max_seq_length $seq_len \
--preprocessing_num_workers 16 \
--checkpointing_steps 1000 \
--per_device_train_batch_size $BATCH_SIZE_PER_GPU \
--gradient_accumulation_steps $GRADIENT_ACC_STEPS \
--learning_rate $lr \
--lr_scheduler_type linear \
--warmup_ratio 0.03 \
--weight_decay 0. \
--num_train_epochs 2 \
--output_dir output/llama2-7b_lr${lr}_seq_len${seq_len}_bsz${TOTAL_BATCH_SIZE}_initwp${init_warmup}_finalwp${final_warmup}_maxvalue${max_value}_final${max_value_final} \
--with_tracking \
--report_to tensorboard \
--logging_steps 1 \
--max_value $max_value \
--num_token $num_token \
--max_value_final $max_value_final \
--init_warmup $init_warmup \
--final_warmup $final_warmup

#python open_instruct/merge_lora.py \
# --base_model_name_or_path ../hf_llama2_models/${MODEL_SIZE} \
# --lora_model_name_or_path output/tulu_v2_${MODEL_SIZE}_lora/ \
# --output_dir output/tulu_v2_${MODEL_SIZE}_lora_merged/ \
# --save_tokenizer
12 changes: 12 additions & 0 deletions ft_alpaca.cmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
requirements = (Machine == "isye-hpc0457.isye.gatech.edu")
universe = vanilla
getenv = true
executable = finetune_alpaca.sh
notify_user = yyu429@gatech.edu
Log = /home/yyu429/eval_ppl2/$(Cluster).$(process).log
output = /home/yyu429/eval_ppl2/$(Cluster).$(process).out
error = /home/yyu429/eval_ppl2/$(Cluster).$(process).error
notification = error
notification = complete
request_gpus = 1
queue
12 changes: 12 additions & 0 deletions gsm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env bash
# Here we use 1 GPU for demonstration, but you can use multiple GPUs and larger eval_batch_size to speed up the evaluation.
export CUDA_VISIBLE_DEVICES=0
export LD_LIBRARY_PATH=/home/yyu429/ENTER/envs/s5/lib
/home/yyu429/ENTER/envs/s5/bin/python3 -m eval.gsm.run_eval \
--data_dir data/eval/gsm/ \
--save_dir results/gsm/llama-7B-cot-8shot \
--model meta-llama/Llama-2-7b-hf \
--tokenizer meta-llama/Llama-2-7b-hf \
--n_shot 8 \
--use_slow_tokenizer \
--max_num_examples 200
57 changes: 55 additions & 2 deletions open_instruct/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,26 @@
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training

logger = get_logger(__name__)
class Scheduler:
def __init__(self,init_warmup,final_warmup,total_steps, max_value, max_value_final,schedule='linear'):
self.init_warmup = init_warmup
self.final_warmup = final_warmup
self.total_steps = total_steps
self.max_value = max_value
self.max_value_final=max_value_final
self.schedule = schedule

def calculate_schedule(self, step):
if step<=self.init_warmup:
return self.max_value
elif step>=self.total_steps-self.final_warmup:
return self.max_value_final
else:
if self.schedule == 'linear':
return self.max_value - (self.max_value-self.max_value_final) * (step-self.init_warmup) / (self.total_steps-self.init_warmup-self.final_warmup)
else:
return



def parse_args():
Expand Down Expand Up @@ -223,6 +243,23 @@ def parse_args():
action='store_true',
help='Use 8bit optimizer from bitsandbytes. Not compatible with deepspeed (use deepspeed config instead).',
)
parser.add_argument("--num_token",
default=1,
type=int)

parser.add_argument("--max_value",
default=1.0,
type=float)
parser.add_argument("--max_value_final",
default=0.1,
type=float)
parser.add_argument("--init_warmup",
default=1000,
type=float)
parser.add_argument("--final_warmup",
default=1000,
type=float)

args = parser.parse_args()

# Sanity checks
Expand Down Expand Up @@ -395,9 +432,14 @@ def main():

# Load pretrained model and tokenizer
if args.config_name:
config = AutoConfig.from_pretrained(args.config_name)
config = AutoConfig.from_pretrained(args.config_name, )
elif args.model_name_or_path:
config = AutoConfig.from_pretrained(args.model_name_or_path)
# config = AutoConfig.from_pretrained(args.model_name_or_path)
###Revision
config = transformers.AutoConfig.from_pretrained(
args.model_name_or_path, token="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", num_token=args.num_token,
max_value=args.max_value, _attn_implementation="eager"
)
else:
raise ValueError(
"You are instantiating a new config instance from scratch. This is not supported by this script."
Expand Down Expand Up @@ -487,6 +529,9 @@ def main():
model.print_trainable_parameters()

# Preprocessing the datasets.
###Revision
print(raw_datasets["train"])
print(f"max length is {args.max_seq_length}")
if "prompt" in raw_datasets["train"].column_names and "completion" in raw_datasets["train"].column_names:
encode_function = partial(
encode_with_prompt_completion_format,
Expand Down Expand Up @@ -614,6 +659,8 @@ def main():
completed_steps = 0
starting_epoch = 0

###Revision
clip_scheduler = Scheduler(init_warmup=args.init_warmup, final_warmup=args.final_warmup, total_steps=args.max_train_steps, max_value=args.max_value, max_value_final=args.max_value_final)
# Potentially load in the weights and states from a previous save
if args.resume_from_checkpoint:
if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
Expand Down Expand Up @@ -666,6 +713,12 @@ def main():
else:
active_dataloader = train_dataloader
for step, batch in enumerate(active_dataloader):
###Revision
threshold = clip_scheduler.calculate_schedule(step)
for layer in model.module.model.model.layers:
layer.self_attn.max_value = threshold
layer.self_attn.clipping = True

with accelerator.accumulate(model):
outputs = model(**batch, use_cache=False)
loss = outputs.loss
Expand Down
10 changes: 9 additions & 1 deletion open_instruct/reformat_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ def convert_stanford_alpaca_data(data_dir, output_dir, num_examples=None):
output_path = os.path.join(output_dir, "stanford_alpaca_data.jsonl")
with open(output_path, "w") as fout:
for idx, example in enumerate(examples):
print(example)
encoded_example = encode_instruction_example(
instruction=example["instruction"],
input=example["input"],
Expand Down Expand Up @@ -786,4 +787,11 @@ def should_be_filtered(example):
fout.write(line)
else:
print(f"Processing {dataset} data with default configurations...")
globals()[f"convert_{dataset}_data"](os.path.join(args.raw_data_dir, dataset), os.path.join(args.output_dir, dataset))
# globals()[f"convert_{dataset}_data"](os.path.join(args.raw_data_dir, dataset), os.path.join(args.output_dir, dataset))
convert_gpt4_alpaca_data(
data_dir=os.path.join(args.raw_data_dir, "gpt4_alpaca"),
output_dir=os.path.join(args.output_dir, "tulu_v1", "gpt4_alpaca_subset"),
load_en=True,
load_zh=False,
num_examples=None
)
Loading

0 comments on commit a758969

Please sign in to comment.