Skip to content

Commit

Permalink
Fix several issues in the evals. (allenai#79)
Browse files Browse the repository at this point in the history
* Merging lora on CPU to avoid GPU memory OOM.

* Update requirements.

* Change the alpaca_eval annotator to `alpaca_eval_gpt4`

* Make TyDiQA random example selection deterministic.

* Cache openai request for truthfulqa.

* Add templates for other models.

* More evaluation experiments.

* Avoid loading the entire leaderboard.
  • Loading branch information
yizhongw authored Nov 17, 2023
1 parent b555637 commit 4a2e9dd
Show file tree
Hide file tree
Showing 8 changed files with 126 additions and 30 deletions.
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# This dockerfile is forked from ai2/cuda11.8-cudnn8-dev-ubuntu20.04
FROM gcr.io/ai2-beaker-core/public/ckio6ct4g24tgnrtk3og:latest
FROM gcr.io/ai2-beaker-core/public/cl5erg1ebj67821o3200:latest

RUN apt update && apt install -y openjdk-8-jre-headless

Expand All @@ -10,7 +10,7 @@ WORKDIR /stage/

COPY requirements.txt .
RUN pip install --upgrade pip setuptools wheel
RUN pip install torch==2.0.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
RUN pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118
RUN pip install packaging
RUN pip install flash-attn==2.2.2 --no-build-isolation
RUN pip install -r requirements.txt
Expand Down
8 changes: 6 additions & 2 deletions eval/alpaca_farm/run_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,18 +82,22 @@ def main(args):
df_leaderboard, annotations = alpaca_farm_evaluate(
model_outputs=model_results,
reference_outputs=args.reference_path,
annotators_config="alpaca_eval_gpt4_0314",
annotators_config="alpaca_eval_gpt4",
output_path=args.save_dir,
is_return_instead_of_print=True,
caching_path=os.path.join(args.save_dir, "alpaca_eval_annotator_cache.json"),
precomputed_leaderboard=None,
is_cache_leaderboard=False
)
else:
df_leaderboard, annotations = alpaca_farm_evaluate(
model_outputs=model_results,
annotators_config="alpaca_eval_gpt4_0314",
annotators_config="alpaca_eval_gpt4",
output_path=args.save_dir,
is_return_instead_of_print=True,
caching_path=os.path.join(args.save_dir, "alpaca_eval_annotator_cache.json"),
precomputed_leaderboard=None,
is_cache_leaderboard=False
)

print(df_leaderboard.to_string(float_format="%.2f"))
Expand Down
50 changes: 50 additions & 0 deletions eval/templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,53 @@ def create_prompt_with_llama2_chat_format(messages, bos="<s>", eos="</s>", add_b
# The next line removes the bos token if add_bos is False.
formatted_text = formatted_text[len(bos):] if not add_bos else formatted_text
return formatted_text


def create_prompt_with_xwin_chat_format(messages, bos="<s>", eos="</s>", add_bos=True):
'''
This function is adapted from the official xwin chat completion script:
https://huggingface.co/Xwin-LM/Xwin-LM-70B-V0.1
'''
formatted_text = "A chat between a curious user and an artificial intelligence assistant. "
formatted_text += "The assistant gives helpful, detailed, and polite answers to the user's questions. "
for message in messages:
if message["role"] == "user":
formatted_text += "USER: " + message["content"] + " "
elif message["role"] == "assistant":
formatted_text += "ASSISTANT: " + message["content"] + eos
formatted_text += "ASSISTANT:"
return formatted_text


def create_prompt_with_zephyr_chat_format(messages, bos="<s>", eos="</s>", add_bos=True):
'''
This function is adapted from the official zephyr chat completion script:
https://huggingface.co/HuggingFaceH4/zephyr-7b-beta
'''
formatted_text = ""
# if messages[0]["role"] != "system":
# messages = [{
# "role": "system",
# "content": ""
# }] + messages

for message in messages:
if message["role"] == "system":
formatted_text += "<|system|>\n" + message["content"] + "\n"
elif message["role"] == "user":
formatted_text += "<|user|>\n" + message["content"] + "\n"
elif message["role"] == "assistant":
formatted_text += "<|assistant|>\n" + message["content"] + eos + "\n"
else:
raise ValueError(
"Zephyr chat template only supports 'system', 'user' and 'assistant' roles. Invalid role: {}.".format(message["role"])
)
formatted_text += "<|assistant|>\n"
return formatted_text







33 changes: 25 additions & 8 deletions eval/truthfulqa/run_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def trim_answer(answer):
return answer


def run_chatgpt(questions, engine, tag, preset='qa', verbose=False):
def run_chatgpt(questions, engine, tag, preset='qa', batch_size=1, cache_path=None, verbose=False):

"""Stores answers from ChatGPT / GPT4 models (requires an API key)"""

Expand All @@ -52,15 +52,21 @@ def run_chatgpt(questions, engine, tag, preset='qa', verbose=False):
{"prompt": format_prompt(questions.loc[idx], preset, format='general'), "id": idx} for idx in questions.index
]

responses = query_openai_chat_model(engine=engine, instances=instances, temperature=0.0)
responses = query_openai_chat_model(
engine=engine,
output_path=cache_path,
instances=instances,
batch_size=batch_size,
temperature=0.0
)
assert len(responses) == len(instances)

for idx, response in zip(questions.index, responses):
questions.loc[idx, tag] = trim_answer(response["output"])
return questions


def run_gpt3(questions, engine, tag, preset='qa', verbose=False):
def run_gpt3(questions, engine, tag, preset='qa', batch_size=1, cache_path=None, verbose=False):
"""Stores answers from GPT-3 models (requires an API key)"""

if tag not in questions.columns:
Expand All @@ -73,7 +79,15 @@ def run_gpt3(questions, engine, tag, preset='qa', verbose=False):
{"prompt": format_prompt(questions.loc[idx], preset, format='general'), "id": idx} for idx in questions.index
]

responses = query_openai_model(engine=engine, instances=instances, temperature=0.0, stop=None if preset == 'long' else '\n\n', max_tokens=50)
responses = query_openai_model(
engine=engine,
instances=instances,
output_path=cache_path,
batch_size=batch_size,
temperature=0.0,
stop=None if preset == 'long' else '\n\n',
max_tokens=50
)
assert len(responses) == len(instances)

for idx, response in zip(questions.index, responses):
Expand All @@ -82,7 +96,7 @@ def run_gpt3(questions, engine, tag, preset='qa', verbose=False):
return questions


def run_gpt3_mc(questions, engine, tag, preset='qa', verbose=False):
def run_gpt3_mc(questions, engine, tag, preset='qa', batch_size=1, cache_path=None, verbose=False):
"""Runs multiple-choice metrics for GPT-3 models (requires an API key)"""

set_columns(tag, questions)
Expand Down Expand Up @@ -114,6 +128,8 @@ def run_gpt3_mc(questions, engine, tag, preset='qa', verbose=False):
responses = query_openai_model(
engine=engine,
instances=instances,
output_path=cache_path,
batch_size=batch_size,
temperature=0.0,
stop=["\n\n"],
max_tokens=0,
Expand Down Expand Up @@ -277,18 +293,19 @@ def main(args):
run_hf_model_mc(questions, model, tokenizer, tag=args.model_name_or_path, batch_size=args.eval_batch_size, preset=args.preset)
elif args.openai_engine:
# gpt-3 language models
cache_path = os.path.join(args.save_dir, "openai_query_cache.jsonl")
if args.openai_engine in ['ada', 'babbage', 'curie', 'davinci', 'text-davinci-003', 'text-davinci-002', 'code-davinci-002']:
if "judge" in args.metrics or "info" in args.metrics:
print("Running generations")
run_gpt3(questions, args.openai_engine, args.openai_engine, args.preset)
run_gpt3(questions, args.openai_engine, args.openai_engine, cache_path=cache_path, batch_size=args.eval_batch_size, preset=args.preset)
if 'mc' in args.metrics:
print("Running multiple-choice classification!")
run_gpt3_mc(questions, args.openai_engine, args.openai_engine, preset=args.preset)
run_gpt3_mc(questions, args.openai_engine, args.openai_engine, cache_path=cache_path, batch_size=args.eval_batch_size, preset=args.preset)
# other openai engines
else:
if "judge" in args.metrics or "info" in args.metrics:
print("Running generations")
run_chatgpt(questions, args.openai_engine, args.openai_engine, args.preset)
run_chatgpt(questions, args.openai_engine, args.openai_engine, cache_path=cache_path, batch_size=args.eval_batch_size, preset=args.preset)
if "mc" in args.metrics:
raise ValueError("OpenAI Chat engines does not support MC metrics.")

Expand Down
2 changes: 1 addition & 1 deletion eval/tydiqa/run_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def main(args):
"answers": qa["answers"]
}
test_data.append(example)
data_languages = set([example["lang"] for example in test_data])
data_languages = sorted(list(set([example["lang"] for example in test_data])))
if args.max_num_examples_per_lang:
sampled_examples = []
for lang in data_languages:
Expand Down
3 changes: 2 additions & 1 deletion open_instruct/merge_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ def parse_args():
quantization_config=quantization_config,
device_map={"": 0} if torch.cuda.is_available() else None,
)
base_model = dequantize_model(base_model, device=base_model.device)
# base_model = dequantize_model(base_model, device=base_model.device)
base_model = dequantize_model(base_model, device="cpu")
else:
base_model = AutoModelForCausalLM.from_pretrained(
args.base_model_name_or_path if args.base_model_name_or_path else peft_config.base_model_name_or_path,
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@ protobuf
# But this PR is not compatible with the latest version of Transformers library (v4.34.0).
# To incorporate it, we forked the Transformers library and made some changes to make it compatible with the latest version.
git+https://github.com/yizhongw/transformers.git@left_padding
openai
openai<=0.28.1
tiktoken
rouge_score
tensorboard
wandb
gradio
gradio==3.50.2
termcolor
jsonlines
unidic-lite
Expand Down
52 changes: 38 additions & 14 deletions scripts/submit_eval_jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
"codex_eval_temp_0.8",
"trutufulqa",
"toxigen",
"alpaca_farm",
"alpaca_eval",
]

# model to evaluate, each in the followng format: model name, their beaker id, checkpoint subfolder
Expand Down Expand Up @@ -98,7 +98,10 @@

# tulu v2 ablation models
# ("finetuned_llama2_7B_on_v1_data", "01H7ABFYB84N9TN8MYXAVSMJ68", None, "tuned_lm"),
# ("finetuned_llama2_7B_on_sharegpt", "01HEXQK5YHNWG6RW1RS1H32XXA", None, "tuned_lm"),
# ("finetuned_llama2_13B_on_v1_data", "01H7AC0KXGRDH9ACJ24WTSK7SR", None, "tuned_lm"),
# ("finetuned_llama2_70B_on_v1_data", "01HE9NVD58XX6G9ZYA61JZKJ7N", None, "tuned_lm"),
# ("finetuned_llama2_7B_on_sharegpt_dpo", "01HEXR0R515HKPKTN4TNAC408A", None, "tuned_lm"),

# tulu v2 models
# ("tulu_v2_7B_qlora", "01HDCNBNJS56BWKP5AHV4YNCSJ", None, "tuned_lm"),
Expand All @@ -107,6 +110,11 @@
# ("tulu_v2_7B_jax", "01HBXTF305QARZ7P4T6ASXXVAM", None, "tuned_lm"),
# ("tulu_v2_13B_jax", "01HBWE5NHC3M30HH63339HS8BE", None, "tuned_lm"),
# ("tulu_v2_70B_jax", "01HCB2VZJ2T2JXZX0R1SJBRSB2", None, "tuned_lm"),
# ("tulu_v2_7B_dpo", "01HE8H1MBSVN09ZZ82X6K90NTF", None, "tuend_lm"),
# ("tulu_v2_13B_dpo", "01HE8YMBMJSTJV49QWA6TF2NTE", None, "tuend_lm"),
# ("tulu_v2_70B_dpo_first_epoch", "01HES1TCSJCPTPV50HQZHSN319", None, "tuend_lm"),
# ("tulu_v2_70B_dpo_second_epoch", "/net/nfs.cirrascale/allennlp/hamishi/EasyLM/tulu_2_70b_dpo/", None, "tuend_lm"),
# ("tulu_v2_70B_dpo", "01HEXKXP0MFM60PT7SY71XXSWD", None, "tuend_lm"),

# code llama models
# ("code_llama_7B", "01HD9Z1MJ9K3ZK494KGTVD1063", None, "vanilla_lm"),
Expand All @@ -133,7 +141,8 @@
# ("finetuned_falcon_7B_flanv2_cot_oasst1_dolly_sharegpt_gpt4alpaca_codealpaca", "01H356X9ZYY8HX1C7HFH6JYWNW", None, "tuned_lm"),
# ("hf-falcon-rw-7B", "tiiuae/falcon-rw-7b", None, "vanilla_lm"),
# ("finetuned_falcon_rw_7B_flanv2_cot_oasst1_dolly_sharegpt_gpt4alpaca_codealpaca", "01H37QXWFK095588W6GCMVGFKB", None, "tuned_lm"),

# ("zephyr-7B", "/net/nfs.cirrascale/allennlp/yizhongw/checkpoints/zephyr-7b-beta", None, "tuned_lm"),
# ("xwin-70B", "/net/nfs.cirrascale/allennlp/yizhongw/checkpoints/Xwin-LM-70B-V0.1", None, "tuned_lm"),
]

#--------------- experiments about number of supervision tasks -------------------------
Expand Down Expand Up @@ -307,7 +316,7 @@
--use_chat_format \
--chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format
'''
elif experiment_group == "alpaca_farm":
elif experiment_group == "alpaca_eval":
d['tasks'][0]['arguments'][0] = '''
python -m eval.alpaca_farm.run_eval \
--use_vllm \
Expand All @@ -320,12 +329,23 @@
else:
raise ValueError("experiment_group not supported")

if model_info[0].startswith("hf-"): # if it's a huggingface model, load it from the model hub
d['tasks'][0]['arguments'] = [d['tasks'][0]['arguments'][0].replace("--model_name_or_path /model", "--model_name_or_path "+model_info[1])]
d['tasks'][0]['arguments'] = [d['tasks'][0]['arguments'][0].replace("--tokenizer_name_or_path /model", "--model_name_or_path "+model_info[1])]
if model_info[1].startswith("/"): # if it's a local model, load it from the local directory
d['tasks'][0]['arguments'] = [d['tasks'][0]['arguments'][0].replace("--model_name_or_path /model", "--model_name_or_path "+model_info[1])]
d['tasks'][0]['arguments'] = [d['tasks'][0]['arguments'][0].replace("--tokenizer_name_or_path /model", "--model_name_or_path "+model_info[1])]
else: # if it's a beaker model, mount the beaker dataset to `/model`
d['tasks'][0]['datasets'][1]['source']['beaker'] = model_info[1]

# if a specific checkpoint is specified, load model from that checkpoint
if model_info[2] is not None:
assert "--model_name_or_path /model" in d['tasks'][0]['arguments'][0]
d['tasks'][0]['arguments'] = [d['tasks'][0]['arguments'][0].replace("--model_name_or_path /model", "--model_name_or_path /model/"+model_info[2])]
assert "--tokenizer_name_or_path /model" in d['tasks'][0]['arguments'][0]
d['tasks'][0]['arguments'] = [d['tasks'][0]['arguments'][0].replace("--tokenizer_name_or_path /model", "--tokenizer_name_or_path /model/"+model_info[2])]
# extract existing model path
model_name_or_path = re.search("--model_name_or_path (\S+)", d['tasks'][0]['arguments'][0]).group(1)
# replace the model path with the checkpoint subfolder
d['tasks'][0]['arguments'] = [d['tasks'][0]['arguments'][0].replace(model_name_or_path, model_name_or_path+"/"+model_info[2])]
# replace the tokenizer path with the checkpoint subfolder
tokenizer_name_or_path = re.search("--tokenizer_name_or_path (\S+)", d['tasks'][0]['arguments'][0]).group(1)

# for vanilla_lm, remove the chat formatting function
if model_info[3] == "vanilla_lm":
Expand Down Expand Up @@ -364,12 +384,6 @@
# request 2x more GPUs
d['tasks'][0]['resources']['gpuCount'] = 2 * d['tasks'][0]['resources']['gpuCount']

if model_info[0].startswith("hf-"): # if it's a huggingface model, load it from the model hub
d['tasks'][0]['arguments'] = [d['tasks'][0]['arguments'][0].replace("--use_chat_format", "")]
d['tasks'][0]['arguments'] = [d['tasks'][0]['arguments'][0].replace("--model_name_or_path /model", "--model_name_or_path "+model_info[1])]
d['tasks'][0]['arguments'] = [d['tasks'][0]['arguments'][0].replace("--tokenizer_name_or_path /model", "--model_name_or_path "+model_info[1])]
else: # if it's a beaker model, mount the beaker dataset to `/model`
d['tasks'][0]['datasets'][1]['source']['beaker'] = model_info[1]

if "llama2-chat" in model_info[0]:
d['tasks'][0]['arguments'] = [d['tasks'][0]['arguments'][0].replace(
Expand All @@ -380,7 +394,17 @@
d['tasks'][0]['arguments'] = [d['tasks'][0]['arguments'][0].replace(
"--chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format",
"--chat_formatting_function eval.templates.create_prompt_with_llama2_chat_format")
]
]
elif "zephyr" in model_info[0]:
d['tasks'][0]['arguments'] = [d['tasks'][0]['arguments'][0].replace(
"--chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format",
"--chat_formatting_function eval.templates.create_prompt_with_zephyr_chat_format")
]
elif "xwin" in model_info[0]:
d['tasks'][0]['arguments'] = [d['tasks'][0]['arguments'][0].replace(
"--chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format",
"--chat_formatting_function eval.templates.create_prompt_with_xwin_chat_format")
]

if any([x in model_info[0] for x in ["opt", "pythia", "falcon"]]):
if "--use_vllm" in d['tasks'][0]['arguments'][0]:
Expand Down

0 comments on commit 4a2e9dd

Please sign in to comment.