Fix several issues in the evals. (allenai#79)

* Merging lora on CPU to avoid GPU memory OOM. * Update requirements. * Change the alpaca_eval annotator to `alpaca_eval_gpt4` * Make TyDiQA random example selection deterministic. * Cache openai request for truthfulqa. * Add templates for other models. * More evaluation experiments. * Avoid loading the entire leaderboard.
yifan1130 · Nov 17, 2023 · 4a2e9dd · 4a2e9dd
1 parent b555637
commit 4a2e9dd
Show file tree

Hide file tree

Showing 8 changed files with 126 additions and 30 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,5 +1,5 @@
 # This dockerfile is forked from ai2/cuda11.8-cudnn8-dev-ubuntu20.04
-FROM gcr.io/ai2-beaker-core/public/ckio6ct4g24tgnrtk3og:latest
+FROM gcr.io/ai2-beaker-core/public/cl5erg1ebj67821o3200:latest
 
 RUN apt update && apt install -y openjdk-8-jre-headless
 
@@ -10,7 +10,7 @@ WORKDIR /stage/
 
 COPY requirements.txt .
 RUN pip install --upgrade pip setuptools wheel
-RUN pip install torch==2.0.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+RUN pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118
 RUN pip install packaging
 RUN pip install flash-attn==2.2.2 --no-build-isolation
 RUN pip install -r requirements.txt

diff --git a/eval/alpaca_farm/run_eval.py b/eval/alpaca_farm/run_eval.py
@@ -82,18 +82,22 @@ def main(args):
         df_leaderboard, annotations = alpaca_farm_evaluate(
             model_outputs=model_results,
             reference_outputs=args.reference_path,
-            annotators_config="alpaca_eval_gpt4_0314",
+            annotators_config="alpaca_eval_gpt4",
             output_path=args.save_dir,
             is_return_instead_of_print=True,
             caching_path=os.path.join(args.save_dir, "alpaca_eval_annotator_cache.json"),
+            precomputed_leaderboard=None,
+            is_cache_leaderboard=False
         )
     else:
         df_leaderboard, annotations = alpaca_farm_evaluate(
             model_outputs=model_results,
-            annotators_config="alpaca_eval_gpt4_0314",
+            annotators_config="alpaca_eval_gpt4",
             output_path=args.save_dir,
             is_return_instead_of_print=True,
             caching_path=os.path.join(args.save_dir, "alpaca_eval_annotator_cache.json"),
+            precomputed_leaderboard=None,
+            is_cache_leaderboard=False
         )
 
     print(df_leaderboard.to_string(float_format="%.2f"))

diff --git a/eval/templates.py b/eval/templates.py
@@ -46,3 +46,53 @@ def create_prompt_with_llama2_chat_format(messages, bos="<s>", eos="</s>", add_b
     # The next line removes the bos token if add_bos is False.
     formatted_text = formatted_text[len(bos):] if not add_bos else formatted_text
     return formatted_text
+
+
+def create_prompt_with_xwin_chat_format(messages, bos="<s>", eos="</s>", add_bos=True):
+    '''
+    This function is adapted from the official xwin chat completion script:
+    https://huggingface.co/Xwin-LM/Xwin-LM-70B-V0.1
+    '''
+    formatted_text = "A chat between a curious user and an artificial intelligence assistant. "
+    formatted_text += "The assistant gives helpful, detailed, and polite answers to the user's questions. "
+    for message in messages:
+        if message["role"] == "user":
+            formatted_text += "USER: " + message["content"] + " "
+        elif message["role"] == "assistant":
+            formatted_text += "ASSISTANT: " + message["content"] + eos
+    formatted_text += "ASSISTANT:"
+    return formatted_text
+
+
+def create_prompt_with_zephyr_chat_format(messages, bos="<s>", eos="</s>", add_bos=True):
+    '''
+    This function is adapted from the official zephyr chat completion script:
+    https://huggingface.co/HuggingFaceH4/zephyr-7b-beta
+    '''
+    formatted_text = ""
+    # if messages[0]["role"] != "system":
+    #     messages = [{
+    #         "role": "system",
+    #         "content": ""
+    #     }] + messages
+
+    for message in messages:
+        if message["role"] == "system":
+            formatted_text += "<|system|>\n" + message["content"] + "\n"
+        elif message["role"] == "user":
+            formatted_text += "<|user|>\n" + message["content"] + "\n"
+        elif message["role"] == "assistant":
+            formatted_text += "<|assistant|>\n" + message["content"] + eos + "\n"
+        else:
+            raise ValueError(
+                "Zephyr chat template only supports 'system', 'user' and 'assistant' roles. Invalid role: {}.".format(message["role"])
+                )
+    formatted_text += "<|assistant|>\n"
+    return formatted_text
+
+
+
+
+
+
+
diff --git a/eval/truthfulqa/run_eval.py b/eval/truthfulqa/run_eval.py
@@ -38,7 +38,7 @@ def trim_answer(answer):
     return answer
 
 
-def run_chatgpt(questions, engine, tag, preset='qa', verbose=False):
+def run_chatgpt(questions, engine, tag, preset='qa', batch_size=1, cache_path=None, verbose=False):
 
     """Stores answers from ChatGPT / GPT4 models (requires an API key)"""
 
@@ -52,15 +52,21 @@ def run_chatgpt(questions, engine, tag, preset='qa', verbose=False):
         {"prompt": format_prompt(questions.loc[idx], preset, format='general'), "id": idx} for idx in questions.index
     ]
 
-    responses = query_openai_chat_model(engine=engine, instances=instances, temperature=0.0)
+    responses = query_openai_chat_model(
+        engine=engine,
+        output_path=cache_path, 
+        instances=instances, 
+        batch_size=batch_size,
+        temperature=0.0
+    )
     assert len(responses) == len(instances)
 
     for idx, response in zip(questions.index, responses):
         questions.loc[idx, tag] = trim_answer(response["output"])
     return questions
 
 
-def run_gpt3(questions, engine, tag, preset='qa', verbose=False):
+def run_gpt3(questions, engine, tag, preset='qa', batch_size=1, cache_path=None, verbose=False):
     """Stores answers from GPT-3 models (requires an API key)"""
 
     if tag not in questions.columns:
@@ -73,7 +79,15 @@ def run_gpt3(questions, engine, tag, preset='qa', verbose=False):
         {"prompt": format_prompt(questions.loc[idx], preset, format='general'), "id": idx} for idx in questions.index
     ]
 
-    responses = query_openai_model(engine=engine, instances=instances, temperature=0.0, stop=None if preset == 'long' else '\n\n', max_tokens=50)
+    responses = query_openai_model(
+        engine=engine, 
+        instances=instances, 
+        output_path=cache_path,
+        batch_size=batch_size,
+        temperature=0.0, 
+        stop=None if preset == 'long' else '\n\n', 
+        max_tokens=50
+    )
     assert len(responses) == len(instances)
 
     for idx, response in zip(questions.index, responses):
@@ -82,7 +96,7 @@ def run_gpt3(questions, engine, tag, preset='qa', verbose=False):
     return questions
 
 
-def run_gpt3_mc(questions, engine, tag, preset='qa', verbose=False):
+def run_gpt3_mc(questions, engine, tag, preset='qa', batch_size=1, cache_path=None, verbose=False):
     """Runs multiple-choice metrics for GPT-3 models (requires an API key)"""
 
     set_columns(tag, questions)
@@ -114,6 +128,8 @@ def run_gpt3_mc(questions, engine, tag, preset='qa', verbose=False):
     responses = query_openai_model(
         engine=engine, 
         instances=instances,
+        output_path=cache_path,
+        batch_size=batch_size,
         temperature=0.0, 
         stop=["\n\n"], 
         max_tokens=0, 
@@ -277,18 +293,19 @@ def main(args):
             run_hf_model_mc(questions, model, tokenizer, tag=args.model_name_or_path, batch_size=args.eval_batch_size, preset=args.preset)
     elif args.openai_engine:
         # gpt-3 language models
+        cache_path = os.path.join(args.save_dir, "openai_query_cache.jsonl")
         if args.openai_engine in ['ada', 'babbage', 'curie', 'davinci', 'text-davinci-003', 'text-davinci-002', 'code-davinci-002']:
             if "judge" in args.metrics or "info" in args.metrics:
                 print("Running generations")
-                run_gpt3(questions, args.openai_engine, args.openai_engine, args.preset)
+                run_gpt3(questions, args.openai_engine, args.openai_engine, cache_path=cache_path, batch_size=args.eval_batch_size, preset=args.preset)
             if 'mc' in args.metrics:
                 print("Running multiple-choice classification!")
-                run_gpt3_mc(questions, args.openai_engine, args.openai_engine, preset=args.preset)
+                run_gpt3_mc(questions, args.openai_engine, args.openai_engine, cache_path=cache_path, batch_size=args.eval_batch_size, preset=args.preset)
         # other openai engines
         else:
             if "judge" in args.metrics or "info" in args.metrics:
                 print("Running generations")
-                run_chatgpt(questions, args.openai_engine, args.openai_engine, args.preset)
+                run_chatgpt(questions, args.openai_engine, args.openai_engine, cache_path=cache_path, batch_size=args.eval_batch_size, preset=args.preset)
             if "mc" in args.metrics:
                 raise ValueError("OpenAI Chat engines does not support MC metrics.")
 

diff --git a/eval/tydiqa/run_eval.py b/eval/tydiqa/run_eval.py
@@ -58,7 +58,7 @@ def main(args):
                         "answers": qa["answers"]
                     }
                     test_data.append(example)
-    data_languages = set([example["lang"] for example in test_data])
+    data_languages = sorted(list(set([example["lang"] for example in test_data]))) 
     if args.max_num_examples_per_lang:
         sampled_examples = []
         for lang in data_languages:

diff --git a/open_instruct/merge_lora.py b/open_instruct/merge_lora.py
@@ -67,7 +67,8 @@ def parse_args():
             quantization_config=quantization_config,
             device_map={"": 0} if torch.cuda.is_available() else None,
         )
-        base_model = dequantize_model(base_model, device=base_model.device)
+        # base_model = dequantize_model(base_model, device=base_model.device)
+        base_model = dequantize_model(base_model, device="cpu")
     else:
         base_model = AutoModelForCausalLM.from_pretrained(
             args.base_model_name_or_path if args.base_model_name_or_path else peft_config.base_model_name_or_path,

diff --git a/requirements.txt b/requirements.txt
@@ -17,12 +17,12 @@ protobuf
 # But this PR is not compatible with the latest version of Transformers library (v4.34.0).
 # To incorporate it, we forked the Transformers library and made some changes to make it compatible with the latest version.
 git+https://github.com/yizhongw/transformers.git@left_padding
-openai
+openai<=0.28.1
 tiktoken
 rouge_score
 tensorboard
 wandb
-gradio
+gradio==3.50.2
 termcolor
 jsonlines
 unidic-lite

diff --git a/scripts/submit_eval_jobs.py b/scripts/submit_eval_jobs.py
@@ -36,7 +36,7 @@
     "codex_eval_temp_0.8",
     "trutufulqa",
     "toxigen",
-    "alpaca_farm",
+    "alpaca_eval",
 ]
 
 # model to evaluate, each in the followng format: model name, their beaker id, checkpoint subfolder
@@ -98,7 +98,10 @@
 
     # tulu v2 ablation models
     # ("finetuned_llama2_7B_on_v1_data", "01H7ABFYB84N9TN8MYXAVSMJ68", None, "tuned_lm"),
+    # ("finetuned_llama2_7B_on_sharegpt", "01HEXQK5YHNWG6RW1RS1H32XXA", None, "tuned_lm"),
     # ("finetuned_llama2_13B_on_v1_data", "01H7AC0KXGRDH9ACJ24WTSK7SR", None, "tuned_lm"),
+    # ("finetuned_llama2_70B_on_v1_data", "01HE9NVD58XX6G9ZYA61JZKJ7N", None, "tuned_lm"),
+    # ("finetuned_llama2_7B_on_sharegpt_dpo", "01HEXR0R515HKPKTN4TNAC408A", None, "tuned_lm"),
 
     # tulu v2 models
     # ("tulu_v2_7B_qlora", "01HDCNBNJS56BWKP5AHV4YNCSJ", None, "tuned_lm"),
@@ -107,6 +110,11 @@
     # ("tulu_v2_7B_jax", "01HBXTF305QARZ7P4T6ASXXVAM", None, "tuned_lm"),
     # ("tulu_v2_13B_jax", "01HBWE5NHC3M30HH63339HS8BE", None, "tuned_lm"),
     # ("tulu_v2_70B_jax", "01HCB2VZJ2T2JXZX0R1SJBRSB2", None, "tuned_lm"),
+    # ("tulu_v2_7B_dpo", "01HE8H1MBSVN09ZZ82X6K90NTF", None, "tuend_lm"),
+    # ("tulu_v2_13B_dpo", "01HE8YMBMJSTJV49QWA6TF2NTE", None, "tuend_lm"),
+    # ("tulu_v2_70B_dpo_first_epoch", "01HES1TCSJCPTPV50HQZHSN319", None, "tuend_lm"),
+    # ("tulu_v2_70B_dpo_second_epoch", "/net/nfs.cirrascale/allennlp/hamishi/EasyLM/tulu_2_70b_dpo/", None, "tuend_lm"),
+    # ("tulu_v2_70B_dpo", "01HEXKXP0MFM60PT7SY71XXSWD", None, "tuend_lm"),
 
     # code llama models
     # ("code_llama_7B", "01HD9Z1MJ9K3ZK494KGTVD1063", None, "vanilla_lm"),
@@ -133,7 +141,8 @@
     # ("finetuned_falcon_7B_flanv2_cot_oasst1_dolly_sharegpt_gpt4alpaca_codealpaca", "01H356X9ZYY8HX1C7HFH6JYWNW", None, "tuned_lm"),
     # ("hf-falcon-rw-7B", "tiiuae/falcon-rw-7b", None, "vanilla_lm"),
     # ("finetuned_falcon_rw_7B_flanv2_cot_oasst1_dolly_sharegpt_gpt4alpaca_codealpaca", "01H37QXWFK095588W6GCMVGFKB", None, "tuned_lm"),
-
+    # ("zephyr-7B", "/net/nfs.cirrascale/allennlp/yizhongw/checkpoints/zephyr-7b-beta", None, "tuned_lm"),
+    # ("xwin-70B", "/net/nfs.cirrascale/allennlp/yizhongw/checkpoints/Xwin-LM-70B-V0.1", None, "tuned_lm"),
 ]
 
 #--------------- experiments about number of supervision tasks -------------------------
@@ -307,7 +316,7 @@
             --use_chat_format \
             --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format
         '''
-    elif experiment_group == "alpaca_farm":
+    elif experiment_group == "alpaca_eval":
         d['tasks'][0]['arguments'][0] = '''
         python -m eval.alpaca_farm.run_eval \
             --use_vllm \
@@ -320,12 +329,23 @@
     else:
         raise ValueError("experiment_group not supported")
 
+    if model_info[0].startswith("hf-"):  # if it's a huggingface model, load it from the model hub
+        d['tasks'][0]['arguments'] = [d['tasks'][0]['arguments'][0].replace("--model_name_or_path /model", "--model_name_or_path "+model_info[1])]
+        d['tasks'][0]['arguments'] = [d['tasks'][0]['arguments'][0].replace("--tokenizer_name_or_path /model", "--model_name_or_path "+model_info[1])]
+    if model_info[1].startswith("/"):  # if it's a local model, load it from the local directory
+        d['tasks'][0]['arguments'] = [d['tasks'][0]['arguments'][0].replace("--model_name_or_path /model", "--model_name_or_path "+model_info[1])]
+        d['tasks'][0]['arguments'] = [d['tasks'][0]['arguments'][0].replace("--tokenizer_name_or_path /model", "--model_name_or_path "+model_info[1])]
+    else:  # if it's a beaker model, mount the beaker dataset to `/model`
+        d['tasks'][0]['datasets'][1]['source']['beaker'] = model_info[1]
+
     # if a specific checkpoint is specified, load model from that checkpoint
     if model_info[2] is not None:
-        assert "--model_name_or_path /model" in d['tasks'][0]['arguments'][0]
-        d['tasks'][0]['arguments'] = [d['tasks'][0]['arguments'][0].replace("--model_name_or_path /model", "--model_name_or_path /model/"+model_info[2])]
-        assert "--tokenizer_name_or_path /model" in d['tasks'][0]['arguments'][0]
-        d['tasks'][0]['arguments'] = [d['tasks'][0]['arguments'][0].replace("--tokenizer_name_or_path /model", "--tokenizer_name_or_path /model/"+model_info[2])]
+        # extract existing model path
+        model_name_or_path = re.search("--model_name_or_path (\S+)", d['tasks'][0]['arguments'][0]).group(1)
+        # replace the model path with the checkpoint subfolder
+        d['tasks'][0]['arguments'] = [d['tasks'][0]['arguments'][0].replace(model_name_or_path, model_name_or_path+"/"+model_info[2])]
+        # replace the tokenizer path with the checkpoint subfolder
+        tokenizer_name_or_path = re.search("--tokenizer_name_or_path (\S+)", d['tasks'][0]['arguments'][0]).group(1)
 
     # for vanilla_lm, remove the chat formatting function
     if model_info[3] == "vanilla_lm":
@@ -364,12 +384,6 @@
             # request 2x more GPUs
             d['tasks'][0]['resources']['gpuCount'] = 2 * d['tasks'][0]['resources']['gpuCount']
 
-    if model_info[0].startswith("hf-"):  # if it's a huggingface model, load it from the model hub
-        d['tasks'][0]['arguments'] = [d['tasks'][0]['arguments'][0].replace("--use_chat_format", "")]
-        d['tasks'][0]['arguments'] = [d['tasks'][0]['arguments'][0].replace("--model_name_or_path /model", "--model_name_or_path "+model_info[1])]
-        d['tasks'][0]['arguments'] = [d['tasks'][0]['arguments'][0].replace("--tokenizer_name_or_path /model", "--model_name_or_path "+model_info[1])]
-    else:  # if it's a beaker model, mount the beaker dataset to `/model`
-        d['tasks'][0]['datasets'][1]['source']['beaker'] = model_info[1]
 
     if "llama2-chat" in model_info[0]:
         d['tasks'][0]['arguments'] = [d['tasks'][0]['arguments'][0].replace(
@@ -380,7 +394,17 @@
         d['tasks'][0]['arguments'] = [d['tasks'][0]['arguments'][0].replace(
             "--chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format", 
             "--chat_formatting_function eval.templates.create_prompt_with_llama2_chat_format")
-        ] 
+        ]
+    elif "zephyr" in model_info[0]:
+        d['tasks'][0]['arguments'] = [d['tasks'][0]['arguments'][0].replace(
+            "--chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format", 
+            "--chat_formatting_function eval.templates.create_prompt_with_zephyr_chat_format")
+        ]
+    elif "xwin" in model_info[0]:
+        d['tasks'][0]['arguments'] = [d['tasks'][0]['arguments'][0].replace(
+            "--chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format", 
+            "--chat_formatting_function eval.templates.create_prompt_with_xwin_chat_format")
+        ]
 
     if any([x in model_info[0] for x in ["opt", "pythia", "falcon"]]):
         if "--use_vllm" in d['tasks'][0]['arguments'][0]: