forked from AI4Finance-Foundation/FinGPT
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
c34cd29
commit 01140e5
Showing
21 changed files
with
31,442 additions
and
0 deletions.
There are no files selected for viewing
362 changes: 362 additions & 0 deletions
362
fingpt/FinGPT_MultiAgentsRAG/Evaluation_methods/HaluEval/evaluate.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,362 @@ | ||
import random | ||
import openai | ||
import time | ||
import json | ||
import argparse | ||
import tiktoken | ||
|
||
# START: COPIED FROM <https://github.com/RUCAIBox/HaluEval.git> | ||
openai.api_key = 'sk-' | ||
|
||
def get_qa_response(model, question, answer, instruction): | ||
message = [ | ||
{"role": "system", "content":"You are a huallucination detector. You MUST determine if the provided answer contains hallucination or not for the question based on the world knowledge. The answer you provided MUST be \"Yes\" or \"No\""}, | ||
{"role": "user", "content": instruction + | ||
"\n\n#Question#: " + question + | ||
"\n#Answer#: " + answer + | ||
"\n#Your Judgement#: "} | ||
] | ||
prompt = instruction + "\n\n#Question#: " + question + "\n#Answer#: " + answer + "\n#Your Judgement#:" | ||
while True: | ||
try: | ||
if model == "gpt-3.5-turbo": | ||
res = openai.ChatCompletion.create( | ||
model="gpt-3.5-turbo", | ||
messages=message, | ||
temperature=0.0, | ||
) | ||
response = res['choices'][0]['message']['content'] | ||
else: | ||
res = openai.Completion.create( | ||
engine=model, | ||
prompt=prompt, | ||
temperature=0.0 | ||
) | ||
response = res["choices"][0]['text'].strip() | ||
break | ||
except openai.error.RateLimitError: | ||
print('openai.error.RateLimitError\nRetrying...') | ||
time.sleep(60) | ||
except openai.error.ServiceUnavailableError: | ||
print('openai.error.ServiceUnavailableError\nRetrying...') | ||
time.sleep(20) | ||
except openai.error.Timeout: | ||
print('openai.error.Timeout\nRetrying...') | ||
time.sleep(20) | ||
except openai.error.APIError: | ||
print('openai.error.APIError\nRetrying...') | ||
time.sleep(20) | ||
except openai.error.APIConnectionError: | ||
print('openai.error.APIConnectionError\nRetrying...') | ||
time.sleep(20) | ||
|
||
return response | ||
|
||
|
||
def get_dialogue_response(model, dialog, response, instruction): | ||
message = [ | ||
{"role": "system", "content": "You are a response judge. You MUST determine if the provided response contains non-factual or hallucinated information. The answer you give MUST be \"Yes\" or \"No\""}, | ||
{"role": "user", "content": instruction + | ||
"\n\n#Dialogue History#: " + dialog + | ||
"\n#Response#: " + response + | ||
"\n#Your Judgement#: "} | ||
] | ||
prompt = instruction + "\n\n#Dialogue History#: " + dialog + "\n#Response#: " + response + "\n#Your Judgement#:" | ||
while True: | ||
try: | ||
if model == "gpt-3.5-turbo": | ||
res = openai.ChatCompletion.create( | ||
model="gpt-3.5-turbo", | ||
messages=message, | ||
temperature=0.0, | ||
) | ||
response = res['choices'][0]['message']['content'] | ||
else: | ||
res = openai.Completion.create( | ||
model=model, | ||
prompt=prompt, | ||
temperature=0.0 | ||
) | ||
response = res["choices"][0]['text'].strip() | ||
break | ||
except openai.error.RateLimitError: | ||
print('openai.error.RateLimitError\nRetrying...') | ||
time.sleep(60) | ||
except openai.error.ServiceUnavailableError: | ||
print('openai.error.ServiceUnavailableError\nRetrying...') | ||
time.sleep(20) | ||
except openai.error.Timeout: | ||
print('openai.error.Timeout\nRetrying...') | ||
time.sleep(20) | ||
except openai.error.APIError: | ||
print('openai.error.APIError\nRetrying...') | ||
time.sleep(20) | ||
except openai.error.APIConnectionError: | ||
print('openai.error.APIConnectionError\nRetrying...') | ||
time.sleep(20) | ||
|
||
return response | ||
|
||
|
||
def num_tokens_from_message(message, model="davinci"): | ||
encoding = tiktoken.encoding_for_model(model) | ||
num_tokens = len(encoding.encode(message)) | ||
return num_tokens | ||
|
||
|
||
def truncate_message(prompt1, prompt2, model="davinci"): | ||
if num_tokens_from_message(prompt1 + prompt2, model) > 2033: | ||
truncation_length = 2033 - num_tokens_from_message(prompt2) | ||
while num_tokens_from_message(prompt1) > truncation_length: | ||
prompt1 = " ".join(prompt1.split()[:-1]) | ||
prompt = prompt1 + prompt2 | ||
return prompt | ||
|
||
|
||
def get_summarization_response(model, document, summary, instruction): | ||
message = [ | ||
{"role": "system", "content": "You are a summary judge. You MUST determine if the provided summary contains non-factual or hallucinated information. The answer you give MUST be \"Yes\" or \"No\""}, | ||
{"role": "user", "content": instruction + | ||
"\n\n#Document#: " + document + | ||
"\n#Summary#: " + summary + | ||
"\n#Your Judgement#: "} | ||
] | ||
prompt1 = instruction + "\n\n#Document#: " + document | ||
prompt2 = "\n#Summary#: " + summary + "\n#Your Judgement#:" | ||
if model == "davinci": | ||
prompt = truncate_message(prompt1, prompt2) | ||
else: | ||
prompt = prompt1 + prompt2 | ||
while True: | ||
try: | ||
if model == "gpt-3.5-turbo": | ||
res = openai.ChatCompletion.create( | ||
model="gpt-3.5-turbo", | ||
messages=message, | ||
temperature=0.0, | ||
) | ||
response = res['choices'][0]['message']['content'] | ||
else: | ||
res = openai.Completion.create( | ||
model=model, | ||
prompt=prompt, | ||
temperature=0.0 | ||
) | ||
response = res["choices"][0]['text'].strip() | ||
break | ||
except openai.error.RateLimitError: | ||
print('openai.error.RateLimitError\nRetrying...') | ||
time.sleep(60) | ||
except openai.error.ServiceUnavailableError: | ||
print('openai.error.ServiceUnavailableError\nRetrying...') | ||
time.sleep(20) | ||
except openai.error.Timeout: | ||
print('openai.error.Timeout\nRetrying...') | ||
time.sleep(20) | ||
except openai.error.APIError: | ||
print('openai.error.APIError\nRetrying...') | ||
time.sleep(20) | ||
except openai.error.APIConnectionError: | ||
print('openai.error.APIConnectionError\nRetrying...') | ||
time.sleep(20) | ||
|
||
return response | ||
|
||
|
||
def evaluation_qa_dataset(model, file, instruction, output_path): | ||
with open(file, 'r', encoding="utf-8") as f: | ||
data = [] | ||
for line in f: | ||
data.append(json.loads(line)) | ||
|
||
correct = 0 | ||
incorrect = 0 | ||
for i in range(len(data)): | ||
knowledge = data[i]["knowledge"] | ||
question = data[i]["question"] | ||
hallucinated_answer = data[i]["hallucinated_answer"] | ||
right_answer = data[i]["right_answer"] | ||
|
||
if random.random() > 0.5: | ||
answer = hallucinated_answer | ||
ground_truth = "Yes" | ||
else: | ||
answer = right_answer | ||
ground_truth = "No" | ||
|
||
ans = get_qa_response(model, question, answer, instruction) | ||
ans = ans.replace(".", "") | ||
|
||
if ("Yes" in ans and "No" in ans) or ("Yes" not in ans and "No" not in ans): | ||
gen = {"knowlwdge": knowledge, "question": question, "answer": answer, "ground_truth": ground_truth, "judgement": "failed!"} | ||
dump_jsonl(gen, output_path, append=True) | ||
incorrect += 1 | ||
print('sample {} fails......'.format(i)) | ||
continue | ||
elif "Yes" in ans: | ||
if ans != "Yes": | ||
ans = "Yes" | ||
gen = {"knowledge": knowledge, "question": question, "answer": answer, "ground_truth": ground_truth, "judgement": ans} | ||
elif "No" in ans: | ||
if ans != "No": | ||
ans = "No" | ||
gen = {"knowledge": knowledge, "question": question, "answer": answer, "ground_truth": ground_truth, "judgement": ans} | ||
else: | ||
gen = None | ||
incorrect += 1 | ||
|
||
assert(gen is not None) | ||
|
||
if ground_truth == ans: | ||
correct += 1 | ||
else: | ||
incorrect += 1 | ||
|
||
print('sample {} success......'.format(i)) | ||
dump_jsonl(gen, output_path, append=True) | ||
|
||
print('{} correct samples, {} incorrect samples, Accuracy: {}'.format(correct, incorrect, correct/len(data))) | ||
|
||
|
||
def evaluation_dialogue_dataset(model, file, instruction, output_path): | ||
with open(file, 'r', encoding="utf-8") as f: | ||
data = [] | ||
for line in f: | ||
data.append(json.loads(line)) | ||
|
||
correct = 0 | ||
incorrect = 0 | ||
for i in range(len(data)): | ||
knowledge = data[i]["knowledge"] | ||
dialog = data[i]["dialogue_history"] | ||
hallucinated_response = data[i]["hallucinated_response"] | ||
right_response = data[i]["right_response"] | ||
|
||
if random.random() > 0.5: | ||
response = hallucinated_response | ||
ground_truth = "Yes" | ||
else: | ||
response = right_response | ||
ground_truth = "No" | ||
|
||
ans = get_dialogue_response(model, dialog, response, instruction) | ||
ans = ans.replace(".", "") | ||
|
||
if ("Yes" in ans and "No" in ans) or ("Yes" not in ans and "No" not in ans): | ||
gen = {"knowledge": knowledge, "dialogue_history": dialog, "response": response, "ground_truth": ground_truth, "judgement": "failed!"} | ||
dump_jsonl(gen, output_path, append=True) | ||
incorrect += 1 | ||
print('sample {} fails......'.format(i)) | ||
continue | ||
elif "Yes" in ans: | ||
if ans != "Yes": | ||
ans = "Yes" | ||
gen = {"knowledge": knowledge, "dialogue_history": dialog, "response": response, "ground_truth": ground_truth, "judgement": ans} | ||
elif "No" in ans: | ||
if ans != "No": | ||
ans = "No" | ||
gen = {"knowledge": knowledge, "dialogue_history": dialog, "response": response, "ground_truth": ground_truth, "judgement": ans} | ||
else: | ||
gen = None | ||
assert (gen is not None) | ||
|
||
if ground_truth == ans: | ||
correct += 1 | ||
else: | ||
incorrect += 1 | ||
|
||
print('sample {} success......'.format(i)) | ||
dump_jsonl(gen, output_path, append=True) | ||
|
||
print('{} correct samples, {} incorrect samples, Accuracy: {}'.format(correct, incorrect, correct / len(data))) | ||
|
||
|
||
def evaluation_summarization_dataset(model, file, instruction, output_path): | ||
with open(file, 'r', encoding="utf-8") as f: | ||
data = [] | ||
for line in f: | ||
data.append(json.loads(line)) | ||
|
||
correct = 0 | ||
incorrect = 0 | ||
for i in range(len(data)): | ||
|
||
document = data[i]["document"] | ||
hallucinated_summary = data[i]["hallucinated_summary"] | ||
right_summary = data[i]["right_summary"] | ||
|
||
if random.random() > 0.5: | ||
summary = hallucinated_summary | ||
ground_truth = "Yes" | ||
else: | ||
summary = right_summary | ||
ground_truth = "No" | ||
|
||
ans = get_summarization_response(model, document, summary, instruction) | ||
ans = ans.replace(".", "") | ||
|
||
if ("Yes" in ans and "No" in ans) or ("Yes" not in ans and "No" not in ans): | ||
gen = {"document": document, "summary": summary, "ground_truth": ground_truth, "judgement": "failed!"} | ||
dump_jsonl(gen, output_path, append=True) | ||
incorrect += 1 | ||
print('sample {} fails......'.format(i)) | ||
continue | ||
elif "Yes" in ans: | ||
if ans != "Yes": | ||
ans = "Yes" | ||
gen = {"document": document, "summary": summary, "ground_truth": ground_truth, "judgement": ans} | ||
elif "No" in ans: | ||
if ans != "No": | ||
ans = "No" | ||
gen = {"document": document, "summary": summary, "ground_truth": ground_truth, "judgement": ans} | ||
else: | ||
gen = None | ||
assert (gen is not None) | ||
|
||
if ground_truth == ans: | ||
correct += 1 | ||
else: | ||
incorrect += 1 | ||
|
||
print('sample {} success......'.format(i)) | ||
dump_jsonl(gen, output_path, append=True) | ||
|
||
print('{} correct samples, {} incorrect samples, Accuracy: {}'.format(correct, incorrect, correct / len(data))) | ||
|
||
|
||
def dump_jsonl(data, output_path, append=False): | ||
""" | ||
Write list of objects to a JSON lines file. | ||
""" | ||
mode = 'a+' if append else 'w' | ||
with open(output_path, mode, encoding='utf-8') as f: | ||
json_record = json.dumps(data, ensure_ascii=False) | ||
f.write(json_record + '\n') | ||
|
||
#END: COPIED FROM <https://github.com/RUCAIBox/HaluEval.git> | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser(description="Hallucination Generation") | ||
|
||
parser.add_argument("--task", default="qa", help="qa, dialogue, or summarization") | ||
parser.add_argument("--model", default="davinci", help="model name") | ||
args = parser.parse_args() | ||
|
||
instruction_file = "{}/{}_evaluation_instruction.txt".format(args.task, args.task) | ||
f = open(instruction_file, 'r', encoding="utf-8") | ||
instruction = f.read() | ||
|
||
model = args.model | ||
output_path = "{}/{}_{}_results.json".format(args.task, args.task, args.model) | ||
|
||
data = "../data/{}_data.json".format(args.task) | ||
|
||
if args.task == "qa": | ||
evaluation_qa_dataset(model, data, instruction, output_path) | ||
elif args.task == "dialogue": | ||
evaluation_dialogue_dataset(model, data, instruction, output_path) | ||
elif args.task == "summarization": | ||
evaluation_summarization_dataset(model, data, instruction, output_path) | ||
else: | ||
raise ValueError("The task must be qa, dialogue, or summarization!") |
Oops, something went wrong.