Skip to content

Commit

Permalink
2024-02-01 Multi Agents RAG Project
Browse files Browse the repository at this point in the history
  • Loading branch information
BruceYanghy committed Feb 1, 2024
1 parent c34cd29 commit 01140e5
Show file tree
Hide file tree
Showing 21 changed files with 31,442 additions and 0 deletions.
362 changes: 362 additions & 0 deletions fingpt/FinGPT_MultiAgentsRAG/Evaluation_methods/HaluEval/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,362 @@
import random
import openai
import time
import json
import argparse
import tiktoken

# START: COPIED FROM <https://github.com/RUCAIBox/HaluEval.git>
openai.api_key = 'sk-'

def get_qa_response(model, question, answer, instruction):
message = [
{"role": "system", "content":"You are a huallucination detector. You MUST determine if the provided answer contains hallucination or not for the question based on the world knowledge. The answer you provided MUST be \"Yes\" or \"No\""},
{"role": "user", "content": instruction +
"\n\n#Question#: " + question +
"\n#Answer#: " + answer +
"\n#Your Judgement#: "}
]
prompt = instruction + "\n\n#Question#: " + question + "\n#Answer#: " + answer + "\n#Your Judgement#:"
while True:
try:
if model == "gpt-3.5-turbo":
res = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=message,
temperature=0.0,
)
response = res['choices'][0]['message']['content']
else:
res = openai.Completion.create(
engine=model,
prompt=prompt,
temperature=0.0
)
response = res["choices"][0]['text'].strip()
break
except openai.error.RateLimitError:
print('openai.error.RateLimitError\nRetrying...')
time.sleep(60)
except openai.error.ServiceUnavailableError:
print('openai.error.ServiceUnavailableError\nRetrying...')
time.sleep(20)
except openai.error.Timeout:
print('openai.error.Timeout\nRetrying...')
time.sleep(20)
except openai.error.APIError:
print('openai.error.APIError\nRetrying...')
time.sleep(20)
except openai.error.APIConnectionError:
print('openai.error.APIConnectionError\nRetrying...')
time.sleep(20)

return response


def get_dialogue_response(model, dialog, response, instruction):
message = [
{"role": "system", "content": "You are a response judge. You MUST determine if the provided response contains non-factual or hallucinated information. The answer you give MUST be \"Yes\" or \"No\""},
{"role": "user", "content": instruction +
"\n\n#Dialogue History#: " + dialog +
"\n#Response#: " + response +
"\n#Your Judgement#: "}
]
prompt = instruction + "\n\n#Dialogue History#: " + dialog + "\n#Response#: " + response + "\n#Your Judgement#:"
while True:
try:
if model == "gpt-3.5-turbo":
res = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=message,
temperature=0.0,
)
response = res['choices'][0]['message']['content']
else:
res = openai.Completion.create(
model=model,
prompt=prompt,
temperature=0.0
)
response = res["choices"][0]['text'].strip()
break
except openai.error.RateLimitError:
print('openai.error.RateLimitError\nRetrying...')
time.sleep(60)
except openai.error.ServiceUnavailableError:
print('openai.error.ServiceUnavailableError\nRetrying...')
time.sleep(20)
except openai.error.Timeout:
print('openai.error.Timeout\nRetrying...')
time.sleep(20)
except openai.error.APIError:
print('openai.error.APIError\nRetrying...')
time.sleep(20)
except openai.error.APIConnectionError:
print('openai.error.APIConnectionError\nRetrying...')
time.sleep(20)

return response


def num_tokens_from_message(message, model="davinci"):
encoding = tiktoken.encoding_for_model(model)
num_tokens = len(encoding.encode(message))
return num_tokens


def truncate_message(prompt1, prompt2, model="davinci"):
if num_tokens_from_message(prompt1 + prompt2, model) > 2033:
truncation_length = 2033 - num_tokens_from_message(prompt2)
while num_tokens_from_message(prompt1) > truncation_length:
prompt1 = " ".join(prompt1.split()[:-1])
prompt = prompt1 + prompt2
return prompt


def get_summarization_response(model, document, summary, instruction):
message = [
{"role": "system", "content": "You are a summary judge. You MUST determine if the provided summary contains non-factual or hallucinated information. The answer you give MUST be \"Yes\" or \"No\""},
{"role": "user", "content": instruction +
"\n\n#Document#: " + document +
"\n#Summary#: " + summary +
"\n#Your Judgement#: "}
]
prompt1 = instruction + "\n\n#Document#: " + document
prompt2 = "\n#Summary#: " + summary + "\n#Your Judgement#:"
if model == "davinci":
prompt = truncate_message(prompt1, prompt2)
else:
prompt = prompt1 + prompt2
while True:
try:
if model == "gpt-3.5-turbo":
res = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=message,
temperature=0.0,
)
response = res['choices'][0]['message']['content']
else:
res = openai.Completion.create(
model=model,
prompt=prompt,
temperature=0.0
)
response = res["choices"][0]['text'].strip()
break
except openai.error.RateLimitError:
print('openai.error.RateLimitError\nRetrying...')
time.sleep(60)
except openai.error.ServiceUnavailableError:
print('openai.error.ServiceUnavailableError\nRetrying...')
time.sleep(20)
except openai.error.Timeout:
print('openai.error.Timeout\nRetrying...')
time.sleep(20)
except openai.error.APIError:
print('openai.error.APIError\nRetrying...')
time.sleep(20)
except openai.error.APIConnectionError:
print('openai.error.APIConnectionError\nRetrying...')
time.sleep(20)

return response


def evaluation_qa_dataset(model, file, instruction, output_path):
with open(file, 'r', encoding="utf-8") as f:
data = []
for line in f:
data.append(json.loads(line))

correct = 0
incorrect = 0
for i in range(len(data)):
knowledge = data[i]["knowledge"]
question = data[i]["question"]
hallucinated_answer = data[i]["hallucinated_answer"]
right_answer = data[i]["right_answer"]

if random.random() > 0.5:
answer = hallucinated_answer
ground_truth = "Yes"
else:
answer = right_answer
ground_truth = "No"

ans = get_qa_response(model, question, answer, instruction)
ans = ans.replace(".", "")

if ("Yes" in ans and "No" in ans) or ("Yes" not in ans and "No" not in ans):
gen = {"knowlwdge": knowledge, "question": question, "answer": answer, "ground_truth": ground_truth, "judgement": "failed!"}
dump_jsonl(gen, output_path, append=True)
incorrect += 1
print('sample {} fails......'.format(i))
continue
elif "Yes" in ans:
if ans != "Yes":
ans = "Yes"
gen = {"knowledge": knowledge, "question": question, "answer": answer, "ground_truth": ground_truth, "judgement": ans}
elif "No" in ans:
if ans != "No":
ans = "No"
gen = {"knowledge": knowledge, "question": question, "answer": answer, "ground_truth": ground_truth, "judgement": ans}
else:
gen = None
incorrect += 1

assert(gen is not None)

if ground_truth == ans:
correct += 1
else:
incorrect += 1

print('sample {} success......'.format(i))
dump_jsonl(gen, output_path, append=True)

print('{} correct samples, {} incorrect samples, Accuracy: {}'.format(correct, incorrect, correct/len(data)))


def evaluation_dialogue_dataset(model, file, instruction, output_path):
with open(file, 'r', encoding="utf-8") as f:
data = []
for line in f:
data.append(json.loads(line))

correct = 0
incorrect = 0
for i in range(len(data)):
knowledge = data[i]["knowledge"]
dialog = data[i]["dialogue_history"]
hallucinated_response = data[i]["hallucinated_response"]
right_response = data[i]["right_response"]

if random.random() > 0.5:
response = hallucinated_response
ground_truth = "Yes"
else:
response = right_response
ground_truth = "No"

ans = get_dialogue_response(model, dialog, response, instruction)
ans = ans.replace(".", "")

if ("Yes" in ans and "No" in ans) or ("Yes" not in ans and "No" not in ans):
gen = {"knowledge": knowledge, "dialogue_history": dialog, "response": response, "ground_truth": ground_truth, "judgement": "failed!"}
dump_jsonl(gen, output_path, append=True)
incorrect += 1
print('sample {} fails......'.format(i))
continue
elif "Yes" in ans:
if ans != "Yes":
ans = "Yes"
gen = {"knowledge": knowledge, "dialogue_history": dialog, "response": response, "ground_truth": ground_truth, "judgement": ans}
elif "No" in ans:
if ans != "No":
ans = "No"
gen = {"knowledge": knowledge, "dialogue_history": dialog, "response": response, "ground_truth": ground_truth, "judgement": ans}
else:
gen = None
assert (gen is not None)

if ground_truth == ans:
correct += 1
else:
incorrect += 1

print('sample {} success......'.format(i))
dump_jsonl(gen, output_path, append=True)

print('{} correct samples, {} incorrect samples, Accuracy: {}'.format(correct, incorrect, correct / len(data)))


def evaluation_summarization_dataset(model, file, instruction, output_path):
with open(file, 'r', encoding="utf-8") as f:
data = []
for line in f:
data.append(json.loads(line))

correct = 0
incorrect = 0
for i in range(len(data)):

document = data[i]["document"]
hallucinated_summary = data[i]["hallucinated_summary"]
right_summary = data[i]["right_summary"]

if random.random() > 0.5:
summary = hallucinated_summary
ground_truth = "Yes"
else:
summary = right_summary
ground_truth = "No"

ans = get_summarization_response(model, document, summary, instruction)
ans = ans.replace(".", "")

if ("Yes" in ans and "No" in ans) or ("Yes" not in ans and "No" not in ans):
gen = {"document": document, "summary": summary, "ground_truth": ground_truth, "judgement": "failed!"}
dump_jsonl(gen, output_path, append=True)
incorrect += 1
print('sample {} fails......'.format(i))
continue
elif "Yes" in ans:
if ans != "Yes":
ans = "Yes"
gen = {"document": document, "summary": summary, "ground_truth": ground_truth, "judgement": ans}
elif "No" in ans:
if ans != "No":
ans = "No"
gen = {"document": document, "summary": summary, "ground_truth": ground_truth, "judgement": ans}
else:
gen = None
assert (gen is not None)

if ground_truth == ans:
correct += 1
else:
incorrect += 1

print('sample {} success......'.format(i))
dump_jsonl(gen, output_path, append=True)

print('{} correct samples, {} incorrect samples, Accuracy: {}'.format(correct, incorrect, correct / len(data)))


def dump_jsonl(data, output_path, append=False):
"""
Write list of objects to a JSON lines file.
"""
mode = 'a+' if append else 'w'
with open(output_path, mode, encoding='utf-8') as f:
json_record = json.dumps(data, ensure_ascii=False)
f.write(json_record + '\n')

#END: COPIED FROM <https://github.com/RUCAIBox/HaluEval.git>


if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Hallucination Generation")

parser.add_argument("--task", default="qa", help="qa, dialogue, or summarization")
parser.add_argument("--model", default="davinci", help="model name")
args = parser.parse_args()

instruction_file = "{}/{}_evaluation_instruction.txt".format(args.task, args.task)
f = open(instruction_file, 'r', encoding="utf-8")
instruction = f.read()

model = args.model
output_path = "{}/{}_{}_results.json".format(args.task, args.task, args.model)

data = "../data/{}_data.json".format(args.task)

if args.task == "qa":
evaluation_qa_dataset(model, data, instruction, output_path)
elif args.task == "dialogue":
evaluation_dialogue_dataset(model, data, instruction, output_path)
elif args.task == "summarization":
evaluation_summarization_dataset(model, data, instruction, output_path)
else:
raise ValueError("The task must be qa, dialogue, or summarization!")
Loading

0 comments on commit 01140e5

Please sign in to comment.