-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_rag.py
76 lines (62 loc) · 2.9 KB
/
test_rag.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from query_data import query_rag
from langchain_community.llms.ollama import Ollama
EVAL_PROMPT = """
Expected Response: {expected_response}
Actual Response: {actual_response}
---
(Answer with 'true' or 'false') Does the actual response match the expected response?
"""
### Positive test cases. We provide the correct answer and expect the model to agree.
def test_arctic_warming_rate_pos():
assert query_and_validate(
question="How fast is the Arctic warming? (Answer with the number only, plus 'times faster' or 'times slower')",
expected_response="3 times faster",
)
def test_alaska_precip_trend_pos():
assert query_and_validate(
question="In general, is precipitation increasing or decreasing across Alaska? (Answer with the word 'increasing' or 'decreasing')",
expected_response="Increasing",
)
def test_fbx_freezing_rain_trend_pos():
assert query_and_validate(
question="In Fairbanks, are there more freezing rain events now than in the past? (Answer with a simple 'yes' or 'no')",
expected_response="Yes",
)
### Negative test cases. We provide the incorrect answer and expect the model to disagree.
def test_arctic_warming_rate_neg():
assert not query_and_validate(
question="How fast is the Arctic warming? (Answer with the number only, plus 'times faster' or 'times slower')",
expected_response="2 times faster",
)
def test_alaska_precip_trend_neg():
assert not query_and_validate(
question="In general, is precipitation increasing or decreasing across Alaska? (Answer with the word 'increasing' or 'decreasing')",
expected_response="Decreasing",
)
def test_fbx_freezing_rain_trend_neg():
assert not query_and_validate(
question="In Fairbanks, are there more freezing rain events now than in the past? (Answer with a simple 'yes' or 'no')",
expected_response="No",
)
### The validation function using the LLM to compare the expected and actual responses.
def query_and_validate(question: str, expected_response: str):
response_text = query_rag(question)
prompt = EVAL_PROMPT.format(
expected_response=expected_response, actual_response=response_text
)
model = Ollama(model="phi3:medium")
evaluation_results_str = model.invoke(prompt)
evaluation_results_str_cleaned = evaluation_results_str.strip().lower()
print(prompt)
if "true" in evaluation_results_str_cleaned:
# Print response in Green if it is correct.
print("\033[92m" + f"Response: {evaluation_results_str_cleaned}" + "\033[0m")
return True
elif "false" in evaluation_results_str_cleaned:
# Print response in Red if it is incorrect.
print("\033[91m" + f"Response: {evaluation_results_str_cleaned}" + "\033[0m")
return False
else:
raise ValueError(
f"Invalid evaluation result. Cannot determine if 'true' or 'false'."
)