Skip to content

Commit

Permalink
feat: Wrap up Opik evaluation logic
Browse files Browse the repository at this point in the history
  • Loading branch information
iusztinpaul committed Oct 29, 2024
1 parent fa995ce commit 8f07875
Show file tree
Hide file tree
Showing 4 changed files with 162 additions and 14 deletions.
46 changes: 40 additions & 6 deletions 5-inference/evaluate.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,16 @@
import argparse

from config import settings
from evaluation import Style
from inference_pipeline import LLMTwin
from opik.evaluation import evaluate
from opik.evaluation.metrics import Hallucination
from opik.evaluation.metrics import (
ContextPrecision,
ContextRecall,
Hallucination,
LevenshteinRatio,
Moderation,
)

from core.logger_utils import get_logger
from core.opik_utils import create_dataset_from_artifacts
Expand All @@ -23,11 +32,25 @@ def evaluation_task(x: dict) -> dict:
"output": answer,
"context": context,
"expected_output": x["content"],
"reference": x["content"],
}


if __name__ == "__main__":
dataset_name = "LLMTwinArtifactTestDataset"
def main() -> None:
parser = argparse.ArgumentParser(description="Evaluate monitoring script.")
parser.add_argument(
"--dataset_name",
type=str,
default="LLMTwinMonitoringDataset",
help="Name of the dataset to evaluate",
)

args = parser.parse_args()

dataset_name = args.dataset_name

logger.info(f"Evaluating Opik dataset: '{dataset_name}'")

dataset = create_dataset_from_artifacts(
dataset_name="LLMTwinArtifactTestDataset",
artifact_names=[
Expand All @@ -43,10 +66,21 @@ def evaluation_task(x: dict) -> dict:
experiment_config = {
"model_id": settings.QWAK_DEPLOYMENT_MODEL_ID,
}

res = evaluate(
scoring_metrics = [
LevenshteinRatio(),
Hallucination(),
Moderation(),
ContextRecall(),
ContextPrecision(),
Style(),
]
evaluate(
dataset=dataset,
task=evaluation_task,
scoring_metrics=[Hallucination(model=settings.OPENAI_MODEL_ID)],
scoring_metrics=scoring_metrics,
experiment_config=experiment_config,
)


if __name__ == "__main__":
main()
35 changes: 28 additions & 7 deletions 5-inference/evaluate_monitoring.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import argparse

import opik
from config import settings
from evaluation import Style
from opik.evaluation import evaluate
from opik.evaluation.metrics import Hallucination
from opik.evaluation.metrics import AnswerRelevance, Hallucination, Moderation

from core.logger_utils import get_logger

Expand All @@ -16,22 +19,40 @@ def evaluation_task(x: dict) -> dict:
}


if __name__ == "__main__":
def main() -> None:
parser = argparse.ArgumentParser(description="Evaluate monitoring script.")
parser.add_argument(
"--dataset_name",
type=str,
default="LLMTwinMonitoringDataset",
help="Name of the dataset to evaluate",
)

args = parser.parse_args()

dataset_name = args.dataset_name

logger.info(f"Evaluating Opik dataset: '{dataset_name}'")

client = opik.Opik()
dataset_name = "LLMTwinMonitoringDataset"
try:
dataset = client.get_dataset(dataset_name)
except Exception as e:
logger.error("Monitoring dataset not found in Opik. Exiting.")
except Exception:
logger.error(f"Monitoring dataset '{dataset_name}' not found in Opik. Exiting.")
exit(1)

experiment_config = {
"model_id": settings.QWAK_DEPLOYMENT_MODEL_ID,
}

res = evaluate(
scoring_metrics = [Hallucination(), Moderation(), AnswerRelevance(), Style()]
evaluate(
dataset=dataset,
task=evaluation_task,
scoring_metrics=[Hallucination(model=settings.OPENAI_MODEL_ID)],
scoring_metrics=scoring_metrics,
experiment_config=experiment_config,
)


if __name__ == "__main__":
main()
3 changes: 2 additions & 1 deletion 5-inference/evaluation/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .llm import evaluate as evaluate_llm
from .rag import evaluate as evaluate_rag
from .style import Style

__all__ = ["evaluate_llm", "evaluate_rag"]
__all__ = ["evaluate_llm", "evaluate_rag", "Style"]
92 changes: 92 additions & 0 deletions 5-inference/evaluation/style.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import json
from typing import Any

from config import settings
from opik.evaluation.metrics import base_metric, exceptions, score_result
from opik.evaluation.models import litellm_chat_model
from pydantic import BaseModel


class LLMJudgeStyleOutputResult(BaseModel):
score: int
reason: str


class Style(base_metric.BaseMetric):
"""
A metric that evaluates whether an LLM's output tone and writing style are appropriate for a blog post or social media content.
This metric uses another LLM to judge if the output is factual or contains hallucinations.
It returns a score of 1.0 if the style is appropriate, 0.5 if it is somewhere in the middle and 0.0 otherwise.
"""

def __init__(
self, name: str = "style_metric", model_name: str = settings.OPENAI_MODEL_ID
) -> None:
self.name = name
self.llm_client = litellm_chat_model.LiteLLMChatModel(model_name=model_name)
self.prompt_template = """
You are an impartial expert judge. Evaluate the quality of a given answer to an instruction based on it's style.
Style: Is the tone and writing style appropriate for a blog post or social media content? It should use simple but technical words and avoid formal or academic language.
Style scale:
1 (Poor): Too formal, uses some overly complex words
2 (Good): Good balance of technical content and accessibility, but still uses formal words and expressions
3 (Excellent): Perfectly accessible language for blog/social media, uses simple but precise technical terms when necessary
Example of bad style: The Llama2 7B model constitutes a noteworthy progression in the field of artificial intelligence, serving as the successor to its predecessor, the original Llama architecture.
Example of excellent style: Llama2 7B outperforms the original Llama model across multiple benchmarks.
Instruction: {input}
Answer: {output}
Provide your evaluation in JSON format with the following structure:
{{
"accuracy": {{
"reason": "...",
"score": 0
}},
"style": {{
"reason": "...",
"score": 0
}}
}}
"""

def score(self, input: str, output: str, **ignored_kwargs: Any):
"""
Score the output of an LLM.
Args:
output: The output of an LLM to score.
**ignored_kwargs: Any additional keyword arguments. This is important so that the metric can be used in the `evaluate` function.
"""

prompt = self.prompt_template.format(input=input, output=output)

model_output = self.llm_client.generate_string(
input=prompt, response_format=LLMJudgeStyleOutputResult
)

return self._parse_model_output(model_output)

def _parse_model_output(self, content: str) -> score_result.ScoreResult:
try:
dict_content = json.loads(content)
except Exception:
raise exceptions.MetricComputationError("Failed to parse the model output.")

score = dict_content["score"]
try:
assert 1 <= score <= 3, f"Invalid score value: {score}"
except AssertionError as e:
raise exceptions.MetricComputationError(str(e))

score = (score - 1) / 2.0 # Normalize the score to be between 0 and 1

return score_result.ScoreResult(
name=self.name,
value=score,
reason=dict_content["reason"],
)

0 comments on commit 8f07875

Please sign in to comment.