fix for inference and comet llm

created function for inference evaluation using GPT
aiegoo · May 25, 2024 · fcac22f · fcac22f
1 parent 6239bde
commit fcac22f
Show file tree

Hide file tree

Showing 8 changed files with 63 additions and 15 deletions.
diff --git a/course/module-3/rag/reranking.py b/course/module-3/rag/reranking.py
@@ -1,4 +1,5 @@
 from langchain_openai import ChatOpenAI
+
 from llm_components.chain import GeneralChain
 from llm_components.prompt_templates import RerankingTemplate
 from settings import settings

diff --git a/course/module-5/finetuning_model/model.py b/course/module-5/finetuning_model/model.py
@@ -118,7 +118,7 @@ def tokenize(self, prompt: str) -> dict:
         result = self.tokenizer(
             prompt,
             padding="max_length",
-            max_length=2300,
+            max_length=100,
             truncation=True,
         )
         result["labels"] = result["input_ids"].copy()

diff --git a/course/module-5/inference.py b/course/module-5/inference.py
@@ -2,8 +2,9 @@
 from qwak_inference import RealTimeClient
 
 from rag.retriever import VectorRetriever
-from llm_components.prompt_monitor import PromptMonitor
+from model_evaluation.prompt_monitor import PromptMonitor
 from llm_components.prompt_templates import InferenceTemplateV1
+from model_evaluation.evaluation import evaluate
 
 from settings import settings
 
@@ -16,7 +17,7 @@ def __init__(self):
         self.template = InferenceTemplateV1()
         self.prompt_monitor = PromptMonitor()
 
-    def generate_content(self, query: str) -> str:
+    def generate_content(self, query: str) -> dict:
         retriever = VectorRetriever(query=query)
         hits = retriever.retrieve_top_k(k=settings.TOP_K, to_expand_to_n_queries=settings.EXPAND_N_QUERY)
         context = retriever.rerank(hits=hits, keep_top_k=settings.KEEP_TOP_K)
@@ -31,16 +32,20 @@ def generate_content(self, query: str) -> str:
                     'instruction': prompt
                 }
             ]
-        )
-        self.qwak_client = RealTimeClient(model_id=settings.MODEL_ID,
-                                          environment='llm-twin')
+        ).to_json()
+
         response = self.qwak_client.predict(input_)
+        evaluation = evaluate(query=query,
+                              context=context,
+                              output=str(response))
 
         self.prompt_monitor.log_prompt(
-            template=template,
             prompt=prompt,
             prompt_template_variables={'question': query, 'context': context},
             output=response
         )
 
-        return response
+        return {
+            'content': response,
+            'evaluation': evaluation
+        }
diff --git a/course/module-5/llm_components/prompt_templates.py b/course/module-5/llm_components/prompt_templates.py
@@ -79,8 +79,30 @@ class InferenceTemplateV1(BasePromptTemplate):
     Step 3: Generate the content keeping in mind that it needs to be as cohesive and concise as possible related to the subject presented in the query and similar to the users writing style and knowledge presented in the context.
     """
 
-    def create_template(self, ) -> PromptTemplate:
+    def create_template(self) -> PromptTemplate:
         return PromptTemplate(
             template=self.prompt,
             input_variables=['question', 'context']
         )
+
+
+class EvaluationTemplate(BasePromptTemplate):
+    prompt: str = """You are an AI assistant and your task is to evaluate the output generated by another LLM.
+    The other LLM generates writing content based on a user query and a given context.
+    The given context is comprised of custom data produces by a user that consists of posts, articles or code fragments.
+    Here is a list of steps you need to follow in order to solve this task:
+    Step 1: You need to analyze the user query : {query}
+    Step 2: You need to analyze the given context: {contex}
+    Step 3: You need to analyze the generated output: {output}
+    Step 4: Generate the evaluation
+    When doing the evaluation step you need to take the following into consideration the following:
+    -The evaluation needs to have some sort of metrics.
+    -The generated content needs to be evaluated based on the writing similarity form the context.
+    -The generated content needs to be evaluated based on it's coherence and conciseness related to the given query and context.
+    -The generated content needs to be evaluate based on how well it represents the user knowledge extracted from the context."""
+
+    def create_template(self) -> PromptTemplate:
+        return PromptTemplate(
+            template=self.prompt,
+            input_variables=['query', 'context', 'output']
+        )
diff --git a/course/module-5/main.py b/course/module-5/main.py
@@ -10,4 +10,5 @@
             I'm particularly interested in how RAG works and how it is integrated with vector DBs and large language models (LLMs).
             """
     content = tool.generate_content(query=query)
-    print(content)
+    for item in content:
+        print(item)
diff --git a/course/module-5/model_evaluation/evaluation.py b/course/module-5/model_evaluation/evaluation.py
@@ -0,0 +1,21 @@
+from langchain_openai import ChatOpenAI
+
+from settings import settings
+from llm_components.chain import GeneralChain
+from llm_components.prompt_templates import EvaluationTemplate
+
+
+def evaluate(query: str, context: list[str], output: str) -> str:
+    evaluation_template = EvaluationTemplate()
+    prompt_template = evaluation_template.create_template()
+
+    model = ChatOpenAI(model=settings.OPENAI_MODEL_ID)
+    chain = GeneralChain.get_chain(
+        llm=model,
+        output_key='evaluation',
+        template=prompt_template
+    )
+
+    response = chain.invoke({'query': query, 'context': context, 'output': output})
+
+    return response['evaluation']
diff --git a/...module-5/llm_components/prompt_monitor.py → ...dule-5/model_evaluation/prompt_monitor.py b/...module-5/llm_components/prompt_monitor.py → ...dule-5/model_evaluation/prompt_monitor.py
@@ -1,13 +1,12 @@
 import comet_llm
-from langchain.prompts import PromptTemplate
 
 from settings import settings
 
 
 class PromptMonitor:
 
     @classmethod
-    def log_prompt(cls, template: PromptTemplate,
+    def log_prompt(cls,
                    prompt: str,
                    prompt_template_variables: dict,
                    output: str):
@@ -17,7 +16,6 @@ def log_prompt(cls, template: PromptTemplate,
             workspace=settings.COMET_WORKSPACE,
             project=settings.COMET_PROJECT,
             api_key=settings.COMET_API_KEY,
-            prompt_template=template,
             prompt=prompt,
             prompt_template_variables=prompt_template_variables,
             output=output,

diff --git a/course/module-5/settings.py b/course/module-5/settings.py
@@ -35,8 +35,8 @@ class AppSettings(BaseSettings):
 
     # CometML config
     COMET_API_KEY: str | None = None
-    COMET_WORKSPACE: str = 'vlad_adu'
-    COMET_PROJECT: str = 'scrabble'
+    COMET_WORKSPACE: str = 'vladadu'
+    COMET_PROJECT: str = 'llm-twin'
 
     # LLM Model config
     TOKENIZERS_PARALLELISM: str = "false"