kyryl-opens-ml · truskovskiyk · Dec 2, 2024 · Dec 1, 2024 · Dec 1, 2024 · Dec 1, 2024
diff --git a/ai-search-demo/README.md b/ai-search-demo/README.md
@@ -5,13 +5,10 @@
 This is a small demo showing how to build AI search on top of visual reach data (PDFs, Images, etc)
 
 
-## Evaluation
-
-Before developing this we want to understand how the system performs in general, for this we are going to generate synthetic data based on SmartHR data and evaluate. This is not a real estimate, but a starting point to automate some evaluation. In real life - data from actual use should be used for this.
 
 ## Architecture 
 
-Hitht leve diagram of the systen 
+High-level diagram of the system
 
 
 ```mermaid
@@ -38,22 +35,55 @@ graph TD;
     H --> Q
 ```
 
+
+## Evaluation
+
+Before developing this we want to understand how the system performs in general, for this we are going to generate synthetic data based on SmartHR data and evaluate. This is not a real estimate, but a starting point to automate some evaluation. In real life - data from actual use should be used for this.
+
+### Results:
+
+| Dataset | Language | NDCG@1 | NDCG@5 | Recall@1 | Recall@5 | Precision@1 | Precision@5 |
+|---------|----------|--------|--------|----------|----------|-------------|-------------|
+| [synthetic-data-single-image-single-query](https://huggingface.co/datasets/koml/smart-hr-synthetic-data-single-image-single-query) | English  | 0.5190 | 0.7021 | 0.5190   | 0.8354   | 0.5190      | 0.1671      |
+| [synthetic-data-single-image-single-query](https://huggingface.co/datasets/koml/smart-hr-synthetic-data-single-image-single-query) | Japanese | 0.7215 | 0.8342 | 0.7215   | 0.9241   | 0.7215      | 0.1848      |
+
+
+### Process:
+
+The evaluation process had two stages: we generated synthetic data based on existing SmartHR PDFs and evaluated our visual retrieval. To run a small test:
+
+```
+python ai_search_demo/evaluate_synthetic_data.py create-synthetic-dataset ./example_data/smart-hr ./example_data/smart-hr-dataset-test koml/smart-hr-synthetic-data-test
+python ai_search_demo/evaluate_synthetic_data.py evaluate-on-synthetic-dataset koml/smart-hr-synthetic-data-test --collection-name small-eval
+```
+
+To run large evaluation:
+
+```
+python ai_search_demo/evaluate_synthetic_data.py create-synthetic-dataset ./example_data/smart-hr ./example_data/smart-hr-synthetic-data-single-image-single-query koml/smart-hr-synthetic-data-single-image-single-query --num-samples 79
+python ai_search_demo/evaluate_synthetic_data.py evaluate-on-synthetic-dataset koml/smart-hr-synthetic-data-single-image-single-query --collection-name smart-hr-synthetic-data-single-image-single-query
+
+
+python ai_search_demo/evaluate_synthetic_data.py create-synthetic-dataset ./example_data/smart-hr ./example_data/smart-hr-synthetic-data-single-image-multiple-queries koml/smart-hr-synthetic-data-single-image-multiple-queries --num-samples 1000
+```
+
+
 ## LLM inference 
 
 Download models
 
 ```
-modal run llm_serving_load_models.py --model-name Qwen/Qwen2.5-7B-Instruct --model-revision bb46c15ee4bb56c5b63245ef50fd7637234d6f75
-modal run llm_serving_load_models.py --model-name Qwen/Qwen2-VL-7B-Instruct --model-revision 51c47430f97dd7c74aa1fa6825e68a813478097f
-modal run llm_serving_load_models.py --model-name Qwen/Qwen2-VL-72B-Instruct --model-revision bb46c15ee4bb56c5b63245ef50fd7637234d6f75
-modal run llm_serving_load_models.py --model-name vidore/colqwen2-v1.0-merged --model-revision 364a4f5df97231e233e15cbbaf0b9dbe352ba92c
+modal run llm-inference/llm_serving_load_models.py --model-name Qwen/Qwen2.5-7B-Instruct --model-revision bb46c15ee4bb56c5b63245ef50fd7637234d6f75
+modal run llm-inference/llm_serving_load_models.py --model-name Qwen/Qwen2-VL-7B-Instruct --model-revision 51c47430f97dd7c74aa1fa6825e68a813478097f
+modal run llm-inference/llm_serving_load_models.py --model-name Qwen/Qwen2-VL-72B-Instruct --model-revision bb46c15ee4bb56c5b63245ef50fd7637234d6f75
+modal run llm-inference/llm_serving_load_models.py --model-name vidore/colqwen2-v1.0-merged --model-revision 364a4f5df97231e233e15cbbaf0b9dbe352ba92c
 ```
 
 Deploy models
 
 ```
-modal deploy llm_serving.py 
-modal deploy llm_serving_colpali.py
+modal deploy llm-inference/llm_serving.py 
+modal deploy llm-inference/llm_serving_colpali.py
 ```
 
 ## DB 

diff --git a/ai-search-demo/ai_search_demo/evaluate_synthetic_data.py b/ai-search-demo/ai_search_demo/evaluate_synthetic_data.py
@@ -0,0 +1,160 @@
+import base64
+import os
+import random
+from io import BytesIO
+from typing import Dict, List
+
+import PIL
+import typer
+from colpali_engine.trainer.eval_utils import CustomRetrievalEvaluator
+from datasets import Dataset, load_dataset
+from openai import OpenAI
+from pydantic import BaseModel
+from rich import print
+from rich.table import Table
+from tqdm import tqdm
+
+from ai_search_demo.qdrant_inexing import SearchClient, pdfs_to_hf_dataset
+
+# Initialize OpenAI client
+client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
+
+class DataSample(BaseModel):
+    japanese_query: str
+    english_query: str
+
+def generate_synthetic_question(image: PIL.Image.Image) -> DataSample:
+    # Convert PIL image to base64 string
+    buffered = BytesIO()
+    image.save(buffered, format="JPEG")
+    image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
+
+    prompt = """
+    I am developing a visual retrieval dataset to evaluate my system. 
+    Based on the image I provided, I want you to generate a query that this image will satisfy. 
+    For example, if a user types this query into the search box, this image would be extremely relevant.
+    Generate the query in Japanese and English.
+    """
+    # Generate synthetic question using OpenAI
+    chat_completion = client.beta.chat.completions.parse(
+        model="gpt-4o",
+        response_format=DataSample,
+        temperature=1,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
+                    },
+                ],
+            }
+        ],
+    )
+
+    sample = chat_completion.choices[0].message.parsed
+    return sample
+
+def create_synthetic_dataset(input_folder: str, output_folder: str, hub_repo: str, num_samples: int = 10) -> None:
+    # Step 1: Read all PDFs and extract info
+    dataset = pdfs_to_hf_dataset(input_folder)
+
+    # Step 2: Randomly sample data points
+    if num_samples > len(dataset):
+        indices = random.choices(range(len(dataset)), k=num_samples)
+        sampled_data = dataset.select(indices)
+    else:
+        sampled_data = dataset.shuffle().select(range(num_samples))
+
+    synthetic_data: List[Dict] = []
+
+    for index, data_point in enumerate(tqdm(sampled_data, desc="Generating synthetic questions")):
+        image = data_point['image']
+        pdf_name = data_point['pdf_name']
+        pdf_page = data_point['pdf_page']
+
+        # Step 3: Generate synthetic question
+        sample = generate_synthetic_question(image)
+
+        # Step 4: Store samples in a new dataset
+        synthetic_data.append({
+            "index": index,
+            "image": image,
+            "question_en": sample.english_query,
+            "question_jp": sample.japanese_query,
+            "pdf_name": pdf_name,
+            "pdf_page": pdf_page
+        })
+
+    # Create a new dataset from synthetic data
+    synthetic_dataset = Dataset.from_list(synthetic_data)
+    synthetic_dataset.save_to_disk(output_folder)
+
+    # Save the dataset card
+    synthetic_dataset.push_to_hub(hub_repo, private=False)
+
+def evaluate_on_synthetic_dataset(hub_repo: str, collection_name: str = "synthetic-dataset-evaluate-full") -> None:
+    # Ingest collection with IngestClient
+    print("Load data")
+    synthetic_dataset = load_dataset(hub_repo)['train']
+
+    print("Ingest data to qdrant")
+    # ingest_client = IngestClient()
+    # ingest_client.ingest(collection_name, synthetic_dataset)
+
+    run_evaluation(synthetic_dataset=synthetic_dataset, collection_name=collection_name, query_text_key='question_en')
+    run_evaluation(synthetic_dataset=synthetic_dataset, collection_name=collection_name, query_text_key='question_jp')
+
+def run_evaluation(synthetic_dataset: Dataset, collection_name: str, query_text_key: str) -> None:
+    search_client = SearchClient()
+    relevant_docs: Dict[str, Dict[str, int]] = {}
+    results: Dict[str, Dict[str, float]] = {}
+
+    for x in synthetic_dataset:
+        query_id = f"{x['pdf_name']}_{x['pdf_page']}"
+        relevant_docs[query_id] = {query_id: 1}  # The most relevant document is itself
+
+        response = search_client.search_images_by_text(query_text=x[query_text_key], collection_name=collection_name, top_k=10)
+
+        results[query_id] = {}
+        for point in response.points:
+            doc_id = f"{point.payload['pdf_name']}_{point.payload['pdf_page']}"
+            results[query_id][doc_id] = point.score
+
+    mteb_evaluator = CustomRetrievalEvaluator()
+
+    ndcg, _map, recall, precision, naucs = mteb_evaluator.evaluate(
+        relevant_docs,
+        results,
+        mteb_evaluator.k_values,
+    )
+
+    mrr = mteb_evaluator.evaluate_custom(relevant_docs, results, mteb_evaluator.k_values, "mrr")
+
+    scores = {
+        **{f"ndcg_at_{k.split('@')[1]}": v for (k, v) in ndcg.items()},
+        **{f"map_at_{k.split('@')[1]}": v for (k, v) in _map.items()},
+        **{f"recall_at_{k.split('@')[1]}": v for (k, v) in recall.items()},
+        **{f"precision_at_{k.split('@')[1]}": v for (k, v) in precision.items()},
+        **{f"mrr_at_{k.split('@')[1]}": v for (k, v) in mrr[0].items()},
+        **{f"naucs_at_{k.split('@')[1]}": v for (k, v) in naucs.items()},
+    }
+
+    # Use rich to print scores beautifully
+    table = Table(title=f"Evaluation Scores for {query_text_key}")
+    table.add_column("Metric", justify="right", style="cyan", no_wrap=True)
+    table.add_column("Score", style="magenta")
+
+    for metric, score in scores.items():
+        table.add_row(metric, f"{score:.4f}")
+
+    print(table)
+
+
+if __name__ == '__main__':
+    app = typer.Typer()
+    app.command()(create_synthetic_dataset)
+    app.command()(evaluate_on_synthetic_dataset)
+    app()
diff --git a/ai-search-demo/ai_search_demo/qdrant_inexing.py b/ai-search-demo/ai_search_demo/qdrant_inexing.py
@@ -1,12 +1,13 @@
-from qdrant_client import QdrantClient
-from qdrant_client.http import models
-from tqdm import tqdm
-from pdf2image import convert_from_path
-from pypdf import PdfReader
 import io
-import requests
+import tracemalloc
 from pathlib import Path
+
+import requests
 from datasets import Dataset
+from pdf2image import convert_from_path
+from pypdf import PdfReader
+from qdrant_client import QdrantClient
+from qdrant_client.http import models
 from tqdm import tqdm
 
 # Constants
@@ -142,6 +143,8 @@ def search_images_by_text(self, query_text, collection_name: str, top_k=TOP_K):
 
         return search_result
 
+
+
 def get_pdf_images(pdf_path):
     reader = PdfReader(pdf_path)
     page_texts = []
@@ -150,11 +153,12 @@ def get_pdf_images(pdf_path):
         text = page.extract_text()
         page_texts.append(text)
     # Convert to PIL images
-    images = convert_from_path(pdf_path)
+    images = convert_from_path(pdf_path, dpi=150, fmt="jpeg", jpegopt={"quality": 100, "progressive": True, "optimize": True})
     assert len(images) == len(page_texts)
     return images, page_texts
 
 def pdfs_to_hf_dataset(path_to_folder):
+    tracemalloc.start()  # Start tracing memory allocations
 
     data = []
     global_index = 0
@@ -173,6 +177,17 @@ def pdfs_to_hf_dataset(path_to_folder):
                 "page_text": text
             })
             global_index += 1
+            # Print memory usage after processing each image
+            current, peak = tracemalloc.get_traced_memory()
+
+        # Print memory usage after processing each PDF
+        current, peak = tracemalloc.get_traced_memory()
+        print(f"PDF: Current memory usage is {current / 10**6}MB; Peak was {peak / 10**6}MB")
+
+    current, peak = tracemalloc.get_traced_memory()
+    print(f"TOTAL: Current memory usage is {current / 10**6}MB; Peak was {peak / 10**6}MB")
+    tracemalloc.stop()  # Stop tracing memory allocations
+
     print("Done processing")
     dataset = Dataset.from_list(data)
     print("Done converting to dataset")

diff --git a/ai-search-demo/ai_search_demo/ui.py b/ai-search-demo/ai_search_demo/ui.py
@@ -1,12 +1,12 @@
-import streamlit as st
-import os
 import json
+import os
+
 import pandas as pd
-import threading
-from ai_search_demo.qdrant_inexing import IngestClient, pdfs_to_hf_dataset
-from ai_search_demo.qdrant_inexing import SearchClient
+import streamlit as st
 from datasets import load_from_disk
 
+from ai_search_demo.qdrant_inexing import IngestClient, SearchClient, pdfs_to_hf_dataset
+
 STORAGE_DIR = "storage"
 COLLECTION_INFO_FILENAME = "collection_info.json"
 HF_DATASET_DIRNAME = "hf_dataset"
@@ -101,8 +101,9 @@ def process_and_ingest():
             with open(os.path.join(collection_dir, COLLECTION_INFO_FILENAME), "w") as json_file:
                 json.dump(collection_info, json_file)
 
-        # Run the processing and ingestion in a separate thread
-        threading.Thread(target=process_and_ingest).start()
+        # Run the processing and ingestion in the current function with a spinner
+        with st.spinner('Processing and ingesting PDFs...'):
+            process_and_ingest()
 
 def display_all_collections():
     st.header("Previously Uploaded Collections")

diff --git a/ai-search-demo/ai_search_demo/llm_serving.py → ai-search-demo/llm-inference/llm_serving.py b/ai-search-demo/ai_search_demo/llm_serving.py → ai-search-demo/llm-inference/llm_serving.py
diff --git a/...emo/ai_search_demo/llm_serving_colpali.py → ...demo/llm-inference/llm_serving_colpali.py b/...emo/ai_search_demo/llm_serving_colpali.py → ...demo/llm-inference/llm_serving_colpali.py
@@ -35,11 +35,11 @@
 @modal.asgi_app()
 def serve():
     import fastapi
+    import torch
     from colpali_engine.models import ColQwen2, ColQwen2Processor
+    from fastapi import APIRouter, Depends, HTTPException, Security
     from fastapi.middleware.cors import CORSMiddleware
     from fastapi.security import HTTPBearer
-    from fastapi import HTTPException, Security, APIRouter, Depends
-    import torch
 
     volume.reload()  # ensure we have the latest version of the weights
 

diff --git a/...ai_search_demo/llm_serving_load_models.py → .../llm-inference/llm_serving_load_models.py b/...ai_search_demo/llm_serving_load_models.py → .../llm-inference/llm_serving_load_models.py