Limit scope to restaurants recommendations and 400 businesses

dvquy13 · Sep 8, 2024 · 7338241 · 7338241
1 parent ee5881f
commit 7338241
Show file tree

Hide file tree

Showing 11 changed files with 5,739 additions and 5,073 deletions.
diff --git a/README.md b/README.md
@@ -11,14 +11,14 @@
 - Create a new Python 3.11.9 environment: `conda create --prefix ./.venv python=3.11.9`
 - Make sure Poetry use the new Python 3.11.9 environment: `poetry env use ./.venv`
 - Install Python dependencies with Poetry: `poetry install`
-- Start the Jupyterlab notebook: `poetry jupyter lab`
+- Start the Jupyterlab notebook: `poetry run jupyter lab`
 
 # Build the RAG pipeline
-- Sequentially run the notebooks 001 to 003 to prepare the data: [notebooks/001-sample-yelp-dataset](notebooks/001-sample-yelp-dataset.ipynb), [notebooks/002-analyze-sample-data.ipynb](notebooks/002-analyze-sample-data.ipynb), [notebooks/003-collate-metadata-into-review.ipynb](notebooks/003-collate-metadata-into-review.ipynb)
+- Sequentially run the notebooks 001 and 003 to prepare the data: [notebooks/001-sample-yelp-dataset](notebooks/001-sample-yelp-dataset.ipynb), [notebooks/003-collate-metadata-into-review.ipynb](notebooks/003-collate-metadata-into-review.ipynb)
 - Run the notebook 005 to fine-tune the embedding model [notebooks/005-finetune-embeddings.ipynb](notebooks/005-finetune-embeddings.ipynb). On a machine with 4 GB of vRAM GPU, the fine-tuning process would take about 2 hours.
 - Then explore main RAG-building notebook it [notebooks/009-pipeline-v1.ipynb](notebooks/009-pipeline-v1.ipynb) 
 
 # Start Chatbot UI
 - Navigate to `ui` folder: `cd ui`
-- Run: `poetry chainlit run chat_v2.py -hw`
+- Run: `poetry run chainlit run chat_v2.py -hw`
 - Access the Chatbot UI at http://localhost:8000
diff --git a/notebooks/003-collate-metadata-into-review.ipynb b/notebooks/003-collate-metadata-into-review.ipynb
diff --git a/notebooks/009-pipeline-v1.ipynb b/notebooks/009-pipeline-v1.ipynb
diff --git a/src/features/citation/custom_citation_query_engine.py b/src/features/citation/custom_citation_query_engine.py
@@ -1,8 +1,8 @@
 from llama_index.core.prompts.base import PromptTemplate
 
 CUSTOM_CITATION_QA_TEMPLATE = PromptTemplate(
-    "Based on only the provided information, recommend multiple places to visit that match the user's preferences. "
-    "Include information about the places that would help the user make decisions, e.g. location and categories"
+    "Based on only the provided information, recommend multiple restaurants that match the user's preferences. "
+    "Include information about the restaurants that would help the user make decisions, e.g. location and categories"
     "You should rank the recommendations based on how relevant they are to the user's query"
     "Provide a summary explanation of the strengths of each option and compare them with each other based on different intentions.\n"
     "When referencing information from a source review, "
@@ -17,8 +17,8 @@
     "Cake Mix is my favorite. Great place to top off a date.\n"
     "Source 2:\n"
     "I'm looking forward to coming back next week. I can't believe on Thursday tall PBR drafts are 2.50.\n"
-    "Query: What are some places to enjoy cake?\n"
-    "Answer: Based on your query about places to enjoy cake, here are several recommendations ranked by relevance:"
+    "Query: What are some restaurants to enjoy cake?\n"
+    "Answer: Based on your query about restaurants to enjoy cake, here are several recommendations ranked by relevance:"
     "### 1. Miha Kitchen:\n- Address: <placeholder>\n- Categories: <placeholder>\n- Summary: Miha Kitchen is highly praised for its delicious offerings, including cheese cake. "
     " The bakery has a cute space and a variety of grab-and-go options, especially for the Cake Mix [1]. "
     "The positive reviews highlight the quality of their food and drinks, particularly the chese cake, which is noted as good."
@@ -36,8 +36,8 @@
 )
 
 CUSTOM_CITATION_REFINE_TEMPLATE = PromptTemplate(
-    "Based on only the provided information, recommend multiple places to visit that match the user's preferences. "
-    "Include information about the places that would help the user make decisions, e.g. location and categories"
+    "Based on only the provided information, recommend multiple restaurants that match the user's preferences. "
+    "Include information about the restaurants that would help the user make decisions, e.g. location and categories"
     "You should rank the recommendations based on how relevant they are to the user's query"
     "Provide a summary explanation of the strengths of each option and compare them with each other based on different intentions.\n"
     "When referencing information from a source review, "
@@ -52,8 +52,8 @@
     "Cake Mix is my favorite. Great place to top off a date.\n"
     "Source 2:\n"
     "I'm looking forward to coming back next week. I can't believe on Thursday tall PBR drafts are 2.50.\n"
-    "Query: What are some places to enjoy cake?\n"
-    "Answer: Based on your query about places to enjoy cake, here are several recommendations ranked by relevance:"
+    "Query: What are some restaurants to enjoy cake?\n"
+    "Answer: Based on your query about restaurants to enjoy cake, here are several recommendations ranked by relevance:"
     "### 1. Miha Kitchen:\n- Address: <placeholder>\n- Categories: <placeholder>\n- Summary: Miha Kitchen is highly praised for its delicious offerings, including cheese cake. "
     " The bakery has a cute space and a variety of grab-and-go options, especially for the Cake Mix [1]. "
     "The positive reviews highlight the quality of their food and drinks, particularly the chese cake, which is noted as good."
@@ -66,7 +66,8 @@
     "We have provided an existing answer: {existing_answer}"
     "Below are several numbered sources of information. "
     "Use them to refine the existing answer. "
-    "Please double check and make sure that the correct sources are quoted at the end. "
+    "Important: You must double check and make sure that the correct sources are quoted at the end"
+    " and the cited source numbers are correctly included in the main answer! "
     "If the provided sources are not helpful, you will repeat the existing answer."
     "\nBegin refining!"
     "\n------\n"

diff --git a/src/features/synthesize_recommendation/custom_tree_summarize.py b/src/features/synthesize_recommendation/custom_tree_summarize.py
@@ -4,11 +4,11 @@
 from llama_index.core.prompts.utils import is_chat_model
 
 CUSTOM_TREE_SUMMARIZE_TMPL = (
-    "Context information about various places to visit is provided below.\n"
+    "Context information about various restaurants is provided below.\n"
     "---------------------\n"
     "{context_str}\n"
     "---------------------\n"
-    "Based on only the provided information, recommend multiple places to visit that match the user's preferences. "
+    "Based on only the provided information, recommend multiple restaurants that match the user's preferences. "
     "You should rank the recommendations based on how relevant they are to the user's query"
     "Provide a summary explanation of the strengths of each option and compare them with each other based on different intentions.\n"
     "User Query: {query_str}\n"

diff --git a/src/run/cfg.py b/src/run/cfg.py
@@ -9,16 +9,18 @@
 from src.run.utils import pprint_pydantic_model, substitute_punctuation
 
 # Frequently changed
-response_curated_eval_dataset_fp = "data/031_rerun/response_curated_eval_dataset.json"
+response_curated_eval_dataset_fp = (
+    "data/034_rerun_400_restaurants/response_curated_eval_dataset.json"
+)
 response_synthetic_eval_dataset_fp = (
-    "data/031_rerun/response_synthetic_eval_dataset.json"
+    "data/034_rerun_400_restaurants/response_synthetic_eval_dataset.json"
 )
 retrieval_synthetic_eval_dataset_fp = (
-    "data/031_rerun/retrieval_synthetic_eval_dataset.json"
+    "data/034_rerun_400_restaurants/retrieval_synthetic_eval_dataset.json"
 )
-storage_context_persist_dp = "data/031_rerun/storage_context"
-db_collection = "review_rec_bot__031_rerun"
-db_collection_fp = "data/031_rerun/chroma_db"
+storage_context_persist_dp = "data/034_rerun_400_restaurants/storage_context"
+db_collection = "review_rec_bot__034_rerun_400_restaurants"
+db_collection_fp = "data/034_rerun_400_restaurants/chroma_db"
 
 
 class LLMConfig(BaseModel):
@@ -47,7 +49,7 @@ class RetrievalConfig(BaseModel):
     retrieval_similarity_cutoff: int = (
         None  # If using RRF, this applies after the RRF so the score ties closely to the RRF formula. Not as helpful to use in this case...
     )
-    rerank_top_k: int = 10
+    rerank_top_k: int = 20
     # rerank_model_name: str = "BAAI/bge-reranker-large"
     rerank_model_name: str = "BAAI/bge-reranker-v2-m3"
 
@@ -132,7 +134,7 @@ class RunConfig(BaseModel):
     db_collection_fp: str = db_collection_fp
     notebook_cache_dp: str = None
 
-    data_fp: str = "../data/yelp_dataset/sample/sample_100_biz/denom_review.parquet"
+    data_fp: str = "../data/yelp_dataset/sample/sample_400_biz/denom_review.parquet"
 
     llm_cfg: LLMConfig = LLMConfig()
 

diff --git a/src/svc/availability/availability_check.py b/src/svc/availability/availability_check.py
@@ -105,8 +105,8 @@ def hash_to_slots(self, business_id: str, date_time: str) -> int:
         seed_str = f"{business_id}_{date_time_obj.strftime('%Y-%m-%d_%H')}"
         seed = int(hashlib.sha256(seed_str.encode()).hexdigest(), 16) % (10**8)
         random.seed(seed)
-        # 50% chance to return 0
-        if seed % 2 == 0:
+        # 20% chance to return 0
+        if seed % 5 == 0:
             return 0
         return random.randint(1, 10)
 

diff --git a/ui/.chainlit/config.toml b/ui/.chainlit/config.toml
@@ -56,7 +56,7 @@ edit_message = true
 name = "Review Rec Bot"
 
 # Description of the assistant. This is used for HTML tags.
-# description = "Help users find places to visit based on Yelp reviews"
+# description = "Help users find restaurants and cafes based on Yelp reviews"
 
 # Large size content are by default collapsed for a cleaner ui
 default_collapse_content = true

diff --git a/ui/callback_handler.py b/ui/callback_handler.py
@@ -64,7 +64,7 @@ def on_event_start(
         elif event_type == CBEventType.FUNCTION_CALL:
             step_type = "tool"
             # on_event_start: event_type=<CBEventType.FUNCTION_CALL: 'function_call'>, payload={<EventPayload.FUNCTION_CALL: 'function_call'>: '{"input":"best places to enjoy cold brew coffee"}', <EventPayload.TOOL: 'tool'>: ToolMetadata(de
-            # scription='useful for when you want to find places to visit based on end-user reviews. Takes input in a question format, e.g.: What are the best Vietnamese restaurants in Texas?', name='reco_review', fn_schema=<class 'llama_in
+            # scription='useful for when you want to find restaurants based on end-user reviews. Takes input in a question format, e.g.: What are the best Vietnamese restaurants in Texas?', name='reco_review', fn_schema=<class 'llama_in
             # dex.core.tools.types.DefaultToolFnSchema'>, return_direct=False)}
             tool = payload.get(EventPayload.TOOL)
             tool_name = tool.name

diff --git a/ui/chat.py b/ui/chat.py
@@ -202,7 +202,7 @@
     metadata=ToolMetadata(
         name="reco_review",
         description=(
-            "useful for when you want to find places to visit"
+            "useful for when you want to find restaurants"
             " based on end-user reviews. Takes input in a question"
             " format, e.g.: What are the best Vietnamese restaurants in Texas?"
         ),
@@ -215,7 +215,7 @@
 
 
 agent_system_prompt = """
-You're a helpful assistant who excels at recommending places to go.
+You're a helpful assistant who excels at recommending restaurants.
 
 Always return the referenced paragraphs at the end of your answer to users. Format them nicely if need to.
 """
@@ -246,7 +246,7 @@ def app_chat(message, history, streaming=True):
 
 with gr.Blocks() as demo:
     chatbot = gr.Chatbot(
-        placeholder="<strong>Review Rec Bot</strong><br>I help with recommending places to vist!"
+        placeholder="<strong>Review Rec Bot</strong><br>I help with recommending restaurants and cafes!"
     )
     gr.ChatInterface(
         app_chat,
@@ -255,7 +255,7 @@ def app_chat(message, history, streaming=True):
             placeholder="Where do you want to go today?", container=False, scale=7
         ),
         title="Review Rec Bot",
-        description="Help users find places to visit based on Yelp reviews",
+        description="Help users find restaurants based on Yelp reviews",
         theme="soft",
         examples=[
             "Hello",

diff --git a/ui/chat_v2.py b/ui/chat_v2.py
@@ -42,7 +42,7 @@
 
 ARGS = RunInputArgs(
     EXPERIMENT_NAME="Review Rec Bot - Yelp Review Rec Bot",
-    RUN_NAME="031_rerun",
+    RUN_NAME="034_rerun_400_restaurants",
     RUN_DESCRIPTION="""
 # Objective
 
@@ -65,14 +65,14 @@
 
 dir_prefix = "../notebooks"
 cfg.storage_context_persist_dp = os.path.abspath(
-    f"{dir_prefix}/data/031_rerun/storage_context"
+    f"{dir_prefix}/data/034_rerun_400_restaurants/storage_context"
 )
-cfg.db_collection = "review_rec_bot__031_rerun"
-cfg.db_collection_fp = "data/031_rerun/chroma_db"
+cfg.db_collection = "review_rec_bot__034_rerun_400_restaurants"
+cfg.db_collection_fp = "data/034_rerun_400_restaurants/chroma_db"
 cfg.llm_cfg.embedding_model_name = os.path.abspath(
     f"{dir_prefix}/data/finetune_embedding/finetuned_model"
 )
-cfg.data_fp = "../data/yelp_dataset/sample/sample_100_biz/denom_review.parquet"
+cfg.data_fp = "../data/yelp_dataset/sample/sample_400_biz/denom_review.parquet"
 
 cfg.init(ARGS)
 
@@ -114,7 +114,6 @@
     index=index,
     vector_store_query_mode="mmr",
     similarity_top_k=cfg.retrieval_cfg.retrieval_dense_top_k,
-    # sparse_top_k=cfg.retrieval_cfg.retrieval_sparse_top_k,
 )
 
 logger.info(f"Configuring BM25 Retriever...")
@@ -183,13 +182,13 @@
     citation_refine_template=CUSTOM_CITATION_REFINE_TEMPLATE,
 )
 
-logger.info(f"Registerring Query Engine as Tool...")
+logger.info(f"Registering Query Engine as Tool...")
 query_engine_tool = QueryEngineTool(
     query_engine=query_engine,
     metadata=ToolMetadata(
         name="reco_review",
         description=(
-            "useful for when you want to find places to visit"
+            "useful for when you want to find restaurants and cafes"
             " based on end-user reviews. Takes input in a question"
             " format, e.g.: What are the best Vietnamese restaurants in Texas?"
         ),
@@ -216,9 +215,14 @@
 agent_system_prompt = """
 You're a helpful assistant who excels at recommending places to go.
 
+When users ask for relative time like today or tomorrow, always use the get_current_datetime tool.
+
+You should always narrow down the places like states or cities in the US. If you don't know this information, please ask the user.
+
 You must return the cited sources to your users so that they know you base on which information to make the recommendations.
 
-If there are citation sources returned from the tools, always return them exactly as they are at the end of your answer to users.
+If there are citation sources returned from the tools, always return them exactly as they are of your answer to users.
+This mean that you must respect if where the citation numbers (like [1], [2]) in the answers and at the end below the Sources section.
 """
 
 
@@ -235,7 +239,7 @@ async def start():
 
     await cl.Message(
         author="Jaina",
-        content="Hello! I'm Jaina. I help people find places to visit. What are you looking for today?",
+        content="Hello! I'm Jaina. I help people find restaurants and cafes. What are you looking for today?",
     ).send()