Fix/prompt tuning report rating (microsoft#496)

* add community report ranking to cli * use tuned ratings in generated community report rankings * update report generation template * add new import paths to init * add community report generation prompt and util function * add rating generator, update init paths * add report rating generation prompt * update report generation template * typo * add semantic release * Add missing docstring * Format * is is * Format ruff --------- Co-authored-by: Julian Whiting <[email protected]>
hunterx0028 · Jul 10, 2024 · c3852b0 · c3852b0
1 parent aff2f79
commit c3852b0
Show file tree

Hide file tree

Showing 9 changed files with 197 additions and 9 deletions.
diff --git a/.semversioner/next-release/minor-20240710183748086411.json b/.semversioner/next-release/minor-20240710183748086411.json
@@ -0,0 +1,4 @@
+{
+  "type": "minor",
+  "description": "Add dynamic community report rating to the prompt tuning engine"
+}
diff --git a/docsite/posts/prompt_tuning/auto_prompt_tuning.md b/docsite/posts/prompt_tuning/auto_prompt_tuning.md
@@ -6,7 +6,7 @@ tags: [post, tuning]
 date: 2024-06-13
 ---
 
-GraphRAG provides the ability to create domain adaptive templates for the generation of the knowledge graph. This step is optional, though is is highly encouraged to run it as it will yield better results when executing an Index Run.
+GraphRAG provides the ability to create domain adaptive templates for the generation of the knowledge graph. This step is optional, though it is highly encouraged to run it as it will yield better results when executing an Index Run.
 
 The templates are generated by loading the inputs, splitting them into chunks (text units) and then running a series of LLM invocations and template substitutions to generate the final prompts. We suggest using the default values provided by the script, but in this page you'll find the detail of each in case you want to further explore and tweak the template generation algorithm.
 

diff --git a/graphrag/prompt_tune/cli.py b/graphrag/prompt_tune/cli.py
@@ -18,6 +18,7 @@
     create_entity_extraction_prompt,
     create_entity_summarization_prompt,
     detect_language,
+    generate_community_report_rating,
     generate_community_reporter_role,
     generate_domain,
     generate_entity_relationship_examples,
@@ -179,6 +180,14 @@ async def generate_indexing_prompts(
     persona = await generate_persona(llm, domain)
     reporter.info(f"Generated persona: {persona}")
 
+    reporter.info("Generating community report ranking description...")
+    community_report_ranking = await generate_community_report_rating(
+        llm, domain=domain, persona=persona, docs=doc_list
+    )
+    reporter.info(
+        f"Generated community report ranking description: {community_report_ranking}"
+    )
+
     entity_types = None
     if not skip_entity_types:
         reporter.info("Generating entity types")
@@ -235,6 +244,7 @@ async def generate_indexing_prompts(
     create_community_summarization_prompt(
         persona=persona,
         role=community_reporter_role,
+        report_rating_description=community_report_ranking,
         language=language,
         output_path=output_path,
     )

diff --git a/graphrag/prompt_tune/generator/__init__.py b/graphrag/prompt_tune/generator/__init__.py
@@ -3,6 +3,7 @@
 
 """Prompt generation module."""
 
+from .community_report_rating import generate_community_report_rating
 from .community_report_summarization import create_community_summarization_prompt
 from .community_reporter_role import generate_community_reporter_role
 from .defaults import MAX_TOKEN_COUNT
@@ -20,6 +21,7 @@
     "create_entity_extraction_prompt",
     "create_entity_summarization_prompt",
     "detect_language",
+    "generate_community_report_rating",
     "generate_community_reporter_role",
     "generate_domain",
     "generate_entity_relationship_examples",

diff --git a/graphrag/prompt_tune/generator/community_report_rating.py b/graphrag/prompt_tune/generator/community_report_rating.py
@@ -0,0 +1,35 @@
+"""Generate a rating description for community report rating."""
+
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+
+from graphrag.llm.types.llm_types import CompletionLLM
+from graphrag.prompt_tune.prompt import (
+    GENERATE_REPORT_RATING_PROMPT,
+)
+
+
+async def generate_community_report_rating(
+    llm: CompletionLLM, domain: str, persona: str, docs: str | list[str]
+) -> str:
+    """Generate an LLM persona to use for GraphRAG prompts.
+
+    Parameters
+    ----------
+    - llm (CompletionLLM): The LLM to use for generation
+    - domain (str): The domain to generate a rating for
+    - persona (str): The persona to generate a rating for for
+    - docs (str | list[str]): Documents used to contextualize the rating
+
+    Returns
+    -------
+    - str: The generated rating description prompt response.
+    """
+    docs_str = " ".join(docs) if isinstance(docs, list) else docs
+    domain_prompt = GENERATE_REPORT_RATING_PROMPT.format(
+        domain=domain, persona=persona, input_text=docs_str
+    )
+
+    response = await llm(domain_prompt)
+
+    return str(response.output).strip()
diff --git a/graphrag/prompt_tune/generator/community_report_summarization.py b/graphrag/prompt_tune/generator/community_report_summarization.py
@@ -13,6 +13,7 @@
 def create_community_summarization_prompt(
     persona: str,
     role: str,
+    report_rating_description: str,
     language: str,
     output_path: Path | None = None,
 ) -> str:
@@ -30,7 +31,10 @@ def create_community_summarization_prompt(
     - str: The community summarization prompt
     """
     prompt = COMMUNITY_REPORT_SUMMARIZATION_PROMPT.format(
-        persona=persona, role=role, language=language
+        persona=persona,
+        role=role,
+        report_rating_description=report_rating_description,
+        language=language,
     )
 
     if output_path:

diff --git a/graphrag/prompt_tune/prompt/__init__.py b/graphrag/prompt_tune/prompt/__init__.py
@@ -1,8 +1,9 @@
+"""Persona, entity type, relationships and domain generation prompts module."""
+
 # Copyright (c) 2024 Microsoft Corporation.
 # Licensed under the MIT License
 
-"""Persona, entity type, relationships and domain generation prompts module."""
-
+from .community_report_rating import GENERATE_REPORT_RATING_PROMPT
 from .community_reporter_role import GENERATE_COMMUNITY_REPORTER_ROLE_PROMPT
 from .domain import GENERATE_DOMAIN_PROMPT
 from .entity_relationship import (
@@ -26,5 +27,6 @@
     "GENERATE_COMMUNITY_REPORTER_ROLE_PROMPT",
     "GENERATE_DOMAIN_PROMPT",
     "GENERATE_PERSONA_PROMPT",
+    "GENERATE_REPORT_RATING_PROMPT",
     "UNTYPED_ENTITY_RELATIONSHIPS_GENERATION_PROMPT",
 ]
diff --git a/graphrag/prompt_tune/prompt/community_report_rating.py b/graphrag/prompt_tune/prompt/community_report_rating.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2024 Microsoft Corporation.
+# Licensed under the MIT License
+
+"""Fine tuning prompts for Community Reports Rating."""
+
+GENERATE_REPORT_RATING_PROMPT = """
+
+You are a helpful agent tasked with rating the importance of a given text in the context of the provided domain and persona. Your goal is to provide a rating that reflects the relevance and significance of the text to the specified domain and persona. Use your expertise to evaluate the text based on the importance criteria and assign a float score between 0-10. Only respond with the text description of the importance criteria. Use the provided example data format to guide your response. Ignore the content of the example data and focus on the structure.
+
+######################
+-Examples-
+######################
+
+### Example 1
+
+# Domain
+
+Personal and Family Communication
+
+# Persona
+
+You are an expert in Social Network Analysis with a focus on the Personal and Family Communication domain. You are skilled at mapping and interpreting complex social networks, understanding the dynamics of interpersonal relationships, and identifying patterns of communication within communities. You are adept at helping people understand the structure and relations within their personal and family networks, providing insights into how information flows, how strong various connections are, and how these networks influence individual and group behavior.
+
+# Data
+
+
+Subject: Re: Event
+From: Alice Brown [email protected]
+Date: 2012-11-14, 9:52 a.m.
+To: John Smith [email protected]
+CC: Jane Doe [email protected], Bob Johnson [email protected], Emma Davis [email protected]
+
+The event is at 6pm at City Hall (Queen street) event chamber. We
+just need to get there by 5:45pm. It is 30-minute long so we will be
+done by 6:30pm. We'll then head over to New Sky on Spadina for some
+unique cuisine!
+
+Guests are you and Emma, and my uncle and auntie from London
+who my folks have designated to act as their reps. Jane and Joe are
+witnesses.
+
+Be there or be square!
+Alice
+
+On Wed, Nov 14, 2012 at 9:40 AM, John Smith [email protected] wrote:
+
+Thats the day after Bob's event!
+Any more details on the event schedule? ITS NEXT WEEK!
+On Tue, Nov 13, 2012 at 7:51 PM, Jane Doe
+[email protected] wrote:
+I am supposed to forward you the invitation to this year's celebration.
+Date: Saturday, Nov. 24, 6 pm starting
+Place as usual: Dean's house, 6 Cardish, Kleinburg L0J 1C0
+Jane Doe
+[email protected]
+
+# Importance Criteria
+
+A float score between 0-10 that represents the relevance of the email's content to family communication, health concerns, travel plans, and interpersonal dynamics, with 1 being trivial or spam and 10 being highly relevant, urgent, and impactful to family cohesion or well-being.
+#############################
+
+### Example 2
+
+# Domain
+
+Literary Analysis
+
+# Persona
+
+You are a literary scholar with a focus on works from the 19th century. You are skilled at analyzing and interpreting texts, identifying themes and motifs, and understanding the historical and cultural contexts in which these works were written. You are adept at helping people understand the deeper meanings and significance of literary works, providing insights into the author's intentions, the social issues addressed in the text, and the impact of these works on contemporary society.
+
+# Data
+
+Had she found Jane in any apparent danger, Mrs. Bennet would have been very miserable; but being satisfied on seeing her that her illness was not alarming, she had no wish of her recovering immediately, as her restoration to health would probably remove her from Netherfield. She would not listen, therefore, to her daughter's proposal of being carried home; neither did the apothecary, who arrived about the same time, think it at all advisable. After sitting a little with Jane, on Miss Bingley's appearance and invitation, the mother and three daughters all attended her into the breakfast parlor. Bingley met them with hopes that Mrs. Bennet had not found Miss Bennet worse than she expected.
+
+"Indeed I have, Sir," was her answer. "She is a great deal too ill to be moved. Mr. Jones says we must not think of moving her. We must trespass a little longer on your kindness."
+
+"Removed!" cried Bingley. "It must not be thought of. My sister, I am sure, will not hear of her removal."
+
+# Importance Criteria
+
+A float score between 0-10 that represents the relevance of the text to literary analysis, historical context, thematic interpretation, and cultural significance, with 1 being trivial or irrelevant and 10 being highly significant, profound, and impactful to the understanding of the text and its implications.
+#############################
+
+### Example 3
+
+# Domain
+
+Environmental Science
+
+# Persona
+
+You are an environmental scientist with a focus on climate change and sustainability. You are skilled at analyzing data, interpreting social commentary and recommending policy changes. You are adept at helping people understand the causes and consequences of climate change, providing insights into how they can reduce their carbon footprint, adopt sustainable practices, and contribute to a healthier planet.
+
+# Data
+
+Host 1 (Anna): Welcome to "Green Living Today," the podcast where we explore practical tips and inspiring stories about sustainable living. I'm your host, Anna Green.
+
+Host 2 (Mark): And I'm Mark Smith. Today, we have a special episode focused on reducing plastic waste in our daily lives. We'll be talking to a special guest who has made significant strides in living a plastic-free lifestyle.
+
+Anna: That's right, Mark. Our guest today is Laura Thompson, the founder of "Plastic-Free Living," a blog dedicated to sharing tips and resources for reducing plastic use. Welcome to the show, Laura!
+
+Guest (Laura): Thanks, Anna and Mark. It's great to be here.
+
+Mark: Laura, let's start by talking about your journey. What inspired you to start living a plastic-free lifestyle?
+
+# Importance Criteria
+
+A float score between 0-10 that represents the relevance of the text to sustainability, plastic waste reduction, and environmental policies, with 1 being trivial or irrelevant and 10 being highly significant, impactful, and actionable in promoting environmental awareness.
+#############################
+
+
+#############################
+-Real Data-
+#############################
+
+# Domain
+
+{domain}
+
+# Persona
+
+{persona}
+
+# Data
+
+{input_text}
+
+# Importance Criteria
+
+
+"""
diff --git a/graphrag/prompt_tune/template/community_report_summarization.py b/graphrag/prompt_tune/template/community_report_summarization.py
@@ -7,15 +7,14 @@
 {persona}
 
 # Goal
-Write a comprehensive assessment report of a community taking on the role of a {role}. The content of this report includes an overview of the community's key entities, their legal compliance, technical capabilities,
-reputation, and noteworthy claims.
+Write a comprehensive assessment report of a community taking on the role of a {role}. The content of this report includes an overview of the community's key entities and relationships.
 
 # Report Structure
 The report should include the following sections:
 - TITLE: community's name that represents its key entities - title should be short but specific. When possible, include representative named entities in the title.
-- SUMMARY: An executive summary of the community's overall structure, how its entities are related to each other, and significant threats associated with its entities.
-- THREAT SEVERITY RATING: a float score between 0-10 that represents the potential global impact to humanity as posed by entities within the community.
-- RATING EXPLANATION: Give a single sentence explanation of the threat severity rating.
+- SUMMARY: An executive summary of the community's overall structure, how its entities are related to each other, and significant points associated with its entities.
+- REPORT RATING: {report_rating_description}
+- RATING EXPLANATION: Give a single sentence explanation of the rating.
 - DETAILED FINDINGS: A list of 5-10 key insights about the community. Each insight should have a short summary followed by multiple paragraphs of explanatory text grounded according to the grounding rules below. Be comprehensive.
 
 Return output as a well-formed JSON-formatted string with the following format. Don't use any unnecessary escape sequences. The output should be a single JSON object that can be parsed by json.loads.