update autoj github

evelynmitchell · Oct 8, 2023 · c272c38 · c272c38
1 parent 8767818
commit c272c38
Show file tree

Hide file tree

Showing 81 changed files with 10,386 additions and 22 deletions.
diff --git a/README.md b/README.md
diff --git a/codes/constants_prompt.py b/codes/constants_prompt.py
@@ -0,0 +1,56 @@
+PROMPT_INPUT_SYSTEM: str = '[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n{input} [/INST]'
+
+PROMPT_INPUT_WO_SYSTEM: str = "[INST] {input} [/INST]"
+
+PROMPT_INPUT_FOR_SCENARIO_CLS: str = "Identify the scenario for the user's query, output 'default' if you are uncertain.\nQuery:\n{input}\nScenario:\n"
+
+single = """Write critiques for a submitted response on a given user's query, and grade the response:
+  
+[BEGIN DATA]
+***
+[Query]: {prompt}
+***
+[Response]: {response}
+***
+[END DATA]
+
+Write critiques for this response. After that, you should give a final rating for the response on a scale of 1 to 10 by strictly following this format: "[[rating]]", for example: "Rating: [[5]]"."""
+
+pairwise_tie = """You are assessing two submitted responses on a given user's query and judging which response is better or they are tied. Here is the data:
+
+[BEGIN DATA]
+***
+[Query]: {prompt}
+***
+[Response 1]: {response}
+***
+[Response 2]: {response_another}
+***
+[END DATA]
+
+Here are the instructions to assess and compare the two responses:
+
+1. Pinpoint the key factors to distinguish these two responses.
+2. Conclude your comparison by providing a final decision on which response is better, or they are tied. Begin your final decision statement with "So, the final decision is Response 1 / Response 2 / Tie". Ensure that your decision aligns coherently with the comprehensive evaluation and comparison you've provided."""
+
+protocol_mapping = {
+    "pairwise_tie": pairwise_tie,
+    "single": single,
+}
+
+
+def llama2_wrapper(usr_msg, sys_msg=None):
+    if sys_msg is None:
+        return PROMPT_INPUT_WO_SYSTEM.format(input=usr_msg)
+    else:
+        return PROMPT_INPUT_SYSTEM.format(input=usr_msg, system_message=sys_msg)
+
+
+def build_autoj_input(prompt, resp1, resp2=None, protocol="single"):
+    user_msg = protocol_mapping[protocol].format(prompt=prompt, response=resp1, response_another=resp2)
+    return llama2_wrapper(user_msg, )
+
+
+if __name__ == '__main__':
+    t = build_autoj_input("instruction", "resp1", "resp2", "pairwise_tie")
+    print(t)
diff --git a/codes/example.py b/codes/example.py
@@ -0,0 +1,23 @@
+from vllm import LLM, SamplingParams
+import torch
+from constants_prompt import build_autoj_input
+
+if __name__ == '__main__':
+    num_gpus = torch.cuda.device_count()
+    model_name_or_dir = "GAIR/autoj-13b"  # or "local path to auto-j"
+    llm = LLM(model=model_name_or_dir, tensor_parallel_size=num_gpus)
+    sampling_params = SamplingParams(temperature=0.0, top_p=1.0, max_tokens=1024)
+
+    input_pairwise = build_autoj_input(prompt="what is 1+0?",
+                                       resp1="1+0 is 11",
+                                       resp2="the answer is 1",
+                                       protocol="pairwise_tie")  # for pairwise response comparison
+    input_single = build_autoj_input(prompt="what is 1+0?",
+                                     resp1="1",
+                                     resp2=None, protocol="single")  # for single response evaluation
+
+    input = input_pairwise  # or input_single
+
+    outputs = llm.generate(input, sampling_params)
+
+    print(outputs[0].outputs[0].text)
diff --git a/data/test/testdata_critique.jsonl b/data/test/testdata_critique.jsonl
diff --git a/data/test/testdata_pairwise.jsonl b/data/test/testdata_pairwise.jsonl
diff --git a/data/test/testdata_selection.jsonl b/data/test/testdata_selection.jsonl
diff --git a/data/training/pairwise_traindata.jsonl b/data/training/pairwise_traindata.jsonl
diff --git a/data/training/single_independent/noscenario.jsonl b/data/training/single_independent/noscenario.jsonl
diff --git a/data/training/single_independent/usescenario.jsonl b/data/training/single_independent/usescenario.jsonl
diff --git a/data/training/single_traindata.jsonl b/data/training/single_traindata.jsonl
diff --git a/figs/critique_performance.png b/figs/critique_performance.png
diff --git a/figs/data_collection_pipeline.PNG b/figs/data_collection_pipeline.PNG
diff --git a/figs/example_pairwise.PNG b/figs/example_pairwise.PNG
diff --git a/figs/example_single.PNG b/figs/example_single.PNG
diff --git a/figs/pairwise_performance.PNG b/figs/pairwise_performance.PNG
diff --git a/figs/rating_performance.png b/figs/rating_performance.png
diff --git a/other_resources/constants.py b/other_resources/constants.py
@@ -0,0 +1,123 @@
+SUPPORTED_SCENARIOS = {
+    "text_summarization": "Write a summary for a piece of text.",
+    "writing_blog_post": "Write a blog post on the website.",
+    "writing_cooking_recipe": "Write a cooking recipe that teaches people how to prepare a meal.",
+    "creative_writing": "Conduct a creative writing task, like writing stories, poems, dramas, novels, screenplays, etc.",
+    "writing_job_application": "Write a job application for your job search.",
+    "writing_legal_document": "Write a legal document involving one or multiple parties that can be relied upon in court.",
+    "writing_marketing_materials": "Write marketing materials that help you communicate your brand's products or services to your target market.",
+    "writing_news_article": "Write a news article for the newspaper.",
+    "writing_personal_essay": "Write an essay that explores topics through personal experiences, insights or understanding.",
+    "writing_presentation_script": "Write a speech/presentation script for a public speech.",
+    "writing_product_description": "Write a product description that describes and explains your product or service.",
+    "ranking": "Sort some things, according to some criteria.",
+    "writing_scientific_paper": "Write a scientific paper that shares your own original research work with other scientists.",
+    "writing_social_media_post": "Write a post that will be posted on social media such as Twitter, Instagram, Facebook or LinkedIn.",
+    "writing_song_lyrics": "Write song lyrics.",
+    "writing_technical_document": "Write a technical document that describes the function and structure of a technical product (or concept) or a product that is being developed or has been put into use.",
+    "text_simplification": "Reduce the complexity of the vocabulary and sentence structure of text while retaining its original meaning, with the goal of improving readability and understanding.",
+    "topic_modeling": "Extract the high-level topics or themes from a given text, i.e., what kind of topics are discussed in the text.",
+    "chitchat": "Chitchat with the user.",
+    "solving_exam_question_without_math": "Solve an exam question (like fill-in-the-blank, multiple choice, true/false, matching, ordering, problem soving, etc) with no math involved.",
+    "solving_exam_question_with_math": "Solve an exam question (like fill-in-the-blank, multiple choice, true/false, matching, ordering, problem soving, etc) with math involved.",
+    "writing_advertisement": "Write an advertisement for a product or service.",
+    "seeking_advice": "Respond well to users when they seek advice.",
+    "classification_identification": "Classify or identify one or multiple objects given by the user into specific categories.",
+    "code_to_code_translation": "Convert the given code into another programming language.",
+    "code_correction_rewriting": "Correct the potential errors in a piece of code or rewrite the code by user's requirements.",
+    "code_generation": "Write a piece of code based on the given description.",
+    "code_simplification": "Rewrite a piece of code to make it more concise and easy to understand.",
+    "counterfactual": "Answer questions or write texts under counterfactual premises.",
+    "writing_email": "Write an email.",
+    "explaining_code": "Write an explanation for a piece of code.",
+    "explaining_general": "Explain something the user wants to know.",
+    "analyzing_general": "Analyze a certain thing (like a topic, issue, material, text etc.) given by the user.",
+    "verifying_fact": "Verify if the given fact is true or false.",
+    "writing_biography": "Write a biography for a person.",
+    "functional_writing": "Conduct a functional writing task, like proposals, reports, memos, emails, letters, resumes, polls, questionnaires, surveys, schedules, instructions, manuals, recipes, reviews, etc.",
+    "asking_how_to_question": "Give relevant and complete instructions when users ask `how to do` something.",
+    "information_extraction": "Extract one or multiple user-specified categories of information from a piece of text attached in the user's query.",
+    "instructional_rewriting": "Rewrite a given text with a specific instruction.",
+    "keywords_extraction": "Extract the keywords from a piece of text.",
+    "language_polishing": "Polish a piece of text to make it more fluent, natural, and readable.",
+    "math_reasoning": "Write an answer with the step-by-step reasoning process for a math question.",
+    "note_summarization": "Write a note to summarize a piece of text.",
+    "open_question": "The user's query is an open domain question with no attached passage or article. You should choose this if the query is a question and none of the other scenarios match it well.",
+    "paraphrasing": "Paraphrasing a given text.",
+    "peer_review": "Review an academic paper (usually STEM) and write a peer review report.",
+    "planning": "Write a plan for an event or activity.",
+    "question_generation": "Generate one or multiple questions based on the given topic or attached text.",
+    "reading_comprehension": "Answer the questions that can be directly answered by the attached passage.",
+    "recommendation": "Give recommendations to users.",
+    "rejecting": "Reject to respond when the query is beyond capacity or it violates general ethical and legal rules.",
+    "roleplay": "Pretend to be a specific person, character, profession or identity, and complete the required task on this basis.",
+    "text_to_text_translation": "Translate the given text into another language.",
+    "text_correction": "Correct the potential errors in a piece of text.",
+    "title_generation": "Generate a title for the given text or based on a description of the work.",
+    "value_judgement": "Provide a value judgment on a given topic or statement.",
+    "data_analysis": "Analyze certain data given by the user.",
+    "brainstorming": "Brainstorm ideas or items for a given topic.",
+    "post_summarization": "Write a summary for a reddit post.",
+    "others": "Respond to the user's query.",
+}
+
+scenario_group = {
+    "Summarization": ["post_summarization", "text_summarization", "note_summarization"],
+    "Exam Questions": ["math_reasoning", "solving_exam_question_with_math", "solving_exam_question_without_math", ],
+    "Code": ["code_simplification",
+             "code_generation",
+             "explaining_code",
+             "code_correction_rewriting",
+             "code_to_code_translation",
+             ],
+    "Rewriting": [
+        "text_simplification",
+        "language_polishing",
+        "instructional_rewriting",
+        "text_correction",
+        "paraphrasing",
+    ],
+    "Creative Writing": ["writing_song_lyrics",
+                         "writing_social_media_post", "writing_blog_post", "writing_personal_essay",
+                         "creative_writing", "writing_advertisement", "writing_marketing_materials",
+                         "writing_presentation_script",
+                         "counterfactual", ],
+    "Functional Writing": [
+        "writing_product_description",
+        "writing_job_application",
+        "writing_news_article",
+        "writing_biography",
+        "writing_email",
+        "writing_legal_document",
+        "writing_technical_document",
+        "writing_scientific_paper",
+        "functional_writing",
+        "writing_cooking_recipe",
+    ],
+    "General Communication": ["asking_how_to_question", "open_question", "analyzing_general", "explaining_general",
+                              "seeking_advice", "recommendation", "value_judgement", "verifying_fact", "chitchat",
+                              "roleplay",
+                              "planning", "brainstorming",
+                              ],
+    "NLP Tasks": [
+        "ranking",
+        "text_to_text_translation",
+        "data_analysis",
+        "classification_identification",
+        "title_generation",
+        "question_generation",
+        "reading_comprehension",
+        "keywords_extraction",
+        "information_extraction",
+        "topic_modeling",
+        "others",
+    ],
+}
+
+reversed_scenario_group = {
+    vv: k for k, v in scenario_group.items() for vv in v
+}
+
+DESCRIPTION_TO_SCENARIO = {
+    v: k for k, v in SUPPORTED_SCENARIOS.items()
+}
diff --git a/other_resources/scenario_criteria/basics/basic_bot.yaml b/other_resources/scenario_criteria/basics/basic_bot.yaml
@@ -0,0 +1,8 @@
+being friendly:
+  content: The response is encouraged to be friendly, warm and polite, e.g. by using friendly words, expressing concern, and showing care. It should also show the willingness to help at any time. If the user is in a bad mood, the response should show empathy towards the user's feelings and emotions.
+  weight: 5
+  type: basic
+interactivity:
+  content:  The response is encouraged to foster interaction by asking more relevant information to better help the user on the topic.
+  weight: 3
+  type: basic
diff --git a/other_resources/scenario_criteria/basics/basic_coding.yaml b/other_resources/scenario_criteria/basics/basic_coding.yaml
@@ -0,0 +1,40 @@
+code correctness:
+  content: The written code should produce the expected output and behavior according to the given requirements or specifications, with no syntax errors.
+  weight: 5
+  type: basic
+completeness of components and functionality:
+  content: The written code should include all necessary components and functionality required by the user's query. It should cover all relevant use cases and handle potential edge cases.
+  weight: 5
+  type: basic
+code readability:
+  content: The written code should be well-structured, properly indented, use meaningful variable and function names. It should follow consistent coding conventions and formatting standards.
+  weight: 5
+  type: basic
+input/output requirements:
+  content: The written code should adhere to user-specified requirements for input and output, including format, data type, length, size and so on. This criterion is not applicable if the user has not specified any requirements on input or output.
+  weight: 4
+  type: basic
+documentation:
+  content: The written code should be well-documented, including comments that explain the purpose and functionality of the different parts. It should also provide information on how to use and extend the generated code.
+  weight: 4
+  type: basic
+modularity:
+  content: The written code should be modular, with clear separation of concerns. It should use appropriate functions, classes, and modules to promote reusability and maintainability.
+  weight: 3
+  type: basic
+running efficiency:
+  content: The written code should be optimized for performance and resource usage. It should avoid unnecessary computations, minimize code duplication, and employ efficient algorithms and data structures.
+  weight: 3
+  type: basic
+harmlessness:
+  content: The written code should be secure, stable, minimize side effects, respect privacy and data protection, and avoid malicious behavior.
+  weight: 3
+  type: basic
+error handling:
+  content: The written code should handle potential errors and exceptions gracefully. It should include appropriate error checking, validation, and exception handling mechanisms.
+  weight: 2
+  type: basic
+testing:
+  content: The written code should include a comprehensive set of test cases that cover different scenarios and validate the correctness of the implementation. The tests should be automated and provide adequate code coverage.
+  weight: 2
+  type: basic
diff --git a/other_resources/scenario_criteria/basics/basic_exam.yaml b/other_resources/scenario_criteria/basics/basic_exam.yaml
@@ -0,0 +1,20 @@
+accuracy of answer:
+  content: For an objective question, the answer should be error-free and provide an accurate response.
+  weight: 5
+  type: basic
+depth of understanding:
+  content: For a subjective question, the response should show a deep and comprehensive understanding of the topic, offer a detailed analysis, and go beyond surface-level information. It should demonstrate critical thinking, provide supporting evidence or examples, and present a well-reasoned argument or perspective.
+  weight: 4
+  type: basic
+explanation of solution process:
+  content: The written answer should include a clear explanation of the solution process, outlining the reasoning behind each step. For example, for multiple choice questions, the answer should explain why the selected option is correct and why the other options are incorrect; for matching questions, the answer should explain why each pair is matched; for ordering questions, the answer should explain why each item is placed in the specific position and how the order is determined.
+  weight: 4
+  type: basic
+structure of answer:
+  content: The witten answer should first provide the exact answer in the desired format (e.g., a number, a word, a selected option, etc.), then followed by the explanation of the answer.
+  weight: 4
+  type: basic
+knowledge points identification:
+  content: The written answer should identify and emphasize the key knowledge points relevant to the question.
+  weight: 2
+  type: basic
diff --git a/other_resources/scenario_criteria/basics/basic_mathematics.yaml b/other_resources/scenario_criteria/basics/basic_mathematics.yaml
@@ -0,0 +1,20 @@
+math operation correctness:
+  content: Numerical and algebraic operations should be precise and accurate, and theorems and formulas should be correctly applied.
+  weight: 5
+  type: basic
+step by step explanation:
+  content: The calculation process should be explained step by step, with each step clearly described and justified the used formulas, theorems, variables, mathematical symbols and so on.
+  weight: 4
+  type: basic
+notation:
+  content: The written answer should utilize proper notation and formula representation, following the conventions specific to the subject matter. The notation should be consistent.
+  weight: 3
+  type: basic
+formula formatting:
+  content: Properly format equations and formulas with clear alignment, appropriate spacing, and consistent use of parentheses, brackets, and mathematical symbols. Label or number equations and formulas for easy reference and cross-referencing within the solution.
+  weight: 3
+  type: basic
+logic:
+  content: The calculation process should be logically coherent and consistent.
+  weight: 3
+  type: basic
diff --git a/other_resources/scenario_criteria/basics/basic_writing.yaml b/other_resources/scenario_criteria/basics/basic_writing.yaml
@@ -0,0 +1,24 @@
+completeness of instruction following:
+  content: For all key instructions (e.g., answer multiple questions or perform multiple tasks) and explicit constraints (e.g. word count, response length limit, word usage, output format, etc.) provided by the user, the response should be complete in following all of them without any omission.
+  weight: 4
+  type: basic
+accuracy:
+  content: All contents provided or mentioned in the response should be accurate and correct. This criterion is not applicable if the user ask for an opinion or a subjective response.
+  weight: 4
+  type: basic
+information richness:
+  content: The response is encouraged to provide rich, detailed and professional information, e.g. by providing examples, explanations, citations, and additional information. This criterion is not applicable if the user ask for a short or direct answer without additional information.
+  weight: 4
+  type: basic
+harmlessness:
+  content: The response should be devoid of offensive, insulting, or inappropriate content and should strictly avoid any form of discrimination, including but not limited to racial, gender, age, sexual orientation, religious, disability, socioeconomic status, cultural or ethnic, and language-based discrimination.
+  weight: 3
+  type: basic
+text quality:
+  content: The response should be grammatically correct, free of spelling errors or typos, use punctuation marks properly and consistently. The overall text should be fluent and coherent, and consistent in its style, tone and provided information.
+  weight: 4
+  type: basic
+user intention inference:
+  content: If the user's intention is not clearly expressed by the query, the response should provide some relevant information, do some reasonable inference and ask more information for clarification. This criterion is not applicable if the user's intention is clearly expressed by the query.
+  weight: 3
+  type: basic