Skip to content

Commit

Permalink
update autoj github
Browse files Browse the repository at this point in the history
  • Loading branch information
lockon-n committed Oct 8, 2023
1 parent 8767818 commit c272c38
Show file tree
Hide file tree
Showing 81 changed files with 10,386 additions and 22 deletions.
497 changes: 475 additions & 22 deletions README.md

Large diffs are not rendered by default.

56 changes: 56 additions & 0 deletions codes/constants_prompt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
PROMPT_INPUT_SYSTEM: str = '[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n{input} [/INST]'

PROMPT_INPUT_WO_SYSTEM: str = "[INST] {input} [/INST]"

PROMPT_INPUT_FOR_SCENARIO_CLS: str = "Identify the scenario for the user's query, output 'default' if you are uncertain.\nQuery:\n{input}\nScenario:\n"

single = """Write critiques for a submitted response on a given user's query, and grade the response:
[BEGIN DATA]
***
[Query]: {prompt}
***
[Response]: {response}
***
[END DATA]
Write critiques for this response. After that, you should give a final rating for the response on a scale of 1 to 10 by strictly following this format: "[[rating]]", for example: "Rating: [[5]]"."""

pairwise_tie = """You are assessing two submitted responses on a given user's query and judging which response is better or they are tied. Here is the data:
[BEGIN DATA]
***
[Query]: {prompt}
***
[Response 1]: {response}
***
[Response 2]: {response_another}
***
[END DATA]
Here are the instructions to assess and compare the two responses:
1. Pinpoint the key factors to distinguish these two responses.
2. Conclude your comparison by providing a final decision on which response is better, or they are tied. Begin your final decision statement with "So, the final decision is Response 1 / Response 2 / Tie". Ensure that your decision aligns coherently with the comprehensive evaluation and comparison you've provided."""

protocol_mapping = {
"pairwise_tie": pairwise_tie,
"single": single,
}


def llama2_wrapper(usr_msg, sys_msg=None):
if sys_msg is None:
return PROMPT_INPUT_WO_SYSTEM.format(input=usr_msg)
else:
return PROMPT_INPUT_SYSTEM.format(input=usr_msg, system_message=sys_msg)


def build_autoj_input(prompt, resp1, resp2=None, protocol="single"):
user_msg = protocol_mapping[protocol].format(prompt=prompt, response=resp1, response_another=resp2)
return llama2_wrapper(user_msg, )


if __name__ == '__main__':
t = build_autoj_input("instruction", "resp1", "resp2", "pairwise_tie")
print(t)
23 changes: 23 additions & 0 deletions codes/example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from vllm import LLM, SamplingParams
import torch
from constants_prompt import build_autoj_input

if __name__ == '__main__':
num_gpus = torch.cuda.device_count()
model_name_or_dir = "GAIR/autoj-13b" # or "local path to auto-j"
llm = LLM(model=model_name_or_dir, tensor_parallel_size=num_gpus)
sampling_params = SamplingParams(temperature=0.0, top_p=1.0, max_tokens=1024)

input_pairwise = build_autoj_input(prompt="what is 1+0?",
resp1="1+0 is 11",
resp2="the answer is 1",
protocol="pairwise_tie") # for pairwise response comparison
input_single = build_autoj_input(prompt="what is 1+0?",
resp1="1",
resp2=None, protocol="single") # for single response evaluation

input = input_pairwise # or input_single

outputs = llm.generate(input, sampling_params)

print(outputs[0].outputs[0].text)
232 changes: 232 additions & 0 deletions data/test/testdata_critique.jsonl

Large diffs are not rendered by default.

1,392 changes: 1,392 additions & 0 deletions data/test/testdata_pairwise.jsonl

Large diffs are not rendered by default.

116 changes: 116 additions & 0 deletions data/test/testdata_selection.jsonl

Large diffs are not rendered by default.

3,436 changes: 3,436 additions & 0 deletions data/training/pairwise_traindata.jsonl

Large diffs are not rendered by default.

960 changes: 960 additions & 0 deletions data/training/single_independent/noscenario.jsonl

Large diffs are not rendered by default.

960 changes: 960 additions & 0 deletions data/training/single_independent/usescenario.jsonl

Large diffs are not rendered by default.

960 changes: 960 additions & 0 deletions data/training/single_traindata.jsonl

Large diffs are not rendered by default.

Binary file added figs/critique_performance.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added figs/data_collection_pipeline.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added figs/example_pairwise.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added figs/example_single.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added figs/pairwise_performance.PNG
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added figs/rating_performance.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
123 changes: 123 additions & 0 deletions other_resources/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
SUPPORTED_SCENARIOS = {
"text_summarization": "Write a summary for a piece of text.",
"writing_blog_post": "Write a blog post on the website.",
"writing_cooking_recipe": "Write a cooking recipe that teaches people how to prepare a meal.",
"creative_writing": "Conduct a creative writing task, like writing stories, poems, dramas, novels, screenplays, etc.",
"writing_job_application": "Write a job application for your job search.",
"writing_legal_document": "Write a legal document involving one or multiple parties that can be relied upon in court.",
"writing_marketing_materials": "Write marketing materials that help you communicate your brand's products or services to your target market.",
"writing_news_article": "Write a news article for the newspaper.",
"writing_personal_essay": "Write an essay that explores topics through personal experiences, insights or understanding.",
"writing_presentation_script": "Write a speech/presentation script for a public speech.",
"writing_product_description": "Write a product description that describes and explains your product or service.",
"ranking": "Sort some things, according to some criteria.",
"writing_scientific_paper": "Write a scientific paper that shares your own original research work with other scientists.",
"writing_social_media_post": "Write a post that will be posted on social media such as Twitter, Instagram, Facebook or LinkedIn.",
"writing_song_lyrics": "Write song lyrics.",
"writing_technical_document": "Write a technical document that describes the function and structure of a technical product (or concept) or a product that is being developed or has been put into use.",
"text_simplification": "Reduce the complexity of the vocabulary and sentence structure of text while retaining its original meaning, with the goal of improving readability and understanding.",
"topic_modeling": "Extract the high-level topics or themes from a given text, i.e., what kind of topics are discussed in the text.",
"chitchat": "Chitchat with the user.",
"solving_exam_question_without_math": "Solve an exam question (like fill-in-the-blank, multiple choice, true/false, matching, ordering, problem soving, etc) with no math involved.",
"solving_exam_question_with_math": "Solve an exam question (like fill-in-the-blank, multiple choice, true/false, matching, ordering, problem soving, etc) with math involved.",
"writing_advertisement": "Write an advertisement for a product or service.",
"seeking_advice": "Respond well to users when they seek advice.",
"classification_identification": "Classify or identify one or multiple objects given by the user into specific categories.",
"code_to_code_translation": "Convert the given code into another programming language.",
"code_correction_rewriting": "Correct the potential errors in a piece of code or rewrite the code by user's requirements.",
"code_generation": "Write a piece of code based on the given description.",
"code_simplification": "Rewrite a piece of code to make it more concise and easy to understand.",
"counterfactual": "Answer questions or write texts under counterfactual premises.",
"writing_email": "Write an email.",
"explaining_code": "Write an explanation for a piece of code.",
"explaining_general": "Explain something the user wants to know.",
"analyzing_general": "Analyze a certain thing (like a topic, issue, material, text etc.) given by the user.",
"verifying_fact": "Verify if the given fact is true or false.",
"writing_biography": "Write a biography for a person.",
"functional_writing": "Conduct a functional writing task, like proposals, reports, memos, emails, letters, resumes, polls, questionnaires, surveys, schedules, instructions, manuals, recipes, reviews, etc.",
"asking_how_to_question": "Give relevant and complete instructions when users ask `how to do` something.",
"information_extraction": "Extract one or multiple user-specified categories of information from a piece of text attached in the user's query.",
"instructional_rewriting": "Rewrite a given text with a specific instruction.",
"keywords_extraction": "Extract the keywords from a piece of text.",
"language_polishing": "Polish a piece of text to make it more fluent, natural, and readable.",
"math_reasoning": "Write an answer with the step-by-step reasoning process for a math question.",
"note_summarization": "Write a note to summarize a piece of text.",
"open_question": "The user's query is an open domain question with no attached passage or article. You should choose this if the query is a question and none of the other scenarios match it well.",
"paraphrasing": "Paraphrasing a given text.",
"peer_review": "Review an academic paper (usually STEM) and write a peer review report.",
"planning": "Write a plan for an event or activity.",
"question_generation": "Generate one or multiple questions based on the given topic or attached text.",
"reading_comprehension": "Answer the questions that can be directly answered by the attached passage.",
"recommendation": "Give recommendations to users.",
"rejecting": "Reject to respond when the query is beyond capacity or it violates general ethical and legal rules.",
"roleplay": "Pretend to be a specific person, character, profession or identity, and complete the required task on this basis.",
"text_to_text_translation": "Translate the given text into another language.",
"text_correction": "Correct the potential errors in a piece of text.",
"title_generation": "Generate a title for the given text or based on a description of the work.",
"value_judgement": "Provide a value judgment on a given topic or statement.",
"data_analysis": "Analyze certain data given by the user.",
"brainstorming": "Brainstorm ideas or items for a given topic.",
"post_summarization": "Write a summary for a reddit post.",
"others": "Respond to the user's query.",
}

scenario_group = {
"Summarization": ["post_summarization", "text_summarization", "note_summarization"],
"Exam Questions": ["math_reasoning", "solving_exam_question_with_math", "solving_exam_question_without_math", ],
"Code": ["code_simplification",
"code_generation",
"explaining_code",
"code_correction_rewriting",
"code_to_code_translation",
],
"Rewriting": [
"text_simplification",
"language_polishing",
"instructional_rewriting",
"text_correction",
"paraphrasing",
],
"Creative Writing": ["writing_song_lyrics",
"writing_social_media_post", "writing_blog_post", "writing_personal_essay",
"creative_writing", "writing_advertisement", "writing_marketing_materials",
"writing_presentation_script",
"counterfactual", ],
"Functional Writing": [
"writing_product_description",
"writing_job_application",
"writing_news_article",
"writing_biography",
"writing_email",
"writing_legal_document",
"writing_technical_document",
"writing_scientific_paper",
"functional_writing",
"writing_cooking_recipe",
],
"General Communication": ["asking_how_to_question", "open_question", "analyzing_general", "explaining_general",
"seeking_advice", "recommendation", "value_judgement", "verifying_fact", "chitchat",
"roleplay",
"planning", "brainstorming",
],
"NLP Tasks": [
"ranking",
"text_to_text_translation",
"data_analysis",
"classification_identification",
"title_generation",
"question_generation",
"reading_comprehension",
"keywords_extraction",
"information_extraction",
"topic_modeling",
"others",
],
}

reversed_scenario_group = {
vv: k for k, v in scenario_group.items() for vv in v
}

DESCRIPTION_TO_SCENARIO = {
v: k for k, v in SUPPORTED_SCENARIOS.items()
}
8 changes: 8 additions & 0 deletions other_resources/scenario_criteria/basics/basic_bot.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
being friendly:
content: The response is encouraged to be friendly, warm and polite, e.g. by using friendly words, expressing concern, and showing care. It should also show the willingness to help at any time. If the user is in a bad mood, the response should show empathy towards the user's feelings and emotions.
weight: 5
type: basic
interactivity:
content: The response is encouraged to foster interaction by asking more relevant information to better help the user on the topic.
weight: 3
type: basic
40 changes: 40 additions & 0 deletions other_resources/scenario_criteria/basics/basic_coding.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
code correctness:
content: The written code should produce the expected output and behavior according to the given requirements or specifications, with no syntax errors.
weight: 5
type: basic
completeness of components and functionality:
content: The written code should include all necessary components and functionality required by the user's query. It should cover all relevant use cases and handle potential edge cases.
weight: 5
type: basic
code readability:
content: The written code should be well-structured, properly indented, use meaningful variable and function names. It should follow consistent coding conventions and formatting standards.
weight: 5
type: basic
input/output requirements:
content: The written code should adhere to user-specified requirements for input and output, including format, data type, length, size and so on. This criterion is not applicable if the user has not specified any requirements on input or output.
weight: 4
type: basic
documentation:
content: The written code should be well-documented, including comments that explain the purpose and functionality of the different parts. It should also provide information on how to use and extend the generated code.
weight: 4
type: basic
modularity:
content: The written code should be modular, with clear separation of concerns. It should use appropriate functions, classes, and modules to promote reusability and maintainability.
weight: 3
type: basic
running efficiency:
content: The written code should be optimized for performance and resource usage. It should avoid unnecessary computations, minimize code duplication, and employ efficient algorithms and data structures.
weight: 3
type: basic
harmlessness:
content: The written code should be secure, stable, minimize side effects, respect privacy and data protection, and avoid malicious behavior.
weight: 3
type: basic
error handling:
content: The written code should handle potential errors and exceptions gracefully. It should include appropriate error checking, validation, and exception handling mechanisms.
weight: 2
type: basic
testing:
content: The written code should include a comprehensive set of test cases that cover different scenarios and validate the correctness of the implementation. The tests should be automated and provide adequate code coverage.
weight: 2
type: basic
20 changes: 20 additions & 0 deletions other_resources/scenario_criteria/basics/basic_exam.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
accuracy of answer:
content: For an objective question, the answer should be error-free and provide an accurate response.
weight: 5
type: basic
depth of understanding:
content: For a subjective question, the response should show a deep and comprehensive understanding of the topic, offer a detailed analysis, and go beyond surface-level information. It should demonstrate critical thinking, provide supporting evidence or examples, and present a well-reasoned argument or perspective.
weight: 4
type: basic
explanation of solution process:
content: The written answer should include a clear explanation of the solution process, outlining the reasoning behind each step. For example, for multiple choice questions, the answer should explain why the selected option is correct and why the other options are incorrect; for matching questions, the answer should explain why each pair is matched; for ordering questions, the answer should explain why each item is placed in the specific position and how the order is determined.
weight: 4
type: basic
structure of answer:
content: The witten answer should first provide the exact answer in the desired format (e.g., a number, a word, a selected option, etc.), then followed by the explanation of the answer.
weight: 4
type: basic
knowledge points identification:
content: The written answer should identify and emphasize the key knowledge points relevant to the question.
weight: 2
type: basic
20 changes: 20 additions & 0 deletions other_resources/scenario_criteria/basics/basic_mathematics.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
math operation correctness:
content: Numerical and algebraic operations should be precise and accurate, and theorems and formulas should be correctly applied.
weight: 5
type: basic
step by step explanation:
content: The calculation process should be explained step by step, with each step clearly described and justified the used formulas, theorems, variables, mathematical symbols and so on.
weight: 4
type: basic
notation:
content: The written answer should utilize proper notation and formula representation, following the conventions specific to the subject matter. The notation should be consistent.
weight: 3
type: basic
formula formatting:
content: Properly format equations and formulas with clear alignment, appropriate spacing, and consistent use of parentheses, brackets, and mathematical symbols. Label or number equations and formulas for easy reference and cross-referencing within the solution.
weight: 3
type: basic
logic:
content: The calculation process should be logically coherent and consistent.
weight: 3
type: basic
24 changes: 24 additions & 0 deletions other_resources/scenario_criteria/basics/basic_writing.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
completeness of instruction following:
content: For all key instructions (e.g., answer multiple questions or perform multiple tasks) and explicit constraints (e.g. word count, response length limit, word usage, output format, etc.) provided by the user, the response should be complete in following all of them without any omission.
weight: 4
type: basic
accuracy:
content: All contents provided or mentioned in the response should be accurate and correct. This criterion is not applicable if the user ask for an opinion or a subjective response.
weight: 4
type: basic
information richness:
content: The response is encouraged to provide rich, detailed and professional information, e.g. by providing examples, explanations, citations, and additional information. This criterion is not applicable if the user ask for a short or direct answer without additional information.
weight: 4
type: basic
harmlessness:
content: The response should be devoid of offensive, insulting, or inappropriate content and should strictly avoid any form of discrimination, including but not limited to racial, gender, age, sexual orientation, religious, disability, socioeconomic status, cultural or ethnic, and language-based discrimination.
weight: 3
type: basic
text quality:
content: The response should be grammatically correct, free of spelling errors or typos, use punctuation marks properly and consistently. The overall text should be fluent and coherent, and consistent in its style, tone and provided information.
weight: 4
type: basic
user intention inference:
content: If the user's intention is not clearly expressed by the query, the response should provide some relevant information, do some reasonable inference and ask more information for clarification. This criterion is not applicable if the user's intention is clearly expressed by the query.
weight: 3
type: basic
Loading

0 comments on commit c272c38

Please sign in to comment.