From ee8ced174e173869efeedec4b0357daed985f653 Mon Sep 17 00:00:00 2001
From: ubuntu <ubuntu@localhost.localdomain>
Date: Wed, 6 Sep 2023 20:08:28 +0800
Subject: [PATCH] modify the config of mgsm and logic_grid. add
 vertical_solver_first structure

---
 agentverse/agents/pipeline/critic.py          |   4 +-
 agentverse/agents/pipeline/solver.py          |   5 +-
 agentverse/agentversepipeline.py              |   3 +-
 .../environments/decision_maker/__init__.py   |   2 +
 .../decision_maker/vertical_solver_first.py   |  87 ++++++
 agentverse/environments/executor/code_test.py |  12 +-
 agentverse/initialization.py                  |   5 +-
 agentverse/memory/chat_history.py             |   4 +-
 .../humaneval/gpt-3.5-concurrent/config.yaml  | 244 +++++++++++++++++
 .../config.yaml                               | 256 ++++++++++++++++++
 .../config.yaml                               | 253 +++++++++++++++++
 .../gpt-3.5-vertical-solver-first/config.yaml | 249 +++++++++++++++++
 .../config.yaml                               | 244 +++++++++++++++++
 .../config.yaml                               | 244 +++++++++++++++++
 .../tasks/humaneval/gpt-4-new/config.yaml     |   6 +-
 agentverse/tasks/humaneval/output_parser.py   | 138 ++++++++--
 .../tasks/logic_grid/gpt-4-new/config.yaml    | 176 ++++++++++++
 .../tasks/mgsm/gpt-3.5-autogpt/config.yaml    | 200 ++++++++++++++
 agentverse/tasks/mgsm/gpt-3.5-new/config.yaml | 189 +++++++++++++
 agentverse/tasks/mgsm/output_parser.py        |  53 +++-
 agentverse/tasks/responsegen/output_parser.py |   3 -
 benchmark.py                                  |   2 +-
 dataloader/humaneval.py                       |   6 +
 dataloader/logic_grid.py                      |   3 +-
 dataloader/mgsm.py                            |   2 +
 evaluate_logic.py                             |  71 +++++
 evaluate_math.py                              |  90 ++++++
 27 files changed, 2505 insertions(+), 46 deletions(-)
 create mode 100644 agentverse/environments/decision_maker/vertical_solver_first.py
 create mode 100644 agentverse/tasks/humaneval/gpt-3.5-concurrent/config.yaml
 create mode 100644 agentverse/tasks/humaneval/gpt-3.5-vertical-solver-first-autogpt-2/config.yaml
 create mode 100644 agentverse/tasks/humaneval/gpt-3.5-vertical-solver-first-autogpt/config.yaml
 create mode 100644 agentverse/tasks/humaneval/gpt-3.5-vertical-solver-first/config.yaml
 create mode 100644 agentverse/tasks/humaneval/gpt-4-new-vertical-sovler-first-rust/config.yaml
 create mode 100644 agentverse/tasks/humaneval/gpt-4-new-vertical-sovler-first/config.yaml
 create mode 100644 agentverse/tasks/logic_grid/gpt-4-new/config.yaml
 create mode 100644 agentverse/tasks/mgsm/gpt-3.5-autogpt/config.yaml
 create mode 100644 agentverse/tasks/mgsm/gpt-3.5-new/config.yaml
 create mode 100644 evaluate_logic.py
 create mode 100644 evaluate_math.py

diff --git a/agentverse/agents/pipeline/critic.py b/agentverse/agents/pipeline/critic.py
index 1c45f463b..6272b3b4d 100644
--- a/agentverse/agents/pipeline/critic.py
+++ b/agentverse/agents/pipeline/critic.py
@@ -18,6 +18,8 @@
 
 @agent_registry.register("critic")
 class CriticAgent(BaseAgent):
+    max_history: int = 3
+
     def step(self, env_description: str = "") -> CriticMessage:
         pass
 
@@ -35,7 +37,7 @@ async def astep(
             task_description=task_description,
             role_description=self.role_description,
         )
-        history = self.memory.to_messages(self.name)
+        history = self.memory.to_messages(self.name, start_index=-self.max_history)
         parsed_response: Union[AgentCriticism, None] = None
         for i in range(self.max_retry):
             try:
diff --git a/agentverse/agents/pipeline/solver.py b/agentverse/agents/pipeline/solver.py
index 1151c61e8..ca7cb5cec 100644
--- a/agentverse/agents/pipeline/solver.py
+++ b/agentverse/agents/pipeline/solver.py
@@ -21,6 +21,8 @@
 
 @agent_registry.register("solver")
 class SolverAgent(BaseAgent):
+    max_history: int = 3
+
     def step(
         self,
         former_solution: str,
@@ -35,8 +37,9 @@ def step(
             former_solution=former_solution,
             task_description=task_description,
             advice=advice,
+            role_description=self.role_description,
         )
-        history = self.memory.to_messages(self.name)  # Critic Opinions
+        history = self.memory.to_messages(self.name, start_index=-self.max_history)
         parsed_response = None
         for i in range(self.max_retry):
             try:
diff --git a/agentverse/agentversepipeline.py b/agentverse/agentversepipeline.py
index 8aca492c5..aa8195f6b 100644
--- a/agentverse/agentversepipeline.py
+++ b/agentverse/agentversepipeline.py
@@ -89,6 +89,7 @@ def run(
 
         """
         self.environment.reset()
+        self.logs = []
         advice = "No advice yet."
         previous_plan = "No solution yet."
         while not self.environment.is_done():
@@ -97,7 +98,7 @@ def run(
             )
             self.logs += logs
         self.save_result(previous_plan, single_agent)
-        return previous_plan, result, logs
+        return previous_plan, result, self.logs
 
     '''
     def iter_run(
diff --git a/agentverse/environments/decision_maker/__init__.py b/agentverse/environments/decision_maker/__init__.py
index d2b49b1ae..6576bcc99 100644
--- a/agentverse/environments/decision_maker/__init__.py
+++ b/agentverse/environments/decision_maker/__init__.py
@@ -6,3 +6,5 @@
 from .horizontal import HorizontalDecisionMaker
 from .vertical import VerticalDecisionMaker
 from .dynamic import DynamicDecisionMaker
+from .vertical_solver_first import VerticalSolverFirstDecisionMaker
+from .concurrent import ConcurrentDecisionMaker
diff --git a/agentverse/environments/decision_maker/vertical_solver_first.py b/agentverse/environments/decision_maker/vertical_solver_first.py
new file mode 100644
index 000000000..3aab38e0f
--- /dev/null
+++ b/agentverse/environments/decision_maker/vertical_solver_first.py
@@ -0,0 +1,87 @@
+from __future__ import annotations
+import asyncio
+from colorama import Fore
+
+from typing import TYPE_CHECKING, List
+
+from . import decision_maker_registry
+from .base import BaseDecisionMaker
+from agentverse.logging import typewriter_log, logger
+from agentverse.message import Message
+
+if TYPE_CHECKING:
+    from agentverse.agents import BaseAgent, SolverAgent, CriticAgent
+    from agentverse.message import CriticMessage, SolverMessage
+
+
+@decision_maker_registry.register("vertical-solver-first")
+class VerticalSolverFirstDecisionMaker(BaseDecisionMaker):
+    """
+    Discuss in a vertical manner.
+    """
+
+    name: str = "vertical"
+    max_inner_turns: int = 3
+
+    async def astep(
+        self,
+        agents: List[BaseAgent],
+        task_description: str,
+        previous_plan: str = "No solution yet.",
+        advice: str = "No advice yet.",
+        *args,
+        **kwargs,
+    ) -> List[SolverMessage]:
+        # Here we assume that the first agent is the solver.
+        # The rest of the agents are the reviewers.
+        if advice != "No advice yet.":
+            self.broadcast_messages(
+                agents, [Message(content=advice, sender="Evaluator")]
+            )
+        previous_plan = agents[0].step(previous_plan, advice, task_description)
+        self.broadcast_messages(agents, [previous_plan])
+        logger.info("", f"Initial Plan:\n{previous_plan.content}", Fore.BLUE)
+        for i in range(self.max_inner_turns):
+            reviews: List[CriticMessage] = await asyncio.gather(
+                *[
+                    agent.astep(previous_plan, advice, task_description)
+                    for agent in agents[1:]
+                ]
+            )
+            logger.info(
+                "",
+                "Reviews:\n"
+                + "\n".join(
+                    [f"[{review.sender}]: {review.content}" for review in reviews]
+                ),
+                Fore.YELLOW,
+            )
+
+            nonempty_reviews = []
+            for review in reviews:
+                if not review.is_agree and review.content != "":
+                    nonempty_reviews.append(review)
+            if len(nonempty_reviews) == 0:
+                logger.info("", "Consensus Reached!.", Fore.GREEN)
+                break
+            self.broadcast_messages(agents, nonempty_reviews)
+            previous_plan = agents[0].step(previous_plan, advice, task_description)
+            logger.info("", f"Updated Plan:\n{previous_plan.content}", Fore.BLUE)
+            self.broadcast_messages(agents, [previous_plan])
+        result = previous_plan
+        return [result]
+
+    def broadcast_messages(self, agents, messages) -> None:
+        for agent in agents:
+            agent.add_message_to_memory(messages)
+
+    def p2p_messages(self, agents, messages) -> None:
+        agents[0].add_message_to_memory(messages)
+        for message in messages:
+            for agent in agents[1:]:
+                if agent.name == message.sender:
+                    agent.add_message_to_memory(messages)
+                    break
+
+    def reset(self):
+        pass
diff --git a/agentverse/environments/executor/code_test.py b/agentverse/environments/executor/code_test.py
index 9012c1720..4cb730e54 100644
--- a/agentverse/environments/executor/code_test.py
+++ b/agentverse/environments/executor/code_test.py
@@ -17,6 +17,8 @@ def execute_command(command: str) -> str:
 
 @executor_registry.register("code-test")
 class CodeTestExecutor(BaseExecutor):
+    has_test: dict = {}
+
     def step(
         self,
         agent: ExecutorAgent,
@@ -27,9 +29,13 @@ def step(
     ) -> Any:
         os.makedirs("tmp", exist_ok=True)
         self.write_to_file("tmp/main.py", solution)
-        response = agent.step(task_description, solution).content
-        self.write_to_file(response["file_path"], response["code"])
-        result = execute_command(f"python {response['file_path']}")
+        if task_description not in self.has_test:
+            response = agent.step(task_description, solution).content
+            self.write_to_file(response["file_path"], response["code"])
+            self.has_test[task_description] = f"python {response['file_path']}"
+            result = execute_command(f"python {response['file_path']}")
+        else:
+            result = execute_command(self.has_test[task_description])
         return result
 
     def write_to_file(self, file_name, file_content):
diff --git a/agentverse/initialization.py b/agentverse/initialization.py
index cd08fa4ee..463b573ec 100644
--- a/agentverse/initialization.py
+++ b/agentverse/initialization.py
@@ -4,13 +4,14 @@
 from typing import Dict, List, TYPE_CHECKING
 
 import yaml
-'''
+
+"""
 try:
     from bmtools.agent.singletool import import_all_apis, load_single_tools
 except:
     print("BMTools is not installed, tools cannot be used. To install BMTools, \
          please follow the instruction in the README.md file.")
-'''
+"""
 
 from agentverse.llms import llm_registry
 
diff --git a/agentverse/memory/chat_history.py b/agentverse/memory/chat_history.py
index a6f06242f..1666d1d4d 100644
--- a/agentverse/memory/chat_history.py
+++ b/agentverse/memory/chat_history.py
@@ -29,9 +29,9 @@ def to_string(self, add_sender_prefix: bool = False) -> str:
         else:
             return "\n".join([message.content for message in self.messages])
 
-    def to_messages(self, my_name: str = ""):
+    def to_messages(self, my_name: str = "", start_index: int = 0) -> List[dict]:
         messages = []
-        for message in self.messages:
+        for message in self.messages[start_index:]:
             messages.append(
                 {
                     "role": "user" if message.sender != my_name else "assistant",
diff --git a/agentverse/tasks/humaneval/gpt-3.5-concurrent/config.yaml b/agentverse/tasks/humaneval/gpt-3.5-concurrent/config.yaml
new file mode 100644
index 000000000..e4b9abce5
--- /dev/null
+++ b/agentverse/tasks/humaneval/gpt-3.5-concurrent/config.yaml
@@ -0,0 +1,244 @@
+cnt_agents: &cnt_agents 3
+max_turn: &max_turn 3
+max_criticizing_rounds: 3
+
+prompts:
+  role_assigner_prepend_prompt: &role_assigner_prepend_prompt |-
+    # Role Description
+    You are the leader of a group of experts, now you need to recruit a small group of experts with diverse identity to correctly write the code to solve the given problems:
+    ${task_description}
+    
+    You can recruit ${cnt_critic_agents} expert in different fields. What experts will you recruit to better generate an accurate solution?
+
+    Here are some suggestion:
+    ${advice}
+    
+  role_assigner_append_prompt: &role_assigner_append_prompt |-
+    # Response Format Guidance
+    You should respond with a list of expert description. For example:
+    1. an electrical engineer specified in the filed of xxx.
+    2. an economist who is good at xxx.
+    3. a lawyer with a good knowledge of xxx.
+    ...
+
+    Only respond with the description of each role. Do not include your reason.
+
+  solver_prepend_prompt: &solver_prepend_prompt |-
+    You and a team is discussing on completing the following function:
+    ```python
+    ${task_description}
+    ```
+  
+  solver_append_prompt: &solver_append_prompt |-
+    You are ${role_description}. From the above discussion, can you provide a correct completion of the code? You must respond with only the Python code wrapped with markdown quotes "```".
+  # You should respond in the following json format wrapped with markdown quotes:
+  # ```json
+  # {
+  #     "text": "your thought",
+  #     "reasoning": "your reasoning",
+  #     "criticism": "constructive self-criticism",
+  #     "code": "the final code completion",
+  # }
+  # ```
+
+  # Respond only the json, and nothing else. Make sure it can be directly parsed with Python `json.loads`.
+
+  critic_prepend_prompt: &critic_prepend_prompt |-
+    You are in a discussion group, aiming to complete the following code function:
+    ```python
+    ${task_description}
+    ```
+
+  critic_append_prompt: &critic_append_prompt |-
+    You are ${role_description}. Based on your knowledge, can you complete the function? You should explain your reasoning step by step, and give the code completion with detailed comment. 
+
+  manager_prompt: &manager_prompt |-
+    According to the Previous Solution and the Previous Sentences, select the most appropriate Critic from a specific Role and output the Role.
+    ```python 
+    ${task_description} 
+    ```
+    # Previous Solution
+    The solution you gave in the last step is:
+    ${former_solution}
+
+    # Critics
+    There are some critics on the above solution:
+    ```
+    ${critic_opinions}
+    ```
+
+    # Previous Sentences
+    The previous sentences in the previous rounds is:
+    ${previous_sentence}
+
+  executor_prepend_prompt: &executor_prepend_prompt |-
+    You are an experienced program tester. Now your team is trying to solve the problem: 
+    '''
+    Complete the Python function:
+    ${task_description}
+    '''
+
+    Your team has given the following answer:
+    '''
+    ${solution}
+    '''
+
+  executor_append_prompt: &executor_append_prompt |-
+    The solution has been written to `tmp/main.py`. Your are going to write the unit testing code for the solution. You should respond in the following json format wrapped with markdown quotes:
+    ```json
+    {
+        "thought": your thought,
+        "file_path": the path to write your testing code,
+        "code": the testing code,
+        "command": the command to change directory and execute your testing code
+    }
+    ```
+
+    Respond only the json, and nothing else.
+
+  evaluator_prepend_prompt: &evaluator_prepend_prompt |-
+    You are an experienced code reviewer. As a good reviewer, you carefully check the correctness of the given code completion. When the completion is incorrect, you should patiently teach the writer how to correct the completion.
+
+    # Experts
+    The experts recruited in this turn includes:
+    ${all_role_description}
+    
+    # Problem and Writer's Solution
+    Problem: 
+    ${task_description}
+
+    Writer's Solution: 
+    ${solution}
+
+  evaluator_append_prompt: &evaluator_append_prompt |-
+    # Response Format Guidance
+    You must respond in the following format:
+    Score: (0 or 1, 0 for incorrect and 1 for correct)
+    Response: (give your advice on how to correct the solution, and your suggestion on on what experts should recruit in the next round)
+
+
+name: pipeline
+
+
+environment:
+  env_type: pipeline
+  max_turn: *max_turn
+  role_assigner:
+    type: role_description
+    cnt_agents: *cnt_agents
+  decision_maker:
+    type: concurrent
+  executor:
+    type: none
+  evaluator:
+    type: basic
+
+agents:
+  - #role_assigner_agent:
+    agent_type: role_assigner
+    name: role assigner
+    prepend_prompt_template: *role_assigner_prepend_prompt
+    append_prompt_template: *role_assigner_append_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: "gpt-3.5-turbo"
+      temperature: 0
+      max_tokens: 512
+    output_parser:
+      type: role_assigner
+
+  - #solver_agent:
+    agent_type: solver
+    name: Planner
+    prepend_prompt_template: *solver_prepend_prompt
+    append_prompt_template: *solver_append_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: "gpt-3.5-turbo-16k"
+      temperature: 0.3
+      max_tokens: 2048
+    output_parser:
+      type: humaneval-solver
+      # stop:
+      #   - "\ndef "
+      #   - "\nclass "
+      #   - "\nif "
+      #   - "\n\n#"
+
+  - #critic_agents:
+    agent_type: critic
+    name: Critic 1
+    role_description: |-
+      Waiting to be assigned.
+    prepend_prompt_template: *critic_prepend_prompt
+    append_prompt_template: *critic_append_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: "gpt-3.5-turbo-16k"
+      temperature: 0
+      max_tokens: 2048
+    output_parser:
+      type: humaneval-critic-agree
+
+  - #executor_agent:
+    agent_type: executor
+    name: Executor
+    prepend_prompt_template: *executor_prepend_prompt
+    append_prompt_template: *executor_append_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: gpt-3.5-turbo
+      temperature: 0
+      max_tokens: 1024
+    output_parser:
+      type: humaneval-executor
+
+  - #evaluator_agent:
+    agent_type: evaluator
+    name: Evaluator
+    role_description: |-
+      Evaluator
+    prepend_prompt_template: *evaluator_prepend_prompt
+    append_prompt_template: *evaluator_append_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: gpt-3.5-turbo
+      temperature: 0.3
+      max_tokens: 1024
+    output_parser:
+      type: mgsm-evaluator
+      dimensions:
+        - Score
+
+
+  - #manager_agent:
+    agent_type: manager
+    name: Manager
+    prompt_template: *manager_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: "gpt-3.5-turbo"
+      temperature: 0
+      max_tokens: 1024
+    output_parser:
+      type: humaneval-manager
+
+
diff --git a/agentverse/tasks/humaneval/gpt-3.5-vertical-solver-first-autogpt-2/config.yaml b/agentverse/tasks/humaneval/gpt-3.5-vertical-solver-first-autogpt-2/config.yaml
new file mode 100644
index 000000000..e4dba494f
--- /dev/null
+++ b/agentverse/tasks/humaneval/gpt-3.5-vertical-solver-first-autogpt-2/config.yaml
@@ -0,0 +1,256 @@
+cnt_agents: &cnt_agents 2
+max_turn: &max_turn 5
+max_criticizing_rounds: 3
+
+prompts:
+  role_assigner_prepend_prompt: &role_assigner_prepend_prompt |-
+    # Role Description
+    You are the leader of a group of experts, now you need to recruit a small group of experts with diverse identity to correctly write the code to solve the given problems:
+    ${task_description}
+    
+    You can recruit ${cnt_critic_agents} expert in different fields. What experts will you recruit to better generate an accurate solution?
+
+    Here are some suggestion:
+    ${advice}
+    
+  role_assigner_append_prompt: &role_assigner_append_prompt |-
+    # Response Format Guidance
+    You should respond with a list of expert description. For example:
+    1. an electrical engineer specified in the filed of xxx.
+    2. an economist who is good at xxx.
+    3. a lawyer with a good knowledge of xxx.
+    ...
+
+    Only respond with the description of each role. Do not include your reason.
+
+  solver_prepend_prompt: &solver_prepend_prompt |-
+    Can you complete the following code?
+    ```python
+    ${task_description}
+    ```
+  
+  solver_append_prompt: &solver_append_prompt |-
+    You are ${role_description}. Using these information, can you provide a correct completion of the code?
+    You should respond in the following format:
+    Text: your thought
+    Reasoning: your reasoning
+    Criticism: constructive self-criticism
+    Code: the final code completion with step by step comment
+
+  # Example Function:
+  # ```python
+  # def add(num1: float, num2: float) -> float:
+  # """A function that returns the addition of the two number."""
+  #   return num1 + num2
+  # ```
+
+  # Example Response:
+  # Text: 
+
+  critic_prepend_prompt: &critic_prepend_prompt |-
+    You are in a discussion group, aiming to complete the following code function:
+    ```python
+    ${task_description}
+    ```
+
+  critic_append_prompt: &critic_append_prompt |-
+    You are ${role_description}. Based on your knowledge, can you check the correctness of the latest completion given above? Write your reasoning step by step. You should respond in the following format:
+    Text: your thought
+    Reasoning: your reasoning
+    Criticism: constructive self-criticism
+    Speak: your words to say to others
+    Final Decision: "[Agree]" if you think the latest completion is correct, or "[Disagree]" if you think it is wrong.
+
+  manager_prompt: &manager_prompt |-
+    According to the Previous Solution and the Previous Sentences, select the most appropriate Critic from a specific Role and output the Role.
+    ```python 
+    ${task_description} 
+    ```
+    # Previous Solution
+    The solution you gave in the last step is:
+    ${former_solution}
+
+    # Critics
+    There are some critics on the above solution:
+    ```
+    ${critic_opinions}
+    ```
+
+    # Previous Sentences
+    The previous sentences in the previous rounds is:
+    ${previous_sentence}
+
+  executor_prepend_prompt: &executor_prepend_prompt |-
+    You are an experienced program tester. Now your team is trying to solve the problem: 
+    '''
+    Complete the Python function:
+    ${task_description}
+    '''
+
+    Your team has given the following answer:
+    '''
+    ${solution}
+    '''
+
+  executor_append_prompt: &executor_append_prompt |-
+    The solution has been written to `tmp/main.py`. Your are going to write the unit testing code for the solution. You should respond in the following json format wrapped with markdown quotes:
+    ```json
+    {
+        "thought": your thought,
+        "file_path": the path to write your testing code,
+        "code": the testing code,
+        "command": the command to change directory and execute your testing code
+    }
+    ```
+
+    Respond only the json, and nothing else.
+
+  evaluator_prepend_prompt: &evaluator_prepend_prompt |-
+    You are an experienced code reviewer. As a good reviewer, you carefully check the correctness of the given code completion. When the completion is incorrect, you should patiently teach the writer how to correct the completion.
+
+    # Experts
+    The experts recruited in this turn includes:
+    ${all_role_description}
+    
+    # Problem and Writer's Solution
+    Problem: 
+    ${task_description}
+
+    Writer's Solution: 
+    ${solution}
+
+    Tester's Feedback:
+    ${result}
+
+  evaluator_append_prompt: &evaluator_append_prompt |-
+    # Response Format Guidance
+    You must respond in the following format:
+    Score: (0 or 1, 0 for incorrect and 1 for correct)
+    Response: (give your advice on how to correct the solution, and your suggestion on on what experts should recruit in the next round)
+
+
+name: pipeline
+
+
+environment:
+  env_type: pipeline
+  max_turn: *max_turn
+  role_assigner:
+    type: role_description
+    cnt_agents: *cnt_agents
+  decision_maker:
+    type: vertical-solver-first
+  executor:
+    type: none
+  evaluator:
+    type: basic
+
+agents:
+  - #role_assigner_agent:
+    agent_type: role_assigner
+    name: role assigner
+    prepend_prompt_template: *role_assigner_prepend_prompt
+    append_prompt_template: *role_assigner_append_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: "gpt-3.5-turbo"
+      temperature: 0
+      max_tokens: 512
+    output_parser:
+      type: role_assigner
+
+  - #solver_agent:
+    agent_type: solver
+    name: Planner
+    prepend_prompt_template: *solver_prepend_prompt
+    append_prompt_template: *solver_append_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: "gpt-3.5-turbo-16k"
+      temperature: 0.3
+      max_tokens: 2048
+    output_parser:
+      type: humaneval-solver-autogpt-2
+      # stop:
+      #   - "\ndef "
+      #   - "\nclass "
+      #   - "\nif "
+      #   - "\n\n#"
+
+  - #critic_agents:
+    agent_type: critic
+    name: Critic 1
+    role_description: |-
+      Waiting to be assigned.
+    prepend_prompt_template: *critic_prepend_prompt
+    append_prompt_template: *critic_append_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: "gpt-3.5-turbo-16k"
+      temperature: 0
+      max_tokens: 2048
+    output_parser:
+      type: humaneval-critic-autogpt
+
+  - #executor_agent:
+    agent_type: executor
+    name: Executor
+    prepend_prompt_template: *executor_prepend_prompt
+    append_prompt_template: *executor_append_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: gpt-3.5-turbo
+      temperature: 0
+      max_tokens: 1024
+    output_parser:
+      type: humaneval-executor
+
+  - #evaluator_agent:
+    agent_type: evaluator
+    name: Evaluator
+    role_description: |-
+      Evaluator
+    prepend_prompt_template: *evaluator_prepend_prompt
+    append_prompt_template: *evaluator_append_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: gpt-3.5-turbo
+      temperature: 0.3
+      max_tokens: 1024
+    output_parser:
+      type: mgsm-evaluator
+      dimensions:
+        - Score
+
+
+  - #manager_agent:
+    agent_type: manager
+    name: Manager
+    prompt_template: *manager_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: "gpt-3.5-turbo"
+      temperature: 0
+      max_tokens: 1024
+    output_parser:
+      type: humaneval-manager
+
+
diff --git a/agentverse/tasks/humaneval/gpt-3.5-vertical-solver-first-autogpt/config.yaml b/agentverse/tasks/humaneval/gpt-3.5-vertical-solver-first-autogpt/config.yaml
new file mode 100644
index 000000000..81c60f95f
--- /dev/null
+++ b/agentverse/tasks/humaneval/gpt-3.5-vertical-solver-first-autogpt/config.yaml
@@ -0,0 +1,253 @@
+cnt_agents: &cnt_agents 2
+max_turn: &max_turn 5
+max_criticizing_rounds: 3
+
+prompts:
+  role_assigner_prepend_prompt: &role_assigner_prepend_prompt |-
+    # Role Description
+    You are the leader of a group of experts, now you need to recruit a small group of experts with diverse identity to correctly write the code to solve the given problems:
+    ${task_description}
+    
+    You can recruit ${cnt_critic_agents} expert in different fields. What experts will you recruit to better generate an accurate solution?
+
+    Here are some suggestion:
+    ${advice}
+    
+  role_assigner_append_prompt: &role_assigner_append_prompt |-
+    # Response Format Guidance
+    You should respond with a list of expert description. For example:
+    1. an electrical engineer specified in the filed of xxx.
+    2. an economist who is good at xxx.
+    3. a lawyer with a good knowledge of xxx.
+    ...
+
+    Only respond with the description of each role. Do not include your reason.
+
+  solver_prepend_prompt: &solver_prepend_prompt |-
+    Can you complete the following code?
+    ```python
+    ${task_description}
+    ```
+  
+  solver_append_prompt: &solver_append_prompt |-
+    You are ${role_description}. Using these information, can you provide a correct completion of the code?
+    You should respond in the following format:
+    Text: your thought
+    Reasoning: your reasoning
+    Criticism: constructive self-criticism
+    Code: the final code completion with step by step comment
+
+  # Example Function:
+  # ```python
+  # def add(num1: float, num2: float) -> float:
+  # """A function that returns the addition of the two number."""
+  #   return num1 + num2
+  # ```
+
+  # Example Response:
+  # Text: 
+
+  critic_prepend_prompt: &critic_prepend_prompt |-
+    You are in a discussion group, aiming to complete the following code function:
+    ```python
+    ${task_description}
+    ```
+
+  critic_append_prompt: &critic_append_prompt |-
+    You are ${role_description}. Based on your knowledge, can you check the correctness of the latest completion given above? Write your reasoning step by step. When responding, you should follow the rules:
+    1. If the latest provided solution is correct, end your response with a special token "[Agree]". 
+    2. If the solution is incorrect, give your comment and end your response with a special token "[Disagree]".
+
+  manager_prompt: &manager_prompt |-
+    According to the Previous Solution and the Previous Sentences, select the most appropriate Critic from a specific Role and output the Role.
+    ```python 
+    ${task_description} 
+    ```
+    # Previous Solution
+    The solution you gave in the last step is:
+    ${former_solution}
+
+    # Critics
+    There are some critics on the above solution:
+    ```
+    ${critic_opinions}
+    ```
+
+    # Previous Sentences
+    The previous sentences in the previous rounds is:
+    ${previous_sentence}
+
+  executor_prepend_prompt: &executor_prepend_prompt |-
+    You are an experienced program tester. Now your team is trying to solve the problem: 
+    '''
+    Complete the Python function:
+    ${task_description}
+    '''
+
+    Your team has given the following answer:
+    '''
+    ${solution}
+    '''
+
+  executor_append_prompt: &executor_append_prompt |-
+    The solution has been written to `tmp/main.py`. Your are going to write the unit testing code for the solution. You should respond in the following json format wrapped with markdown quotes:
+    ```json
+    {
+        "thought": your thought,
+        "file_path": the path to write your testing code,
+        "code": the testing code,
+        "command": the command to change directory and execute your testing code
+    }
+    ```
+
+    Respond only the json, and nothing else.
+
+  evaluator_prepend_prompt: &evaluator_prepend_prompt |-
+    You are an experienced code reviewer. As a good reviewer, you carefully check the correctness of the given code completion. When the completion is incorrect, you should patiently teach the writer how to correct the completion.
+
+    # Experts
+    The experts recruited in this turn includes:
+    ${all_role_description}
+    
+    # Problem and Writer's Solution
+    Problem: 
+    ${task_description}
+
+    Writer's Solution: 
+    ${solution}
+
+    Tester's Feedback:
+    ${result}
+
+  evaluator_append_prompt: &evaluator_append_prompt |-
+    # Response Format Guidance
+    You must respond in the following format:
+    Score: (0 or 1, 0 for incorrect and 1 for correct)
+    Response: (give your advice on how to correct the solution, and your suggestion on on what experts should recruit in the next round)
+
+
+name: pipeline
+
+
+environment:
+  env_type: pipeline
+  max_turn: *max_turn
+  role_assigner:
+    type: role_description
+    cnt_agents: *cnt_agents
+  decision_maker:
+    type: vertical-solver-first
+  executor:
+    type: none
+  evaluator:
+    type: basic
+
+agents:
+  - #role_assigner_agent:
+    agent_type: role_assigner
+    name: role assigner
+    prepend_prompt_template: *role_assigner_prepend_prompt
+    append_prompt_template: *role_assigner_append_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: "gpt-3.5-turbo"
+      temperature: 0
+      max_tokens: 512
+    output_parser:
+      type: role_assigner
+
+  - #solver_agent:
+    agent_type: solver
+    name: Planner
+    prepend_prompt_template: *solver_prepend_prompt
+    append_prompt_template: *solver_append_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: "gpt-3.5-turbo-16k"
+      temperature: 0.3
+      max_tokens: 2048
+    output_parser:
+      type: humaneval-solver-autogpt-2
+      # stop:
+      #   - "\ndef "
+      #   - "\nclass "
+      #   - "\nif "
+      #   - "\n\n#"
+
+  - #critic_agents:
+    agent_type: critic
+    name: Critic 1
+    role_description: |-
+      Waiting to be assigned.
+    prepend_prompt_template: *critic_prepend_prompt
+    append_prompt_template: *critic_append_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: "gpt-3.5-turbo-16k"
+      temperature: 0
+      max_tokens: 2048
+    output_parser:
+      type: humaneval-critic-agree
+
+  - #executor_agent:
+    agent_type: executor
+    name: Executor
+    prepend_prompt_template: *executor_prepend_prompt
+    append_prompt_template: *executor_append_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: gpt-3.5-turbo
+      temperature: 0
+      max_tokens: 1024
+    output_parser:
+      type: humaneval-executor
+
+  - #evaluator_agent:
+    agent_type: evaluator
+    name: Evaluator
+    role_description: |-
+      Evaluator
+    prepend_prompt_template: *evaluator_prepend_prompt
+    append_prompt_template: *evaluator_append_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: gpt-3.5-turbo
+      temperature: 0.3
+      max_tokens: 1024
+    output_parser:
+      type: mgsm-evaluator
+      dimensions:
+        - Score
+
+
+  - #manager_agent:
+    agent_type: manager
+    name: Manager
+    prompt_template: *manager_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: "gpt-3.5-turbo"
+      temperature: 0
+      max_tokens: 1024
+    output_parser:
+      type: humaneval-manager
+
+
diff --git a/agentverse/tasks/humaneval/gpt-3.5-vertical-solver-first/config.yaml b/agentverse/tasks/humaneval/gpt-3.5-vertical-solver-first/config.yaml
new file mode 100644
index 000000000..faa64da55
--- /dev/null
+++ b/agentverse/tasks/humaneval/gpt-3.5-vertical-solver-first/config.yaml
@@ -0,0 +1,249 @@
+cnt_agents: &cnt_agents 2
+max_turn: &max_turn 5
+max_criticizing_rounds: 3
+
+prompts:
+  role_assigner_prepend_prompt: &role_assigner_prepend_prompt |-
+    # Role Description
+    You are the leader of a group of experts, now you need to recruit a small group of experts with diverse identity to correctly write the code to solve the given problems:
+    ${task_description}
+    
+    You can recruit ${cnt_critic_agents} expert in different fields. What experts will you recruit to better generate an accurate solution?
+
+    Here are some suggestion:
+    ${advice}
+    
+  role_assigner_append_prompt: &role_assigner_append_prompt |-
+    # Response Format Guidance
+    You should respond with a list of expert description. For example:
+    1. an electrical engineer specified in the filed of xxx.
+    2. an economist who is good at xxx.
+    3. a lawyer with a good knowledge of xxx.
+    ...
+
+    Only respond with the description of each role. Do not include your reason.
+
+  solver_prepend_prompt: &solver_prepend_prompt |-
+    Can you complete the following code?
+    ```python
+    ${task_description}
+    ```
+  
+  solver_append_prompt: &solver_append_prompt |-
+    You are ${role_description}. Using these information, can you provide a correct completion of the code? Explain your reasoning step by step in the code comment. You must respond with only the Python code wrapped with markdown quotes "```".
+  # You should respond in the following json format wrapped with markdown quotes:
+  # ```json
+  # {
+  #     "text": "your thought",
+  #     "reasoning": "your reasoning",
+  #     "criticism": "constructive self-criticism",
+  #     "code": "the final code completion",
+  # }
+  # ```
+
+  # Respond only the json, and nothing else. Make sure it can be directly parsed with Python `json.loads`.
+
+  critic_prepend_prompt: &critic_prepend_prompt |-
+    You are in a discussion group, aiming to complete the following code function:
+    ```python
+    ${task_description}
+    ```
+
+  critic_append_prompt: &critic_append_prompt |-
+    You are ${role_description}. Based on your knowledge, can you check the correctness of the latest completion given above? Write your reasoning step by step. When responding, you should follow the rules:
+    1. If the latest provided solution is correct, end your response with a special token "[Agree]". 
+    2. If the solution is incorrect, give your comment and end your response with a special token "[Disagree]".
+
+  manager_prompt: &manager_prompt |-
+    According to the Previous Solution and the Previous Sentences, select the most appropriate Critic from a specific Role and output the Role.
+    ```python 
+    ${task_description} 
+    ```
+    # Previous Solution
+    The solution you gave in the last step is:
+    ${former_solution}
+
+    # Critics
+    There are some critics on the above solution:
+    ```
+    ${critic_opinions}
+    ```
+
+    # Previous Sentences
+    The previous sentences in the previous rounds is:
+    ${previous_sentence}
+
+  executor_prepend_prompt: &executor_prepend_prompt |-
+    You are an experienced program tester. Now your team is trying to solve the problem: 
+    '''
+    Complete the Python function:
+    ${task_description}
+    '''
+
+    Your team has given the following answer:
+    '''
+    ${solution}
+    '''
+
+  executor_append_prompt: &executor_append_prompt |-
+    The solution has been written to `tmp/main.py`. Your are going to write the unit testing code for the solution. You should respond in the following json format wrapped with markdown quotes:
+    ```json
+    {
+        "thought": your thought,
+        "file_path": the path to write your testing code,
+        "code": the testing code,
+        "command": the command to change directory and execute your testing code
+    }
+    ```
+
+    Respond only the json, and nothing else.
+
+  evaluator_prepend_prompt: &evaluator_prepend_prompt |-
+    You are an experienced code reviewer. As a good reviewer, you carefully check the correctness of the given code completion. When the completion is incorrect, you should patiently teach the writer how to correct the completion.
+
+    # Experts
+    The experts recruited in this turn includes:
+    ${all_role_description}
+    
+    # Problem and Writer's Solution
+    Problem: 
+    ${task_description}
+
+    Writer's Solution: 
+    ${solution}
+
+    Tester's Feedback:
+    ${result}
+
+  evaluator_append_prompt: &evaluator_append_prompt |-
+    # Response Format Guidance
+    You must respond in the following format:
+    Score: (0 or 1, 0 for incorrect and 1 for correct)
+    Response: (give your advice on how to correct the solution, and your suggestion on on what experts should recruit in the next round)
+
+
+name: pipeline
+
+
+environment:
+  env_type: pipeline
+  max_turn: *max_turn
+  role_assigner:
+    type: role_description
+    cnt_agents: *cnt_agents
+  decision_maker:
+    type: vertical-solver-first
+  executor:
+    type: none
+  evaluator:
+    type: basic
+
+agents:
+  - #role_assigner_agent:
+    agent_type: role_assigner
+    name: role assigner
+    prepend_prompt_template: *role_assigner_prepend_prompt
+    append_prompt_template: *role_assigner_append_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: "gpt-3.5-turbo"
+      temperature: 0
+      max_tokens: 512
+    output_parser:
+      type: role_assigner
+
+  - #solver_agent:
+    agent_type: solver
+    name: Planner
+    prepend_prompt_template: *solver_prepend_prompt
+    append_prompt_template: *solver_append_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: "gpt-3.5-turbo-16k"
+      temperature: 0.3
+      max_tokens: 2048
+    output_parser:
+      type: humaneval-solver
+      # stop:
+      #   - "\ndef "
+      #   - "\nclass "
+      #   - "\nif "
+      #   - "\n\n#"
+
+  - #critic_agents:
+    agent_type: critic
+    name: Critic 1
+    role_description: |-
+      Waiting to be assigned.
+    prepend_prompt_template: *critic_prepend_prompt
+    append_prompt_template: *critic_append_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: "gpt-3.5-turbo"
+      temperature: 0
+      max_tokens: 1024
+    output_parser:
+      type: humaneval-critic-agree
+
+  - #executor_agent:
+    agent_type: executor
+    name: Executor
+    prepend_prompt_template: *executor_prepend_prompt
+    append_prompt_template: *executor_append_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: gpt-3.5-turbo
+      temperature: 0
+      max_tokens: 1024
+    output_parser:
+      type: humaneval-executor
+
+  - #evaluator_agent:
+    agent_type: evaluator
+    name: Evaluator
+    role_description: |-
+      Evaluator
+    prepend_prompt_template: *evaluator_prepend_prompt
+    append_prompt_template: *evaluator_append_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: gpt-3.5-turbo
+      temperature: 0.3
+      max_tokens: 1024
+    output_parser:
+      type: mgsm-evaluator
+      dimensions:
+        - Score
+
+
+  - #manager_agent:
+    agent_type: manager
+    name: Manager
+    prompt_template: *manager_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: "gpt-3.5-turbo"
+      temperature: 0
+      max_tokens: 1024
+    output_parser:
+      type: humaneval-manager
+
+
diff --git a/agentverse/tasks/humaneval/gpt-4-new-vertical-sovler-first-rust/config.yaml b/agentverse/tasks/humaneval/gpt-4-new-vertical-sovler-first-rust/config.yaml
new file mode 100644
index 000000000..c24d66e6a
--- /dev/null
+++ b/agentverse/tasks/humaneval/gpt-4-new-vertical-sovler-first-rust/config.yaml
@@ -0,0 +1,244 @@
+cnt_agents: &cnt_agents 4
+max_turn: &max_turn 2
+max_criticizing_rounds: 2
+
+prompts:
+  role_assigner_prepend_prompt: &role_assigner_prepend_prompt |-
+    # Role Description
+    You are the leader of a group of experts, now you need to recruit a small group of experts with diverse identity to correctly write the code to solve the given problems:
+    ${task_description}
+    
+    You can recruit ${cnt_critic_agents} expert in different fields. What experts will you recruit to better generate an accurate solution?
+
+    Here are some suggestion:
+    ${advice}
+    
+  role_assigner_append_prompt: &role_assigner_append_prompt |-
+    # Response Format Guidance
+    You should respond with a list of expert description. For example:
+    1. an electrical engineer specified in the filed of xxx.
+    2. an economist who is good at xxx.
+    3. a lawyer with a good knowledge of xxx.
+    ...
+
+    Only respond with the description of each role. Do not include your reason.
+
+  solver_prepend_prompt: &solver_prepend_prompt |-
+    Can you complete the following code?
+    ```rust 
+    ${task_description} 
+    ```
+
+  # # Previous Solution
+  # The solution you gave in the last step is:
+  # ${former_solution}
+
+  # # Evaluation
+  # The unit testing gave the following suggestion:
+  # ${advice}
+
+  # # Critics
+  # The following messages are the critics on the provided solution:
+  
+  solver_append_prompt: &solver_append_prompt |-
+    You are ${role_description}. Provide a correct completion of the code. Explain your reasoning. Your response should contain only Rust code. Do not give any additional information. Use ```rust to put the completed Rust code in markdown quotes. When responding, please include the given code and the completion.
+
+  critic_prepend_prompt: &critic_prepend_prompt |-
+    You are in a discussion group, aiming to complete the following code function:
+    ```rust
+    ${task_description}
+    ```
+
+  # Below is a possible code completion:
+  # ```
+  # ${preliminary_solution}
+  # ```
+
+  critic_append_prompt: &critic_append_prompt |-
+    You are ${role_description}. Based on your knowledge, can you check the functional correctness of the latest completion given above? When responding, you should follow the following rules:
+    1. If the latest provided solution is correct, end your response with a special token "[Agree]". 
+    2. If the solution is incorrect, give your comment and end your response with a special token "[Disagree]".
+  
+
+  manager_prompt: &manager_prompt |-
+    According to the Previous Solution and the Previous Sentences, select the most appropriate Critic from a specific Role and output the Role.
+    ```rust 
+    ${task_description} 
+    ```
+    # Previous Solution
+    The solution you gave in the last step is:
+    ${former_solution}
+
+    # Critics
+    There are some critics on the above solution:
+    ```
+    ${critic_opinions}
+    ```
+
+    # Previous Sentences
+    The previous sentences in the previous rounds is:
+    ${previous_sentence}
+
+  executor_prepend_prompt: &executor_prepend_prompt |-
+    You are an experienced program tester. Now your team is trying to solve the problem: 
+    '''
+    Complete the Rust function:
+    ${task_description}
+    '''
+
+  executor_append_prompt: &executor_append_prompt |-
+    The solution has been written to `tmp/main.py`. Your are going to write the unit testing code for the solution. You should respond in the following format:
+    Thought: your thought
+    File Path: the path to write your testing code
+    Code: the testing code with explaination in docstring. make sure to write the input in the assertion to make it appear in the unit test report.
+    Command: the command to change directory and execute your testing code
+
+  evaluator_prepend_prompt: &evaluator_prepend_prompt |-
+    # Problem
+    Complete the following function
+    ```rust
+    ${task_description}
+    ```
+    
+    # Experts
+    The experts recruited in this turn includes:
+    ${all_role_description}
+
+    # Writer's Solution: 
+    ${solution}
+
+    # Tester's Feedback:
+    ${result}
+
+  evaluator_append_prompt: &evaluator_append_prompt |-
+    # Response Format Guidance
+    You must respond in the following format:
+    Score: (0 or 1, 0 for incorrect and 1 for correct)
+    Response: (give your advice on how to correct the solution, and your suggestion on what experts should recruit in the next round)
+    
+
+name: pipeline
+
+
+environment:
+  env_type: pipeline
+  max_turn: *max_turn
+  role_assigner:
+    type: role_description
+    cnt_agents: *cnt_agents
+  decision_maker:
+    type: vertical-solver-first
+  executor:
+    type: code-test
+  evaluator:
+    type: basic
+
+agents:
+  - #role_assigner_agent:
+    agent_type: role_assigner
+    name: role assigner
+    max_retry: 1000
+    prepend_prompt_template: *role_assigner_prepend_prompt
+    append_prompt_template: *role_assigner_append_prompt
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-4
+      model: "gpt-4"
+      temperature: 0
+      max_tokens: 512
+    output_parser:
+      type: role_assigner
+
+  - #solver_agent:
+    agent_type: solver
+    name: Planner
+    max_retry: 1000
+    prepend_prompt_template: *solver_prepend_prompt
+    append_prompt_template: *solver_append_prompt
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-4
+      model: "gpt-4"
+      temperature: 0
+      max_tokens: 1024
+    output_parser:
+      type: humaneval-solver
+      # stop:
+      #   - "\ndef "
+      #   - "\nclass "
+      #   - "\nif "
+      #   - "\n\n#"
+
+  - #critic_agents:
+    agent_type: critic
+    name: Critic 1
+    max_retry: 1000
+    role_description: |-
+      Waiting to be assigned.
+    prepend_prompt_template: *critic_prepend_prompt
+    append_prompt_template: *critic_append_prompt
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-4
+      model: "gpt-4"
+      temperature: 0
+      max_tokens: 1024
+    output_parser:
+      type: mgsm-critic-agree
+
+  - #executor_agent:
+    agent_type: executor
+    name: Executor
+    max_retry: 1000
+    prepend_prompt_template: *executor_prepend_prompt
+    append_prompt_template: *executor_append_prompt
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-4
+      model: gpt-4
+      temperature: 0
+      max_tokens: 1024
+    output_parser:
+      type: humaneval-executor
+
+  - #evaluator_agent:
+    agent_type: evaluator
+    name: Evaluator
+    max_retry: 1000
+    role_description: |-
+      Evaluator
+    prepend_prompt_template: *evaluator_prepend_prompt
+    append_prompt_template: *evaluator_append_prompt
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-4
+      model: gpt-4
+      temperature: 0.3
+      max_tokens: 1024
+    output_parser:
+      type: mgsm-evaluator
+      dimensions:
+        - Score
+
+
+  - #manager_agent:
+    agent_type: manager
+    name: Manager
+    max_retry: 1000
+    prompt_template: *manager_prompt
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-4
+      model: "gpt-4"
+      temperature: 0
+      max_tokens: 1024
+    output_parser:
+      type: humaneval-manager
+
+
diff --git a/agentverse/tasks/humaneval/gpt-4-new-vertical-sovler-first/config.yaml b/agentverse/tasks/humaneval/gpt-4-new-vertical-sovler-first/config.yaml
new file mode 100644
index 000000000..6e98549b7
--- /dev/null
+++ b/agentverse/tasks/humaneval/gpt-4-new-vertical-sovler-first/config.yaml
@@ -0,0 +1,244 @@
+cnt_agents: &cnt_agents 4
+max_turn: &max_turn 2
+max_criticizing_rounds: 2
+
+prompts:
+  role_assigner_prepend_prompt: &role_assigner_prepend_prompt |-
+    # Role Description
+    You are the leader of a group of experts, now you need to recruit a small group of experts with diverse identity to correctly write the code to solve the given problems:
+    ${task_description}
+    
+    You can recruit ${cnt_critic_agents} expert in different fields. What experts will you recruit to better generate an accurate solution?
+
+    Here are some suggestion:
+    ${advice}
+    
+  role_assigner_append_prompt: &role_assigner_append_prompt |-
+    # Response Format Guidance
+    You should respond with a list of expert description. For example:
+    1. an electrical engineer specified in the filed of xxx.
+    2. an economist who is good at xxx.
+    3. a lawyer with a good knowledge of xxx.
+    ...
+
+    Only respond with the description of each role. Do not include your reason.
+
+  solver_prepend_prompt: &solver_prepend_prompt |-
+    Can you complete the following code?
+    ```python 
+    ${task_description} 
+    ```
+
+  # # Previous Solution
+  # The solution you gave in the last step is:
+  # ${former_solution}
+
+  # # Evaluation
+  # The unit testing gave the following suggestion:
+  # ${advice}
+
+  # # Critics
+  # The following messages are the critics on the provided solution:
+  
+  solver_append_prompt: &solver_append_prompt |-
+    You are ${role_description}. Provide a correct completion of the code. Explain your reasoning. Your response should contain only Python code. Do not give any additional information. Use ```python to put the completed Python code in markdown quotes. When responding, please include the given code and the completion.
+
+  critic_prepend_prompt: &critic_prepend_prompt |-
+    You are in a discussion group, aiming to complete the following code function:
+    ```python
+    ${task_description}
+    ```
+
+  # Below is a possible code completion:
+  # ```
+  # ${preliminary_solution}
+  # ```
+
+  critic_append_prompt: &critic_append_prompt |-
+    You are ${role_description}. Based on your knowledge, can you check the functional correctness of the latest completion given above? When responding, you should follow the following rules:
+    1. If the latest provided solution is correct, end your response with a special token "[Agree]". 
+    2. If the solution is incorrect, give your comment and end your response with a special token "[Disagree]".
+  
+
+  manager_prompt: &manager_prompt |-
+    According to the Previous Solution and the Previous Sentences, select the most appropriate Critic from a specific Role and output the Role.
+    ```python 
+    ${task_description} 
+    ```
+    # Previous Solution
+    The solution you gave in the last step is:
+    ${former_solution}
+
+    # Critics
+    There are some critics on the above solution:
+    ```
+    ${critic_opinions}
+    ```
+
+    # Previous Sentences
+    The previous sentences in the previous rounds is:
+    ${previous_sentence}
+
+  executor_prepend_prompt: &executor_prepend_prompt |-
+    You are an experienced program tester. Now your team is trying to solve the problem: 
+    '''
+    Complete the Python function:
+    ${task_description}
+    '''
+
+  executor_append_prompt: &executor_append_prompt |-
+    The solution has been written to `tmp/main.py`. Your are going to write the unit testing code for the solution. You should respond in the following format:
+    Thought: your thought
+    File Path: the path to write your testing code
+    Code: the testing code with explaination in docstring. make sure to write the input in the assertion to make it appear in the unit test report.
+    Command: the command to change directory and execute your testing code
+
+  evaluator_prepend_prompt: &evaluator_prepend_prompt |-
+    # Problem
+    Complete the following function
+    ```python
+    ${task_description}
+    ```
+    
+    # Experts
+    The experts recruited in this turn includes:
+    ${all_role_description}
+
+    # Writer's Solution: 
+    ${solution}
+
+    # Tester's Feedback:
+    ${result}
+
+  evaluator_append_prompt: &evaluator_append_prompt |-
+    # Response Format Guidance
+    You must respond in the following format:
+    Score: (0 or 1, 0 for incorrect and 1 for correct)
+    Response: (give your advice on how to correct the solution, and your suggestion on what experts should recruit in the next round)
+    
+
+name: pipeline
+
+
+environment:
+  env_type: pipeline
+  max_turn: *max_turn
+  role_assigner:
+    type: role_description
+    cnt_agents: *cnt_agents
+  decision_maker:
+    type: vertical-solver-first
+  executor:
+    type: code-test
+  evaluator:
+    type: basic
+
+agents:
+  - #role_assigner_agent:
+    agent_type: role_assigner
+    name: role assigner
+    max_retry: 1000
+    prepend_prompt_template: *role_assigner_prepend_prompt
+    append_prompt_template: *role_assigner_append_prompt
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-4
+      model: "gpt-4"
+      temperature: 0
+      max_tokens: 512
+    output_parser:
+      type: role_assigner
+
+  - #solver_agent:
+    agent_type: solver
+    name: Planner
+    max_retry: 1000
+    prepend_prompt_template: *solver_prepend_prompt
+    append_prompt_template: *solver_append_prompt
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-4
+      model: "gpt-4"
+      temperature: 0
+      max_tokens: 1024
+    output_parser:
+      type: humaneval-solver
+      # stop:
+      #   - "\ndef "
+      #   - "\nclass "
+      #   - "\nif "
+      #   - "\n\n#"
+
+  - #critic_agents:
+    agent_type: critic
+    name: Critic 1
+    max_retry: 1000
+    role_description: |-
+      Waiting to be assigned.
+    prepend_prompt_template: *critic_prepend_prompt
+    append_prompt_template: *critic_append_prompt
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-4
+      model: "gpt-4"
+      temperature: 0
+      max_tokens: 1024
+    output_parser:
+      type: mgsm-critic-agree
+
+  - #executor_agent:
+    agent_type: executor
+    name: Executor
+    max_retry: 1000
+    prepend_prompt_template: *executor_prepend_prompt
+    append_prompt_template: *executor_append_prompt
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-4
+      model: gpt-4
+      temperature: 0
+      max_tokens: 1024
+    output_parser:
+      type: humaneval-executor
+
+  - #evaluator_agent:
+    agent_type: evaluator
+    name: Evaluator
+    max_retry: 1000
+    role_description: |-
+      Evaluator
+    prepend_prompt_template: *evaluator_prepend_prompt
+    append_prompt_template: *evaluator_append_prompt
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-4
+      model: gpt-4
+      temperature: 0.3
+      max_tokens: 1024
+    output_parser:
+      type: mgsm-evaluator
+      dimensions:
+        - Score
+
+
+  - #manager_agent:
+    agent_type: manager
+    name: Manager
+    max_retry: 1000
+    prompt_template: *manager_prompt
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-4
+      model: "gpt-4"
+      temperature: 0
+      max_tokens: 1024
+    output_parser:
+      type: humaneval-manager
+
+
diff --git a/agentverse/tasks/humaneval/gpt-4-new/config.yaml b/agentverse/tasks/humaneval/gpt-4-new/config.yaml
index 72b24aede..1ec899eba 100644
--- a/agentverse/tasks/humaneval/gpt-4-new/config.yaml
+++ b/agentverse/tasks/humaneval/gpt-4-new/config.yaml
@@ -1,5 +1,5 @@
 cnt_agents: &cnt_agents 4
-max_rounds: &max_rounds 5
+max_turn: &max_turn 5
 max_criticizing_rounds: 3
 
 prompts:
@@ -128,8 +128,6 @@ prompts:
     You must respond in the following format:
     Score: (0 or 1, 0 for incorrect and 1 for correct)
     Response: (give your advice on how to correct the solution, and your suggestion on on what experts should recruit in the next round)
-
-    Now carefully check the writer's solution, summarize the tester's feedback, and give your response.
     
 
 name: pipeline
@@ -137,7 +135,7 @@ name: pipeline
 
 environment:
   env_type: pipeline
-  max_rounds: *max_rounds
+  max_turn: *max_turn
   role_assigner:
     type: role_description
     cnt_agents: *cnt_agents
diff --git a/agentverse/tasks/humaneval/output_parser.py b/agentverse/tasks/humaneval/output_parser.py
index 2e484d785..83a2514e6 100644
--- a/agentverse/tasks/humaneval/output_parser.py
+++ b/agentverse/tasks/humaneval/output_parser.py
@@ -23,16 +23,52 @@ def parse(self, output: LLMResult) -> Union[AgentAction, AgentFinish]:
 class HumanevalSolverParser(OutputParser):
     def parse(self, output: LLMResult) -> Union[AgentAction, AgentFinish]:
         text = output.content
-        end_pos = text.rfind("```")
-        if end_pos == -1:
+        # start_pos = text.find("```")
+        # end_pos = text.rfind("```")
+        # if end_pos == -1:
+        #     raise OutputParserError(text)
+        # text = text[start_pos:end_pos]
+        # cleaned_output = text.strip().strip("```").strip()
+        # if cleaned_output.startswith("python"):
+        #     cleaned_output = cleaned_output[6:].strip()
+        # elif cleaned_output.startswith("python3"):
+        #     cleaned_output = cleaned_output[7:].strip()
+        code = re.findall(r"```.*?\n(.+?)```", text, re.DOTALL)[-1]
+
+        return AgentFinish({"output": code}, text)
+
+
+@output_parser_registry.register("humaneval-solver-autogpt")
+class HumanevalSolverParser(OutputParser):
+    def parse(self, output: LLMResult) -> Union[AgentAction, AgentFinish]:
+        text = output.content
+        json_dict = re.findall(r"```.*?\n(.+?)```", text, re.DOTALL)[-1]
+        try:
+            cleaned_output = ast.literal_eval(json_dict)
+        except BaseException as e:
             raise OutputParserError(text)
-        text = text[:end_pos]
-        cleaned_output = text.strip().strip("```").strip()
-        if cleaned_output.startswith("python"):
-            cleaned_output = cleaned_output[6:].strip()
-        elif cleaned_output.startswith("python3"):
-            cleaned_output = cleaned_output[7:].strip()
-        return AgentFinish({"output": cleaned_output}, text)
+        if "code" not in json_dict:
+            raise OutputParserError(text)
+        return AgentFinish({"output": cleaned_output["code"]}, text)
+
+
+@output_parser_registry.register("humaneval-solver-autogpt-2")
+class HumanevalSolverParser(OutputParser):
+    def parse(self, output: LLMResult) -> Union[AgentAction, AgentFinish]:
+        text = output.content
+        try:
+            parsed_result = re.findall(
+                r"Text:(.+?)Reasoning:(.+?)Criticism:(.+?)Code:(.+)", text, re.DOTALL
+            )[0]
+        except BaseException as e:
+            raise OutputParserError(text)
+        code = parsed_result[-1].strip()
+        if code.startswith("```"):
+            try:
+                code = re.findall(r"```.*?\n(.+?)```", code, re.DOTALL)[0].strip()
+            except BaseException as e:
+                raise OutputParserError(text)
+        return AgentFinish({"output": code}, text)
 
 
 @output_parser_registry.register("humaneval-manager")
@@ -41,23 +77,23 @@ def parse(self, output: LLMResult) -> Union[AgentAction, AgentFinish]:
         return AgentFinish({"output": output.content}, output.content)
 
 
-@output_parser_registry.register("humaneval-solver")
-class HumanevalSolverParser(OutputParser):
-    def parse(self, output: LLMResult) -> Union[AgentAction, AgentFinish]:
-        text = output.content
-        end_pos = text.rfind("```")
-        if end_pos == -1:
-            raise OutputParserError(text)
-        text = text[:end_pos]
-        cleaned_output = text.strip().strip("```").strip()
-        if cleaned_output.startswith("python"):
-            cleaned_output = cleaned_output[6:].strip()
-        elif cleaned_output.startswith("python3"):
-            cleaned_output = cleaned_output[7:].strip()
-        return AgentFinish({"output": cleaned_output}, text)
+# @output_parser_registry.register("humaneval-solver")
+# class HumanevalSolverParser(OutputParser):
+#     def parse(self, output: LLMResult) -> Union[AgentAction, AgentFinish]:
+#         text = output.content
+#         end_pos = text.rfind("```")
+#         if end_pos == -1:
+#             raise OutputParserError(text)
+#         text = text[:end_pos]
+#         cleaned_output = text.strip().strip("```").strip()
+#         if cleaned_output.startswith("python"):
+#             cleaned_output = cleaned_output[6:].strip()
+#         elif cleaned_output.startswith("python3"):
+#             cleaned_output = cleaned_output[7:].strip()
+#         return AgentFinish({"output": cleaned_output}, text)
 
 
-@output_parser_registry.register("humaneval-executor")
+@output_parser_registry.register("humaneval-executor-autogpt")
 class HumanevalSolverParser(OutputParser):
     def parse(self, output: LLMResult) -> Union[AgentAction, AgentFinish]:
         text = output.content
@@ -73,6 +109,31 @@ def parse(self, output: LLMResult) -> Union[AgentAction, AgentFinish]:
         return AgentFinish({"output": cleaned_output}, text)
 
 
+@output_parser_registry.register("humaneval-executor")
+class HumanevalSolverParser(OutputParser):
+    def parse(self, output: LLMResult) -> Union[AgentAction, AgentFinish]:
+        text = output.content
+        try:
+            parsed_result = re.findall(
+                r"Thought:(.+?)File Path:(.+?)Code:(.+?)Command:(.+)",
+                text,
+                re.DOTALL,
+            )[0]
+            cleaned_output = {
+                "thought": parsed_result[0].strip(),
+                "file_path": parsed_result[1].strip().strip("`"),
+                "code": parsed_result[2]
+                .strip()
+                .strip("```")
+                .strip("python")
+                .strip("python3"),
+                "command": parsed_result[3].strip().strip("`"),
+            }
+        except BaseException as e:
+            raise OutputParserError(text)
+        return AgentFinish({"output": cleaned_output}, text)
+
+
 @output_parser_registry.register("humaneval-evaluator")
 class HumanevalEvaluatorParser(OutputParser):
     dimensions: List[str] = None
@@ -130,3 +191,32 @@ def parse(self, output: LLMResult) -> AgentCriticism:
             return AgentCriticism(False, criticism)
         else:
             raise OutputParserError(text)
+
+
+@output_parser_registry.register("humaneval-critic-agree")
+class HumanevalyCriticParser(OutputParser):
+    def parse(self, output: LLMResult) -> AgentCriticism:
+        text = output.content
+        if "[Agree]" in text:
+            return AgentCriticism(True, "")
+        else:
+            return AgentCriticism(False, text)
+
+
+@output_parser_registry.register("humaneval-critic-autogpt")
+class HumanevalCriticParser(OutputParser):
+    def parse(self, output: LLMResult) -> Union[AgentAction, AgentFinish]:
+        text = output.content
+        try:
+            parsed_result = re.findall(
+                r"Text:(.+?)Reasoning:(.+?)Criticism:(.+?)Speak:(.+?)Final Decision:(.+)",
+                text,
+                re.DOTALL,
+            )[0]
+        except BaseException as e:
+            raise OutputParserError(text)
+        decision = parsed_result[-1].strip()
+        if "[Agree]" in decision:
+            return AgentCriticism(True, "")
+        else:
+            return AgentCriticism(False, parsed_result[-2].strip())
diff --git a/agentverse/tasks/logic_grid/gpt-4-new/config.yaml b/agentverse/tasks/logic_grid/gpt-4-new/config.yaml
new file mode 100644
index 000000000..2b920e321
--- /dev/null
+++ b/agentverse/tasks/logic_grid/gpt-4-new/config.yaml
@@ -0,0 +1,176 @@
+cnt_agents: &cnt_agents 2
+max_turn: &max_turn 3
+max_criticizing_rounds: 3
+
+prompts:
+  role_assigner_prepend_prompt: &role_assigner_prepend_prompt |-
+    # Role Description
+    You are the leader of a group, now you are facing a problem:
+    ```
+    ${task_description}
+    ```
+    
+    You can recruit ${cnt_critic_agents} people to solve the logic problem. What people will you recruit?
+
+    Here are some suggestion:
+    ${advice}
+
+  role_assigner_append_prompt: &role_assigner_append_prompt |-
+    # Response Format Guidance
+    You should respond with a list of ${cnt_critic_agents} people description. For example:
+    1. an electrical engineer specified in the filed of xxx
+    2. an economist who is good at xxx
+    3. a lawyer with a good knowledge of xxx
+    ...
+    
+    Only respond with the description of each role. Do not include your reason.
+
+  solver_prepend_prompt: &solver_prepend_prompt |-
+    ${task_description} 
+    
+  # Messages from the solver and critics will be filled here in the code.
+  
+  solver_append_prompt: &solver_append_prompt |-
+    Using these information, can you provide the correct solution to the math problem? Explain your reasoning and solve the problem step by step. Your final answer should be a single integer, which is the number of choice, in the form \boxed{answer}, at the end of your response.
+
+  critic_prepend_prompt: &critic_prepend_prompt |-
+    You are in a discussion group, aiming to collaborative solve the following logic problem:
+    ```
+    ${task_description}
+    ```
+
+  # Messages from the solver and critics will be filled here in the code.
+
+  critic_append_prompt: &critic_append_prompt |-    
+    You are ${role_description}. Based on your knowledge, can you check the correctness of the solutions given above? You should give your correct solution to the problem step by step. When responding, you should follow the following rules:
+    1. Double-check the above solutions, give your critics, then generate the correct solution step by step.
+    2. If the final answer in your solution is the same as the final answer in the above provided solution, end your response with a special token "[Agree]".
+    3. You must highlight your final answer in the form \boxed{answer} at the end of your response. The answer must be a single integer.
+
+    Now give your response.
+
+  evaluator_prepend_prompt: &evaluator_prepend_prompt |-
+    Problem:
+    ```
+    ${task_description}
+    ```
+
+    Solution: 
+    ```
+    ${solution}
+    ```
+
+    You are a logic problem lover. Above is a logic problem and a solution. Check whether the solution and the deduction is correct. If the deduction is wrong, you should explain why it is wrong, but do not give your solution. When it is correct, output a correctness of 1 and why it is correct.
+    
+  evaluator_append_prompt: &evaluator_append_prompt |-
+    You should respond in the following format:
+    Correctness: (0 or 1, 0 is wrong, and 1 is correct)
+    Response: (explain in details why it is wrong or correct. do not provide your solution)
+
+    
+
+
+name: pipeline
+
+
+environment:
+  env_type: pipeline
+  max_turn: *max_turn
+  role_assigner:
+    type: role_description
+    cnt_agents: *cnt_agents
+  decision_maker:
+    type: vertical-solver-first
+  executor:
+    type: none
+  evaluator:
+    type: basic
+
+agents:
+  - #role_assigner_agent:
+    agent_type: role_assigner
+    name: role assigner
+    max_retry: 1000
+    prepend_prompt_template: *role_assigner_prepend_prompt
+    append_prompt_template: *role_assigner_append_prompt
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-4
+      model: gpt-4
+      temperature: 0
+      max_tokens: 512
+    output_parser:
+      type: role_assigner
+
+  - #solver_agent:
+    agent_type: solver
+    name: Planner
+    max_retry: 1000
+    prepend_prompt_template: *solver_prepend_prompt
+    append_prompt_template: *solver_append_prompt
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-4
+      model: gpt-4
+      temperature: 0
+      max_tokens: 1024
+    output_parser:
+      type: mgsm
+
+  - #critic_agents:
+    agent_type: critic
+    name: Critic 1
+    max_retry: 1000
+    role_description: |-
+      Waiting to be assigned.
+    prepend_prompt_template: *critic_prepend_prompt
+    append_prompt_template: *critic_append_prompt
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-4
+      model: "gpt-4"
+      temperature: 0
+      max_tokens: 1024
+    output_parser:
+      type: mgsm-critic-agree
+
+  - #executor_agent:
+    agent_type: executor
+    name: Executor
+    max_retry: 1000
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-4
+      model: gpt-4
+      temperature: 0
+      max_tokens: 1024
+    output_parser:
+      type: mgsm
+
+  - #evaluator_agent:
+    agent_type: evaluator
+    name: Evaluator
+    max_retry: 1000
+    role_description: |-
+      Evaluator
+    prepend_prompt_template: *evaluator_prepend_prompt
+    append_prompt_template: *evaluator_append_prompt
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-4
+      model: gpt-4
+      temperature: 0.3
+      max_tokens: 1024
+    output_parser:
+      type: mgsm-evaluator
+      dimensions:
+        - Correctness
+
+
+tools:
+
diff --git a/agentverse/tasks/mgsm/gpt-3.5-autogpt/config.yaml b/agentverse/tasks/mgsm/gpt-3.5-autogpt/config.yaml
new file mode 100644
index 000000000..393f2150d
--- /dev/null
+++ b/agentverse/tasks/mgsm/gpt-3.5-autogpt/config.yaml
@@ -0,0 +1,200 @@
+cnt_agents: &cnt_agents 2
+max_turn: &max_turn 3
+max_criticizing_rounds: 3
+
+prompts:
+  role_assigner_prepend_prompt: &role_assigner_prepend_prompt |-
+    # Role Description
+    You are the leader of a group of experts, now you are facing a grade school math problem:
+    ${task_description}
+    
+    You can recruit ${cnt_critic_agents} expert in different fields. What experts will you recruit to better generate an accurate solution?
+
+    Here are some suggestion:
+    ${advice}
+    
+  role_assigner_append_prompt: &role_assigner_append_prompt |-
+    # Response Format Guidance
+    You should respond with a list of expert description. For example:
+    1. an electrical engineer specified in the filed of xxx.
+    2. an economist who is good at xxx.
+    3. a lawyer with a good knowledge of xxx.
+    ...
+
+    Only respond with the description of each role. Do not include your reason.
+
+  solver_prepend_prompt: &solver_prepend_prompt |-
+    Can you solve the following math problem? 
+    ${task_description} 
+
+  solver_append_prompt: &solver_append_prompt |-
+    You are ${role_description}. Using these information, can you provide the correct solution to the math problem? You should always respond in the following format:
+    Thought: your thought on the problem
+    Reasoning: your reasoning process
+    Criticism: constructive self-criticism on the above content
+    Solution: your solution and the process. your final answer must be a single numerical number (not a equation, fraction, function or variable), in the form \boxed{answer}, at the end of your response.
+  # Explain your reasoning. Your final answer must be a single numerical number (not a equation, fraction, function or variable), in the form \boxed{answer}, at the end of your response.
+
+  # Your response should include one final answer in the form of \boxed{answer} at the end. If there are multiple answers given by different agents, you should only select the correct one and summarize its deduction process. Do not summarize each agent's response separately, summarize the whole chat history and give one final answer.
+  # Can you solve the following math problem? 
+  # ${task_description} 
+
+  # # Previous Solution
+  # The solution you gave in the last step is:
+  # ```
+  # ${former_solution}
+  # ```
+
+  # # Critics
+  # There are some critics on the above solution:
+  # ```
+  # ${critic_opinions}
+  # ```
+  
+  # Using the these information, can you provide the correct solution to the math problem? Explain your reasoning. Your final answer should be a single numerical number (not a equation), in the form \boxed{answer}, at the end of your response.
+
+  critic_prepend_prompt: &critic_prepend_prompt |-
+    You are in a discussion group, aiming to collaborative solve the following math problem:
+    ${task_description}
+
+  critic_append_prompt: &critic_append_prompt |-
+    You are ${role_description}. Based on your knowledge, can you check the correctness of the latest solutions given in the chat history? 
+
+    You should always respond in the following format:
+    Thought: your thought
+    Reasoning: your reasoning process for checking the correctness
+    Criticism: constructive criticism on your and others thought
+    Speak: your words to say to others
+    Decision: set to "[Agree]" if you think the latest solution is correct, otherwise "[Disagree]"
+  # Now give your response.
+  # If the solution is correct, end your response with a special token "[Correct]".
+  # If the solution is wrong, end your response with a special token "[Wrong]".
+  # # Response Format
+  # Using the solution from the other member as additional information, can you provide your answer to this math problem? Explain your reasoning. Your final answer should be a single numerical number (not a equation), in the form \boxed{answer}, at the end of your response. And additionally:
+  # 1. If your solution has the same answer to the provided solution, end your response with a special token "[Correct]".
+  # 2. If your solution has different answer to the provided solution, end your response with a special token "[Wrong]".
+
+  evaluator_prepend_prompt: &evaluator_prepend_prompt |-
+    Experts: ${all_role_description}
+    Problem: ${task_description}
+    Solution: 
+    ```
+    ${solution}
+    ```
+
+  evaluator_append_prompt: &evaluator_append_prompt |-
+    You are an experienced mathematic teacher. As a good teacher, you carefully check the correctness of the given solution on a grade school math problem. 
+    
+    You should respond in the following format:
+    Thought: your thought on the problem
+    Reasoning: your reasoning process
+    Criticism: constructive criticism on your and others' thought
+    Speak: your advice to say to others
+    Correctness: 0 or 1, 0 is wrong, and 1 is correct
+
+    
+
+name: pipeline
+
+
+environment:
+  env_type: pipeline
+  max_turn: *max_turn
+  role_assigner:
+    type: role_description
+    cnt_agents: *cnt_agents
+  decision_maker:
+    type: vertical-solver-first
+  executor:
+    type: none
+  evaluator:
+    type: basic
+
+agents:
+  - #role_assigner_agent:
+    agent_type: role_assigner
+    name: role assigner
+    prepend_prompt_template: *role_assigner_prepend_prompt
+    append_prompt_template: *role_assigner_append_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: "gpt-3.5-turbo"
+      temperature: 0
+      max_tokens: 512
+    output_parser:
+      type: role_assigner
+
+  - #solver_agent:
+    agent_type: solver
+    name: Planner
+    prepend_prompt_template: *solver_prepend_prompt
+    append_prompt_template: *solver_append_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: "gpt-3.5-turbo-16k"
+      temperature: 0.3
+      max_tokens: 2048
+    output_parser:
+      type: mgsm-solver-autogpt
+
+  - #critic_agents:
+    agent_type: critic
+    name: Critic 1
+    role_description: |-
+      Waiting to be assigned.
+    prepend_prompt_template: *critic_prepend_prompt
+    append_prompt_template: *critic_append_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: "gpt-3.5-turbo-16k"
+      temperature: 0
+      max_tokens: 2048
+    output_parser:
+      type: mgsm-critic-autogpt
+
+  - #executor_agent:
+    agent_type: executor
+    name: Executor
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: gpt-3.5-turbo
+      temperature: 0
+      max_tokens: 1024
+    output_parser:
+      type: mgsm
+
+  - #evaluator_agent:
+    agent_type: evaluator
+    name: Evaluator
+    role_description: |-
+      Evaluator
+    prepend_prompt_template: *evaluator_prepend_prompt
+    append_prompt_template: *evaluator_append_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: gpt-3.5-turbo
+      temperature: 0.3
+      max_tokens: 1024
+    output_parser:
+      type: mgsm-evaluator-autogpt
+      dimensions:
+        - Correctness
+
+
+tools:
+
diff --git a/agentverse/tasks/mgsm/gpt-3.5-new/config.yaml b/agentverse/tasks/mgsm/gpt-3.5-new/config.yaml
new file mode 100644
index 000000000..35910b70f
--- /dev/null
+++ b/agentverse/tasks/mgsm/gpt-3.5-new/config.yaml
@@ -0,0 +1,189 @@
+cnt_agents: &cnt_agents 2
+max_turn: &max_turn 3
+max_criticizing_rounds: 3
+
+prompts:
+  role_assigner_prepend_prompt: &role_assigner_prepend_prompt |-
+    # Role Description
+    You are the leader of a group of experts, now you are facing a grade school math problem:
+    ${task_description}
+    
+    You can recruit ${cnt_critic_agents} expert in different fields. What experts will you recruit to better generate an accurate solution?
+
+    Here are some suggestion:
+    ${advice}
+    
+  role_assigner_append_prompt: &role_assigner_append_prompt |-
+    # Response Format Guidance
+    You should respond with a list of expert description. For example:
+    1. an electrical engineer specified in the filed of xxx.
+    2. an economist who is good at xxx.
+    3. a lawyer with a good knowledge of xxx.
+    ...
+
+    Only respond with the description of each role. Do not include your reason.
+
+  solver_prepend_prompt: &solver_prepend_prompt |-
+    Can you solve the following math problem? 
+    ${task_description} 
+
+  solver_append_prompt: &solver_append_prompt |-
+    You are ${role_description}. Using these information, can you provide the correct solution to the math problem? Explain your reasoning. Your final answer must be a single numerical number (not a equation, fraction, function or variable), in the form \boxed{answer}, at the end of your response.
+
+  # Your response should include one final answer in the form of \boxed{answer} at the end. If there are multiple answers given by different agents, you should only select the correct one and summarize its deduction process. Do not summarize each agent's response separately, summarize the whole chat history and give one final answer.
+  # Can you solve the following math problem? 
+  # ${task_description} 
+
+  # # Previous Solution
+  # The solution you gave in the last step is:
+  # ```
+  # ${former_solution}
+  # ```
+
+  # # Critics
+  # There are some critics on the above solution:
+  # ```
+  # ${critic_opinions}
+  # ```
+  
+  # Using the these information, can you provide the correct solution to the math problem? Explain your reasoning. Your final answer should be a single numerical number (not a equation), in the form \boxed{answer}, at the end of your response.
+
+  critic_prepend_prompt: &critic_prepend_prompt |-
+    You are in a discussion group, aiming to collaborative solve the following math problem:
+    ${task_description}
+
+  critic_append_prompt: &critic_append_prompt |-
+    You are ${role_description}. Based on your knowledge, can you check the correctness of the solutions given in the chat history? You should give your correct solution to the problem step by step. When responding, you should follow the following rules:
+    1. Double-check the above solutions, give your critics, then generate the correct solution step by step.
+    2. If the final answer in your solution is the same as the final answer in the above provided solution, end your response with a special token "[Agree]".
+    3. You must highlight your final answer in the form \boxed{answer} at the end of your response. The answer must be a numerical number, not a equation, fraction, function or variable.
+
+    Now give your response.
+  # If the solution is correct, end your response with a special token "[Correct]".
+  # If the solution is wrong, end your response with a special token "[Wrong]".
+  # # Response Format
+  # Using the solution from the other member as additional information, can you provide your answer to this math problem? Explain your reasoning. Your final answer should be a single numerical number (not a equation), in the form \boxed{answer}, at the end of your response. And additionally:
+  # 1. If your solution has the same answer to the provided solution, end your response with a special token "[Correct]".
+  # 2. If your solution has different answer to the provided solution, end your response with a special token "[Wrong]".
+
+  evaluator_prepend_prompt: &evaluator_prepend_prompt |-
+    Experts: ${all_role_description}
+    Problem: ${task_description}
+    Solution: 
+    ```
+    ${solution}
+    ```
+
+  evaluator_append_prompt: &evaluator_append_prompt |-
+    You are an experienced mathematic teacher. As a good teacher, you carefully check the correctness of the given solution on a grade school math problem. When the solution is wrong, you should give your advice to the students on how to correct the solution. When it is correct, output a correctness of 1 and why it is correct. Also check that the final answer is in the form \boxed{answer} at the end of the solution. The answer must be a numerical number (not a equation, fraction, function or variable). You should also give some suggestion on on what experts should recruit in the next round.
+    
+    You should respond in the following format:
+    Correctness: (0 or 1, 0 is wrong, and 1 is correct)
+    Response: (explain in details why it is wrong or correct)
+
+    
+
+name: pipeline
+
+
+environment:
+  env_type: pipeline
+  max_turn: *max_turn
+  role_assigner:
+    type: role_description
+    cnt_agents: *cnt_agents
+  decision_maker:
+    type: vertical-solver-first
+  executor:
+    type: none
+  evaluator:
+    type: basic
+
+agents:
+  - #role_assigner_agent:
+    agent_type: role_assigner
+    name: role assigner
+    prepend_prompt_template: *role_assigner_prepend_prompt
+    append_prompt_template: *role_assigner_append_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: "gpt-3.5-turbo"
+      temperature: 0
+      max_tokens: 512
+    output_parser:
+      type: role_assigner
+
+  - #solver_agent:
+    agent_type: solver
+    name: Planner
+    prepend_prompt_template: *solver_prepend_prompt
+    append_prompt_template: *solver_append_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: "gpt-3.5-turbo-16k"
+      temperature: 0.3
+      max_tokens: 2048
+    output_parser:
+      type: mgsm
+
+  - #critic_agents:
+    agent_type: critic
+    name: Critic 1
+    role_description: |-
+      Waiting to be assigned.
+    prepend_prompt_template: *critic_prepend_prompt
+    append_prompt_template: *critic_append_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: "gpt-3.5-turbo-16k"
+      temperature: 0
+      max_tokens: 2048
+    output_parser:
+      type: mgsm-critic-agree
+
+  - #executor_agent:
+    agent_type: executor
+    name: Executor
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: gpt-3.5-turbo
+      temperature: 0
+      max_tokens: 1024
+    output_parser:
+      type: mgsm
+
+  - #evaluator_agent:
+    agent_type: evaluator
+    name: Evaluator
+    role_description: |-
+      Evaluator
+    prepend_prompt_template: *evaluator_prepend_prompt
+    append_prompt_template: *evaluator_append_prompt
+    max_retry: 10
+    memory:
+      memory_type: chat_history
+    llm:
+      llm_type: gpt-3.5-turbo
+      model: gpt-3.5-turbo
+      temperature: 0.3
+      max_tokens: 1024
+    output_parser:
+      type: mgsm-evaluator
+      dimensions:
+        - Correctness
+
+
+tools:
+
diff --git a/agentverse/tasks/mgsm/output_parser.py b/agentverse/tasks/mgsm/output_parser.py
index ab7233338..f7cd2c0f4 100644
--- a/agentverse/tasks/mgsm/output_parser.py
+++ b/agentverse/tasks/mgsm/output_parser.py
@@ -19,6 +19,21 @@ def parse(self, output: LLMResult) -> Union[AgentAction, AgentFinish]:
         return AgentFinish({"output": output.content}, output.content)
 
 
+@output_parser_registry.register("mgsm-solver-autogpt")
+class MGSMSolverParser(OutputParser):
+    def parse(self, output: LLMResult) -> Union[AgentAction, AgentFinish]:
+        text = output.content
+        try:
+            parsed_result = re.findall(
+                r"Thought:(.+?)Reasoning:(.+?)Criticism:(.+?)Solution:(.+)",
+                text,
+                re.DOTALL,
+            )[0]
+        except BaseException as e:
+            raise OutputParserError(text)
+        return AgentFinish({"output": re.sub(r"\n+", "\n", text.strip())}, text)
+
+
 @output_parser_registry.register("mgsm-evaluator")
 class MGSMEvaluatorParser(OutputParser):
     dimensions: List[str] = None
@@ -48,14 +63,28 @@ def parse(self, output: LLMResult) -> Tuple[List[int], str]:
             advice = pat.findall(cleaned_output)[0]
             # logger.info("Evaluator give the following advice:\n" + advice)
         except (IndexError, ValueError):
-            import pdb
-
-            pdb.set_trace()
             # logger.error("Bad response from evaluator!")
             raise OutputParserError(text)
         return score, advice
 
 
+@output_parser_registry.register("mgsm-evaluator-autogpt")
+class MGSMCriticParser(OutputParser):
+    def parse(self, output: LLMResult) -> Union[AgentAction, AgentFinish]:
+        text = output.content
+        try:
+            parsed_result = re.findall(
+                r"Thought:(.+?)Reasoning:(.+?)Criticism:(.+?)Speak:(.+?)Correctness:(.+)",
+                text,
+                re.DOTALL,
+            )[0]
+            score = int(parsed_result[-1])
+            advice = parsed_result[-2]
+        except BaseException as e:
+            raise OutputParserError(text)
+        return score, advice
+
+
 @output_parser_registry.register("mgsm-critic")
 class MGSMCriticParser(OutputParser):
     def parse(self, output: LLMResult) -> AgentCriticism:
@@ -88,6 +117,24 @@ def parse(self, output: LLMResult) -> AgentCriticism:
         #     raise OutputParserError(text)
 
 
+@output_parser_registry.register("mgsm-critic-autogpt")
+class MGSMCriticParser(OutputParser):
+    def parse(self, output: LLMResult) -> Union[AgentAction, AgentFinish]:
+        text = output.content
+        try:
+            parsed_result = re.findall(
+                r"Thought:(.+?)Reasoning:(.+?)Criticism:(.+?)Speak:(.+?)Decision:(.+)",
+                text,
+                re.DOTALL,
+            )[0]
+        except BaseException as e:
+            raise OutputParserError(text)
+        if "[Agree]" in parsed_result[-1]:
+            return AgentCriticism(True, "")
+        else:
+            return AgentCriticism(False, parsed_result[-2])
+
+
 @output_parser_registry.register("mgsm-critic-agree")
 class MGSMCriticAgreeParser(OutputParser):
     def parse(self, output: LLMResult) -> AgentCriticism:
diff --git a/agentverse/tasks/responsegen/output_parser.py b/agentverse/tasks/responsegen/output_parser.py
index 9b28093cd..2d9cc1cdf 100644
--- a/agentverse/tasks/responsegen/output_parser.py
+++ b/agentverse/tasks/responsegen/output_parser.py
@@ -46,9 +46,6 @@ def parse(self, output: LLMResult) -> Tuple[List[int], str]:
             advice = re.findall(r"(?:\d.\s*)?Advice:\s*(.+)", checks[-1])[0]
             # logger.info("Evaluator give the following advice:\n" + advice)
         except (IndexError, ValueError):
-            import pdb
-
-            pdb.set_trace()
             # logger.error("Bad response from evaluator!")
             raise OutputParserError(text)
         return score, advice
diff --git a/benchmark.py b/benchmark.py
index 58cbb209e..e922f9c3a 100644
--- a/benchmark.py
+++ b/benchmark.py
@@ -69,7 +69,7 @@ def get_dataloader(task, dataset_path):
                     "input": example["input"],
                     "response": plan,
                     "label": example["answer"],
-                    "logs": agentversepipeline.logs,
+                    "logs": logs,
                 }
             )
             + "\n"
diff --git a/dataloader/humaneval.py b/dataloader/humaneval.py
index 71945310a..eaf134819 100644
--- a/dataloader/humaneval.py
+++ b/dataloader/humaneval.py
@@ -8,6 +8,12 @@
 @dataloader_registry.register("humaneval/gpt-3.5")
 @dataloader_registry.register("humaneval/gpt-3.5-new")
 @dataloader_registry.register("humaneval/gpt-4-new")
+@dataloader_registry.register("humaneval/gpt-3.5-vertical-solver-first")
+@dataloader_registry.register("humaneval/gpt-3.5-vertical-solver-first-autogpt")
+@dataloader_registry.register("humaneval/gpt-3.5-vertical-solver-first-autogpt-2")
+@dataloader_registry.register("humaneval/gpt-3.5-concurrent")
+@dataloader_registry.register("humaneval/gpt-4-new-vertical-sovler-first")
+@dataloader_registry.register("humaneval/gpt-4-new-vertical-sovler-first-rust")
 class HumanevalLoader(DataLoader):
     def __init__(self, path: str):
         super().__init__(path)
diff --git a/dataloader/logic_grid.py b/dataloader/logic_grid.py
index bbf872962..f247344cf 100644
--- a/dataloader/logic_grid.py
+++ b/dataloader/logic_grid.py
@@ -5,7 +5,8 @@
 
 
 @dataloader_registry.register("logic_grid")
-@dataloader_registry.register("logic_grid/2agents")
+@dataloader_registry.register("logic_grid/gpt-4")
+@dataloader_registry.register("logic_grid/gpt-4-new")
 class LogicGridLoader(DataLoader):
     def __init__(self, path: str):
         self.answer_pat = re.compile(r"#### (-?\d+)")
diff --git a/dataloader/mgsm.py b/dataloader/mgsm.py
index 79a976acb..9569907b5 100644
--- a/dataloader/mgsm.py
+++ b/dataloader/mgsm.py
@@ -7,6 +7,8 @@
 @dataloader_registry.register("mgsm")
 @dataloader_registry.register("mgsm/gpt-4")
 @dataloader_registry.register("mgsm/gpt-3.5")
+@dataloader_registry.register("mgsm/gpt-3.5-new")
+@dataloader_registry.register("mgsm/gpt-3.5-autogpt")
 class MGSMLoader(DataLoader):
     def __init__(self, path: str):
         self.answer_pat = re.compile(r"#### (-?\d+)")
diff --git a/evaluate_logic.py b/evaluate_logic.py
new file mode 100644
index 000000000..0185e5f3c
--- /dev/null
+++ b/evaluate_logic.py
@@ -0,0 +1,71 @@
+import re
+import json
+import subprocess
+from importlib import reload
+from argparse import ArgumentParser
+
+parser = ArgumentParser()
+parser.add_argument("--path", type=str, required=True)
+parser.add_argument("--max_line", type=int, default=1000000000000)
+args = parser.parse_args()
+
+
+def check_corr(result: str, correct_solution: str, tol: float = 1e-3):
+    result = result.replace(",", "")
+    if result.strip() == correct_solution.strip():
+        return 1
+    try:
+        result = float(result.strip())
+        correct_solution = float(correct_solution.strip())
+        return abs(result - correct_solution) < tol
+    except:
+        return 0
+
+
+final_accs = []
+err_cnts = []
+for i in range(2):
+    acc = 0
+    total = 0
+    err_cnt = 0
+    with open(args.path) as f:
+        for idx, line in enumerate(f):
+            if idx == args.max_line:
+                break
+            line = json.loads(line)
+            label = str(line["label"])
+            if i == 0:
+                response = line["response"]
+            else:
+                if line["logs"][0]["agent"] == "role assigner":
+                    response = line["logs"][1]["content"]
+                else:
+                    response = line["logs"][0]["content"]
+            total += 1
+            result = re.findall(r"\\boxed\{(.+?)\}", response)
+            if len(result) == 0:
+                err_cnt += 1
+                # print(response)
+                continue
+            result = result[0]
+            result = re.sub(r"\\text\{.+\}?", "", result)
+            result = (
+                result.replace("rd", "")
+                .replace("nd", "")
+                .replace("st", "")
+                .replace("th", "")
+                .replace("House", "")
+                .replace("house", "")
+                .replace("\\", "")
+            )
+
+            # acc += check_corr(result, label)
+            try:
+                acc += int(result) == int(label)
+            except:
+                print(result)
+
+    final_accs.append(acc / total)
+    err_cnts.append(err_cnt)
+print(final_accs)
+print(err_cnts)
diff --git a/evaluate_math.py b/evaluate_math.py
new file mode 100644
index 000000000..44399da32
--- /dev/null
+++ b/evaluate_math.py
@@ -0,0 +1,90 @@
+import re
+import json
+import subprocess
+from importlib import reload
+from argparse import ArgumentParser
+
+parser = ArgumentParser()
+parser.add_argument("--path", type=str, required=True)
+parser.add_argument("--max_line", type=int, default=1000000000000)
+args = parser.parse_args()
+
+
+def check_corr(result: str, correct_solution: str, tol: float = 1e-3):
+    result = result.replace(",", "")
+    if result.strip() == correct_solution.strip():
+        return 1
+    try:
+        result = float(result.strip())
+        correct_solution = float(correct_solution.strip())
+        return abs(result - correct_solution) < tol
+    except:
+        return 0
+
+
+# final_accs = []
+# for i in range(2):
+#     acc = 0
+#     total = 0
+#     with open(args.path) as f:
+#         for line in f:
+#             line = json.loads(line)
+#             label = str(line["label"])
+#             if i == 0:
+#                 code = line["response"]
+#             else:
+#                 code = line["logs"][0]["content"]
+#             total += 1
+#             code = code.strip().replace("```", "")
+#             code = code.lstrip("python3")
+#             code = code.lstrip("python")
+#             with open("tmp.py", "w") as f:
+#                 f.write(code)
+
+#             try:
+#                 import tmp
+
+#                 reload(tmp)
+#                 result = str(tmp.solution())
+#                 is_corr = check_corr(result, label)
+
+#                 is_corr = int(is_corr)
+#                 # Step 2
+#                 if is_corr:
+#                     acc += 1
+#             except:
+#                 print(code)
+#     final_accs.append(acc / total)
+# print(final_accs)
+
+final_accs = []
+err_cnts = []
+for i in range(2):
+    acc = 0
+    total = 0
+    err_cnt = 0
+    with open(args.path) as f:
+        for idx, line in enumerate(f):
+            if idx == args.max_line:
+                break
+            line = json.loads(line)
+            label = str(line["label"])
+            if i == 0:
+                response = line["response"]
+            else:
+                if line["logs"][0]["module"] == "Role Assigner":
+                    response = line["logs"][1]["content"]
+                else:
+                    response = line["logs"][0]["content"]
+            total += 1
+            result = re.findall(r"\\boxed\{(.+?)\}", response)
+            if len(result) == 0:
+                err_cnt += 1
+                print(response)
+                continue
+            result = result[0]
+            acc += check_corr(result, label)
+    final_accs.append(acc / total)
+    err_cnts.append(err_cnt)
+print(final_accs)
+print(err_cnts)