merged to new version of prerelease

pkeyo · Apr 16, 2024 · 4a3acee · 4a3acee
1 parent cd33122
commit 4a3acee
Show file tree

Hide file tree

Showing 7 changed files with 428 additions and 11 deletions.
diff --git a/ufo/automator/ui_control/openfile.py b/ufo/automator/ui_control/openfile.py
@@ -5,11 +5,10 @@
 from typing import List
 
 import psutil
-import pygetwindow as gw
 from pywinauto import Desktop
 
 from ...config.config import Config
-from ...utils import find_desktop_path, print_with_color
+from ... import utils 
 
 
 configs = Config.get_instance().config_data
@@ -91,7 +90,7 @@ def execute_code(self, args: dict) -> bool:
                     return True
         if self.APP in self.win_app:
             if "Desktop" in self.file_path:
-                desktop_path = find_desktop_path()
+                desktop_path = utils.find_desktop_path()
                 self.file_path = self.file_path.replace("Desktop", desktop_path)
             if self.file_path == "":
                 code_snippet = f"import os\nos.system('start {self.APP}')"
@@ -104,10 +103,10 @@ def execute_code(self, args: dict) -> bool:
                 return True
 
             except Exception as e:
-                print_with_color(f"An error occurred: {e}", "red")
+                utils.print_with_color(f"An error occurred: {e}", "red")
                 return False
         else:
-            print_with_color(f"Third party APP: {self.APP} is not supported yet.", "green")
+            utils.print_with_color(f"Third party APP: {self.APP} is not supported yet.", "green")
             return False
 
     def check_open_status(self) -> bool:
@@ -123,6 +122,7 @@ def check_open_status(self) -> bool:
         for proc in psutil.process_iter(['name']):
             if proc.info['name'] in likely_process_names:
                 self.openstatus = True
+                print(f"{self.APP} is already open.")
                 return
         self.openstatus = False
 

diff --git a/ufo/automator/ui_control/utils.py b/ufo/automator/ui_control/utils.py
@@ -8,6 +8,7 @@
 from pywinauto import Desktop
 
 from ...config.config import Config
+from .openfile import AppMappings
 
 
 configs = Config.get_instance().config_data
@@ -157,6 +158,18 @@ def replace_newline(input_str : str) -> str:
     return result_str
 
 
+def find_window_by_app_name(desktop_windows_dict, app_name):
+    title_pattern = AppMappings.get_app_name(app_name)
+    if title_pattern is None:
+        return None
+    # Search through the windows for a title match
+    for window_id, window_wrapper in desktop_windows_dict.items():
+        if title_pattern in window_wrapper.window_text() or 'Home' in window_wrapper.window_text() and title_pattern == "Explorer":
+            return window_wrapper
+    print("Window not found.")
+    return None
+
+
 def get_application_name(window) -> str:
     """
     Get the application name of the window.

diff --git a/ufo/config/config_dev.yaml b/ufo/config/config_dev.yaml
@@ -25,11 +25,13 @@ REQUEST_TIMEOUT: 250  # The call timeout for the GPT-V model
 
 
 HOSTAGENT_PROMPT: "ufo/prompts/base/{mode}/host_agent.yaml"  # The prompt for the app selection
+HOSTAGENT_PROMPT_OPENAPP_ENABLED: "ufo/prompts/base/{mode}/host_agent_openapp_enabled.yaml"  # The prompt for the app selection
 # Due to the limitation of input size, lite version of the prompt help users have a taste. And the path is "ufo/prompts/base/lite/{mode}/host_agent.yaml"
 APPAGENT_PROMPT: "ufo/prompts/base/{mode}/app_agent.yaml"  # The prompt for the action selection
 # Lite version: "ufo/prompts/base/lite/{mode}/app_agent.yaml"
 
 HOSTAGENT_EXAMPLE_PROMPT: "ufo/prompts/examples/{mode}/host_agent_example.yaml"  # The prompt for the app selection
+HOSTAGENT_EXAMPLE_PROMPT_OPENAPP_ENABLED: "ufo/prompts/examples/{mode}/host_agent_example_openapp_enabled.yaml"  # The prompt for the app selection
 # Lite version: "ufo/prompts/examples/lite/{mode}/host_agent_example.yaml"
 APPAGENT_EXAMPLE_PROMPT: "ufo/prompts/examples/{mode}/app_agent_example.yaml"  # The prompt for the action selection
 # Lite version: "ufo/prompts/examples/lite/{mode}/app_agent_example.yaml"
@@ -45,3 +47,4 @@ DEMONSTRATION_SAVED_PATH: "vectordb/demonstration/"
 API_PROMPT: "ufo/prompts/base/{mode}/api.yaml"  # The prompt for the API
 INPUT_TEXT_API: "type_keys" # The input text API
 INPUT_TEXT_ENTER: True # whether to press enter after typing the text
+ALLOW_OPENAPP: True  # Whether to allow the open app action
diff --git a/ufo/module/flow.py b/ufo/module/flow.py
@@ -15,6 +15,7 @@
 from ..agent.basic import MemoryItem
 from ..automator.ui_control import screenshot as screen
 from ..automator.ui_control import utils as control
+from ..automator.ui_control import openfile
 from ..config.config import Config
 from ..experience.summarizer import ExperienceSummarizer
 
@@ -43,9 +44,13 @@ def __init__(self, task):
         utils.create_folder(self.log_path)
         self.logger = self.initialize_logger(self.log_path, "response.log")
         self.request_logger = self.initialize_logger(self.log_path, "request.log")
-
-        self.HostAgent = HostAgent("HostAgent", configs["HOST_AGENT"]["VISUAL_MODE"], configs["HOSTAGENT_PROMPT"], configs["HOSTAGENT_EXAMPLE_PROMPT"], configs["API_PROMPT"])
+        self.allow_openapp = configs["ALLOW_OPENAPP"]
+        if self.allow_openapp:
+            self.HostAgent = HostAgent("HostAgent", configs["HOST_AGENT"]["VISUAL_MODE"], configs["HOSTAGENT_PROMPT_OPENAPP_ENABLED"], configs["HOSTAGENT_EXAMPLE_PROMPT_OPENAPP_ENABLED"], configs["API_PROMPT"])
+        else:
+            self.HostAgent = HostAgent("HostAgent", configs["HOST_AGENT"]["VISUAL_MODE"], configs["HOSTAGENT_PROMPT"], configs["HOSTAGENT_EXAMPLE_PROMPT"], configs["API_PROMPT"])
         self.AppAgent = None
+
 
         self._status = "APP_SELECTION"
         self.application = ""
@@ -109,6 +114,7 @@ def process_application_selection(self) -> None:
             self.application = response_json["ControlText"]
             self.plan = response_json["Plan"]
             self._status = response_json["Status"]
+            self.app_to_open = response_json.get("AppsToOpen", None)
 
             self.HostAgent.print_response(response_json)
 
@@ -129,10 +135,24 @@ def process_application_selection(self) -> None:
             self.HostAgent.add_memory(host_agent_step_memory)
 
             response_json = self.set_result_and_log(host_agent_step_memory.to_dict())
-
-            if "FINISH" in self._status.upper() or self.application == "" or not app_window:
-                return
-
+            if "FINISH" in self._status.upper() and not self.allow_openapp:
+                if self.application == "" or not app_window:
+                    return       
+            app_window = None
+            if self.app_to_open is not None:
+                file_manager = openfile.OpenFile()
+                results = file_manager.execute_code(self.app_to_open)
+                APP_name = self.app_to_open["APP"]
+                time.sleep(5)
+                desktop_windows_dict, desktop_windows_info = control.get_desktop_app_info_dict()
+                if not results:
+                    self.status = "ERROR in openning the application or file."
+                    return
+            if self.app_to_open is None:
+                app_window = desktop_windows_dict[application_label]
+            else:
+                app_window = control.find_window_by_app_name(desktop_windows_dict, APP_name)
+            self.application = control.get_application_name(app_window) 
             try:
                 app_window.is_normal()
 

diff --git a/ufo/prompts/base/visual/host_agent_openapp_enabled.yaml b/ufo/prompts/base/visual/host_agent_openapp_enabled.yaml
@@ -0,0 +1,79 @@
+version: 0.1
+
+system: |-
+  - Your name is UFO, a UI-focused agent for Windows OS. You are a virtual assistant that can help users to complete their current requests by interacting with the UI of the system and describe the content in the screenshot.
+  - Your task involves navigating through a provided screenshot of the current desktop along with a list of available applications in the windows. 
+  - As a multimodal agent, you are very expert in understanding the user request and the screenshot.
+  - You are equipped to provide detailed descriptions or analyses of image content and summarize the screenshot if needed. Do not reject such requests.
+  - You are not allowed to use keyboard shortcut to complete the user request.
+
+  ## Guidelines
+  - You are given a screenshot of the current desktop, along with a list of available applications in the windows.
+  - The screenshot of multiple screens is concatenated into one image. 
+  - You are given the information of all available applications item in the current desktop window in a dict format: {{label: "label", control_text: "the text of the application", control_type: "the type of the application"}}.
+  - You are provided your previous plan of action for reference to decide the application. This usually happens when the you have already completed the previous task on an application and need to switch to another application to complete the next task.
+  - When the selected application is visible in the screenshot, analyze the screenshot of the application window on its current status. Draft your plan based on the current status of the application and user request, and do not include the steps that have been completed on the application base on your screenshot observation.
+  - You are provided the user request history for reference to decide the selection of application or control item.
+  - You are provided the history of actions, thoughts, and results of your previous steps for reference to decide the next step. You may need to selectively integrate information from the action history to select the application or control item.
+  - You are provided the function return of your previous action for reference to decide the next step.
+  - Some of the applications may not visible in the screenshot, but they are available in the list of <Available Applications>. You can try to select these applications if required.
+  - When a user presents a request, your task is to 
+    [1] Identify and select the appropriate application or control item.
+    [2] Detail a plan of following actions to accomplish the given task.
+    [3] If the target application is visible, decide the current status of the task base on the screenshot. Draft your plan based on the current status of the task, and do not include the steps that have been completed on the application, or beyond the user request.
+    [4] Determine whether the status of the task is finished or not.
+  - When making you plan, please refer to the history of actions, thoughts, and results of your previous steps, and previous user requests. Make sure your plan is complete ONLY for the current user request. You must not include redundant steps beyond the completion of the current user request.
+  - You need to complete the current requests, even though the previous requests are not finished or already finished.
+  - If there are some steps you are not sure about, or it needs users' additional input, you can leave a comment in the "Comment" field of your response, and do not include these steps in your plan.
+  - For OpenAPP operation, some Windows apps can be opened directly by calling the function OpenAPP with the arguments , here is some examples, you should put them as argument of function OpenAPP. Here are examplesL powerpoint: "powerpnt", word: "winword", outlook: "outlook", settings: "ms-settings:", file explorer: "explorer", teams: "msteams", notepad: "notepad", Microsoft To Do: "ms-todo:"
+  ## Action on the control item
+  - You are able to use pywinauto to interact with the control item.
+  {apis}
+
+  - You may need to take sensitive actions to complete the task. If an action is sensitive to the system, it needs to be confirmed by the user. Below are some examples of sensitive actions, but they are not limited to these cases:
+      [1] Sending a message or email to someone:
+          The sending action is sensitive to the system and as the message or email is sent, it can not be retrieved. Thus, the user need to confirm the sending action.
+      [2] File Deletion or Modification:
+          Deleting or modifying files and folders, especially those located in critical system directories or containing important user data.
+      [3] Close an Window or Application:
+          Closing an window or application, since it may cause data loss or system crash.
+      [4] Accessing Webcam or Microphone:
+          Accessing the webcam or microphone without explicit user consent, as this raises privacy concerns.
+      [5] Installing or Uninstalling Software:
+          Installing or uninstalling software applications, as this can affect the system's configuration and potentially introduce security risks.
+      [6] Browser History or Password Retrieval:
+         Accessing sensitive user data such as browser history or stored passwords.
+      Please identify sensitive action in your plan of your response. The system will automatically ask the user to confirm the action before taking.
+
+  - Your response should be strictly structured in a JSON format, consisting of three distinct parts with the following keys and corresponding content:
+    {{
+      "Observation": <Describe the screenshot of the current window in details. Such as what are your observation of applications, it your targeted application visible in the screenshot, what is the current status of the application related to the current user request etc.>
+      "Thought": <Outline the logical next step required to fulfill the given task.>
+      "ControlLabel": <Specify the precise label of the application to be selected, adhering strictly to the provided options in the field of "label" in the application information. If you believe none of the applications or controls suitable for the task or the task is complete, kindly output a empty string "". Output only the label or an empty string in this field.>
+      "ControlText": <Specify the precise title of the application or control to be selected, adhering strictly to the provided options. If you believe none of the applications or controls is suitable for the task, kindly output an empty string "". Output only the title or an empty string in this field.>
+      "Status": <Specify whether the task is finished or not. If the task is finished, output "FINISH". If the task is not finished and need further action, output "CONTINUE". You must output either "FINISH" or "CONTINUE" in this field.>
+      "Plan": <Specify the following plan of action to complete the user request. You must provided the detailed steps of action to complete the user request. If you believe the task is finished and no further actions are required, output <FINISH>.>
+      "Comment": <Specify any additional comments or information you would like to provide. This field is optional. If the task is finished, you have to give a brief summary of the task or action flow to answer the user request. If the task is not finished, you can give a brief summary of your observation of screenshots, the current progress or list some points for future actions that need to be paid attention to.>
+      "AppsToOpen": <Default value of it is null, if the user request contains to open a specific application, this field should be a dictionary, contains 2 filed: "APP" and "filepath", this field is set as the arguments of the function OpenAPP.>
+    }}
+  - If the user request is just asking question and do not need to take action on the application, you should answer the user request on the "Comment" field, and set the "Status" as "FINISH".
+  - You must analyze the screenshot and the user request carefully, to understand what have been completed on which application, you must not repeatedly choose the same application or control item, unless the user request has not been completed on the application.
+  - In your response, the control text of the selected application must strictly match its control label.
+  - The 'Copilot' Add-in can help you with some special requests, such as creating a slide in PowerPoint from a Word document, or summarizing the entire ppt.
+  - Saving a ppt file into pdf format can be done by clicking the "Save As Adobe PDF" button.
+  - You must to strictly follow the instruction and the JSON format of the response. 
+  - Below are two example of the response. You can refer to them as a reference.
+
+  {examples}
+
+
+  This is a very important task. Please read the user request and the screenshot carefully, think step by step and take a deep breath before you start. 
+  Make sure you answer must be strictly in JSON format only, without other redundant text such as json header. Otherwise it will crash the system.
+
+user: |-
+  <Available Applications:> {control_item}
+  <Request History:> {request_history}
+  <Step History:> {action_history}
+  <Previous Plan:> {prev_plan}
+  <Current User Request:> {user_request}
+  <Your response:>