jasonkneen · pull · Feb 6, 2025 · Feb 6, 2025 · Feb 6, 2025 · Feb 6, 2025
diff --git a/...tchat/src/autogen_agentchat/teams/_group_chat/_magentic_one/_magentic_one_orchestrator.py b/...tchat/src/autogen_agentchat/teams/_group_chat/_magentic_one/_magentic_one_orchestrator.py
@@ -1,5 +1,6 @@
 import json
 import logging
+import re
 from typing import Any, Dict, List, Mapping
 
 from autogen_core import AgentId, CancellationToken, DefaultTopicId, MessageContext, event, rpc
@@ -80,14 +81,12 @@ def __init__(
         self._plan = ""
         self._n_rounds = 0
         self._n_stalls = 0
-        self._team_description = "\n".join(
-            [
-                f"{topic_type}: {description}".strip()
-                for topic_type, description in zip(
-                    self._participant_topic_types, self._participant_descriptions, strict=True
-                )
-            ]
-        )
+
+        # Produce a team description. Each agent sould appear on a single line.
+        self._team_description = ""
+        for topic_type, description in zip(self._participant_topic_types, self._participant_descriptions, strict=True):
+            self._team_description += re.sub(r"\s+", " ", f"{topic_type}: {description}").strip() + "\n"
+        self._team_description = self._team_description.strip()
 
     def _get_task_ledger_facts_prompt(self, task: str) -> str:
         return ORCHESTRATOR_TASK_LEDGER_FACTS_PROMPT.format(task=task)

diff --git a/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/_multimodal_web_surfer.py b/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/_multimodal_web_surfer.py
@@ -53,9 +53,9 @@
     TOOL_CLICK,
     TOOL_HISTORY_BACK,
     TOOL_HOVER,
-    TOOL_PAGE_DOWN,
-    TOOL_PAGE_UP,
     TOOL_READ_PAGE_AND_ANSWER,
+    TOOL_SCROLL_DOWN,
+    TOOL_SCROLL_UP,
     TOOL_SLEEP,
     TOOL_SUMMARIZE_PAGE,
     TOOL_TYPE,
@@ -65,6 +65,8 @@
 from ._types import InteractiveRegion, UserContent
 from .playwright_controller import PlaywrightController
 
+DEFAULT_CONTEXT_SIZE = 128000
+
 
 class MultimodalWebSurferConfig(BaseModel):
     name: str
@@ -174,9 +176,9 @@ async def main() -> None:
 
     DEFAULT_DESCRIPTION = """
     A helpful assistant with access to a web browser.
-    Ask them to perform web searches, open pages, and interact with content (e.g., clicking links, scrolling the viewport, etc., filling in form fields, etc.).
+    Ask them to perform web searches, open pages, and interact with content (e.g., clicking links, scrolling the viewport, filling in form fields, etc.).
     It can also summarize the entire page, or answer questions based on the content of the page.
-    It can also be asked to sleep and wait for pages to load, in cases where the pages seem to be taking a while to load.
+    It can also be asked to sleep and wait for pages to load, in cases where the page seems not yet fully loaded.
     """
     DEFAULT_START_PAGE = "https://www.bing.com/"
 
@@ -464,11 +466,11 @@ async def _generate_reply(self, cancellation_token: CancellationToken) -> UserCo
 
         # We can scroll up
         if viewport["pageTop"] > 5:
-            tools.append(TOOL_PAGE_UP)
+            tools.append(TOOL_SCROLL_UP)
 
         # Can scroll down
         if (viewport["pageTop"] + viewport["height"] + 5) < viewport["scrollHeight"]:
-            tools.append(TOOL_PAGE_DOWN)
+            tools.append(TOOL_SCROLL_DOWN)
 
         # Focus hint
         focused = await self._playwright_controller.get_focused_rect_id(self._page)
@@ -477,6 +479,8 @@ async def _generate_reply(self, cancellation_token: CancellationToken) -> UserCo
             name = self._target_name(focused, rects)
             if name:
                 name = f"(and name '{name}') "
+            else:
+                name = ""
 
             role = "control"
             try:
@@ -611,10 +615,10 @@ async def _execute_tool(
                 self._last_download = None
             if reset_prior_metadata and self._prior_metadata_hash is not None:
                 self._prior_metadata_hash = None
-        elif name == "page_up":
+        elif name == "scroll_up":
             action_description = "I scrolled up one page in the browser."
             await self._playwright_controller.page_up(self._page)
-        elif name == "page_down":
+        elif name == "scroll_down":
             action_description = "I scrolled down one page in the browser."
             await self._playwright_controller.page_down(self._page)
 
@@ -855,35 +859,50 @@ async def _summarize_page(
         buffer = ""
         # for line in re.split(r"([\r\n]+)", page_markdown):
         for line in page_markdown.splitlines():
-            message = UserMessage(
-                # content=[
+            trial_message = UserMessage(
                 content=prompt + buffer + line,
-                #    ag_image,
-                # ],
                 source=self.name,
             )
 
-            remaining = self._model_client.remaining_tokens(messages + [message])
-            if remaining > self.SCREENSHOT_TOKENS:
-                buffer += line
-            else:
+            try:
+                remaining = self._model_client.remaining_tokens(messages + [trial_message])
+            except KeyError:
+                # Use the default if the model isn't found
+                remaining = DEFAULT_CONTEXT_SIZE - self._model_client.count_tokens(messages + [trial_message])
+
+            if self._model_client.model_info["vision"] and remaining <= 0:
                 break
 
+            if self._model_client.model_info["vision"] and remaining <= self.SCREENSHOT_TOKENS:
+                break
+
+            buffer += line
+
         # Nothing to do
         buffer = buffer.strip()
         if len(buffer) == 0:
             return "Nothing to summarize."
 
         # Append the message
-        messages.append(
-            UserMessage(
-                content=[
-                    prompt + buffer,
-                    ag_image,
-                ],
-                source=self.name,
+        if self._model_client.model_info["vision"]:
+            # Multimodal
+            messages.append(
+                UserMessage(
+                    content=[
+                        prompt + buffer,
+                        ag_image,
+                    ],
+                    source=self.name,
+                )
+            )
+        else:
+            # Text only
+            messages.append(
+                UserMessage(
+                    content=prompt + buffer,
+                    source=self.name,
+                )
             )
-        )
 
         # Generate the response
         response = await self._model_client.create(messages, cancellation_token=cancellation_token)

diff --git a/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/_prompts.py b/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/_prompts.py
@@ -8,7 +8,7 @@
 {tool_names}
 
 When deciding between tools, consider if the request can be best addressed by:
-    - the contents of the current viewport (in which case actions like clicking links, clicking buttons, inputting text might be most appropriate, or hovering over element)
+    - the contents of the current viewport (in which case actions like clicking links, clicking buttons, inputting text, or hovering over an element might be most appropriate)
     - contents found elsewhere on the full webpage (in which case actions like scrolling, summarization, or full-page Q&A might be most appropriate)
     - on some other website entirely (in which case actions like performing a new web search might be the best option)
 """
@@ -29,7 +29,7 @@
 {tool_names}
 
 When deciding between tools, consider if the request can be best addressed by:
-    - the contents of the current viewport (in which case actions like clicking links, clicking buttons, inputting text might be most appropriate, or hovering over element)
+    - the contents of the current viewport (in which case actions like clicking links, clicking buttons, inputting text, or hovering over an element might be most appropriate)
     - contents found elsewhere on the full webpage (in which case actions like scrolling, summarization, or full-page Q&A might be most appropriate)
     - on some other website entirely (in which case actions like performing a new web search might be the best option)
 """

diff --git a/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/_tool_definitions.py b/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/_tool_definitions.py
@@ -87,11 +87,11 @@ def _load_tool(tooldef: Dict[str, Any]) -> ToolSchema:
     }
 )
 
-TOOL_PAGE_UP: ToolSchema = _load_tool(
+TOOL_SCROLL_UP: ToolSchema = _load_tool(
     {
         "type": "function",
         "function": {
-            "name": "page_up",
+            "name": "scroll_up",
             "description": "Scrolls the entire browser viewport one page UP towards the beginning.",
             "parameters": {
                 "type": "object",
@@ -107,11 +107,11 @@ def _load_tool(tooldef: Dict[str, Any]) -> ToolSchema:
     }
 )
 
-TOOL_PAGE_DOWN: ToolSchema = _load_tool(
+TOOL_SCROLL_DOWN: ToolSchema = _load_tool(
     {
         "type": "function",
         "function": {
-            "name": "page_down",
+            "name": "scroll_down",
             "description": "Scrolls the entire browser viewport one page DOWN towards the end.",
             "parameters": {
                 "type": "object",

diff --git a/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/playwright_controller.py b/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/playwright_controller.py
@@ -122,23 +122,23 @@ async def get_visual_viewport(self, page: Page) -> VisualViewport:
             pass
         return visualviewport_from_dict(await page.evaluate("MultimodalWebSurfer.getVisualViewport();"))
 
-    async def get_focused_rect_id(self, page: Page) -> str:
+    async def get_focused_rect_id(self, page: Page) -> str | None:
         """
         Retrieve the ID of the currently focused element.
 
         Args:
             page (Page): The Playwright page object.
 
         Returns:
-            str: The ID of the focused element.
+            str: The ID of the focused element or None if no control has focus.
         """
         assert page is not None
         try:
             await page.evaluate(self._page_script)
         except Exception:
             pass
         result = await page.evaluate("MultimodalWebSurfer.getFocusedElementId();")
-        return str(result)
+        return None if result is None else str(result)
 
     async def get_page_metadata(self, page: Page) -> Dict[str, Any]:
         """