Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[pull] main from microsoft:main #124

Merged
merged 3 commits into from
Feb 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import logging
import re
from typing import Any, Dict, List, Mapping

from autogen_core import AgentId, CancellationToken, DefaultTopicId, MessageContext, event, rpc
Expand Down Expand Up @@ -80,14 +81,12 @@ def __init__(
self._plan = ""
self._n_rounds = 0
self._n_stalls = 0
self._team_description = "\n".join(
[
f"{topic_type}: {description}".strip()
for topic_type, description in zip(
self._participant_topic_types, self._participant_descriptions, strict=True
)
]
)

# Produce a team description. Each agent sould appear on a single line.
self._team_description = ""
for topic_type, description in zip(self._participant_topic_types, self._participant_descriptions, strict=True):
self._team_description += re.sub(r"\s+", " ", f"{topic_type}: {description}").strip() + "\n"
self._team_description = self._team_description.strip()

def _get_task_ledger_facts_prompt(self, task: str) -> str:
return ORCHESTRATOR_TASK_LEDGER_FACTS_PROMPT.format(task=task)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@
TOOL_CLICK,
TOOL_HISTORY_BACK,
TOOL_HOVER,
TOOL_PAGE_DOWN,
TOOL_PAGE_UP,
TOOL_READ_PAGE_AND_ANSWER,
TOOL_SCROLL_DOWN,
TOOL_SCROLL_UP,
TOOL_SLEEP,
TOOL_SUMMARIZE_PAGE,
TOOL_TYPE,
Expand All @@ -65,6 +65,8 @@
from ._types import InteractiveRegion, UserContent
from .playwright_controller import PlaywrightController

DEFAULT_CONTEXT_SIZE = 128000


class MultimodalWebSurferConfig(BaseModel):
name: str
Expand Down Expand Up @@ -174,9 +176,9 @@ async def main() -> None:

DEFAULT_DESCRIPTION = """
A helpful assistant with access to a web browser.
Ask them to perform web searches, open pages, and interact with content (e.g., clicking links, scrolling the viewport, etc., filling in form fields, etc.).
Ask them to perform web searches, open pages, and interact with content (e.g., clicking links, scrolling the viewport, filling in form fields, etc.).
It can also summarize the entire page, or answer questions based on the content of the page.
It can also be asked to sleep and wait for pages to load, in cases where the pages seem to be taking a while to load.
It can also be asked to sleep and wait for pages to load, in cases where the page seems not yet fully loaded.
"""
DEFAULT_START_PAGE = "https://www.bing.com/"

Expand Down Expand Up @@ -464,11 +466,11 @@ async def _generate_reply(self, cancellation_token: CancellationToken) -> UserCo

# We can scroll up
if viewport["pageTop"] > 5:
tools.append(TOOL_PAGE_UP)
tools.append(TOOL_SCROLL_UP)

# Can scroll down
if (viewport["pageTop"] + viewport["height"] + 5) < viewport["scrollHeight"]:
tools.append(TOOL_PAGE_DOWN)
tools.append(TOOL_SCROLL_DOWN)

# Focus hint
focused = await self._playwright_controller.get_focused_rect_id(self._page)
Expand All @@ -477,6 +479,8 @@ async def _generate_reply(self, cancellation_token: CancellationToken) -> UserCo
name = self._target_name(focused, rects)
if name:
name = f"(and name '{name}') "
else:
name = ""

role = "control"
try:
Expand Down Expand Up @@ -611,10 +615,10 @@ async def _execute_tool(
self._last_download = None
if reset_prior_metadata and self._prior_metadata_hash is not None:
self._prior_metadata_hash = None
elif name == "page_up":
elif name == "scroll_up":
action_description = "I scrolled up one page in the browser."
await self._playwright_controller.page_up(self._page)
elif name == "page_down":
elif name == "scroll_down":
action_description = "I scrolled down one page in the browser."
await self._playwright_controller.page_down(self._page)

Expand Down Expand Up @@ -855,35 +859,50 @@ async def _summarize_page(
buffer = ""
# for line in re.split(r"([\r\n]+)", page_markdown):
for line in page_markdown.splitlines():
message = UserMessage(
# content=[
trial_message = UserMessage(
content=prompt + buffer + line,
# ag_image,
# ],
source=self.name,
)

remaining = self._model_client.remaining_tokens(messages + [message])
if remaining > self.SCREENSHOT_TOKENS:
buffer += line
else:
try:
remaining = self._model_client.remaining_tokens(messages + [trial_message])
except KeyError:
# Use the default if the model isn't found
remaining = DEFAULT_CONTEXT_SIZE - self._model_client.count_tokens(messages + [trial_message])

if self._model_client.model_info["vision"] and remaining <= 0:
break

if self._model_client.model_info["vision"] and remaining <= self.SCREENSHOT_TOKENS:
break

buffer += line

# Nothing to do
buffer = buffer.strip()
if len(buffer) == 0:
return "Nothing to summarize."

# Append the message
messages.append(
UserMessage(
content=[
prompt + buffer,
ag_image,
],
source=self.name,
if self._model_client.model_info["vision"]:
# Multimodal
messages.append(
UserMessage(
content=[
prompt + buffer,
ag_image,
],
source=self.name,
)
)
else:
# Text only
messages.append(
UserMessage(
content=prompt + buffer,
source=self.name,
)
)
)

# Generate the response
response = await self._model_client.create(messages, cancellation_token=cancellation_token)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
{tool_names}

When deciding between tools, consider if the request can be best addressed by:
- the contents of the current viewport (in which case actions like clicking links, clicking buttons, inputting text might be most appropriate, or hovering over element)
- the contents of the current viewport (in which case actions like clicking links, clicking buttons, inputting text, or hovering over an element might be most appropriate)
- contents found elsewhere on the full webpage (in which case actions like scrolling, summarization, or full-page Q&A might be most appropriate)
- on some other website entirely (in which case actions like performing a new web search might be the best option)
"""
Expand All @@ -29,7 +29,7 @@
{tool_names}

When deciding between tools, consider if the request can be best addressed by:
- the contents of the current viewport (in which case actions like clicking links, clicking buttons, inputting text might be most appropriate, or hovering over element)
- the contents of the current viewport (in which case actions like clicking links, clicking buttons, inputting text, or hovering over an element might be most appropriate)
- contents found elsewhere on the full webpage (in which case actions like scrolling, summarization, or full-page Q&A might be most appropriate)
- on some other website entirely (in which case actions like performing a new web search might be the best option)
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,11 @@ def _load_tool(tooldef: Dict[str, Any]) -> ToolSchema:
}
)

TOOL_PAGE_UP: ToolSchema = _load_tool(
TOOL_SCROLL_UP: ToolSchema = _load_tool(
{
"type": "function",
"function": {
"name": "page_up",
"name": "scroll_up",
"description": "Scrolls the entire browser viewport one page UP towards the beginning.",
"parameters": {
"type": "object",
Expand All @@ -107,11 +107,11 @@ def _load_tool(tooldef: Dict[str, Any]) -> ToolSchema:
}
)

TOOL_PAGE_DOWN: ToolSchema = _load_tool(
TOOL_SCROLL_DOWN: ToolSchema = _load_tool(
{
"type": "function",
"function": {
"name": "page_down",
"name": "scroll_down",
"description": "Scrolls the entire browser viewport one page DOWN towards the end.",
"parameters": {
"type": "object",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,23 +122,23 @@ async def get_visual_viewport(self, page: Page) -> VisualViewport:
pass
return visualviewport_from_dict(await page.evaluate("MultimodalWebSurfer.getVisualViewport();"))

async def get_focused_rect_id(self, page: Page) -> str:
async def get_focused_rect_id(self, page: Page) -> str | None:
"""
Retrieve the ID of the currently focused element.

Args:
page (Page): The Playwright page object.

Returns:
str: The ID of the focused element.
str: The ID of the focused element or None if no control has focus.
"""
assert page is not None
try:
await page.evaluate(self._page_script)
except Exception:
pass
result = await page.evaluate("MultimodalWebSurfer.getFocusedElementId();")
return str(result)
return None if result is None else str(result)

async def get_page_metadata(self, page: Page) -> Dict[str, Any]:
"""
Expand Down