Skip to content

Commit

Permalink
Merge pull request OpenInterpreter#309 from benxu3/livekit-realtime
Browse files Browse the repository at this point in the history
add realtime livekit multimodal worker
  • Loading branch information
KillianLucas authored Oct 2, 2024
2 parents 63cc2a2 + 21e04c1 commit 207ec08
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 7 deletions.
11 changes: 10 additions & 1 deletion software/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from dotenv import load_dotenv
import signal
from source.server.livekit.worker import main as worker_main
from source.server.livekit.multimodal import main as multimodal_main
import warnings
import requests

Expand Down Expand Up @@ -71,6 +72,11 @@ def run(
"--debug",
help="Print latency measurements and save microphone recordings locally for manual playback",
),
multimodal: bool = typer.Option(
False,
"--multimodal",
help="Run the multimodal agent",
),
):

threads = []
Expand Down Expand Up @@ -274,7 +280,10 @@ def display_qr_code():

for attempt in range(30):
try:
worker_main(local_livekit_url)
if multimodal:
multimodal_main(local_livekit_url)
else:
worker_main(local_livekit_url)
except KeyboardInterrupt:
print("Exiting.")
raise
Expand Down
12 changes: 6 additions & 6 deletions software/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ readme = "../README.md"

[tool.poetry.dependencies]
python = ">=3.10,<3.12"
livekit = "^0.12.1"
livekit-agents = "^0.8.6"
livekit-plugins-deepgram = "^0.6.5"
livekit-plugins-openai = "^0.8.1"
livekit-plugins-silero = "^0.6.4"
livekit-plugins-elevenlabs = "^0.7.3"
livekit = "^0.17.2"
livekit-agents = "^0.10.0"
livekit-plugins-deepgram = "^0.6.7"
livekit-plugins-openai = "^0.10.1"
livekit-plugins-silero = "^0.7.1"
livekit-plugins-elevenlabs = "^0.7.5"
segno = "^1.6.1"
open-interpreter = {extras = ["os", "server"], version = "^0.3.12"} # You should add a "browser" extra, so selenium isn't in the main package
ngrok = "^1.4.0"
Expand Down
54 changes: 54 additions & 0 deletions software/source/server/livekit/multimodal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from __future__ import annotations
import sys
from livekit.agents import (
AutoSubscribe,
JobContext,
WorkerOptions,
cli,
llm,
)
from livekit.agents.multimodal import MultimodalAgent
from livekit.plugins import openai
from dotenv import load_dotenv
import os

load_dotenv()

async def entrypoint(ctx: JobContext):
await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY)

participant = await ctx.wait_for_participant()

openai_api_key = os.getenv("OPENAI_API_KEY")
model = openai.realtime.RealtimeModel(
instructions="You are a helpful assistant and you love open-source software",
voice="shimmer",
temperature=0.8,
modalities=["audio", "text"],
api_key=openai_api_key,
base_url="wss://api.openai.com/v1",
)
assistant = MultimodalAgent(model=model)
assistant.start(ctx.room)

session = model.sessions[0]
session.conversation.item.create(
llm.ChatMessage(
role="user",
content="Please begin the interaction with the user in a manner consistent with your instructions.",
)
)
session.response.create()

def main(livekit_url):
# Workers have to be run as CLIs right now.
# So we need to simualte running "[this file] dev"

# Modify sys.argv to set the path to this file as the first argument
# and 'dev' as the second argument
sys.argv = [str(__file__), 'dev']

# Initialize the worker with the entrypoint
cli.run_app(
WorkerOptions(entrypoint_fnc=entrypoint, api_key="devkey", api_secret="secret", ws_url=livekit_url, port=8082)
)

0 comments on commit 207ec08

Please sign in to comment.