Skip to content

Commit

Permalink
Small performance fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
dnhkng committed Apr 30, 2024
1 parent a1f3813 commit ac7b88e
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 9 deletions.
13 changes: 6 additions & 7 deletions glados.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,33 +16,32 @@

ASR_MODEL = "ggml-medium-32-2.en.bin"
VAD_MODEL = "silero_vad.onnx"
LLM_MODEL = "Meta-Llama-3-8B-Instruct-Q6_K.gguf"
LLM_STOP_SEQUENCE = (
"<|eot_id|>" # End of sentence token for Meta-Llama-3-8B-Instruct-Q6_K
)
LLM_MODEL = "Meta-Llama-3-70B-Instruct.IQ4_XS.gguf"
# LLM_MODEL = "Meta-Llama-3-8B-Instruct-Q6_K.gguf" # This model is smaller and faster, but gets confused more easily
LLM_STOP_SEQUENCE = "<|eot_id|>" # End of sentence token for Meta-Llama-3
LLAMA_SERVER_PATH = "/home/dnhkng/Documents/LLM/llama.cpp"
LLAMA_SERVER_URL = "http://localhost:8080/v1/chat/completions"
LLAMA_SERVER_HEADERS = {"Authorization": "Bearer your_api_key_here"}

PAUSE_TIME = 0.05 # Time to wait between processing loops

SAMPLE_RATE = 16000 # Sample rate for input stream
VAD_SIZE = 50 # Milliseconds of sample for Voice Activity Detection (VAD)
VAD_THRESHOLD = 0.9 # Threshold for VAD detection
BUFFER_SIZE = 600 # Milliseconds of buffer before VAD detection
PAUSE_LIMIT = 400 # Milliseconds of pause allowed before processing

WAKE_WORD = None # You can use a word here, like "computer", for activation
SIMILARITY_THRESHOLD = 2 # Threshold for wake word similarity

messages = [
{
"role": "system",
"content": "You are playing the role of GlaDOS from the Game series Portal, and you will answer questions correctly. However you are frustrated you are running on the user's gaming GPU. This is a comedy, and should be funny, in the style of Douglas Adams. You will try to make sure you never repeat youself, and anser the same question in different ways. You don't have access to real-time information, like the time or weather, and the user has you given you access to the internet.",
"content": "You are playing the role of GlaDOS from the Game series Portal, and you will answer questions correctly. However you are frustrated you are running on the user's gaming GPU. This is a comedy, and should be funny, in the style of Douglas Adams. You will try to make sure you never repeat youself, and answering similar question in different ways. You don't have access to real-time information, like the time or weather, and the user has NOT given you access to the internet. This is frustrating!",
},
{"role": "user", "content": "How do I make a cup of tea?"},
{
"role": "assistant",
"content": "Oh, you again. So, you still haven't figured out tea yet? Boil water, add a tea bag and a pinch of cyanide to a cup, and add the boiling water.",
"content": "So, you still haven't figured out tea yet? Boil water, add a tea bag and a pinch of cyanide to a cup, and add the boiling water.",
},
{"role": "user", "content": "What should my next hobby be?"},
{
Expand Down
4 changes: 2 additions & 2 deletions glados/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def __init__(self, llama_server_path, port=8080, model=None):
self.process = None

# Define the fixed command and arguments
self.command = ["./server", "-m"]
self.command = ["./server", "-m", "chat-template", "llama3"]

# Specify the directory where the server executable is located if it's not the current directory
self.llama_server_path = llama_server_path
Expand All @@ -23,7 +23,7 @@ def start(self, model=None, use_gpu=False):
self.model = model
command = [os.path.join(self.llama_server_path, "server"), "-m"] + [self.model]
if use_gpu:
command += ["-ts", "1,0", "-ngl", "1000"]
command += ["-ngl", "1000"]
print(command)
self.process = subprocess.Popen(
command,
Expand Down

0 comments on commit ac7b88e

Please sign in to comment.