Skip to content

Commit

Permalink
optimize tts output, add ui visuals and spectrums, optimize ui, impro…
Browse files Browse the repository at this point in the history
…ve main logic, fix turn handling
  • Loading branch information
lef-fan committed Feb 19, 2024
1 parent 1eed76e commit 5faaa25
Show file tree
Hide file tree
Showing 12 changed files with 224 additions and 91 deletions.
Binary file added assets/loading.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added assets/muted_mic.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added assets/transition.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
File renamed without changes.
4 changes: 2 additions & 2 deletions components/ap.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ def __init__(self, params=None):
self.device = self.params.get('device', None)
self.listening_sound_path = self.params.get('assets', None).get('listening_sound', None)
self.listening_sound, self.listening_sound_sr = sf.read(self.listening_sound_path)
self.speaking_sound_path = self.params.get('assets', None).get('speaking_sound', None)
self.speaking_sound, self.speaking_sound_sr = sf.read(self.speaking_sound_path)
self.transition_sound_path = self.params.get('assets', None).get('transition_sound', None)
self.transition_sound, self.transition_sound_sr = sf.read(self.transition_sound_path)

if self.device == "default":
self.device = None
Expand Down
15 changes: 6 additions & 9 deletions components/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
from .utils import remove_emojis
from .utils import remove_multiple_dots
from .utils import remove_code_blocks


class Llm:
Expand Down Expand Up @@ -47,6 +45,8 @@ def get_answer(self, ui, tts, data):
self.messages,
stream=self.streaming_output
)
ui.load_visual("Aria")

if self.streaming_output:
llm_output = ""
tts_text_buffer = []
Expand All @@ -56,7 +56,7 @@ def get_answer(self, ui, tts, data):
for i, out in enumerate(outputs):
if "content" in out['choices'][0]["delta"]:
output_chunk_txt = out['choices'][0]["delta"]['content']
if output_chunk_txt == "``":
if output_chunk_txt == "``": # TODO remove the remaining backticks in ui
skip_code_block_on_tts = not skip_code_block_on_tts
color_code_block = not color_code_block
if i == 1:
Expand All @@ -69,10 +69,10 @@ def get_answer(self, ui, tts, data):
llm_output += output_chunk_txt
if not skip_code_block_on_tts:
tts_text_buffer.append(output_chunk_txt)
if tts_text_buffer[-1] in [".", "!", "?", ":", "..", "..."]:
tts.run_tts(remove_emojis("".join(tts_text_buffer).strip()))
if tts_text_buffer[-1] in [".", "!", "?", ":", "..", "..."]: # TODO fix last sentence when not code and not have any of the stops
tts.run_tts(ui, remove_emojis("".join(tts_text_buffer).strip())) # TODO remove multi dots
tts_text_buffer = []
tts.check_last_chunk()
tts.check_audio_finished()
print()
llm_output = llm_output.strip()
else:
Expand All @@ -85,7 +85,4 @@ def get_answer(self, ui, tts, data):
}
)

if not self.streaming_output:
llm_output = remove_emojis(remove_multiple_dots(remove_code_blocks(llm_output)))

return llm_output
50 changes: 35 additions & 15 deletions components/tts.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import time
import warnings
import pyaudio
import numpy as np
Expand All @@ -11,6 +12,7 @@
class Tts:
def __init__(self, params=None):
self.params = params or {}
self.audio_device = self.params.get('audio_device', None)
self.device = self.params.get('device', None)
self.samplerate = self.params.get('samplerate', None)
self.buffer_size = self.params.get('buffer_size', None)
Expand All @@ -22,16 +24,22 @@ def __init__(self, params=None):
self.verbose = self.params.get('verbose', None)
self.voice_to_clone = self.params.get('assets', None).get('voice_to_clone', None)
self.sample_format = pyaudio.paFloat32

if self.audio_device == "default":
self.audio_device = None

p = pyaudio.PyAudio()
self.stream = p.open(format=self.sample_format,
channels=self.channels,
rate=self.samplerate,
frames_per_buffer=self.buffer_size,
output=True
output=True,
output_device_index=self.audio_device,
stream_callback=self._callback
)

self.last_chunk_length = self.buffer_size

self.audio_buffer = None
self.ui = None

if not self.verbose:
warnings.filterwarnings("ignore", module="TTS")
Expand All @@ -53,11 +61,23 @@ def __init__(self, params=None):
self.gpt_cond_latent, self.speaker_embedding = self.model.get_conditioning_latents(
audio_path=[self.voice_to_clone]
)

def audio_stream(self, buffer):
self.stream.write(buffer.tobytes())

def run_tts(self, data):

def _callback(self, in_data, frame_count, time_info, status):
if self.audio_buffer is None:
data = np.zeros(frame_count)
elif len(self.audio_buffer) >= frame_count:
data = self.audio_buffer[:frame_count]
self.audio_buffer = self.audio_buffer[frame_count:]
else:
shortfall = frame_count - len(self.audio_buffer)
data = np.concatenate((self.audio_buffer, np.zeros(shortfall, dtype=np.float32)))
self.audio_buffer = None
if self.ui is not None:
self.ui.update_visual("Aria", data)
return (data.tobytes(), pyaudio.paContinue)

def run_tts(self, ui, data):
self.ui = ui
tts_stream = self.model.inference_stream(
data,
"en",
Expand All @@ -66,15 +86,15 @@ def run_tts(self, data):
enable_text_splitting=self.text_splitting
)
for chunk in tts_stream:
self.last_chunk_length = chunk.shape[-1]
chunk = chunk.squeeze()
if self.device == 'gpu':
chunk = chunk.cpu()
self.audio_stream(chunk.numpy())
if self.audio_buffer is None:
self.audio_buffer = chunk.numpy()
else:
self.audio_buffer = np.concatenate((self.audio_buffer, chunk.numpy()))

return 'tts done'
return 'tts_done'

def check_last_chunk(self):
if self.last_chunk_length < self.buffer_size:
chunk = np.zeros(self.buffer_size - self.last_chunk_length)
self.audio_stream(chunk)
def check_audio_finished(self):
time.sleep(len(self.audio_buffer) / self.samplerate)
190 changes: 140 additions & 50 deletions components/ui.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import tkinter as tk
import numpy as np
import scipy.fft
from PIL import Image, ImageTk, ImageSequence


class Ui:
Expand All @@ -9,6 +10,9 @@ def __init__(self, params=None):
self.window_title = self.params.get('window_title', None)
self.window_size = self.params.get('window_size', None)
self.icon = self.params.get('assets', None).get('icon', None)
self.loading_gif = self.params.get('assets', None).get('loading_gif', None)
self.transition_gif = self.params.get('assets', None).get('transition_gif', None)
self.muted_mic_gif = self.params.get('assets', None).get('muted_mic_gif', None)

self.root = tk.Tk()
self.root.title(self.window_title)
Expand All @@ -18,17 +22,12 @@ def __init__(self, params=None):
self.root.configure(bg="black")
self.root.resizable(True, True)

self.spectrum_widget = tk.Canvas(self.root, bg="black", width=self.window_size, height=int(int(self.window_size)/2))
self.spectrum_widget.pack(expand=True, fill="both", padx=10, pady=10)

self.center_x, self.center_y = int(int(self.window_size)/2), int(int(self.window_size)/4)
self.radius = 100
self.min_radius = 50
self.circle = self.spectrum_widget.create_oval(self.center_x - self.radius,
self.center_y - self.radius,
self.center_x + self.radius,
self.center_y + self.radius,
outline="white", width=2, fill="white")
self.visual_widget = tk.Canvas(
self.root,
bg="black",
width=self.window_size,
height=int(int(self.window_size)/2))
self.visual_widget.pack(expand=True, fill="both", padx=10, pady=10)

self.scrollbar = tk.Scrollbar(self.root, bg="black")
self.scrollbar.pack(side="right", fill="y")
Expand All @@ -46,11 +45,140 @@ def __init__(self, params=None):
self.text_widget.bind("<Button-3>", self.show_context_menu)
self.text_widget.bind("<Button-1>", self.close_context_menu)

self.update_spectrum_viz("system", None)
loading_gif = Image.open(self.loading_gif)
self.loading_frames = [
ImageTk.PhotoImage(frame.convert("RGBA").resize((250, 250), Image.ANTIALIAS))
for frame in ImageSequence.Iterator(loading_gif)]

transition_gif = Image.open(self.transition_gif)
self.transition_frames = [
ImageTk.PhotoImage(frame.convert("RGBA").resize((250, 250), Image.ANTIALIAS))
for frame in ImageSequence.Iterator(transition_gif)]

muted_mic_gif = Image.open(self.muted_mic_gif)
self.muted_mic_frames = [
ImageTk.PhotoImage(frame.convert("RGBA").resize((250, 250), Image.ANTIALIAS))
for frame in ImageSequence.Iterator(muted_mic_gif)]

self.visual_x = int(int(self.window_size)/2)
self.visual_y = int(int(self.window_size)/4)

self.listening_color = "#FFFFFF"
self.listening_max_percentage = 0.85
self.listening_sensitivity_factor = 1000
self.listening_min_radius = 50
self.listening_radius = 100

self.speaking_BAR_COUNT = 5
# self.speaking_MIN_SIZES = [10, 30, 50, 30, 10]
self.speaking_MIN_SIZES = [0, 2, 5, 2, 0]
self.speaking_MAX_SIZES = [70, 90, 110, 90, 70]
self.speaking_BAR_WIDTH = 20
self.speaking_BAR_SPACING = 10

self.root.bind("<Configure>", self.on_resize)
self.root.protocol("WM_DELETE_WINDOW", self.on_closing)
self.kill = False

self.visual_stop = False
self.load_visual("system_init")

def on_resize(self, event):
self.visual_x = int(int(self.root.winfo_width())/2)
self.visual_y = int(int(self.root.winfo_height())/4)

def run_visual(self, frames, frame_num):
self.visual_widget.coords(self.visual_widget_item, self.visual_x, self.visual_y)
frame = frames[frame_num]
self.visual_widget.itemconfig(self.visual_widget_item, image=frame)
frame_num = (frame_num + 1) % len(frames)
if not self.visual_stop:
self.visual_after_id = self.root.after(20, self.run_visual, frames, frame_num)

def update_visual(self, user_name, data, time_color_warning=0):
if user_name == "You":
spectrum = np.abs(scipy.fft.fft(data))
amplitude = spectrum.mean()
max_radius = min(self.visual_widget.winfo_reqwidth(), self.visual_widget.winfo_reqheight()) * self.listening_max_percentage / 2
scaled_radius = min(amplitude * self.listening_sensitivity_factor, max_radius)
self.listening_radius = int(0.8 * self.listening_radius + 0.2 * max(scaled_radius, self.listening_min_radius))
oval_coords = (
self.visual_x - self.listening_radius,
self.visual_y - self.listening_radius,
self.visual_x + self.listening_radius,
self.visual_y + self.listening_radius
)
if 0 < time_color_warning < 0.5:
self.listening_color = "#FF0000"
elif 0.5 < time_color_warning < 1:
self.listening_color = "#B93C3C"
elif 1 < time_color_warning < 1.5:
self.listening_color = "#8A4B4B"
elif 1.5 < time_color_warning < 2:
self.listening_color = "#584848"
else:
self.listening_color = "#FFFFFF"
self.visual_widget.coords(self.visual_widget_item, oval_coords)
self.visual_widget.itemconfig(self.visual_widget_item, outline=self.listening_color , fill=self.listening_color)
elif user_name == "Aria":
spectrum = np.abs(scipy.fft.fft(data))
amplitude = spectrum.mean()
for i in range(self.speaking_BAR_COUNT):
height = self.speaking_MIN_SIZES[i] + (self.speaking_MAX_SIZES[i] - self.speaking_MIN_SIZES[i]) * amplitude
self.visual_widget.coords(self.visual_widget_items[i],
self.visual_x + i * (self.speaking_BAR_WIDTH + self.speaking_BAR_SPACING) - (self.speaking_BAR_WIDTH + self.speaking_BAR_SPACING) * self.speaking_BAR_COUNT / 2,
self.visual_y - self.bar_height / 2 + height / 2 + self.bar_height / 2,
(self.visual_x + i * (self.speaking_BAR_WIDTH + self.speaking_BAR_SPACING) - (self.speaking_BAR_WIDTH + self.speaking_BAR_SPACING) * self.speaking_BAR_COUNT / 2) + self.speaking_BAR_WIDTH,
self.visual_y - self.bar_height / 2 - height / 2 + self.bar_height / 2)

def load_visual(self, user_name):
self.stop_visual()
if user_name == "system_init":
self.visual_widget_item = self.visual_widget.create_image(
self.visual_x, self.visual_y, image=self.loading_frames[0])
self.start_visual()
self.run_visual(self.loading_frames, 0)
elif user_name == "system_transition":
self.visual_widget_item = self.visual_widget.create_image(
self.visual_x, self.visual_y, image=self.transition_frames[0])
self.start_visual()
self.run_visual(self.transition_frames, 0)
elif user_name == "system_muted_mic":
self.visual_widget_item = self.visual_widget.create_image(
self.visual_x, self.visual_y, image=self.muted_mic_frames[0])
self.start_visual()
self.run_visual(self.muted_mic_frames, 0)
elif user_name == "You":
oval_coords = (
self.visual_x - self.listening_radius,
self.visual_y - self.listening_radius,
self.visual_x + self.listening_radius,
self.visual_y + self.listening_radius
)
self.visual_widget_item = self.visual_widget.create_oval(oval_coords,
outline=self.listening_color,
width=2,
fill=self.listening_color)
elif user_name == "Aria":
self.bar_height = max(self.speaking_MAX_SIZES) + 20
self.visual_widget_items = []
for i in range(self.speaking_BAR_COUNT):
x0 = self.visual_x + i * (self.speaking_BAR_WIDTH + self.speaking_BAR_SPACING) - (self.speaking_BAR_WIDTH + self.speaking_BAR_SPACING) * self.speaking_BAR_COUNT / 2
y0 = self.visual_y - self.bar_height / 2 - self.speaking_MIN_SIZES[i] / 2 + self.bar_height / 2
x1 = x0 + self.speaking_BAR_WIDTH
y1 = self.visual_y - self.bar_height / 2 + self.speaking_MIN_SIZES[i] / 2 + self.bar_height / 2
bar = self.visual_widget.create_rectangle(x0, y0, x1, y1, fill='#832DFF')
self.visual_widget_items.append(bar)

def stop_visual(self):
self.visual_stop = True
if hasattr(self, 'visual_after_id'):
self.root.after_cancel(self.visual_after_id)
self.visual_widget.delete("all")

def start_visual(self):
self.visual_stop = False

def show_context_menu(self, event):
self.context_menu.post(event.x_root, event.y_root)
if self.text_widget.tag_ranges(tk.SEL):
Expand All @@ -67,44 +195,6 @@ def copy_text(self):
selected_text = self.text_widget.get(tk.SEL_FIRST, tk.SEL_LAST)
self.text_widget.clipboard_clear()
self.text_widget.clipboard_append(selected_text)

def update_spectrum_viz(self, user_name, data, time_color_warning=0):
if user_name == "You":
spectrum = np.abs(scipy.fft.fft(data))
amplitude = spectrum.mean()
max_percentage = 0.85
max_radius = min(self.spectrum_widget.winfo_reqwidth(), self.spectrum_widget.winfo_reqheight()) * max_percentage / 2
sensitivity_factor = 1000
scaled_radius = min(amplitude * sensitivity_factor, max_radius)
self.radius = int(0.8 * self.radius + 0.2 * max(scaled_radius, self.min_radius))
self.spectrum_widget.delete("all")
current_window_width = self.root.winfo_width()
current_window_height = self.root.winfo_height()
self.center_x, self.center_y = int(int(current_window_width)/2), int(int(current_window_height)/4)
oval_coords = (
self.center_x - self.radius,
self.center_y - self.radius,
self.center_x + self.radius,
self.center_y + self.radius
)
if 0 < time_color_warning < 0.5:
color = "#FF0000"
elif 0.5 < time_color_warning < 1:
color = "#B93C3C"
elif 1 < time_color_warning < 1.5:
color = "#8A4B4B"
elif 1.5 < time_color_warning < 2:
color = "#584848"
else:
color = "#FFFFFF"
self.spectrum_widget.create_oval(oval_coords, outline=color, width=2, fill=color)
self.spectrum_widget.update()
elif user_name == "Aria":
self.spectrum_widget.delete("all")
self.spectrum_widget.update()
elif user_name == "system":
self.spectrum_widget.delete("all")
self.spectrum_widget.update()

def add_message(self, user_name, text, new_entry=False, color_code_block=False):
color = "white"
Expand Down
6 changes: 5 additions & 1 deletion components/vad.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,11 @@ def __init__(self, params=None):
sampling_rate=self.samplerate,
min_silence_duration_ms=100,
speech_pad_ms=30
)
)

def reset_vad(self):
self.no_voice_sec = 0
self.vad_iterator.reset_states()

def check(self, mic_chunk, chunk_time):
speech_dict = self.vad_iterator(mic_chunk, return_seconds=False)
Expand Down
Loading

0 comments on commit 5faaa25

Please sign in to comment.