From c58c68cb8991e4a645b9df58c4cb0d5a4bcaa6d9 Mon Sep 17 00:00:00 2001 From: haden Date: Tue, 28 Feb 2023 07:49:28 -0800 Subject: [PATCH] Adds main script --- main.py | 159 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100755 main.py diff --git a/main.py b/main.py new file mode 100755 index 0000000..b519af3 --- /dev/null +++ b/main.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 + +import pickle +import pprint +import time +import wave + +import numpy as np +import pyaudio +import whisper +from pynput import keyboard + + +class SpeachToText: + def __init__(self, device_name) -> None: + self.p = pyaudio.PyAudio() + self.stream = None + + # find the index of the specified device + device_index = None + for i in range(self.p.get_device_count()): + info = self.p.get_device_info_by_index(i) + if info['name'] == device_name: + device_index = info['index'] + break + + if device_index is None: + raise ValueError(f"Could not find audio device named {device_name}") + + # open the audio stream + self.CHUNK = 512 + FORMAT = pyaudio.paInt16 + CHANNELS = 1 + RATE = 16000 + + print() + # print ready in green + print("\033[92m{}\033[00m".format("ready!")) + + self.stream = self.p.open(format=FORMAT, + channels=CHANNELS, + rate=RATE, + input_device_index=device_index, + input=True, + frames_per_buffer=self.CHUNK) + + # setup whisper + self.model = whisper.load_model("base.en") + + self.keyboard = keyboard.Controller() + + # make destructor to close the stream and terminate the pyaudio instance + def __del__(self): + # if there is a stream open close it + if self.stream is not None: + self.stream.stop_stream() + self.stream.close() + if self.p is not None: + self.p.terminate() + + # note that + def infer(self, frames): + # a np array containing the audio waveform + data = b''.join(frames) + data = np.frombuffer(data, dtype=np.int16) + + data_to_transcribe = data.copy().flatten().astype(np.float32) / 32768.0 + + whisper_output = self.model.transcribe(data_to_transcribe, language="en") + + import pprint + + # pprint.pprint(result) + # set the output text and remove the leading space + output_text = whisper_output["text"][1:] + print(output_text) + + self.keyboard.type(output_text) + + return output_text + + def run(self): + current_keys = set() + + with keyboard.Listener(on_press=lambda key: current_keys.add(key), on_release=lambda key: current_keys.discard(key)) as listener: + + # F16 key + activation_key = 269025095 + activation_key_pressed = False + + frames = [] + + try: + while True: + + if keyboard.KeyCode.from_vk(activation_key) in current_keys: + if not activation_key_pressed: + print() + # print listening in cyan + print("\033[96m{}\033[00m".format("listening...")) + self.stream.start_stream() + activation_key_pressed = True + frames_to_read = self.stream.get_read_available() + if frames_to_read > 0: + data = self.stream.read(frames_to_read, exception_on_overflow=False) + frames.append(data) + elif activation_key_pressed: + # process the frames after the activation key is released + self.stream.stop_stream() + self.infer(frames) + frames = [] + activation_key_pressed = False + + except KeyboardInterrupt: + # handle the Ctrl+C interrupt + print("Exiting...") + + +def graph_frames_from_pyaudio(frames): + import matplotlib.pyplot as plt + import numpy as np + + # get the audio data from the frames + data = b''.join(frames) + data = np.frombuffer(data, dtype=np.int16) + + # plot the audio data + plt.plot(data) + plt.show() + + +def get_devices(): + p = pyaudio.PyAudio() + + # list the devices + print() + for i in range(p.get_device_count()): + print() + pprint.pprint(p.get_device_info_by_index(i)) + + +def pickle_frames(frames): + data = b''.join(frames) + data = np.frombuffer(data, dtype=np.int16) + with open('frames.pickle', 'wb') as f: + pickle.dump(data, f) + + +def unpickle_frames(): + print("Loading frames from pickle...") + with open('frames.pickle', 'rb') as f: + data = pickle.load(f) + print("done loading") + return data + + +if __name__ == '__main__': + speech_to_text = SpeachToText('') + speech_to_text.run()