Adds option for zoomer mode and to keep leading whitespace

spartanhaden · Mar 16, 2023 · 35eb95f · 35eb95f
1 parent 8c34931
commit 35eb95f
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 28 deletions.
diff --git a/README.md b/README.md
@@ -25,7 +25,7 @@ python3 main.py --help
 ```
 
 ```
-usage: main.py [-h] [--model_name {tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large}] [--mic_device MIC_DEVICE] [--inference_device INFERENCE_DEVICE] [--activation_key ACTIVATION_KEY]
+usage: main.py [-h] [--model_name {tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large}] [--mic_device MIC_DEVICE] [--inference_device INFERENCE_DEVICE] [--activation_key ACTIVATION_KEY] [--keep_leading_whitespace] [--zoomer_mode]
 
 a small script that types what you say using whisper while holding a hotkey
 
@@ -39,6 +39,9 @@ options:
                         the device to run the inference on. can be cpu, cuda, or cuda:<device number> will automatically select the best device if not specified
   --activation_key ACTIVATION_KEY
                         the key to use for push to talk. can be like <ctrl_r> or <alt_l> or <f1> or e or 1 or 2 or 3 etc
+  --keep_leading_whitespace
+                        keep the leading space that whisper outputs
+  --zoomer_mode         makes everything lowercase and removes all trailing periods
 ```
 
 Defaults
@@ -48,4 +51,6 @@ model_name: "base.en"
 mic_device: "default"
 inference_device: None
 activation_key: "<ctrl_r>"
+keep_leading_whitespace: False
+zoomer_mode: False
 ```
diff --git a/main.py b/main.py
@@ -12,21 +12,21 @@
 
 class SpeachToText:
     def __init__(self) -> None:
-        args = self.process_args()
+        self.args = self.process_args()
 
         self.p = pyaudio.PyAudio()
 
         # find the index of the specified device
         device_index = None
         for i in range(self.p.get_device_count()):
             info = self.p.get_device_info_by_index(i)
-            if info['name'] == args.mic_device:
+            if info['name'] == self.args.mic_device:
                 device_index = info['index']
                 break
 
         if device_index is None:
             self.p.terminate()
-            raise ValueError(f'Could not find audio device named {args.mic_device}')
+            raise ValueError(f'Could not find audio device named {self.args.mic_device}')
 
         # open the audio stream
         self.chunk_size = 512
@@ -50,7 +50,7 @@ def __init__(self) -> None:
         start_time = time.time()
 
         # load the whisper model
-        self.model = whisper.load_model(name=args.model_name, device=args.inference_device)
+        self.model = whisper.load_model(name=self.args.model_name, device=self.args.inference_device)
 
         # make first inference on real data faster by forcing the model to finish loading since in_memory does not seem to fully preload the model
         self.model.transcribe(np.zeros(201, dtype=np.float32))
@@ -99,6 +99,8 @@ def process_args(self):
         parser.add_argument('--inference_device', type=str, default=None,
                             help='the device to run the inference on. can be cpu, cuda, or cuda:<device number> will automatically select the best device if not specified')
         parser.add_argument('--activation_key', type=str, default='<ctrl_r>', help='the key to use for push to talk. can be like <ctrl_r> or <alt_l> or <f1> or e or 1 or 2 or 3 etc')
+        parser.add_argument('--keep_leading_whitespace', action='store_true', help='keep the leading space that whisper outputs')
+        parser.add_argument('--zoomer_mode', action='store_true', help='makes everything lowercase and removes all trailing periods')
         args = parser.parse_args()
 
         # print the model
@@ -137,6 +139,31 @@ def infer(self, frames):
         # return just the text
         return whisper_output['text']
 
+    # removes leading white space and trailing periods and makes everything lowercase
+    def sanitize_output(self, output_text):
+        # strip leading whitespace
+        output_text = output_text.lstrip()
+
+        # check if the output text is empty or is just periods
+        if output_text == '' or output_text == '.' * len(output_text):
+            return ''
+
+        # remove all the trailing periods and make everything lowercase
+        if self.args.zoomer_mode:
+            while output_text[-1] == '.':
+                output_text = output_text[:-1]
+                if output_text == '':
+                    return ''
+
+            # convert the text to lowercase
+            output_text = output_text.lower()
+
+        # readd the leading whitespace if the user wants it
+        if self.args.keep_leading_whitespace:
+            output_text = ' ' + output_text
+
+        return output_text
+
     # listens to the audio stream and then processes the frames and types the output text
     def listen(self):
         self.activation_key_pressed = True
@@ -163,32 +190,17 @@ def listen(self):
         # reset the frames
         self.frames = []
 
-        # strip leading whitespace
-        output_text = output_text.lstrip()
+        output_text = self.sanitize_output(output_text)
 
-        # check if the output text is empty or is just periods
-        if output_text == '' or output_text == '.' * len(output_text):
+        if output_text == '':
             print('nothing detected')
-            self.activation_key_pressed = False
-            return
+        else:
+            print(output_text)
 
-        # remove all the trailing periods but also handle the case where there is only periods
-        while output_text[-1] == '.':
-            output_text = output_text[:-1]
-            if output_text == '':
-                print('nothing detected')
-                self.activation_key_pressed = False
-                return
-
-        # convert the text to lowercase
-        output_text = output_text.lower()
-
-        print(output_text)
-
-        # type the output text
-        self.currently_typing = True
-        self.keyboard.type(output_text)
-        self.currently_typing = False
+            # type the output text
+            self.currently_typing = True
+            self.keyboard.type(output_text)
+            self.currently_typing = False
 
         # reset the activation key
         self.activation_key_pressed = False