Skip to content

Commit

Permalink
Adds option for zoomer mode and to keep leading whitespace
Browse files Browse the repository at this point in the history
  • Loading branch information
spartanhaden committed Mar 16, 2023
1 parent 8c34931 commit 35eb95f
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 28 deletions.
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ python3 main.py --help
```

```
usage: main.py [-h] [--model_name {tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large}] [--mic_device MIC_DEVICE] [--inference_device INFERENCE_DEVICE] [--activation_key ACTIVATION_KEY]
usage: main.py [-h] [--model_name {tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large}] [--mic_device MIC_DEVICE] [--inference_device INFERENCE_DEVICE] [--activation_key ACTIVATION_KEY] [--keep_leading_whitespace] [--zoomer_mode]
a small script that types what you say using whisper while holding a hotkey
Expand All @@ -39,6 +39,9 @@ options:
the device to run the inference on. can be cpu, cuda, or cuda:<device number> will automatically select the best device if not specified
--activation_key ACTIVATION_KEY
the key to use for push to talk. can be like <ctrl_r> or <alt_l> or <f1> or e or 1 or 2 or 3 etc
--keep_leading_whitespace
keep the leading space that whisper outputs
--zoomer_mode makes everything lowercase and removes all trailing periods
```

Defaults
Expand All @@ -48,4 +51,6 @@ model_name: "base.en"
mic_device: "default"
inference_device: None
activation_key: "<ctrl_r>"
keep_leading_whitespace: False
zoomer_mode: False
```
66 changes: 39 additions & 27 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,21 @@

class SpeachToText:
def __init__(self) -> None:
args = self.process_args()
self.args = self.process_args()

self.p = pyaudio.PyAudio()

# find the index of the specified device
device_index = None
for i in range(self.p.get_device_count()):
info = self.p.get_device_info_by_index(i)
if info['name'] == args.mic_device:
if info['name'] == self.args.mic_device:
device_index = info['index']
break

if device_index is None:
self.p.terminate()
raise ValueError(f'Could not find audio device named {args.mic_device}')
raise ValueError(f'Could not find audio device named {self.args.mic_device}')

# open the audio stream
self.chunk_size = 512
Expand All @@ -50,7 +50,7 @@ def __init__(self) -> None:
start_time = time.time()

# load the whisper model
self.model = whisper.load_model(name=args.model_name, device=args.inference_device)
self.model = whisper.load_model(name=self.args.model_name, device=self.args.inference_device)

# make first inference on real data faster by forcing the model to finish loading since in_memory does not seem to fully preload the model
self.model.transcribe(np.zeros(201, dtype=np.float32))
Expand Down Expand Up @@ -99,6 +99,8 @@ def process_args(self):
parser.add_argument('--inference_device', type=str, default=None,
help='the device to run the inference on. can be cpu, cuda, or cuda:<device number> will automatically select the best device if not specified')
parser.add_argument('--activation_key', type=str, default='<ctrl_r>', help='the key to use for push to talk. can be like <ctrl_r> or <alt_l> or <f1> or e or 1 or 2 or 3 etc')
parser.add_argument('--keep_leading_whitespace', action='store_true', help='keep the leading space that whisper outputs')
parser.add_argument('--zoomer_mode', action='store_true', help='makes everything lowercase and removes all trailing periods')
args = parser.parse_args()

# print the model
Expand Down Expand Up @@ -137,6 +139,31 @@ def infer(self, frames):
# return just the text
return whisper_output['text']

# removes leading white space and trailing periods and makes everything lowercase
def sanitize_output(self, output_text):
# strip leading whitespace
output_text = output_text.lstrip()

# check if the output text is empty or is just periods
if output_text == '' or output_text == '.' * len(output_text):
return ''

# remove all the trailing periods and make everything lowercase
if self.args.zoomer_mode:
while output_text[-1] == '.':
output_text = output_text[:-1]
if output_text == '':
return ''

# convert the text to lowercase
output_text = output_text.lower()

# readd the leading whitespace if the user wants it
if self.args.keep_leading_whitespace:
output_text = ' ' + output_text

return output_text

# listens to the audio stream and then processes the frames and types the output text
def listen(self):
self.activation_key_pressed = True
Expand All @@ -163,32 +190,17 @@ def listen(self):
# reset the frames
self.frames = []

# strip leading whitespace
output_text = output_text.lstrip()
output_text = self.sanitize_output(output_text)

# check if the output text is empty or is just periods
if output_text == '' or output_text == '.' * len(output_text):
if output_text == '':
print('nothing detected')
self.activation_key_pressed = False
return
else:
print(output_text)

# remove all the trailing periods but also handle the case where there is only periods
while output_text[-1] == '.':
output_text = output_text[:-1]
if output_text == '':
print('nothing detected')
self.activation_key_pressed = False
return

# convert the text to lowercase
output_text = output_text.lower()

print(output_text)

# type the output text
self.currently_typing = True
self.keyboard.type(output_text)
self.currently_typing = False
# type the output text
self.currently_typing = True
self.keyboard.type(output_text)
self.currently_typing = False

# reset the activation key
self.activation_key_pressed = False
Expand Down

0 comments on commit 35eb95f

Please sign in to comment.