Skip to content

Commit

Permalink
Fix speaker id selection
Browse files Browse the repository at this point in the history
  • Loading branch information
redromnon committed Jun 4, 2024
1 parent aa7d8d3 commit a3a60ae
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 4 deletions.
8 changes: 5 additions & 3 deletions dimits/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,16 @@

class Dimits():
"""Dimits"""
def __init__(self, voice: str, verbose: bool = True, modelDirectory: str = None):
def __init__(self, voice: str, verbose: bool = True, modelDirectory: str = None, speaker_id: int = None):
"""
Initialize a new instance of Dimits with the provided voice and verbosity.
Args:
voice (str): The voice to use for text-to-speech.
verbose (bool): Whether to print verbose output.
model (str): represents the local path to the model file. If not provided (i.e., None), the default behavior is to utilize the model hosted on GitHub.
speaker_id (int): A particular speaker from the voice. The first one is used by default.
Returns:
None
Expand Down Expand Up @@ -64,6 +65,7 @@ def __init__(self, voice: str, verbose: bool = True, modelDirectory: str = None)
# Set the path to the ONNX voice file

self.voice_onnx = voice
self.speaker = speaker_id
self.voice_onnx = os.path.join(
self.parent_destn, str(self.voice_onnx) + '.onnx')
logger('Using ' + str(self.voice_onnx), verbose=verbose)
Expand Down Expand Up @@ -150,7 +152,7 @@ def text_2_audio_file(self, text: str, filename: str, directory: str, format: st
filepath = os.path.join(directory, f'{filename}.{format}')


out_bin = self.tts_model.synthesize(text,length_scale=1.0, noise_scale=1.0, noise_w=1.0)
out_bin = self.tts_model.synthesize(text,length_scale=1.0, noise_scale=1.0, noise_w=1.0, speaker_id=self.speaker)
with open(filepath, 'wb') as f:
f.write(out_bin)

Expand Down
6 changes: 5 additions & 1 deletion dimits/ttsmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,10 @@ def synthesize(
length_scale = length_scale or self.config.inference.length_scale
noise_scale = noise_scale or self.config.inference.noise_scale
noise_w = noise_w or self.config.inference.noise_w

# Set default speaker
if (self.config.num_speakers > 1) and (speaker_id is None):
speaker_id = 0

# Convert text to phonemes
phonemes = self._text_to_phonemes(text)
Expand Down Expand Up @@ -152,7 +156,7 @@ def _create_inputs(self, phoneme_ids, speaker_id, noise_scale, length_scale, noi
"input": phoneme_ids,
"input_lengths": length,
"scales": scales,
"speaker": speaker
"sid": speaker
}

def _float_to_int16(self, audio: np.ndarray) -> np.ndarray:
Expand Down

0 comments on commit a3a60ae

Please sign in to comment.