Fix speaker id selection

Reqeique · Jun 4, 2024 · a3a60ae · a3a60ae
1 parent aa7d8d3
commit a3a60ae
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 4 deletions.
diff --git a/dimits/main.py b/dimits/main.py
@@ -18,15 +18,16 @@
 
 class Dimits():
     """Dimits"""         
-    def __init__(self, voice: str, verbose: bool = True, modelDirectory: str = None):
+    def __init__(self, voice: str, verbose: bool = True, modelDirectory: str = None, speaker_id: int = None):
         """
         Initialize a new instance of Dimits with the provided voice and verbosity.
 
         Args:
             voice (str): The voice to use for text-to-speech.
             verbose (bool): Whether to print verbose output.
             model (str): represents the local path to the model file. If not provided (i.e., None), the default behavior is to utilize the model hosted on GitHub.
-
+            speaker_id (int): A particular speaker from the voice. The first one is used by default.
+            
         Returns:
             None
         
@@ -64,6 +65,7 @@ def __init__(self, voice: str, verbose: bool = True, modelDirectory: str = None)
         # Set the path to the ONNX voice file
 
         self.voice_onnx = voice
+        self.speaker = speaker_id
         self.voice_onnx = os.path.join(
         self.parent_destn, str(self.voice_onnx) + '.onnx')
         logger('Using ' + str(self.voice_onnx), verbose=verbose)
@@ -150,7 +152,7 @@ def text_2_audio_file(self, text: str, filename: str, directory: str, format: st
         filepath = os.path.join(directory, f'{filename}.{format}')
 
 
-        out_bin = self.tts_model.synthesize(text,length_scale=1.0, noise_scale=1.0, noise_w=1.0)
+        out_bin = self.tts_model.synthesize(text,length_scale=1.0, noise_scale=1.0, noise_w=1.0, speaker_id=self.speaker)
         with open(filepath, 'wb') as f:
             f.write(out_bin)
 

diff --git a/dimits/ttsmodel.py b/dimits/ttsmodel.py
@@ -100,6 +100,10 @@ def synthesize(
         length_scale = length_scale or self.config.inference.length_scale
         noise_scale = noise_scale or self.config.inference.noise_scale
         noise_w = noise_w or self.config.inference.noise_w
+
+        # Set default speaker
+        if (self.config.num_speakers > 1) and (speaker_id is None):
+            speaker_id = 0
 
         # Convert text to phonemes
         phonemes = self._text_to_phonemes(text)
@@ -152,7 +156,7 @@ def _create_inputs(self, phoneme_ids, speaker_id, noise_scale, length_scale, noi
             "input": phoneme_ids,
             "input_lengths": length, 
             "scales": scales,
-            "speaker": speaker
+            "sid": speaker
         }
 
     def _float_to_int16(self, audio: np.ndarray) -> np.ndarray: