v2.2

del18687058912 · May 6, 2022 · ffd0238 · ffd0238
1 parent b327be5
commit ffd0238
Show file tree

Hide file tree

Showing 32 changed files with 77 additions and 34 deletions.
diff --git a/README.md b/README.md
@@ -9,6 +9,11 @@ This repo contains all the code needed to run Tortoise TTS in inference mode.
 
 ### New features
 
+#### v2.2; 2022/5/5
+- Added several new voices from the training set.
+- Automated redaction. Wrap the text you want to use to prompt the model but not be spoken in brackets.
+- Bug fixes
+
 #### v2.1; 2022/5/2
 - Added ability to produce totally random voices.
 - Added ability to download voice conditioning latent via a script, and then use a user-provided conditioning latent.
@@ -95,11 +100,9 @@ For the those in the ML space: this is created by projecting a random vector ont
 
 ### Provided voices
 
-This repo comes with several pre-packaged voices. You will be familiar with many of them. :)
-
-Most of the provided voices were not found in the training set. Experimentally, it seems that voices from the training set
-produce more realistic outputs then those outside of the training set. Any voice prepended with "train" came from the
-training set.
+This repo comes with several pre-packaged voices. Voices prepended with "train_" came from the training set and perform
+far better than the others. If your goal is high quality speech, I recommend you pick one of them. If you want to see
+what Tortoise can do for zero-shot mimicing, take a look at the others.
 
 ### Adding a new voice
 

diff --git a/examples/prompting/angry.mp3 b/examples/prompting/angry.mp3
diff --git a/examples/prompting/happy.mp3 b/examples/prompting/happy.mp3
diff --git a/examples/prompting/sad.mp3 b/examples/prompting/sad.mp3
diff --git a/examples/prompting/scared.mp3 b/examples/prompting/scared.mp3
diff --git a/examples/various/desktop.ini b/examples/various/desktop.ini
diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 setuptools.setup(
     name="TorToiSe",
     packages=setuptools.find_packages(),
-    version="2.1.3",
+    version="2.2.0",
     author="James Betker",
     author_email="[email protected]",
     description="A high quality multi-voice text-to-speech library",

diff --git a/tortoise/models/vocoder.py b/tortoise/models/vocoder.py
@@ -284,8 +284,6 @@ def eval(self, inference=False):
             self.remove_weight_norm()
 
     def remove_weight_norm(self):
-        print('Removing weight norm...')
-
         nn.utils.remove_weight_norm(self.conv_pre)
 
         for layer in self.conv_post:

diff --git a/tortoise/utils/audio.py b/tortoise/utils/audio.py
@@ -137,7 +137,7 @@ def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
         self.stft_fn = STFT(filter_length, hop_length, win_length)
         from librosa.filters import mel as librosa_mel_fn
         mel_basis = librosa_mel_fn(
-            sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
+            sr=sampling_rate, n_fft=filter_length, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax)
         mel_basis = torch.from_numpy(mel_basis).float()
         self.register_buffer('mel_basis', mel_basis)
 

diff --git a/tortoise/utils/wav2vec_alignment.py b/tortoise/utils/wav2vec_alignment.py
@@ -66,7 +66,7 @@ def align(self, audio, expected_text, audio_sample_rate=24000):
         logits = logits[0]
         pred_string = self.tokenizer.decode(logits.argmax(-1).tolist())
 
-        fixed_expectation = max_alignment(expected_text, pred_string)
+        fixed_expectation = max_alignment(expected_text.lower(), pred_string)
         w2v_compression = orig_len // logits.shape[0]
         expected_tokens = self.tokenizer.encode(fixed_expectation)
         expected_chars = list(fixed_expectation)
@@ -100,7 +100,10 @@ def pop_till_you_win():
                     break
 
         pop_till_you_win()
-        assert len(expected_tokens) == 0, "This shouldn't happen. My coding sucks."
+        if not (len(expected_tokens) == 0 and len(alignments) == len(expected_text)):
+            torch.save([audio, expected_text], 'alignment_debug.pth')
+            assert False, "Something went wrong with the alignment algorithm. I've dumped a file, 'alignment_debug.pth' to" \
+                          "your current working directory. Please report this along with the file so it can get fixed."
 
         # Now fix up alignments. Anything with -1 should be interpolated.
         alignments.append(orig_len)  # This'll get removed but makes the algorithm below more readable.

diff --git a/tortoise/voices/applejack/1.wav b/tortoise/voices/applejack/1.wav
diff --git a/tortoise/voices/applejack/2.wav b/tortoise/voices/applejack/2.wav
diff --git a/tortoise/voices/applejack/3.wav b/tortoise/voices/applejack/3.wav
diff --git a/tortoise/voices/rainbow/1.wav b/tortoise/voices/rainbow/1.wav
diff --git a/tortoise/voices/rainbow/2.wav b/tortoise/voices/rainbow/2.wav
diff --git a/tortoise/voices/rainbow/3.wav b/tortoise/voices/rainbow/3.wav
diff --git a/tortoise/voices/train_daws/1.mp3 b/tortoise/voices/train_daws/1.mp3
diff --git a/tortoise/voices/train_daws/2.mp3 b/tortoise/voices/train_daws/2.mp3
diff --git a/tortoise/voices/train_daws/3.mp3 b/tortoise/voices/train_daws/3.mp3
diff --git a/tortoise/voices/train_dreams/1.mp3 b/tortoise/voices/train_dreams/1.mp3
diff --git a/tortoise/voices/train_dreams/2.mp3 b/tortoise/voices/train_dreams/2.mp3
diff --git a/tortoise/voices/train_dreams/3.mp3 b/tortoise/voices/train_dreams/3.mp3
diff --git a/tortoise/voices/train_empire/1.mp3 b/tortoise/voices/train_empire/1.mp3
diff --git a/tortoise/voices/train_empire/2.mp3 b/tortoise/voices/train_empire/2.mp3
diff --git a/tortoise/voices/train_empire/3.mp3 b/tortoise/voices/train_empire/3.mp3
diff --git a/tortoise/voices/train_mouse/1.mp3 b/tortoise/voices/train_mouse/1.mp3
diff --git a/tortoise/voices/train_mouse/2.mp3 b/tortoise/voices/train_mouse/2.mp3
diff --git a/tortoise/voices/train_mouse/3.mp3 b/tortoise/voices/train_mouse/3.mp3
diff --git a/tortoise/voices/yannic/00045.mp3 b/tortoise/voices/yannic/00045.mp3
diff --git a/tortoise/voices/yannic/00055.mp3 b/tortoise/voices/yannic/00055.mp3
diff --git a/tortoise/voices/yannic/00203.mp3 b/tortoise/voices/yannic/00203.mp3
diff --git a/tortoise_v2_examples.html b/tortoise_v2_examples.html