GTA synthesis

Signed-off-by: begeekmyfriend <[email protected]>
dalvlv · Apr 28, 2019 · 1ec3de8 · 1ec3de8
1 parent 6f5fe3f
commit 1ec3de8
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -146,7 +146,7 @@ logs will be stored inside **logs-Wavenet**.
 
 **Note:**
 - If model argument is not provided, training will default to Tacotron-2 model training. (both models)
-- Please refer to train arguments under [train.py](https://github.com/Rayhane-mamah/Tacotron-2/blob/master/train.py) for a set of options you can use.
+- Please refer to train arguments under [train.py](https://github.com/begeekmyfriend/Tacotron-2/blob/master/train.py) for a set of options you can use.
 - It is now possible to make wavenet preprocessing alone using **wavenet_proprocess.py**.
 
 # Synthesis
@@ -175,7 +175,7 @@ Synthesizing the **waveforms** conditionned on previously synthesized Mel-spectr
 
 **Note:**
 - If model argument is not provided, synthesis will default to Tacotron-2 model synthesis. (End-to-End TTS)
-- Please refer to synthesis arguments under [synthesize.py](https://github.com/Rayhane-mamah/Tacotron-2/blob/master/synthesize.py) for a set of options you can use.
+- Please refer to synthesis arguments under [synthesize.py](https://github.com/begeekmyfriend/Tacotron-2/blob/master/synthesize.py) for a set of options you can use.
 
 
 # References and Resources:

diff --git a/hparams.py b/hparams.py
@@ -122,7 +122,7 @@
 	tacotron_batch_size = 48, #number of training samples on each training steps
 	#Tacotron Batch synthesis supports ~16x the training batch size (no gradients during testing).
 	#Training Tacotron with unmasked paddings makes it aware of them, which makes synthesis times different from training. We thus recommend masking the encoder.
-	tacotron_synthesis_batch_size = 32 * 16, #DO NOT MAKE THIS BIGGER THAN 1 IF YOU DIDN'T TRAIN TACOTRON WITH "mask_encoder=True"!!
+	tacotron_synthesis_batch_size = 1, #DO NOT MAKE THIS BIGGER THAN 1 IF YOU DIDN'T TRAIN TACOTRON WITH "mask_encoder=True"!!
 	tacotron_test_size = 0.05, #% of data to keep as test data, if None, tacotron_test_batches must be not None. (5% is enough to have a good idea about overfit)
 	tacotron_test_batches = None, #number of test batches.
 

diff --git a/tacotron/synthesizer.py b/tacotron/synthesizer.py
@@ -71,7 +71,7 @@ def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames):
 			np_targets = [np.load(mel_filename) for mel_filename in mel_filenames]
 			target_lengths = [len(np_target) for np_target in np_targets]
 			padded_targets = self._prepare_targets(np_targets, self._hparams.outputs_per_step)
-			feed_dict[self.model.mel_targets] = padded_targets.reshape(len(np_targets), -1, 80)
+			feed_dict[self.model.mel_targets] = padded_targets.reshape(len(np_targets), -1, hparams.num_mels)
 
 		if self.gta or not hparams.predict_linear:
 			mels, alignments = self.session.run([self.mel_outputs, self.alignments], feed_dict=feed_dict)
@@ -118,37 +118,37 @@ def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames):
 		speaker_ids = []
 		for i, mel in enumerate(mels):
 			#Get speaker id for global conditioning (only used with GTA generally)
-			# speaker_id = '<no_g>'
-			# speaker_ids.append(speaker_id)
+			speaker_id = '<no_g>'
+			speaker_ids.append(speaker_id)
 
 			# Write the spectrogram to disk
 			# Note: outputs mel-spectrogram files and target ones have same names, just different folders
-			# mel_filename = os.path.join(out_dir, 'mel-{:03d}.npy'.format(basenames[i]))
-			# np.save(mel_filename, mel, allow_pickle=False)
-			# saved_mels_paths.append(mel_filename)
+			mel_filename = os.path.join(out_dir, '{}.npy'.format(basenames[i]))
+			np.save(mel_filename, mel, allow_pickle=False)
+			saved_mels_paths.append(mel_filename)
 
 			if log_dir is not None:
 				#save wav (mel -> wav)
-				# wav = audio.inv_mel_spectrogram(mel.T, hparams)
-				# audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{:03d}-mel.wav'.format(basenames[i])), hparams)
+				wav = audio.inv_mel_spectrogram(mel.T, hparams)
+				audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-mel.wav'.format(basenames[i])), hparams)
 
 				#save alignments
-				# plot.plot_alignment(alignments[i], os.path.join(log_dir, 'plots/alignment-{:03d}.png'.format(basenames[i])),
-				# 	info='{}'.format(texts[i]), split_title=True)
+				plot.plot_alignment(alignments[i], os.path.join(log_dir, 'plots/alignment-{}.png'.format(basenames[i])),
+					info='{}'.format(texts[i]), split_title=True)
 
 				#save mel spectrogram plot
-				# plot.plot_spectrogram(mel, os.path.join(log_dir, 'plots/mel-{:03d}.png'.format(basenames[i])),
-				# 	info='{}'.format(texts[i]), split_title=True)
+				plot.plot_spectrogram(mel, os.path.join(log_dir, 'plots/mel-{}.png'.format(basenames[i])),
+					info='{}'.format(texts[i]), split_title=True)
 
-				if hparams.predict_linear:
+				if hparams.predict_linear and not self.gta:
 					#save wav (linear -> wav)
 					linear_wav = self.session.run(self.linear_wav_outputs, feed_dict={self.linear_spectrograms: linears[i]})
 					wav = audio.inv_preemphasis(linear_wav, hparams.preemphasis)
-					audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{:03d}-linear.wav'.format(i)), hparams)
+					audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-linear.wav'.format(i)), hparams)
 
 					#save mel spectrogram plot
-					# plot.plot_spectrogram(linears[i], os.path.join(log_dir, 'plots/linear-{:03d}.png'.format(basenames[i])),
-					# 	info='{}'.format(texts[i]), split_title=True, auto_aspect=True)
+					plot.plot_spectrogram(linears[i], os.path.join(log_dir, 'plots/linear-{}.png'.format(basenames[i])),
+						info='{}'.format(texts[i]), split_title=True, auto_aspect=True)
 
 		return saved_mels_paths, speaker_ids