T2 synthesis, T+W batch synthesis, fix checkpoints

- Checkpoints loading for Wavenet and Tacotron during T2 synthesis has been fixed - Wavenet time alignments have been fixed permanently for both GTA and Ground Truth mels (any issue with this is caused by a misuse of the model) - Add batch synthesis for Tacotron (GTA) and Wavenet - Fix bugs and add missings for Tacotron-2 synthesis.
tuong-olli · Aug 12, 2018 · e24217a · e24217a
1 parent 19abfe8
commit e24217a
Show file tree

Hide file tree

Showing 12 changed files with 329 additions and 118 deletions.
diff --git a/datasets/wavenet_preprocessor.py b/datasets/wavenet_preprocessor.py
@@ -126,7 +126,7 @@ def _process_utterance(mel_dir, wav_dir, index, wav_path, hparams):
 
 	#global condition features
 	if hparams.gin_channels > 0:
-		raise RuntimeError('When activating global conditions, please set your speaker_id rules in line 128 of datasets/wavenet_preprocessor.py to use them during training')
+		raise RuntimeError('When activating global conditions, please set your speaker_id rules in line 129 of datasets/wavenet_preprocessor.py to use them during training')
 		speaker_id = '<no_g>' #put the rule to determine how to assign speaker ids (using file names maybe? file basenames are available in "index" variable)
 	else:
 		speaker_id = '<no_g>'

diff --git a/hparams.py b/hparams.py
@@ -142,6 +142,9 @@
 	tacotron_test_batches = 48, #number of test batches (For Ljspeech: 10% ~= 41 batches of 32 samples)
 	tacotron_data_random_state=1234, #random state for train test split repeatability
 
+	#Usually your GPU can handle 16x tacotron_batch_size during synthesis for the same memory amount during training (because no gradients to keep and ops to register for backprop)
+	tacotron_synthesis_batch_size = 32 * 16, #This ensures GTA synthesis goes up to 40x faster than one sample at a time and uses 100% of your GPU computation power.
+
 	tacotron_decay_learning_rate = True, #boolean, determines if the learning rate will follow an exponential decay
 	tacotron_start_decay = 50000, #Step at which learning decay starts
 	tacotron_decay_steps = 50000, #Determines the learning rate decay slope (UNDER TEST)
@@ -183,6 +186,10 @@
 	wavenet_test_batches = None, #number of test batches.
 	wavenet_data_random_state = 1234, #random state for train test split repeatability
 
+	#During synthesis, there is no max_time_steps limitation so the model can sample much longer audio than 8000 steps. (Audio can go up to 500k steps, equivalent to ~21sec on 24kHz)
+	#Usually your GPU can handle 1x~2x wavenet_batch_size during synthesis for the same memory amount during training (because no gradients to keep and ops to register for backprop)
+	wavenet_synthesis_batch_size = 4 * 2, #This ensure that wavenet synthesis goes up to 4x~8x faster when synthesizing multiple sentences. Watch out for OOM with long audios.
+
 	wavenet_learning_rate = 1e-3,
 	wavenet_adam_beta1 = 0.9,
 	wavenet_adam_beta2 = 0.999,
@@ -227,11 +234,10 @@
 	'it appears that oswald had only one caller in response to all of his fpcc activities,',
 	'he relied on the absence of the strychnia.',
 	'scoggins thought it was lighter.',
-	'''would, it is probable, have eventually overcome the reluctance of some of the prisoners at least,
-	and would have possessed so much moral dignity''',
-	'''Sequence to sequence models have enjoyed great success in a variety of tasks such as machine translation, speech recognition, and text summarization.
-	This project covers a sequence to sequence model trained to predict a speech representation from an input sequence of characters. We show that
-	the adopted architecture is able to perform this task with wild success.''',
+	'would, it is probable, have eventually overcome the reluctance of some of the prisoners at least, and would have possessed so much moral dignity',
+	'Sequence to sequence models have enjoyed great success in a variety of tasks such as machine translation, speech recognition, and text summarization.\
+	This project covers a sequence to sequence model trained to predict a speech representation from an input sequence of characters. We show that\
+	the adopted architecture is able to perform this task with wild success.',
 	'Thank you so much for your support!',
 	]
 

diff --git a/synthesize.py b/synthesize.py
@@ -2,6 +2,8 @@
 import os
 from warnings import warn
 
+import tensorflow as tf
+
 from hparams import hparams
 from infolog import log
 from tacotron.synthesize import tacotron_synthesize
@@ -31,6 +33,8 @@ def synthesize(args, hparams, taco_checkpoint, wave_checkpoint, sentences):
 	log('Running End-to-End TTS Evaluation. Model: {}'.format(args.name or args.model))
 	log('Synthesizing mel-spectrograms from text..')
 	wavenet_in_dir = tacotron_synthesize(args, hparams, taco_checkpoint, sentences)
+	#Delete Tacotron model from graph
+	tf.reset_default_graph()
 	log('Synthesizing audio from mel-spectrograms.. (This may take a while)')
 	wavenet_synthesize(args, hparams, wave_checkpoint)
 	log('Tacotron-2 TTS synthesis complete!')
@@ -53,6 +57,7 @@ def main():
 	parser.add_argument('--mode', default='eval', help='mode of run: can be one of {}'.format(accepted_modes))
 	parser.add_argument('--GTA', default='True', help='Ground truth aligned synthesis, defaults to True, only considered in synthesis mode')
 	parser.add_argument('--text_list', default='', help='Text file contains list of texts to be synthesized. Valid if mode=eval')
+	parser.add_argument('--speaker_id', default=None, help='Defines the speakers ids to use when running standalone Wavenet on a folder of mels. this variable must be a comma-separated list of ids')
 	args = parser.parse_args()
 
 	accepted_models = ['Tacotron', 'WaveNet', 'Tacotron-2']

diff --git a/tacotron/synthesize.py b/tacotron/synthesize.py
@@ -60,9 +60,9 @@ def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
 	with open(os.path.join(eval_dir, 'map.txt'), 'w') as file:
 		for i, text in enumerate(tqdm(sentences)):
 			start = time.time()
-			mel_filename = synth.synthesize(text, i+1, eval_dir, log_dir, None)
+			mel_filename, speaker_id = synth.synthesize([text], [i+1], eval_dir, log_dir, None)
 
-			file.write('{}|{}\n'.format(text, mel_filename))
+			file.write('{}|{}|{}\n'.format(text, mel_filename[0], speaker_id[0]))
 	log('synthesized mel spectrograms at {}'.format(eval_dir))
 	return eval_dir
 
@@ -90,18 +90,21 @@ def run_synthesis(args, checkpoint_path, output_dir, hparams):
 		hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / (3600)
 		log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(metadata), hours))
 
+	metadata = [metadata[i: i+hparams.tacotron_synthesis_batch_size] for i in range(0, len(metadata), hparams.tacotron_synthesis_batch_size)]
+
 	log('starting synthesis')
 	mel_dir = os.path.join(args.input_dir, 'mels')
 	wav_dir = os.path.join(args.input_dir, 'audio')
 	with open(os.path.join(synth_dir, 'map.txt'), 'w') as file:
 		for i, meta in enumerate(tqdm(metadata)):
-			text = meta[5]
-			mel_filename = os.path.join(mel_dir, meta[1])
-			wav_filename = os.path.join(wav_dir, meta[0])
-			basename = os.path.basename(mel_filename).replace('.npy', '').replace('mel-', '')
-			mel_output_filename, speaker_id = synth.synthesize(text, basename, synth_dir, None, mel_filename)
-
-			file.write('{}|{}|{}|{}|{}\n'.format(wav_filename, mel_filename, mel_output_filename, speaker_id, text))
+			texts = [m[5] for m in meta]
+			mel_filenames = [os.path.join(mel_dir, m[1]) for m in meta]
+			wav_filenames = [os.path.join(wav_dir, m[0]) for m in meta]
+			basenames = [os.path.basename(m).replace('.npy', '').replace('mel-', '') for m in mel_filenames]
+			mel_output_filenames, speaker_ids = synth.synthesize(texts, basenames, synth_dir, None, mel_filenames)
+
+			for elems in zip(wav_filenames, mel_filenames, mel_output_filenames, speaker_ids, texts):
+				file.write('|'.join([str(x) for x in elems]) + '\n')
 	log('synthesized mel spectrograms at {}'.format(synth_dir))
 	return os.path.join(synth_dir, 'map.txt')
 

diff --git a/tacotron/synthesizer.py b/tacotron/synthesizer.py
@@ -17,9 +17,9 @@
 class Synthesizer:
 	def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron'):
 		log('Constructing model: %s' % model_name)
-		inputs = tf.placeholder(tf.int32, [1, None], 'inputs')
-		input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths')
-		targets = tf.placeholder(tf.float32, [1, None, hparams.num_mels], 'mel_targets')
+		inputs = tf.placeholder(tf.int32, [None, None], 'inputs')
+		input_lengths = tf.placeholder(tf.int32, [None], 'input_lengths')
+		targets = tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets')
 		with tf.variable_scope('model') as scope:
 			self.model = create_model(model_name, hparams)
 			if gta:
@@ -28,41 +28,58 @@ def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron'):
 				self.model.initialize(inputs, input_lengths)
 			self.mel_outputs = self.model.mel_outputs
 			self.linear_outputs = self.model.linear_outputs if (hparams.predict_linear and not gta) else None
-			self.alignment = self.model.alignments[0]
+			self.alignments = self.model.alignments
 
 		self.gta = gta
 		self._hparams = hparams
+		#pad input sequences with the <pad_token> 0 ( _ )
+		self._pad = 0
+		#explicitely setting the padding to a value that doesn't originally exist in the spectogram
+		#to avoid any possible conflicts, without affecting the output range of the model too much
+		if hparams.symmetric_mels:
+			self._target_pad = -(hparams.max_abs_value + .1)
+		else:
+			self._target_pad = -0.1
 
 		log('Loading checkpoint: %s' % checkpoint_path)
-		self.session = tf.Session()
+		#Memory allocation on the GPU as needed
+		config = tf.ConfigProto()
+		config.gpu_options.allow_growth = True
+
+		self.session = tf.Session(config=config)
 		self.session.run(tf.global_variables_initializer())
+
 		saver = tf.train.Saver()
 		saver.restore(self.session, checkpoint_path)
 
 
-	def synthesize(self, text, index, out_dir, log_dir, mel_filename):
+	def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames):
 		hparams = self._hparams
 		cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
-		seq = text_to_sequence(text, cleaner_names)
+		seqs = [np.asarray(text_to_sequence(text, cleaner_names)) for text in texts]
+		input_lengths = [len(seq) for seq in seqs]
+		seqs = self._prepare_inputs(seqs)
 		feed_dict = {
-			self.model.inputs: [np.asarray(seq, dtype=np.int32)],
-			self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32),
+			self.model.inputs: seqs,
+			self.model.input_lengths: np.asarray(input_lengths, dtype=np.int32),
 		}
 
 		if self.gta:
-			feed_dict[self.model.mel_targets] = np.load(mel_filename).reshape(1, -1, 80)
+			np_targets = [np.load(mel_filename) for mel_filename in mel_filenames]
+			target_lengths = [len(np_target) for np_target in np_targets]
+			padded_targets = self._prepare_targets(np_targets, self._hparams.outputs_per_step)
+			feed_dict[self.model.mel_targets] = padded_targets.reshape(len(np_targets), -1, 80)
 
 		if self.gta or not hparams.predict_linear:
-			mels, alignment = self.session.run([self.mel_outputs, self.alignment], feed_dict=feed_dict)
+			mels, alignments = self.session.run([self.mel_outputs, self.alignments], feed_dict=feed_dict)
+			if self.gta:
+				mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)] #Take off the reduction factor padding frames for time consistency with wavenet
+				assert len(mels) == len(np_targets)
 
 		else:
-			linear, mels, alignment = self.session.run([self.linear_outputs, self.mel_outputs, self.alignment], feed_dict=feed_dict)
-			linear = linear.reshape(-1, hparams.num_freq)
-
-		mels = mels.reshape(-1, hparams.num_mels) #Thanks to @imdatsolak for pointing this out
+			linears, mels, alignments = self.session.run([self.linear_outputs, self.mel_outputs, self.alignments], feed_dict=feed_dict)
 
-
-		if index is None:
+		if basenames is None:
 			#Generate wav and read it
 			wav = audio.inv_mel_spectrogram(mels.T, hparams)
 			audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) #Find a better way
@@ -86,34 +103,64 @@ def synthesize(self, text, index, out_dir, log_dir, mel_filename):
 			return
 
 
-		#Get speaker id for global conditioning (only used with GTA generally)
-		if hparams.gin_channels > 0:
-			raise RuntimeError('Please set the speaker_id rule in line 89 of tacotron/synthesizer.py to allow for global condition usage later.')
-			speaker_id = '<no_g>' #set the rule to determine speaker id. By using the file basename maybe? (basenames are inside "index" variable)
-		else:
-			speaker_id = '<no_g>'
+		saved_mels_paths = []
+		speaker_ids = []
+		for i, mel in enumerate(mels):
+			#Get speaker id for global conditioning (only used with GTA generally)
+			if hparams.gin_channels > 0:
+				raise RuntimeError('Please set the speaker_id rule in line 99 of tacotron/synthesizer.py to allow for global condition usage later.')
+				speaker_id = '<no_g>' #set the rule to determine speaker id. By using the file basename maybe? (basenames are inside "basenames" variable)
+				speaker_ids.append(speaker_id) #finish by appending the speaker id. (allows for different speakers per batch if your model is multispeaker)
+			else:
+				speaker_id = '<no_g>'
+				speaker_ids.append(speaker_id)
 
-		# Write the spectrogram to disk
-		# Note: outputs mel-spectrogram files and target ones have same names, just different folders
-		mel_filename = os.path.join(out_dir, 'mel-{}.npy'.format(index))
-		np.save(mel_filename, mels, allow_pickle=False)
+			# Write the spectrogram to disk
+			# Note: outputs mel-spectrogram files and target ones have same names, just different folders
+			mel_filename = os.path.join(out_dir, 'mel-{}.npy'.format(basenames[i]))
+			np.save(mel_filename, mel, allow_pickle=False)
+			saved_mels_paths.append(mel_filename)
 
-		if log_dir is not None:
-			#save wav (mel -> wav)
-			wav = audio.inv_mel_spectrogram(mels.T, hparams)
-			audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-mel.wav'.format(index)), sr=hparams.sample_rate)
+			if log_dir is not None:
+				#save wav (mel -> wav)
+				wav = audio.inv_mel_spectrogram(mel.T, hparams)
+				audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-mel.wav'.format(basenames[i])), sr=hparams.sample_rate)
+
+				#save alignments
+				plot.plot_alignment(alignments[i], os.path.join(log_dir, 'plots/alignment-{}.png'.format(basenames[i])),
+					info='{}'.format(texts[i]), split_title=True)
+
+				#save mel spectrogram plot
+				plot.plot_spectrogram(mel, os.path.join(log_dir, 'plots/mel-{}.png'.format(basenames[i])),
+					info='{}'.format(texts[i]), split_title=True)
+
+				if hparams.predict_linear:
+					#save wav (linear -> wav)
+					wav = audio.inv_linear_spectrogram(linears[i].T, hparams)
+					audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-linear.wav'.format(basenames[i])), sr=hparams.sample_rate)
+
+					#save mel spectrogram plot
+					plot.plot_spectrogram(linears[i], os.path.join(log_dir, 'plots/linear-{}.png'.format(basenames[i])),
+						info='{}'.format(texts[i]), split_title=True, auto_aspect=True)
+
+
+
+		return saved_mels_paths, speaker_ids
+
+	def _round_up(self, x, multiple):
+		remainder = x % multiple
+		return x if remainder == 0 else x + multiple - remainder
 
-			if hparams.predict_linear:
-				#save wav (linear -> wav)
-				wav = audio.inv_linear_spectrogram(linear.T, hparams)
-				audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-linear.wav'.format(index)), sr=hparams.sample_rate)
+	def _prepare_inputs(self, inputs):
+		max_len = max([len(x) for x in inputs])
+		return np.stack([self._pad_input(x, max_len) for x in inputs])
 
-			#save alignments
-			plot.plot_alignment(alignment, os.path.join(log_dir, 'plots/alignment-{}.png'.format(index)),
-				info='{}'.format(text), split_title=True)
+	def _pad_input(self, x, length):
+		return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=self._pad)
 
-			#save mel spectrogram plot
-			plot.plot_spectrogram(mels, os.path.join(log_dir, 'plots/mel-{}.png'.format(index)),
-				info='{}'.format(text), split_title=True)
+	def _prepare_targets(self, targets, alignment):
+		max_len = max([len(t) for t in targets])
+		return np.stack([self._pad_target(t, self._round_up(max_len, alignment)) for t in targets])
 
-		return mel_filename, speaker_id
+	def _pad_target(self, t, length):
+		return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode='constant', constant_values=self._target_pad)
diff --git a/tacotron/train.py b/tacotron/train.py
@@ -157,7 +157,7 @@ def train(log_dir, args, hparams):
 
 					if (checkpoint_state and checkpoint_state.model_checkpoint_path):
 						log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path), slack=True)
-						saver.restore(sess, checkpoint_state.model_checkpoint_path)
+						load_averaged_model(sess, sh_saver, checkpoint_state.model_checkpoint_path)
 					else:
 						log('No model to load at {}'.format(save_dir), slack=True)
 

diff --git a/tacotron/utils/plot.py b/tacotron/utils/plot.py
@@ -40,7 +40,7 @@ def plot_alignment(alignment, path, info=None, split_title=False, max_len=None):
 	plt.close()
 
 
-def plot_spectrogram(pred_spectrogram, path, info=None, split_title=False, target_spectrogram=None, max_len=None):
+def plot_spectrogram(pred_spectrogram, path, info=None, split_title=False, target_spectrogram=None, max_len=None, auto_aspect=False):
 	if max_len is not None:
 		target_spectrogram = target_spectrogram[:max_len]
 		pred_spectrogram = pred_spectrogram[:max_len]
@@ -60,14 +60,20 @@ def plot_spectrogram(pred_spectrogram, path, info=None, split_title=False, targe
 		ax1 = fig.add_subplot(311)
 		ax2 = fig.add_subplot(312)
 
-		im = ax1.imshow(np.rot90(target_spectrogram), interpolation='none')
+		if auto_aspect:
+			im = ax1.imshow(np.rot90(target_spectrogram), aspect='auto', interpolation='none')
+		else:
+			im = ax1.imshow(np.rot90(target_spectrogram), interpolation='none')
 		ax1.set_title('Target Mel-Spectrogram')
 		fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax1)
 		ax2.set_title('Predicted Mel-Spectrogram')
 	else:
 		ax2 = fig.add_subplot(211)
 
-	im = ax2.imshow(np.rot90(pred_spectrogram), interpolation='none')
+	if auto_aspect:
+		im = ax2.imshow(np.rot90(pred_spectrogram), aspect='auto', interpolation='none')
+	else:
+		im = ax2.imshow(np.rot90(pred_spectrogram), interpolation='none')
 	fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax2)
 
 	plt.tight_layout()