Add EOS an the end of each sentence

Signed-off-by: begeekmyfriend <[email protected]>
joeyfish · Oct 25, 2019 · 9c343a5 · 9c343a5
1 parent 22ba511
commit 9c343a5
Show file tree

Hide file tree

Showing 9 changed files with 75 additions and 98 deletions.
diff --git a/datasets/audio.py b/datasets/audio.py
@@ -24,20 +24,6 @@ def preemphasis(wav, k):
 def inv_preemphasis(wav, k):
 	return signal.lfilter([1], [1, -k], wav)
 
-#From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py
-def start_and_end_indices(quantized, silence_threshold=2):
-	for start in range(quantized.size):
-		if abs(quantized[start] - 127) > silence_threshold:
-			break
-	for end in range(quantized.size - 1, 1, -1):
-		if abs(quantized[end] - 127) > silence_threshold:
-			break
-
-	assert abs(quantized[start] - 127) > silence_threshold
-	assert abs(quantized[end] - 127) > silence_threshold
-
-	return start, end
-
 def trim_silence(wav, hparams):
 	'''Trim leading and trailing silence
 
@@ -149,27 +135,6 @@ def _stft(y, hparams):
 def _istft(y, hparams):
 	return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams.win_size)
 
-def num_frames(length, fsize, fshift):
-	"""Compute number of time frames of spectrogram
-	"""
-	pad = (fsize - fshift)
-	if length % fshift == 0:
-		M = (length + pad * 2 - fsize) // fshift + 1
-	else:
-		M = (length + pad * 2 - fsize) // fshift + 2
-	return M
-
-
-def pad_lr(x, fsize, fshift):
-	"""Compute left and right padding
-	"""
-	M = num_frames(len(x), fsize, fshift)
-	pad = (fsize - fshift)
-	T = len(x) + 2 * pad
-	r = (M - 1) * fshift + fsize - T
-	return pad, pad + r
-
-
 # Conversions
 _mel_basis = None
 _inv_mel_basis = None

diff --git a/synthesize.py b/synthesize.py
@@ -1,12 +1,10 @@
 import argparse
 import os
-from warnings import warn
-
 import tensorflow as tf
-
 from hparams import hparams
 from infolog import log
 from tacotron.synthesize import tacotron_synthesize
+from tacotron.utils.symbols import _eos
 
 
 def prepare_run(args):
@@ -21,18 +19,19 @@ def prepare_run(args):
 	wave_checkpoint = os.path.join('logs-' + run_name, 'wave_' + args.checkpoint)
 	return taco_checkpoint, wave_checkpoint, modified_hp
 
+
 def get_sentences(args):
-	if args.text_list:
+	if args.text:
 		try:
-			f = open(args.text_list)
+			f = open(args.text)
 			lines = f.readlines()
 		except UnicodeDecodeError:
-			f = open(args.text_list, encoding='gbk')
+			f = open(args.text, encoding='gbk')
 			lines = f.readlines()
 		sentences = list(map(lambda l: l.strip(), lines[1::2]))
 	else:
 		sentences = hparams.sentences
-	return sentences
+	return list(map(lambda s: s + _eos, sentences))
 
 
 def main():
@@ -49,7 +48,7 @@ def main():
 	parser.add_argument('--output_dir', default='output/', help='folder to contain synthesized mel spectrograms')
 	parser.add_argument('--mode', default='eval', help=f'mode of run: can be one of {accepted_modes}')
 	parser.add_argument('--GTA', default='True', help='Ground truth aligned synthesis, defaults to True, only considered in synthesis mode')
-	parser.add_argument('--text_list', default='', help='Text file contains list of texts to be synthesized. Valid if mode=eval')
+	parser.add_argument('--text', default='', help='Text file contains list of texts to be synthesized. Valid if mode=eval')
 	parser.add_argument('--speaker_id', default=0, type=int, help='Defines the speakers ids to use when running standalone Wavenet on a folder of mels. this variable must be a comma-separated list of ids')
 	args = parser.parse_args()
 

diff --git a/tacotron/feeder.py b/tacotron/feeder.py
@@ -136,7 +136,8 @@ def make_test_batches(self):
 		# Bucket examples based on similar output sequence length for efficiency
 		batches = [examples[i: i+n] for j in range(0, len(examples), n)]
 		np.random.shuffle(batches)
-		log('\nGenerated {} test batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start))
+		end = time.time() - start
+		log(f'Generated {len(batches)} test batches of size {n} in {end:.3f} sec')
 		return batches
 
 	def _enqueue_next_train_group(self):
@@ -153,7 +154,8 @@ def _enqueue_next_train_group(self):
 			# Bucket examples based on similar output sequence length for efficiency
 			batches = [examples[i: i+n] for i in range(0, len(examples), n)]
 			np.random.shuffle(batches)
-			log('\nGenerated {} train batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start))
+			end = time.time() - start
+			log(f'Generated {len(batches)} train batches of size {n} in {end:.3f} sec')
 			for batch in batches:
 				feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, self._hparams.outputs_per_step)))
 				self._session.run(self._enqueue_op, feed_dict=feed_dict)

diff --git a/tacotron/models/modules.py b/tacotron/models/modules.py
@@ -102,7 +102,7 @@ def __call__(self, inputs):
 			x = inputs
 			for i in range(self.enc_conv_num_layers):
 				x = conv1d(x, self.kernel_size, self.channels, self.activation,
-					self.is_training, self.bnorm, 'conv_layer_{}_'.format(i + 1)+self.scope)
+					self.is_training, self.bnorm, f'conv_layer_{i+1}_' + self.scope)
 		return x
 
 
@@ -175,11 +175,11 @@ def __call__(self, inputs):
 		with tf.variable_scope(self.scope):
 			for i, size in enumerate(self.layers_sizes):
 				dense = tf.layers.dense(x, units=size, activation=self.activation,
-					name='dense_{}'.format(i + 1))
+					name=f'dense_{i+1}')
 				#The paper discussed introducing diversity in generation at inference time
 				#by using a dropout of 0.5 only in prenet layers (in both training and inference).
 				x = tf.layers.dropout(dense, rate=self.drop_rate, training=True,
-					name='dropout_{}'.format(i + 1) + self.scope)
+					name=f'dropout_{i+1}' + self.scope)
 		return x
 
 
@@ -206,7 +206,7 @@ def __init__(self, is_training, layers=2, size=1024, zoneout=0.1, scope=None):
 		self.rnn_layers = [ZoneoutLSTMCell(size, is_training,
 			zoneout_factor_cell=zoneout,
 			zoneout_factor_output=zoneout,
-			name='decoder_LSTM_{}'.format(i+1)) for i in range(layers)]
+			name=f'decoder_LSTM_{i+1}') for i in range(layers)]
 
 		self._cell = tf.contrib.rnn.MultiRNNCell(self.rnn_layers, state_is_tuple=True)
 
@@ -231,14 +231,14 @@ def __init__(self, shape=80, activation=None, scope=None):
 		self.activation = activation
 
 		self.scope = 'Linear_projection' if scope is None else scope
-		self.dense = tf.layers.Dense(units=shape, activation=activation, name='projection_{}'.format(self.scope))
+		self.dense = tf.layers.Dense(units=shape, activation=activation, name=f'projection_{self.scope}')
 
 	def __call__(self, inputs):
 		with tf.variable_scope(self.scope):
 			#If activation==None, this returns a simple Linear projection
 			#else the projection will be passed through an activation function
 			# output = tf.layers.dense(inputs, units=self.shape, activation=self.activation,
-			# 	name='projection_{}'.format(self.scope))
+			# 	name=f'projection_{self.scope}')
 			output = self.dense(inputs)
 
 			return output
@@ -265,8 +265,7 @@ def __init__(self, is_training, shape=1, activation=tf.nn.sigmoid, scope=None):
 
 	def __call__(self, inputs):
 		with tf.variable_scope(self.scope):
-			output = tf.layers.dense(inputs, units=self.shape,
-				activation=None, name='projection_{}'.format(self.scope))
+			output = tf.layers.dense(inputs, units=self.shape, activation=None, name=f'projection_{self.scope}')
 
 			#During training, don't use activation as it is integrated inside the sigmoid_cross_entropy loss function
 			if self.is_training:
@@ -302,9 +301,9 @@ def __call__(self, inputs):
 			x = inputs
 			for i in range(self.postnet_num_layers - 1):
 				x = conv1d(x, self.kernel_size, self.channels, self.activation,
-					self.is_training, self.bnorm, 'conv_layer_{}_'.format(i + 1)+self.scope)
+					self.is_training, self.bnorm, f'conv_layer_{i+1}_' + self.scope)
 			x = conv1d(x, self.kernel_size, self.channels, lambda _: _, self.is_training, self.bnorm,
-				'conv_layer_{}_'.format(5)+self.scope)
+				'conv_layer_5_' + self.scope)
 		return x
 
 

diff --git a/tacotron/models/tacotron.py b/tacotron/models/tacotron.py
@@ -140,18 +140,18 @@ def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets
 			self.mel_targets = mel_targets
 			self.targets_lengths = targets_lengths
 			log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
-			log('  Train mode:               {}'.format(is_training))
-			log('  Eval mode:                {}'.format(is_evaluating))
-			log('  GTA mode:                 {}'.format(gta))
-			log('  Synthesis mode:           {}'.format(not (is_training or is_evaluating)))
-			log('  embedding:                {}'.format(embedded_inputs.shape))
-			log('  enc conv out:             {}'.format(enc_conv_output_shape))
-			log('  encoder out:              {}'.format(encoder_outputs.shape))
-			log('  decoder out:              {}'.format(decoder_output.shape))
-			log('  residual out:             {}'.format(residual.shape))
-			log('  projected residual out:   {}'.format(projected_residual.shape))
-			log('  mel out:                  {}'.format(mel_outputs.shape))
-			log('  <stop_token> out:         {}'.format(stop_token_prediction.shape))
+			log(f'  Train mode:               {is_training}')
+			log(f'  Eval mode:                {is_evaluating}')
+			log(f'  GTA mode:                 {gta}')
+			log(f'  Synthesis mode:           {not (is_training or is_evaluating)}')
+			log(f'  embedding:                {embedded_inputs.shape}')
+			log(f'  enc conv out:             {enc_conv_output_shape}')
+			log(f'  encoder out:              {encoder_outputs.shape}')
+			log(f'  decoder out:              {decoder_output.shape}')
+			log(f'  residual out:             {residual.shape}')
+			log(f'  projected residual out:   {projected_residual.shape}')
+			log(f'  mel out:                  {mel_outputs.shape}')
+			log(f'  <stop_token> out:         {stop_token_prediction.shape}')
 
 
 	def add_loss(self):

diff --git a/tacotron/synthesize.py b/tacotron/synthesize.py
@@ -21,6 +21,7 @@ def run_live(args, checkpoint_path, hparams):
 	log(hparams_debug_string())
 	synth = Synthesizer()
 	synth.load(checkpoint_path, hparams)
+	synth.session_open()
 
 	#Generate fast greeting message
 	greetings = 'Hello, Welcome to the Live testing tool. Please type a message and I will try to read it!'
@@ -39,6 +40,8 @@ def run_live(args, checkpoint_path, hparams):
 			generate_fast(synth, leave)
 			sleep(2)
 			break
+	synth.session_close()
+
 
 def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
 	eval_dir = os.path.join(output_dir, 'eval')
@@ -56,6 +59,7 @@ def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
 	log(hparams_debug_string())
 	synth = Synthesizer()
 	synth.load(checkpoint_path, hparams)
+	synth.session_open()
 
 	sentences = list(map(lambda s: s.strip(), sentences))
 	delta_size = hparams.tacotron_synthesis_batch_size if hparams.tacotron_synthesis_batch_size < len(sentences) else len(sentences)
@@ -69,6 +73,7 @@ def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
 		audio.save_wav(wav, os.path.join(eval_dir, f'{i:03d}.wav'), hparams)
 	end = time.time() - start
 	log(f'Generated total batch of {delta_size} in {end:.3f} sec')
+	synth.session_close()
 
 
 def run_synthesis(args, checkpoint_path, output_dir, hparams):
@@ -87,6 +92,7 @@ def run_synthesis(args, checkpoint_path, output_dir, hparams):
 	log(hparams_debug_string())
 	synth = Synthesizer()
 	synth.load(checkpoint_path, hparams, gta=GTA)
+	synth.session_open()
 
 	speaker_num = len(hparams.anchor_dirs)
 	metadata_groups = [[] for i in range(speaker_num)]
@@ -97,7 +103,7 @@ def run_synthesis(args, checkpoint_path, output_dir, hparams):
 		for m in metadata:
 			metadata_groups[int(m[0])].append(m[1:])
 		for i in range(speaker_num):
-			hours = sum([int(x[2]) for x in metadata_groups[i]]) * frame_shift_ms / (3600)
+			hours = sum([int(x[2]) for x in metadata_groups[i]]) * frame_shift_ms / 3600
 			log(f'Loaded {hparams.anchor_dirs[i]} for {len(metadata_groups[i])} examples ({hours:.2f} hours)')
 
 	log('starting synthesis')
@@ -112,6 +118,7 @@ def run_synthesis(args, checkpoint_path, output_dir, hparams):
 			basenames = [os.path.basename(m).replace('.npy', '').replace('mel-', '') for m in mel_filenames]
 			synth.synthesize(texts, basenames, synth_dir, None, mel_filenames, speaker_id)
 	log(f'synthesized mel spectrograms at {synth_dir}')
+	synth.session_close()
 
 
 def tacotron_synthesize(args, hparams, checkpoint, sentences=None):

diff --git a/tacotron/synthesizer.py b/tacotron/synthesizer.py
@@ -37,7 +37,11 @@ def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron'):
 		#to avoid any possible conflicts, without affecting the output range of the model too much
 		self._target_pad = -(hparams.max_abs_value + .1) if hparams.symmetric_mels else -0.1
 
+		self.checkpoint_path = checkpoint_path
 		log('Loading checkpoint: %s' % checkpoint_path)
+
+
+	def session_open(self):
 		#Memory allocation on the GPU as needed
 		config = tf.ConfigProto()
 		config.gpu_options.allow_growth = True
@@ -46,7 +50,11 @@ def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron'):
 		self.session.run(tf.global_variables_initializer())
 
 		saver = tf.train.Saver()
-		saver.restore(self.session, checkpoint_path)
+		saver.restore(self.session, self.checkpoint_path)
+
+
+	def session_close(self):
+		self.session.close()
 
 
 	def synthesize(self, batch, basenames, out_dir, log_dir, mel_filenames, speaker_id):