diff --git a/datasets/audio.py b/datasets/audio.py index df5d5d40..f21a86bf 100644 --- a/datasets/audio.py +++ b/datasets/audio.py @@ -24,20 +24,6 @@ def preemphasis(wav, k): def inv_preemphasis(wav, k): return signal.lfilter([1], [1, -k], wav) -#From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py -def start_and_end_indices(quantized, silence_threshold=2): - for start in range(quantized.size): - if abs(quantized[start] - 127) > silence_threshold: - break - for end in range(quantized.size - 1, 1, -1): - if abs(quantized[end] - 127) > silence_threshold: - break - - assert abs(quantized[start] - 127) > silence_threshold - assert abs(quantized[end] - 127) > silence_threshold - - return start, end - def trim_silence(wav, hparams): '''Trim leading and trailing silence @@ -149,27 +135,6 @@ def _stft(y, hparams): def _istft(y, hparams): return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams.win_size) -def num_frames(length, fsize, fshift): - """Compute number of time frames of spectrogram - """ - pad = (fsize - fshift) - if length % fshift == 0: - M = (length + pad * 2 - fsize) // fshift + 1 - else: - M = (length + pad * 2 - fsize) // fshift + 2 - return M - - -def pad_lr(x, fsize, fshift): - """Compute left and right padding - """ - M = num_frames(len(x), fsize, fshift) - pad = (fsize - fshift) - T = len(x) + 2 * pad - r = (M - 1) * fshift + fsize - T - return pad, pad + r - - # Conversions _mel_basis = None _inv_mel_basis = None diff --git a/synthesize.py b/synthesize.py index 9417eb39..4bef4e51 100644 --- a/synthesize.py +++ b/synthesize.py @@ -1,12 +1,10 @@ import argparse import os -from warnings import warn - import tensorflow as tf - from hparams import hparams from infolog import log from tacotron.synthesize import tacotron_synthesize +from tacotron.utils.symbols import _eos def prepare_run(args): @@ -21,18 +19,19 @@ def prepare_run(args): wave_checkpoint = os.path.join('logs-' + run_name, 'wave_' + args.checkpoint) return taco_checkpoint, wave_checkpoint, modified_hp + def get_sentences(args): - if args.text_list: + if args.text: try: - f = open(args.text_list) + f = open(args.text) lines = f.readlines() except UnicodeDecodeError: - f = open(args.text_list, encoding='gbk') + f = open(args.text, encoding='gbk') lines = f.readlines() sentences = list(map(lambda l: l.strip(), lines[1::2])) else: sentences = hparams.sentences - return sentences + return list(map(lambda s: s + _eos, sentences)) def main(): @@ -49,7 +48,7 @@ def main(): parser.add_argument('--output_dir', default='output/', help='folder to contain synthesized mel spectrograms') parser.add_argument('--mode', default='eval', help=f'mode of run: can be one of {accepted_modes}') parser.add_argument('--GTA', default='True', help='Ground truth aligned synthesis, defaults to True, only considered in synthesis mode') - parser.add_argument('--text_list', default='', help='Text file contains list of texts to be synthesized. Valid if mode=eval') + parser.add_argument('--text', default='', help='Text file contains list of texts to be synthesized. Valid if mode=eval') parser.add_argument('--speaker_id', default=0, type=int, help='Defines the speakers ids to use when running standalone Wavenet on a folder of mels. this variable must be a comma-separated list of ids') args = parser.parse_args() diff --git a/tacotron/feeder.py b/tacotron/feeder.py index 6a1992d8..2c380646 100644 --- a/tacotron/feeder.py +++ b/tacotron/feeder.py @@ -136,7 +136,8 @@ def make_test_batches(self): # Bucket examples based on similar output sequence length for efficiency batches = [examples[i: i+n] for j in range(0, len(examples), n)] np.random.shuffle(batches) - log('\nGenerated {} test batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start)) + end = time.time() - start + log(f'Generated {len(batches)} test batches of size {n} in {end:.3f} sec') return batches def _enqueue_next_train_group(self): @@ -153,7 +154,8 @@ def _enqueue_next_train_group(self): # Bucket examples based on similar output sequence length for efficiency batches = [examples[i: i+n] for i in range(0, len(examples), n)] np.random.shuffle(batches) - log('\nGenerated {} train batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start)) + end = time.time() - start + log(f'Generated {len(batches)} train batches of size {n} in {end:.3f} sec') for batch in batches: feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, self._hparams.outputs_per_step))) self._session.run(self._enqueue_op, feed_dict=feed_dict) diff --git a/tacotron/models/modules.py b/tacotron/models/modules.py index f47b33c8..7b40f0bd 100644 --- a/tacotron/models/modules.py +++ b/tacotron/models/modules.py @@ -102,7 +102,7 @@ def __call__(self, inputs): x = inputs for i in range(self.enc_conv_num_layers): x = conv1d(x, self.kernel_size, self.channels, self.activation, - self.is_training, self.bnorm, 'conv_layer_{}_'.format(i + 1)+self.scope) + self.is_training, self.bnorm, f'conv_layer_{i+1}_' + self.scope) return x @@ -175,11 +175,11 @@ def __call__(self, inputs): with tf.variable_scope(self.scope): for i, size in enumerate(self.layers_sizes): dense = tf.layers.dense(x, units=size, activation=self.activation, - name='dense_{}'.format(i + 1)) + name=f'dense_{i+1}') #The paper discussed introducing diversity in generation at inference time #by using a dropout of 0.5 only in prenet layers (in both training and inference). x = tf.layers.dropout(dense, rate=self.drop_rate, training=True, - name='dropout_{}'.format(i + 1) + self.scope) + name=f'dropout_{i+1}' + self.scope) return x @@ -206,7 +206,7 @@ def __init__(self, is_training, layers=2, size=1024, zoneout=0.1, scope=None): self.rnn_layers = [ZoneoutLSTMCell(size, is_training, zoneout_factor_cell=zoneout, zoneout_factor_output=zoneout, - name='decoder_LSTM_{}'.format(i+1)) for i in range(layers)] + name=f'decoder_LSTM_{i+1}') for i in range(layers)] self._cell = tf.contrib.rnn.MultiRNNCell(self.rnn_layers, state_is_tuple=True) @@ -231,14 +231,14 @@ def __init__(self, shape=80, activation=None, scope=None): self.activation = activation self.scope = 'Linear_projection' if scope is None else scope - self.dense = tf.layers.Dense(units=shape, activation=activation, name='projection_{}'.format(self.scope)) + self.dense = tf.layers.Dense(units=shape, activation=activation, name=f'projection_{self.scope}') def __call__(self, inputs): with tf.variable_scope(self.scope): #If activation==None, this returns a simple Linear projection #else the projection will be passed through an activation function # output = tf.layers.dense(inputs, units=self.shape, activation=self.activation, - # name='projection_{}'.format(self.scope)) + # name=f'projection_{self.scope}') output = self.dense(inputs) return output @@ -265,8 +265,7 @@ def __init__(self, is_training, shape=1, activation=tf.nn.sigmoid, scope=None): def __call__(self, inputs): with tf.variable_scope(self.scope): - output = tf.layers.dense(inputs, units=self.shape, - activation=None, name='projection_{}'.format(self.scope)) + output = tf.layers.dense(inputs, units=self.shape, activation=None, name=f'projection_{self.scope}') #During training, don't use activation as it is integrated inside the sigmoid_cross_entropy loss function if self.is_training: @@ -302,9 +301,9 @@ def __call__(self, inputs): x = inputs for i in range(self.postnet_num_layers - 1): x = conv1d(x, self.kernel_size, self.channels, self.activation, - self.is_training, self.bnorm, 'conv_layer_{}_'.format(i + 1)+self.scope) + self.is_training, self.bnorm, f'conv_layer_{i+1}_' + self.scope) x = conv1d(x, self.kernel_size, self.channels, lambda _: _, self.is_training, self.bnorm, - 'conv_layer_{}_'.format(5)+self.scope) + 'conv_layer_5_' + self.scope) return x diff --git a/tacotron/models/tacotron.py b/tacotron/models/tacotron.py index ad7a37a7..5a7bbec2 100644 --- a/tacotron/models/tacotron.py +++ b/tacotron/models/tacotron.py @@ -140,18 +140,18 @@ def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets self.mel_targets = mel_targets self.targets_lengths = targets_lengths log('Initialized Tacotron model. Dimensions (? = dynamic shape): ') - log(' Train mode: {}'.format(is_training)) - log(' Eval mode: {}'.format(is_evaluating)) - log(' GTA mode: {}'.format(gta)) - log(' Synthesis mode: {}'.format(not (is_training or is_evaluating))) - log(' embedding: {}'.format(embedded_inputs.shape)) - log(' enc conv out: {}'.format(enc_conv_output_shape)) - log(' encoder out: {}'.format(encoder_outputs.shape)) - log(' decoder out: {}'.format(decoder_output.shape)) - log(' residual out: {}'.format(residual.shape)) - log(' projected residual out: {}'.format(projected_residual.shape)) - log(' mel out: {}'.format(mel_outputs.shape)) - log(' out: {}'.format(stop_token_prediction.shape)) + log(f' Train mode: {is_training}') + log(f' Eval mode: {is_evaluating}') + log(f' GTA mode: {gta}') + log(f' Synthesis mode: {not (is_training or is_evaluating)}') + log(f' embedding: {embedded_inputs.shape}') + log(f' enc conv out: {enc_conv_output_shape}') + log(f' encoder out: {encoder_outputs.shape}') + log(f' decoder out: {decoder_output.shape}') + log(f' residual out: {residual.shape}') + log(f' projected residual out: {projected_residual.shape}') + log(f' mel out: {mel_outputs.shape}') + log(f' out: {stop_token_prediction.shape}') def add_loss(self): diff --git a/tacotron/synthesize.py b/tacotron/synthesize.py index 5681f864..f6bd82f7 100644 --- a/tacotron/synthesize.py +++ b/tacotron/synthesize.py @@ -21,6 +21,7 @@ def run_live(args, checkpoint_path, hparams): log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams) + synth.session_open() #Generate fast greeting message greetings = 'Hello, Welcome to the Live testing tool. Please type a message and I will try to read it!' @@ -39,6 +40,8 @@ def run_live(args, checkpoint_path, hparams): generate_fast(synth, leave) sleep(2) break + synth.session_close() + def run_eval(args, checkpoint_path, output_dir, hparams, sentences): eval_dir = os.path.join(output_dir, 'eval') @@ -56,6 +59,7 @@ def run_eval(args, checkpoint_path, output_dir, hparams, sentences): log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams) + synth.session_open() sentences = list(map(lambda s: s.strip(), sentences)) delta_size = hparams.tacotron_synthesis_batch_size if hparams.tacotron_synthesis_batch_size < len(sentences) else len(sentences) @@ -69,6 +73,7 @@ def run_eval(args, checkpoint_path, output_dir, hparams, sentences): audio.save_wav(wav, os.path.join(eval_dir, f'{i:03d}.wav'), hparams) end = time.time() - start log(f'Generated total batch of {delta_size} in {end:.3f} sec') + synth.session_close() def run_synthesis(args, checkpoint_path, output_dir, hparams): @@ -87,6 +92,7 @@ def run_synthesis(args, checkpoint_path, output_dir, hparams): log(hparams_debug_string()) synth = Synthesizer() synth.load(checkpoint_path, hparams, gta=GTA) + synth.session_open() speaker_num = len(hparams.anchor_dirs) metadata_groups = [[] for i in range(speaker_num)] @@ -97,7 +103,7 @@ def run_synthesis(args, checkpoint_path, output_dir, hparams): for m in metadata: metadata_groups[int(m[0])].append(m[1:]) for i in range(speaker_num): - hours = sum([int(x[2]) for x in metadata_groups[i]]) * frame_shift_ms / (3600) + hours = sum([int(x[2]) for x in metadata_groups[i]]) * frame_shift_ms / 3600 log(f'Loaded {hparams.anchor_dirs[i]} for {len(metadata_groups[i])} examples ({hours:.2f} hours)') log('starting synthesis') @@ -112,6 +118,7 @@ def run_synthesis(args, checkpoint_path, output_dir, hparams): basenames = [os.path.basename(m).replace('.npy', '').replace('mel-', '') for m in mel_filenames] synth.synthesize(texts, basenames, synth_dir, None, mel_filenames, speaker_id) log(f'synthesized mel spectrograms at {synth_dir}') + synth.session_close() def tacotron_synthesize(args, hparams, checkpoint, sentences=None): diff --git a/tacotron/synthesizer.py b/tacotron/synthesizer.py index 294f0c3c..90f32d40 100644 --- a/tacotron/synthesizer.py +++ b/tacotron/synthesizer.py @@ -37,7 +37,11 @@ def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron'): #to avoid any possible conflicts, without affecting the output range of the model too much self._target_pad = -(hparams.max_abs_value + .1) if hparams.symmetric_mels else -0.1 + self.checkpoint_path = checkpoint_path log('Loading checkpoint: %s' % checkpoint_path) + + + def session_open(self): #Memory allocation on the GPU as needed config = tf.ConfigProto() config.gpu_options.allow_growth = True @@ -46,7 +50,11 @@ def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron'): self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() - saver.restore(self.session, checkpoint_path) + saver.restore(self.session, self.checkpoint_path) + + + def session_close(self): + self.session.close() def synthesize(self, batch, basenames, out_dir, log_dir, mel_filenames, speaker_id): diff --git a/tacotron/train.py b/tacotron/train.py index 5dbfe058..8ba09beb 100644 --- a/tacotron/train.py +++ b/tacotron/train.py @@ -97,9 +97,9 @@ def train(log_dir, args, hparams): checkpoint_path = os.path.join(save_dir, 'tacotron_model.ckpt') input_path = os.path.join(args.base_dir, args.input_dir) - log('Checkpoint path: {}'.format(checkpoint_path)) - log('Loading training data from: {}'.format(input_path)) - log('Using model: {}'.format(args.model)) + log(f'Checkpoint path: {checkpoint_path}') + log(f'Loading training data from: {input_path}') + log(f'Using model: {args.model}') log(hparams_debug_string()) #Start by setting a seed for repeatability @@ -121,7 +121,7 @@ def train(log_dir, args, hparams): loss_window = ValueWindow(100) saver = tf.train.Saver(max_to_keep=1) - log('Tacotron training set to a maximum of {} steps'.format(args.tacotron_train_steps)) + log(f'Tacotron training set to a maximum of {args.tacotron_train_steps} steps') #Memory allocation on the GPU as needed config = tf.ConfigProto() @@ -139,13 +139,13 @@ def train(log_dir, args, hparams): try: checkpoint_state = tf.train.get_checkpoint_state(save_dir) if (checkpoint_state and checkpoint_state.model_checkpoint_path): - log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path), slack=True) + log(f'Loading checkpoint {checkpoint_state.model_checkpoint_path}', slack=True) saver.restore(sess, checkpoint_state.model_checkpoint_path) else: - log('No model to load at {}'.format(save_dir), slack=True) + log(f'No model to load at {save_dir}', slack=True) except tf.errors.OutOfRangeError as e: - log('Cannot restore checkpoint: {}'.format(e), slack=True) + log(f'Cannot restore checkpoint: {e}', slack=True) else: log('Starting new training!', slack=True) @@ -158,21 +158,20 @@ def train(log_dir, args, hparams): step, loss, opt = sess.run([global_step, model.loss, model.optimize]) time_window.append(time.time() - start_time) loss_window.append(loss) - message = 'Step {:7d} [{:.3f} sec/step, loss={:.5f}, avg_loss={:.5f}]'.format( - step, time_window.average, loss, loss_window.average) + message = f'Step {step:7d} [{time_window.average:.3f} sec/step, loss={loss:.5f}, avg_loss={loss_window.average:.5f}]' log(message, end='\r', slack=(step % args.checkpoint_interval == 0)) if loss > 100 or np.isnan(loss): - log('Loss exploded to {:.5f} at step {}'.format(loss, step)) + log(f'Loss exploded to {loss:.5f} at step {step}') raise Exception('Loss exploded') if step % args.summary_interval == 0: - log('\nWriting summary at step {}'.format(step)) + log(f'Writing summary at step {step}') summary_writer.add_summary(sess.run(stats), step) if step % args.eval_interval == 0: #Run eval and save eval stats - log('\nRunning evaluation at step {}'.format(step)) + log(f'Running evaluation at step {step}') eval_losses = [] before_losses = [] @@ -197,19 +196,19 @@ def train(log_dir, args, hparams): stop_token_loss = sum(stop_token_losses) / len(stop_token_losses) attention_loss = sum(attention_losses) / len(attention_losses) - log('Saving eval log to {}..'.format(eval_dir)) + log(f'Saving eval log to {eval_dir}..') # #Save some log to monitor model improvement on same unseen sequence wav = audio.inv_mel_spectrogram(mel_p.T, hparams) - audio.save_wav(wav, os.path.join(eval_wav_dir, 'step-{}-eval-waveform-mel.wav'.format(step)), hparams) + audio.save_wav(wav, os.path.join(eval_wav_dir, f'step-{step}-eval-waveform-mel.wav'), hparams) - plot.plot_alignment(align, os.path.join(eval_plot_dir, 'step-{}-eval-align.png'.format(step)), - info='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, eval_loss), + plot.plot_alignment(align, os.path.join(eval_plot_dir, f'step-{step}-eval-align.png'), + info=f'{args.model}, {time_string()}, step={step}, loss={eval_loss:.5f}', max_len=t_len // hparams.outputs_per_step) - plot.plot_spectrogram(mel_p, os.path.join(eval_plot_dir, 'step-{}-eval-mel-spectrogram.png'.format(step)), - info='{}, {}, step={}, loss={:.5}'.format(args.model, time_string(), step, eval_loss), target_spectrogram=mel_t, + plot.plot_spectrogram(mel_p, os.path.join(eval_plot_dir, f'step-{step}-eval-mel-spectrogram.png'), + info=f'{args.model}, {time_string()}, step={step}, loss={eval_loss:.5f}', target_spectrogram=mel_t, max_len=t_len) - log('Eval loss for global step {}: {:.3f}'.format(step, eval_loss)) + log(f'Eval loss for global step {step}: {eval_loss:.3f}') log('Writing eval summary!') add_eval_stats(summary_writer, step, before_loss, after_loss, stop_token_loss, attention_loss, eval_loss) @@ -228,27 +227,27 @@ def train(log_dir, args, hparams): ]) #save predicted mel spectrogram to disk (debug) - mel_filename = 'mel-prediction-step-{}.npy'.format(step) + mel_filename = f'mel-prediction-step-{step}.npy' np.save(os.path.join(mel_dir, mel_filename), mel_prediction.T, allow_pickle=False) #save griffin lim inverted wav for debug (mel -> wav) wav = audio.inv_mel_spectrogram(mel_prediction.T, hparams) - audio.save_wav(wav, os.path.join(wav_dir, 'step-{}-wave-from-mel.wav'.format(step)), hparams) + audio.save_wav(wav, os.path.join(wav_dir, f'step-{step}-wave-from-mel.wav'), hparams) #save alignment plot to disk (control purposes) - plot.plot_alignment(alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)), - info='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss), + plot.plot_alignment(alignment, os.path.join(plot_dir, f'step-{step}-align.png'), + info=f'{args.model}, {time_string()}, step={step}, loss={loss:.5f}', max_len=target_length // hparams.outputs_per_step) #save real and predicted mel-spectrogram plot to disk (control purposes) - plot.plot_spectrogram(mel_prediction, os.path.join(plot_dir, 'step-{}-mel-spectrogram.png'.format(step)), - info='{}, {}, step={}, loss={:.5}'.format(args.model, time_string(), step, loss), target_spectrogram=target, + plot.plot_spectrogram(mel_prediction, os.path.join(plot_dir, f'step-{step}-mel-spectrogram.png'), + info=f'{args.model}, {time_string()}, step={step}, loss={loss:.5}', target_spectrogram=target, max_len=target_length) - log('Tacotron training complete after {} global steps!'.format(args.tacotron_train_steps), slack=True) + log(f'Tacotron training complete after {args.tacotron_train_steps} global steps!', slack=True) return save_dir except Exception as e: - log('Exiting due to exception: {}'.format(e), slack=True) + log(f'Exiting due to exception: {e}', slack=True) traceback.print_exc() coord.request_stop(e) diff --git a/train.py b/train.py index 03fd6173..1479caa9 100644 --- a/train.py +++ b/train.py @@ -9,8 +9,6 @@ from tacotron.synthesize import tacotron_synthesize from tacotron.train import tacotron_train -log = infolog.log - def save_seq(file, sequence, input_path): '''Save Tacotron-2 training state to disk. (To skip for future runs)