Skip to content

Commit

Permalink
T2 synthesis, T+W batch synthesis, fix checkpoints
Browse files Browse the repository at this point in the history
- Checkpoints loading for Wavenet and Tacotron during T2 synthesis has been fixed
- Wavenet time alignments have been fixed permanently for both GTA and Ground Truth mels (any issue with this is caused by a misuse of the model)
- Add batch synthesis for Tacotron (GTA) and Wavenet
- Fix bugs and add missings for Tacotron-2 synthesis.
  • Loading branch information
Rayhane-mamah authored Aug 12, 2018
1 parent 19abfe8 commit e24217a
Show file tree
Hide file tree
Showing 12 changed files with 329 additions and 118 deletions.
2 changes: 1 addition & 1 deletion datasets/wavenet_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def _process_utterance(mel_dir, wav_dir, index, wav_path, hparams):

#global condition features
if hparams.gin_channels > 0:
raise RuntimeError('When activating global conditions, please set your speaker_id rules in line 128 of datasets/wavenet_preprocessor.py to use them during training')
raise RuntimeError('When activating global conditions, please set your speaker_id rules in line 129 of datasets/wavenet_preprocessor.py to use them during training')
speaker_id = '<no_g>' #put the rule to determine how to assign speaker ids (using file names maybe? file basenames are available in "index" variable)
else:
speaker_id = '<no_g>'
Expand Down
16 changes: 11 additions & 5 deletions hparams.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,9 @@
tacotron_test_batches = 48, #number of test batches (For Ljspeech: 10% ~= 41 batches of 32 samples)
tacotron_data_random_state=1234, #random state for train test split repeatability

#Usually your GPU can handle 16x tacotron_batch_size during synthesis for the same memory amount during training (because no gradients to keep and ops to register for backprop)
tacotron_synthesis_batch_size = 32 * 16, #This ensures GTA synthesis goes up to 40x faster than one sample at a time and uses 100% of your GPU computation power.

tacotron_decay_learning_rate = True, #boolean, determines if the learning rate will follow an exponential decay
tacotron_start_decay = 50000, #Step at which learning decay starts
tacotron_decay_steps = 50000, #Determines the learning rate decay slope (UNDER TEST)
Expand Down Expand Up @@ -183,6 +186,10 @@
wavenet_test_batches = None, #number of test batches.
wavenet_data_random_state = 1234, #random state for train test split repeatability

#During synthesis, there is no max_time_steps limitation so the model can sample much longer audio than 8000 steps. (Audio can go up to 500k steps, equivalent to ~21sec on 24kHz)
#Usually your GPU can handle 1x~2x wavenet_batch_size during synthesis for the same memory amount during training (because no gradients to keep and ops to register for backprop)
wavenet_synthesis_batch_size = 4 * 2, #This ensure that wavenet synthesis goes up to 4x~8x faster when synthesizing multiple sentences. Watch out for OOM with long audios.

wavenet_learning_rate = 1e-3,
wavenet_adam_beta1 = 0.9,
wavenet_adam_beta2 = 0.999,
Expand Down Expand Up @@ -227,11 +234,10 @@
'it appears that oswald had only one caller in response to all of his fpcc activities,',
'he relied on the absence of the strychnia.',
'scoggins thought it was lighter.',
'''would, it is probable, have eventually overcome the reluctance of some of the prisoners at least,
and would have possessed so much moral dignity''',
'''Sequence to sequence models have enjoyed great success in a variety of tasks such as machine translation, speech recognition, and text summarization.
This project covers a sequence to sequence model trained to predict a speech representation from an input sequence of characters. We show that
the adopted architecture is able to perform this task with wild success.''',
'would, it is probable, have eventually overcome the reluctance of some of the prisoners at least, and would have possessed so much moral dignity',
'Sequence to sequence models have enjoyed great success in a variety of tasks such as machine translation, speech recognition, and text summarization.\
This project covers a sequence to sequence model trained to predict a speech representation from an input sequence of characters. We show that\
the adopted architecture is able to perform this task with wild success.',
'Thank you so much for your support!',
]

Expand Down
5 changes: 5 additions & 0 deletions synthesize.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import os
from warnings import warn

import tensorflow as tf

from hparams import hparams
from infolog import log
from tacotron.synthesize import tacotron_synthesize
Expand Down Expand Up @@ -31,6 +33,8 @@ def synthesize(args, hparams, taco_checkpoint, wave_checkpoint, sentences):
log('Running End-to-End TTS Evaluation. Model: {}'.format(args.name or args.model))
log('Synthesizing mel-spectrograms from text..')
wavenet_in_dir = tacotron_synthesize(args, hparams, taco_checkpoint, sentences)
#Delete Tacotron model from graph
tf.reset_default_graph()
log('Synthesizing audio from mel-spectrograms.. (This may take a while)')
wavenet_synthesize(args, hparams, wave_checkpoint)
log('Tacotron-2 TTS synthesis complete!')
Expand All @@ -53,6 +57,7 @@ def main():
parser.add_argument('--mode', default='eval', help='mode of run: can be one of {}'.format(accepted_modes))
parser.add_argument('--GTA', default='True', help='Ground truth aligned synthesis, defaults to True, only considered in synthesis mode')
parser.add_argument('--text_list', default='', help='Text file contains list of texts to be synthesized. Valid if mode=eval')
parser.add_argument('--speaker_id', default=None, help='Defines the speakers ids to use when running standalone Wavenet on a folder of mels. this variable must be a comma-separated list of ids')
args = parser.parse_args()

accepted_models = ['Tacotron', 'WaveNet', 'Tacotron-2']
Expand Down
21 changes: 12 additions & 9 deletions tacotron/synthesize.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
with open(os.path.join(eval_dir, 'map.txt'), 'w') as file:
for i, text in enumerate(tqdm(sentences)):
start = time.time()
mel_filename = synth.synthesize(text, i+1, eval_dir, log_dir, None)
mel_filename, speaker_id = synth.synthesize([text], [i+1], eval_dir, log_dir, None)

file.write('{}|{}\n'.format(text, mel_filename))
file.write('{}|{}|{}\n'.format(text, mel_filename[0], speaker_id[0]))
log('synthesized mel spectrograms at {}'.format(eval_dir))
return eval_dir

Expand Down Expand Up @@ -90,18 +90,21 @@ def run_synthesis(args, checkpoint_path, output_dir, hparams):
hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / (3600)
log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(metadata), hours))

metadata = [metadata[i: i+hparams.tacotron_synthesis_batch_size] for i in range(0, len(metadata), hparams.tacotron_synthesis_batch_size)]

log('starting synthesis')
mel_dir = os.path.join(args.input_dir, 'mels')
wav_dir = os.path.join(args.input_dir, 'audio')
with open(os.path.join(synth_dir, 'map.txt'), 'w') as file:
for i, meta in enumerate(tqdm(metadata)):
text = meta[5]
mel_filename = os.path.join(mel_dir, meta[1])
wav_filename = os.path.join(wav_dir, meta[0])
basename = os.path.basename(mel_filename).replace('.npy', '').replace('mel-', '')
mel_output_filename, speaker_id = synth.synthesize(text, basename, synth_dir, None, mel_filename)

file.write('{}|{}|{}|{}|{}\n'.format(wav_filename, mel_filename, mel_output_filename, speaker_id, text))
texts = [m[5] for m in meta]
mel_filenames = [os.path.join(mel_dir, m[1]) for m in meta]
wav_filenames = [os.path.join(wav_dir, m[0]) for m in meta]
basenames = [os.path.basename(m).replace('.npy', '').replace('mel-', '') for m in mel_filenames]
mel_output_filenames, speaker_ids = synth.synthesize(texts, basenames, synth_dir, None, mel_filenames)

for elems in zip(wav_filenames, mel_filenames, mel_output_filenames, speaker_ids, texts):
file.write('|'.join([str(x) for x in elems]) + '\n')
log('synthesized mel spectrograms at {}'.format(synth_dir))
return os.path.join(synth_dir, 'map.txt')

Expand Down
131 changes: 89 additions & 42 deletions tacotron/synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
class Synthesizer:
def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron'):
log('Constructing model: %s' % model_name)
inputs = tf.placeholder(tf.int32, [1, None], 'inputs')
input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths')
targets = tf.placeholder(tf.float32, [1, None, hparams.num_mels], 'mel_targets')
inputs = tf.placeholder(tf.int32, [None, None], 'inputs')
input_lengths = tf.placeholder(tf.int32, [None], 'input_lengths')
targets = tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets')
with tf.variable_scope('model') as scope:
self.model = create_model(model_name, hparams)
if gta:
Expand All @@ -28,41 +28,58 @@ def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron'):
self.model.initialize(inputs, input_lengths)
self.mel_outputs = self.model.mel_outputs
self.linear_outputs = self.model.linear_outputs if (hparams.predict_linear and not gta) else None
self.alignment = self.model.alignments[0]
self.alignments = self.model.alignments

self.gta = gta
self._hparams = hparams
#pad input sequences with the <pad_token> 0 ( _ )
self._pad = 0
#explicitely setting the padding to a value that doesn't originally exist in the spectogram
#to avoid any possible conflicts, without affecting the output range of the model too much
if hparams.symmetric_mels:
self._target_pad = -(hparams.max_abs_value + .1)
else:
self._target_pad = -0.1

log('Loading checkpoint: %s' % checkpoint_path)
self.session = tf.Session()
#Memory allocation on the GPU as needed
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

self.session = tf.Session(config=config)
self.session.run(tf.global_variables_initializer())

saver = tf.train.Saver()
saver.restore(self.session, checkpoint_path)


def synthesize(self, text, index, out_dir, log_dir, mel_filename):
def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames):
hparams = self._hparams
cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
seq = text_to_sequence(text, cleaner_names)
seqs = [np.asarray(text_to_sequence(text, cleaner_names)) for text in texts]
input_lengths = [len(seq) for seq in seqs]
seqs = self._prepare_inputs(seqs)
feed_dict = {
self.model.inputs: [np.asarray(seq, dtype=np.int32)],
self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32),
self.model.inputs: seqs,
self.model.input_lengths: np.asarray(input_lengths, dtype=np.int32),
}

if self.gta:
feed_dict[self.model.mel_targets] = np.load(mel_filename).reshape(1, -1, 80)
np_targets = [np.load(mel_filename) for mel_filename in mel_filenames]
target_lengths = [len(np_target) for np_target in np_targets]
padded_targets = self._prepare_targets(np_targets, self._hparams.outputs_per_step)
feed_dict[self.model.mel_targets] = padded_targets.reshape(len(np_targets), -1, 80)

if self.gta or not hparams.predict_linear:
mels, alignment = self.session.run([self.mel_outputs, self.alignment], feed_dict=feed_dict)
mels, alignments = self.session.run([self.mel_outputs, self.alignments], feed_dict=feed_dict)
if self.gta:
mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)] #Take off the reduction factor padding frames for time consistency with wavenet
assert len(mels) == len(np_targets)

else:
linear, mels, alignment = self.session.run([self.linear_outputs, self.mel_outputs, self.alignment], feed_dict=feed_dict)
linear = linear.reshape(-1, hparams.num_freq)

mels = mels.reshape(-1, hparams.num_mels) #Thanks to @imdatsolak for pointing this out
linears, mels, alignments = self.session.run([self.linear_outputs, self.mel_outputs, self.alignments], feed_dict=feed_dict)


if index is None:
if basenames is None:
#Generate wav and read it
wav = audio.inv_mel_spectrogram(mels.T, hparams)
audio.save_wav(wav, 'temp.wav', sr=hparams.sample_rate) #Find a better way
Expand All @@ -86,34 +103,64 @@ def synthesize(self, text, index, out_dir, log_dir, mel_filename):
return


#Get speaker id for global conditioning (only used with GTA generally)
if hparams.gin_channels > 0:
raise RuntimeError('Please set the speaker_id rule in line 89 of tacotron/synthesizer.py to allow for global condition usage later.')
speaker_id = '<no_g>' #set the rule to determine speaker id. By using the file basename maybe? (basenames are inside "index" variable)
else:
speaker_id = '<no_g>'
saved_mels_paths = []
speaker_ids = []
for i, mel in enumerate(mels):
#Get speaker id for global conditioning (only used with GTA generally)
if hparams.gin_channels > 0:
raise RuntimeError('Please set the speaker_id rule in line 99 of tacotron/synthesizer.py to allow for global condition usage later.')
speaker_id = '<no_g>' #set the rule to determine speaker id. By using the file basename maybe? (basenames are inside "basenames" variable)
speaker_ids.append(speaker_id) #finish by appending the speaker id. (allows for different speakers per batch if your model is multispeaker)
else:
speaker_id = '<no_g>'
speaker_ids.append(speaker_id)

# Write the spectrogram to disk
# Note: outputs mel-spectrogram files and target ones have same names, just different folders
mel_filename = os.path.join(out_dir, 'mel-{}.npy'.format(index))
np.save(mel_filename, mels, allow_pickle=False)
# Write the spectrogram to disk
# Note: outputs mel-spectrogram files and target ones have same names, just different folders
mel_filename = os.path.join(out_dir, 'mel-{}.npy'.format(basenames[i]))
np.save(mel_filename, mel, allow_pickle=False)
saved_mels_paths.append(mel_filename)

if log_dir is not None:
#save wav (mel -> wav)
wav = audio.inv_mel_spectrogram(mels.T, hparams)
audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-mel.wav'.format(index)), sr=hparams.sample_rate)
if log_dir is not None:
#save wav (mel -> wav)
wav = audio.inv_mel_spectrogram(mel.T, hparams)
audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-mel.wav'.format(basenames[i])), sr=hparams.sample_rate)

#save alignments
plot.plot_alignment(alignments[i], os.path.join(log_dir, 'plots/alignment-{}.png'.format(basenames[i])),
info='{}'.format(texts[i]), split_title=True)

#save mel spectrogram plot
plot.plot_spectrogram(mel, os.path.join(log_dir, 'plots/mel-{}.png'.format(basenames[i])),
info='{}'.format(texts[i]), split_title=True)

if hparams.predict_linear:
#save wav (linear -> wav)
wav = audio.inv_linear_spectrogram(linears[i].T, hparams)
audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-linear.wav'.format(basenames[i])), sr=hparams.sample_rate)

#save mel spectrogram plot
plot.plot_spectrogram(linears[i], os.path.join(log_dir, 'plots/linear-{}.png'.format(basenames[i])),
info='{}'.format(texts[i]), split_title=True, auto_aspect=True)



return saved_mels_paths, speaker_ids

def _round_up(self, x, multiple):
remainder = x % multiple
return x if remainder == 0 else x + multiple - remainder

if hparams.predict_linear:
#save wav (linear -> wav)
wav = audio.inv_linear_spectrogram(linear.T, hparams)
audio.save_wav(wav, os.path.join(log_dir, 'wavs/wav-{}-linear.wav'.format(index)), sr=hparams.sample_rate)
def _prepare_inputs(self, inputs):
max_len = max([len(x) for x in inputs])
return np.stack([self._pad_input(x, max_len) for x in inputs])

#save alignments
plot.plot_alignment(alignment, os.path.join(log_dir, 'plots/alignment-{}.png'.format(index)),
info='{}'.format(text), split_title=True)
def _pad_input(self, x, length):
return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=self._pad)

#save mel spectrogram plot
plot.plot_spectrogram(mels, os.path.join(log_dir, 'plots/mel-{}.png'.format(index)),
info='{}'.format(text), split_title=True)
def _prepare_targets(self, targets, alignment):
max_len = max([len(t) for t in targets])
return np.stack([self._pad_target(t, self._round_up(max_len, alignment)) for t in targets])

return mel_filename, speaker_id
def _pad_target(self, t, length):
return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode='constant', constant_values=self._target_pad)
2 changes: 1 addition & 1 deletion tacotron/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def train(log_dir, args, hparams):

if (checkpoint_state and checkpoint_state.model_checkpoint_path):
log('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path), slack=True)
saver.restore(sess, checkpoint_state.model_checkpoint_path)
load_averaged_model(sess, sh_saver, checkpoint_state.model_checkpoint_path)
else:
log('No model to load at {}'.format(save_dir), slack=True)

Expand Down
12 changes: 9 additions & 3 deletions tacotron/utils/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def plot_alignment(alignment, path, info=None, split_title=False, max_len=None):
plt.close()


def plot_spectrogram(pred_spectrogram, path, info=None, split_title=False, target_spectrogram=None, max_len=None):
def plot_spectrogram(pred_spectrogram, path, info=None, split_title=False, target_spectrogram=None, max_len=None, auto_aspect=False):
if max_len is not None:
target_spectrogram = target_spectrogram[:max_len]
pred_spectrogram = pred_spectrogram[:max_len]
Expand All @@ -60,14 +60,20 @@ def plot_spectrogram(pred_spectrogram, path, info=None, split_title=False, targe
ax1 = fig.add_subplot(311)
ax2 = fig.add_subplot(312)

im = ax1.imshow(np.rot90(target_spectrogram), interpolation='none')
if auto_aspect:
im = ax1.imshow(np.rot90(target_spectrogram), aspect='auto', interpolation='none')
else:
im = ax1.imshow(np.rot90(target_spectrogram), interpolation='none')
ax1.set_title('Target Mel-Spectrogram')
fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax1)
ax2.set_title('Predicted Mel-Spectrogram')
else:
ax2 = fig.add_subplot(211)

im = ax2.imshow(np.rot90(pred_spectrogram), interpolation='none')
if auto_aspect:
im = ax2.imshow(np.rot90(pred_spectrogram), aspect='auto', interpolation='none')
else:
im = ax2.imshow(np.rot90(pred_spectrogram), interpolation='none')
fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax2)

plt.tight_layout()
Expand Down
Loading

0 comments on commit e24217a

Please sign in to comment.