Skip to content

Commit

Permalink
Add EOS an the end of each sentence
Browse files Browse the repository at this point in the history
Signed-off-by: begeekmyfriend <[email protected]>
  • Loading branch information
begeekmyfriend committed Oct 25, 2019
1 parent 22ba511 commit 9c343a5
Show file tree
Hide file tree
Showing 9 changed files with 75 additions and 98 deletions.
35 changes: 0 additions & 35 deletions datasets/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,6 @@ def preemphasis(wav, k):
def inv_preemphasis(wav, k):
return signal.lfilter([1], [1, -k], wav)

#From https://github.com/r9y9/wavenet_vocoder/blob/master/audio.py
def start_and_end_indices(quantized, silence_threshold=2):
for start in range(quantized.size):
if abs(quantized[start] - 127) > silence_threshold:
break
for end in range(quantized.size - 1, 1, -1):
if abs(quantized[end] - 127) > silence_threshold:
break

assert abs(quantized[start] - 127) > silence_threshold
assert abs(quantized[end] - 127) > silence_threshold

return start, end

def trim_silence(wav, hparams):
'''Trim leading and trailing silence
Expand Down Expand Up @@ -149,27 +135,6 @@ def _stft(y, hparams):
def _istft(y, hparams):
return librosa.istft(y, hop_length=get_hop_size(hparams), win_length=hparams.win_size)

def num_frames(length, fsize, fshift):
"""Compute number of time frames of spectrogram
"""
pad = (fsize - fshift)
if length % fshift == 0:
M = (length + pad * 2 - fsize) // fshift + 1
else:
M = (length + pad * 2 - fsize) // fshift + 2
return M


def pad_lr(x, fsize, fshift):
"""Compute left and right padding
"""
M = num_frames(len(x), fsize, fshift)
pad = (fsize - fshift)
T = len(x) + 2 * pad
r = (M - 1) * fshift + fsize - T
return pad, pad + r


# Conversions
_mel_basis = None
_inv_mel_basis = None
Expand Down
15 changes: 7 additions & 8 deletions synthesize.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import argparse
import os
from warnings import warn

import tensorflow as tf

from hparams import hparams
from infolog import log
from tacotron.synthesize import tacotron_synthesize
from tacotron.utils.symbols import _eos


def prepare_run(args):
Expand All @@ -21,18 +19,19 @@ def prepare_run(args):
wave_checkpoint = os.path.join('logs-' + run_name, 'wave_' + args.checkpoint)
return taco_checkpoint, wave_checkpoint, modified_hp


def get_sentences(args):
if args.text_list:
if args.text:
try:
f = open(args.text_list)
f = open(args.text)
lines = f.readlines()
except UnicodeDecodeError:
f = open(args.text_list, encoding='gbk')
f = open(args.text, encoding='gbk')
lines = f.readlines()
sentences = list(map(lambda l: l.strip(), lines[1::2]))
else:
sentences = hparams.sentences
return sentences
return list(map(lambda s: s + _eos, sentences))


def main():
Expand All @@ -49,7 +48,7 @@ def main():
parser.add_argument('--output_dir', default='output/', help='folder to contain synthesized mel spectrograms')
parser.add_argument('--mode', default='eval', help=f'mode of run: can be one of {accepted_modes}')
parser.add_argument('--GTA', default='True', help='Ground truth aligned synthesis, defaults to True, only considered in synthesis mode')
parser.add_argument('--text_list', default='', help='Text file contains list of texts to be synthesized. Valid if mode=eval')
parser.add_argument('--text', default='', help='Text file contains list of texts to be synthesized. Valid if mode=eval')
parser.add_argument('--speaker_id', default=0, type=int, help='Defines the speakers ids to use when running standalone Wavenet on a folder of mels. this variable must be a comma-separated list of ids')
args = parser.parse_args()

Expand Down
6 changes: 4 additions & 2 deletions tacotron/feeder.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,8 @@ def make_test_batches(self):
# Bucket examples based on similar output sequence length for efficiency
batches = [examples[i: i+n] for j in range(0, len(examples), n)]
np.random.shuffle(batches)
log('\nGenerated {} test batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start))
end = time.time() - start
log(f'Generated {len(batches)} test batches of size {n} in {end:.3f} sec')
return batches

def _enqueue_next_train_group(self):
Expand All @@ -153,7 +154,8 @@ def _enqueue_next_train_group(self):
# Bucket examples based on similar output sequence length for efficiency
batches = [examples[i: i+n] for i in range(0, len(examples), n)]
np.random.shuffle(batches)
log('\nGenerated {} train batches of size {} in {:.3f} sec'.format(len(batches), n, time.time() - start))
end = time.time() - start
log(f'Generated {len(batches)} train batches of size {n} in {end:.3f} sec')
for batch in batches:
feed_dict = dict(zip(self._placeholders, self._prepare_batch(batch, self._hparams.outputs_per_step)))
self._session.run(self._enqueue_op, feed_dict=feed_dict)
Expand Down
19 changes: 9 additions & 10 deletions tacotron/models/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def __call__(self, inputs):
x = inputs
for i in range(self.enc_conv_num_layers):
x = conv1d(x, self.kernel_size, self.channels, self.activation,
self.is_training, self.bnorm, 'conv_layer_{}_'.format(i + 1)+self.scope)
self.is_training, self.bnorm, f'conv_layer_{i+1}_' + self.scope)
return x


Expand Down Expand Up @@ -175,11 +175,11 @@ def __call__(self, inputs):
with tf.variable_scope(self.scope):
for i, size in enumerate(self.layers_sizes):
dense = tf.layers.dense(x, units=size, activation=self.activation,
name='dense_{}'.format(i + 1))
name=f'dense_{i+1}')
#The paper discussed introducing diversity in generation at inference time
#by using a dropout of 0.5 only in prenet layers (in both training and inference).
x = tf.layers.dropout(dense, rate=self.drop_rate, training=True,
name='dropout_{}'.format(i + 1) + self.scope)
name=f'dropout_{i+1}' + self.scope)
return x


Expand All @@ -206,7 +206,7 @@ def __init__(self, is_training, layers=2, size=1024, zoneout=0.1, scope=None):
self.rnn_layers = [ZoneoutLSTMCell(size, is_training,
zoneout_factor_cell=zoneout,
zoneout_factor_output=zoneout,
name='decoder_LSTM_{}'.format(i+1)) for i in range(layers)]
name=f'decoder_LSTM_{i+1}') for i in range(layers)]

self._cell = tf.contrib.rnn.MultiRNNCell(self.rnn_layers, state_is_tuple=True)

Expand All @@ -231,14 +231,14 @@ def __init__(self, shape=80, activation=None, scope=None):
self.activation = activation

self.scope = 'Linear_projection' if scope is None else scope
self.dense = tf.layers.Dense(units=shape, activation=activation, name='projection_{}'.format(self.scope))
self.dense = tf.layers.Dense(units=shape, activation=activation, name=f'projection_{self.scope}')

def __call__(self, inputs):
with tf.variable_scope(self.scope):
#If activation==None, this returns a simple Linear projection
#else the projection will be passed through an activation function
# output = tf.layers.dense(inputs, units=self.shape, activation=self.activation,
# name='projection_{}'.format(self.scope))
# name=f'projection_{self.scope}')
output = self.dense(inputs)

return output
Expand All @@ -265,8 +265,7 @@ def __init__(self, is_training, shape=1, activation=tf.nn.sigmoid, scope=None):

def __call__(self, inputs):
with tf.variable_scope(self.scope):
output = tf.layers.dense(inputs, units=self.shape,
activation=None, name='projection_{}'.format(self.scope))
output = tf.layers.dense(inputs, units=self.shape, activation=None, name=f'projection_{self.scope}')

#During training, don't use activation as it is integrated inside the sigmoid_cross_entropy loss function
if self.is_training:
Expand Down Expand Up @@ -302,9 +301,9 @@ def __call__(self, inputs):
x = inputs
for i in range(self.postnet_num_layers - 1):
x = conv1d(x, self.kernel_size, self.channels, self.activation,
self.is_training, self.bnorm, 'conv_layer_{}_'.format(i + 1)+self.scope)
self.is_training, self.bnorm, f'conv_layer_{i+1}_' + self.scope)
x = conv1d(x, self.kernel_size, self.channels, lambda _: _, self.is_training, self.bnorm,
'conv_layer_{}_'.format(5)+self.scope)
'conv_layer_5_' + self.scope)
return x


Expand Down
24 changes: 12 additions & 12 deletions tacotron/models/tacotron.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,18 +140,18 @@ def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets
self.mel_targets = mel_targets
self.targets_lengths = targets_lengths
log('Initialized Tacotron model. Dimensions (? = dynamic shape): ')
log(' Train mode: {}'.format(is_training))
log(' Eval mode: {}'.format(is_evaluating))
log(' GTA mode: {}'.format(gta))
log(' Synthesis mode: {}'.format(not (is_training or is_evaluating)))
log(' embedding: {}'.format(embedded_inputs.shape))
log(' enc conv out: {}'.format(enc_conv_output_shape))
log(' encoder out: {}'.format(encoder_outputs.shape))
log(' decoder out: {}'.format(decoder_output.shape))
log(' residual out: {}'.format(residual.shape))
log(' projected residual out: {}'.format(projected_residual.shape))
log(' mel out: {}'.format(mel_outputs.shape))
log(' <stop_token> out: {}'.format(stop_token_prediction.shape))
log(f' Train mode: {is_training}')
log(f' Eval mode: {is_evaluating}')
log(f' GTA mode: {gta}')
log(f' Synthesis mode: {not (is_training or is_evaluating)}')
log(f' embedding: {embedded_inputs.shape}')
log(f' enc conv out: {enc_conv_output_shape}')
log(f' encoder out: {encoder_outputs.shape}')
log(f' decoder out: {decoder_output.shape}')
log(f' residual out: {residual.shape}')
log(f' projected residual out: {projected_residual.shape}')
log(f' mel out: {mel_outputs.shape}')
log(f' <stop_token> out: {stop_token_prediction.shape}')


def add_loss(self):
Expand Down
9 changes: 8 additions & 1 deletion tacotron/synthesize.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def run_live(args, checkpoint_path, hparams):
log(hparams_debug_string())
synth = Synthesizer()
synth.load(checkpoint_path, hparams)
synth.session_open()

#Generate fast greeting message
greetings = 'Hello, Welcome to the Live testing tool. Please type a message and I will try to read it!'
Expand All @@ -39,6 +40,8 @@ def run_live(args, checkpoint_path, hparams):
generate_fast(synth, leave)
sleep(2)
break
synth.session_close()


def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
eval_dir = os.path.join(output_dir, 'eval')
Expand All @@ -56,6 +59,7 @@ def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
log(hparams_debug_string())
synth = Synthesizer()
synth.load(checkpoint_path, hparams)
synth.session_open()

sentences = list(map(lambda s: s.strip(), sentences))
delta_size = hparams.tacotron_synthesis_batch_size if hparams.tacotron_synthesis_batch_size < len(sentences) else len(sentences)
Expand All @@ -69,6 +73,7 @@ def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
audio.save_wav(wav, os.path.join(eval_dir, f'{i:03d}.wav'), hparams)
end = time.time() - start
log(f'Generated total batch of {delta_size} in {end:.3f} sec')
synth.session_close()


def run_synthesis(args, checkpoint_path, output_dir, hparams):
Expand All @@ -87,6 +92,7 @@ def run_synthesis(args, checkpoint_path, output_dir, hparams):
log(hparams_debug_string())
synth = Synthesizer()
synth.load(checkpoint_path, hparams, gta=GTA)
synth.session_open()

speaker_num = len(hparams.anchor_dirs)
metadata_groups = [[] for i in range(speaker_num)]
Expand All @@ -97,7 +103,7 @@ def run_synthesis(args, checkpoint_path, output_dir, hparams):
for m in metadata:
metadata_groups[int(m[0])].append(m[1:])
for i in range(speaker_num):
hours = sum([int(x[2]) for x in metadata_groups[i]]) * frame_shift_ms / (3600)
hours = sum([int(x[2]) for x in metadata_groups[i]]) * frame_shift_ms / 3600
log(f'Loaded {hparams.anchor_dirs[i]} for {len(metadata_groups[i])} examples ({hours:.2f} hours)')

log('starting synthesis')
Expand All @@ -112,6 +118,7 @@ def run_synthesis(args, checkpoint_path, output_dir, hparams):
basenames = [os.path.basename(m).replace('.npy', '').replace('mel-', '') for m in mel_filenames]
synth.synthesize(texts, basenames, synth_dir, None, mel_filenames, speaker_id)
log(f'synthesized mel spectrograms at {synth_dir}')
synth.session_close()


def tacotron_synthesize(args, hparams, checkpoint, sentences=None):
Expand Down
10 changes: 9 additions & 1 deletion tacotron/synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,11 @@ def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron'):
#to avoid any possible conflicts, without affecting the output range of the model too much
self._target_pad = -(hparams.max_abs_value + .1) if hparams.symmetric_mels else -0.1

self.checkpoint_path = checkpoint_path
log('Loading checkpoint: %s' % checkpoint_path)


def session_open(self):
#Memory allocation on the GPU as needed
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
Expand All @@ -46,7 +50,11 @@ def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron'):
self.session.run(tf.global_variables_initializer())

saver = tf.train.Saver()
saver.restore(self.session, checkpoint_path)
saver.restore(self.session, self.checkpoint_path)


def session_close(self):
self.session.close()


def synthesize(self, batch, basenames, out_dir, log_dir, mel_filenames, speaker_id):
Expand Down
Loading

0 comments on commit 9c343a5

Please sign in to comment.