Skip to content

Commit

Permalink
minor bugs, hparams optim
Browse files Browse the repository at this point in the history
  • Loading branch information
Rayhane-mamah authored Jan 20, 2019
1 parent 432bbe1 commit 1e99756
Show file tree
Hide file tree
Showing 10 changed files with 76 additions and 50 deletions.
4 changes: 2 additions & 2 deletions datasets/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def save_wav(wav, path, sr):
wavfile.write(path, sr, wav.astype(np.int16))

def save_wavenet_wav(wav, path, sr, inv_preemphasize, k):
wav = inv_preemphasis(wav, k, inv_preemphasize)
# wav = inv_preemphasis(wav, k, inv_preemphasize)
wav *= 32767 / max(0.01, np.max(np.abs(wav)))
wavfile.write(path, sr, wav.astype(np.int16))

Expand Down Expand Up @@ -160,7 +160,7 @@ def pad_lr(x, fsize, fshift):
##########################################################
#Librosa correct padding
def librosa_pad_lr(x, fsize, fshift, pad_sides=1):
'''compute right padding (final frame)
'''compute right padding (final frame) or both sides padding (first and final frames)
'''
assert pad_sides in (1, 2)
# return int(fsize // 2)
Expand Down
24 changes: 14 additions & 10 deletions datasets/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,20 +69,23 @@ def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hpar
wav_path))
return None

#Trim lead/trail silences
if hparams.trim_silence:
wav = audio.trim_silence(wav, hparams)

#Pre-emphasize
wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize)

#rescale wav
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max

#Assert all audio is in [-1, 1]
if (wav > 1.).any() or (wav < -1.).any():
raise RuntimeError('wav has invalid value: {}'.format(wav))

#M-AILABS extra silence specific
if hparams.trim_silence:
wav = audio.trim_silence(wav, hparams)
#Assert all audio is in [-1, 1]
if (wav > 1.).any() or (wav < -1.).any():
raise RuntimeError('wav has invalid value: {}'.format(wav_path))
if (preem_wav > 1.).any() or (preem_wav < -1.).any():
raise RuntimeError('wav has invalid value: {}'.format(wav_path))

#Mu-law quantize
if is_mulaw_quantize(hparams.input_type):
Expand All @@ -92,6 +95,7 @@ def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hpar
#Trim silences
start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
wav = wav[start: end]
preem_wav = preem_wav[start: end]
out = out[start: end]

constant_values = mulaw_quantize(0, hparams.quantize_channels)
Expand All @@ -110,14 +114,14 @@ def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hpar
out_dtype = np.float32

# Compute the mel scale spectrogram from the wav
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32)
mel_frames = mel_spectrogram.shape[1]

if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
return None

#Compute the linear scale spectrogram from the wav
linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32)
linear_spectrogram = audio.linearspectrogram(preem_wav, hparams).astype(np.float32)
linear_frames = linear_spectrogram.shape[1]

#sanity check
Expand Down
22 changes: 13 additions & 9 deletions datasets/wavenet_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,20 +63,23 @@ def _process_utterance(mel_dir, wav_dir, index, wav_path, hparams):
wav_path))
return None

#M-AILABS extra silence specific
if hparams.trim_silence:
wav = audio.trim_silence(wav, hparams)

#Pre-emphasize
wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize)
preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize)

#rescale wav
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max

#Assert all audio is in [-1, 1]
if (wav > 1.).any() or (wav < -1.).any():
raise RuntimeError('wav has invalid value: {}'.format(wav))

#M-AILABS extra silence specific
if hparams.trim_silence:
wav = audio.trim_silence(wav, hparams)
#Assert all audio is in [-1, 1]
if (wav > 1.).any() or (wav < -1.).any():
raise RuntimeError('wav has invalid value: {}'.format(wav_path))
if (preem_wav > 1.).any() or (preem_wav < -1.).any():
raise RuntimeError('wav has invalid value: {}'.format(wav_path))

#Mu-law quantize
if is_mulaw_quantize(hparams.input_type):
Expand All @@ -86,6 +89,7 @@ def _process_utterance(mel_dir, wav_dir, index, wav_path, hparams):
#Trim silences
start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
wav = wav[start: end]
preem_wav = preem_wav[start: end]
out = out[start: end]

constant_values = mulaw_quantize(0, hparams.quantize_channels)
Expand All @@ -104,7 +108,7 @@ def _process_utterance(mel_dir, wav_dir, index, wav_path, hparams):
out_dtype = np.float32

# Compute the mel scale spectrogram from the wav
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32)
mel_frames = mel_spectrogram.shape[1]

if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
Expand Down
4 changes: 2 additions & 2 deletions griffin_lim_synthesis_tool.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
"os.makedirs(out_dir, exist_ok=True)\n",
"\n",
"#mel_file = os.path.join(mel_folder, mel_file)\n",
"mel_file = 'training_data/mels/mel-LJ001-0005.npy'\n",
"mel_file = 'training_data/mels/mel-LJ001-0008.npy'\n",
"mel_spectro = np.load(mel_file)\n",
"mel_spectro.shape"
]
Expand Down Expand Up @@ -55,7 +55,7 @@
"metadata": {},
"outputs": [],
"source": [
"lin_file = 'training_data/linear/linear-LJ001-0005.npy'\n",
"lin_file = 'training_data/linear/linear-LJ001-0008.npy'\n",
"lin_spectro = np.load(lin_file)\n",
"lin_spectro.shape"
]
Expand Down
15 changes: 8 additions & 7 deletions hparams.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@
trim_silence = True, #Whether to clip silence in Audio (at beginning and end of audio only, not the middle)
trim_fft_size = 2048, #Trimming window size
trim_hop_size = 512, #Trimmin hop length
trim_top_db = 45, #Trimming db difference from reference db (smaller==harder trim.)
trim_top_db = 40, #Trimming db difference from reference db (smaller==harder trim.)

#Mel and Linear spectrograms normalization/scaling and clipping
signal_normalization = True, #Whether to normalize mel spectrograms to some predefined range (following below parameters)
Expand Down Expand Up @@ -120,6 +120,7 @@
outputs_per_step = 1, #number of frames to generate at each decoding step (increase to speed up computation and allows for higher batch size, decreases G&L audio quality)
stop_at_any = True, #Determines whether the decoder should stop when predicting <stop> to any frame or to all of them (True works pretty well)
batch_norm_position = 'after', #Can be in ('before', 'after'). Determines whether we use batch norm before or after the activation function (relu). Matter for debate.
clip_outputs = True, #Whether to clip spectrograms to T2_output_range (even in loss computation). ie: Don't penalize model for exceeding output range and bring back to borders.

#Input parameters
embedding_dim = 512, #dimension of embedding space
Expand Down Expand Up @@ -190,7 +191,7 @@
#Model Losses parmeters
#Minimal scales ranges for MoL and Gaussian modeling
log_scale_min=float(np.log(1e-14)), #Mixture of logistic distributions minimal log scale
log_scale_min_gauss = float(np.log(9.1188196 * 1e-4)), #Gaussian distribution minimal allowed log scale
log_scale_min_gauss = float(np.log(1e-7)), #Gaussian distribution minimal allowed log scale
#Loss type
cdf_loss = True, #Whether to use CDF loss in Gaussian modeling. Advantages: non-negative loss term and more training stability. (Automatically True for MoL)

Expand All @@ -216,10 +217,10 @@
upsample_type = 'SubPixel', #Type of the upsampling deconvolution. Can be ('1D' or '2D', 'Resize', 'SubPixel').
upsample_activation = 'Relu', #Activation function used during upsampling. Can be ('LeakyRelu', 'Relu' or None)
upsample_scales = [5, 5, 11], #prod(upsample_scales) should be equal to hop_size
freq_axis_kernel_size = 5, #Only used for 2D upsampling types. This is the number of requency bands that are spanned at a time for each frame.
freq_axis_kernel_size = 2, #Only used for 2D upsampling types. This is the number of requency bands that are spanned at a time for each frame.
leaky_alpha = 0.4, #slope of the negative portion of LeakyRelu (LeakyRelu: y=x if x>0 else y=alpha * x)
NN_init = True, #Determines whether we want to initialize upsampling kernels/biases in a way to ensure upsample is initialize to Nearest neighbor upsampling. (Mostly for debug)
NN_scaler = 0.1, #Determines the initial Nearest Neighbor upsample values scale. i.e: upscaled_input_values = input_values * NN_scaler (1. to disable)
NN_scaler = 0.3, #Determines the initial Nearest Neighbor upsample values scale. i.e: upscaled_input_values = input_values * NN_scaler (1. to disable)

#global conditioning
gin_channels = -1, #Set this to -1 to disable global conditioning, Only used for multi speaker dataset. It defines the depth of the embeddings (Recommended: 16)
Expand Down Expand Up @@ -249,10 +250,10 @@
#Learning rate schedule
tacotron_decay_learning_rate = True, #boolean, determines if the learning rate will follow an exponential decay
tacotron_start_decay = 40000, #Step at which learning decay starts
tacotron_decay_steps = 24500, #Determines the learning rate decay slope (UNDER TEST)
tacotron_decay_steps = 48000, #Determines the learning rate decay slope (UNDER TEST)
tacotron_decay_rate = 0.5, #learning rate decay rate (UNDER TEST)
tacotron_initial_learning_rate = 1e-3, #starting learning rate
tacotron_final_learning_rate = 1e-5, #minimal learning rate
tacotron_final_learning_rate = 1e-4, #minimal learning rate

#Optimization parameters
tacotron_adam_beta1 = 0.9, #AdamOptimizer beta1 parameter
Expand Down Expand Up @@ -304,7 +305,7 @@

#Learning rate schedule
wavenet_lr_schedule = 'exponential', #learning rate schedule. Can be ('exponential', 'noam')
wavenet_learning_rate = 1e-4, #wavenet initial learning rate
wavenet_learning_rate = 1e-3, #wavenet initial learning rate
wavenet_warmup = float(4000), #Only used with 'noam' scheme. Defines the number of ascending learning rate steps.
wavenet_decay_rate = 0.5, #Only used with 'exponential' scheme. Defines the decay rate.
wavenet_decay_steps = 200000, #Only used with 'exponential' scheme. Defines the decay steps.
Expand Down
13 changes: 12 additions & 1 deletion tacotron/models/tacotron.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets
if p_linear_targets is not None:
tower_linear_targets.append(tf.reshape(p_linear_targets[i], [batch_size, -1, linear_channels]))

T2_output_range = (-hp.max_abs_value, hp.max_abs_value) if hp.symmetric_mels else (0, hp.max_abs_value)

self.tower_decoder_output = []
self.tower_alignments = []
self.tower_stop_token_prediction = []
Expand Down Expand Up @@ -176,6 +178,9 @@ def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets
decoder_output = tf.reshape(frames_prediction, [batch_size, -1, hp.num_mels])
stop_token_prediction = tf.reshape(stop_token_prediction, [batch_size, -1])

if hp.clip_outputs:
decoder_output = tf.minimum(tf.maximum(decoder_output, T2_output_range[0]), T2_output_range[1])

#Postnet
postnet = Postnet(is_training, hparams=hp, scope='postnet_convolutions')

Expand All @@ -190,6 +195,9 @@ def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets

#Compute the mel spectrogram
mel_outputs = decoder_output + projected_residual

if hp.clip_outputs:
mel_outputs = tf.minimum(tf.maximum(mel_outputs, T2_output_range[0]), T2_output_range[1])


if post_condition:
Expand All @@ -207,6 +215,9 @@ def initialize(self, inputs, input_lengths, mel_targets=None, stop_token_targets
#[batch_size, decoder_steps(linear_frames), num_freq]
linear_outputs = linear_specs_projection(post_outputs)

if hp.clip_outputs:
linear_outputs = tf.minimum(tf.maximum(linear_outputs, T2_output_range[0]), T2_output_range[1])

#Grab alignments from the final decoder state
alignments = tf.transpose(final_decoder_state.alignment_history.stack(), [1, 2, 0])

Expand Down Expand Up @@ -387,7 +398,7 @@ def add_optimizer(self, global_step):
# Device placement
with tf.device(tf.train.replica_device_setter(ps_tasks=1, ps_device="/cpu:0", worker_device=gpus[i])):
with tf.variable_scope('optimizer') as scope:
update_vars = [v for v in self.all_vars if not ('inputs_embedding' in v or 'encoder_' in v)] if hp.tacotron_fine_tuning else None
update_vars = [v for v in self.all_vars if not ('inputs_embedding' in v.name or 'encoder_' in v.name)] if hp.tacotron_fine_tuning else None
gradients = optimizer.compute_gradients(self.tower_loss[i], var_list=update_vars)
tower_gradients.append(gradients)

Expand Down
6 changes: 4 additions & 2 deletions tacotron/synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ def load(self, checkpoint_path, hparams, gta=False, model_name='Tacotron'):
def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames):
hparams = self._hparams
cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
#[-max, max] or [0,max]
T2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else (0, hparams.max_abs_value)

#Repeat last sample until number of samples is dividable by the number of GPUs (last run scenario)
while len(texts) % hparams.tacotron_synthesis_batch_size != 0:
Expand Down Expand Up @@ -145,8 +147,10 @@ def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames):
#Take off the batch wise padding
mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)]
linears = [linear[:target_length, :] for linear, target_length in zip(linears, target_lengths)]
linears = np.clip(linears, T2_output_range[0], T2_output_range[1])
assert len(mels) == len(linears) == len(texts)

mels = np.clip(mels, T2_output_range[0], T2_output_range[1])

if basenames is None:
#Generate wav and read it
Expand Down Expand Up @@ -207,8 +211,6 @@ def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames):
plot.plot_spectrogram(linears[i], os.path.join(log_dir, 'plots/linear-{}.png'.format(basenames[i])),
title='{}'.format(texts[i]), split_title=True, auto_aspect=True)



return saved_mels_paths, speaker_ids

def _round_up(self, x, multiple):
Expand Down
20 changes: 12 additions & 8 deletions wavenet_vocoder/models/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,12 +526,13 @@ class SubPixelConvolution(tf.layers.Conv2D):
They serve the purpose of upsampling (like deconvolutions) but are faster and less prone to checkerboard artifact with the right initialization.
In contrast to ResizeConvolutions, SubPixel have the same computation speed (when using same n° of params), but a larger receptive fields as they operate on low resolution.
'''
def __init__(self, filters, kernel_size, padding, strides, NN_init, NN_scaler, name=None, **kwargs):
def __init__(self, filters, kernel_size, padding, strides, NN_init, NN_scaler, up_layers, name=None, **kwargs):
#Output channels = filters * H_upsample * W_upsample
conv_filters = filters * strides[0] * strides[1]

#Create initial kernel
self.NN_init = NN_init
self.up_layers = up_layers
self.NN_scaler = NN_scaler
init_kernel = tf.constant_initializer(self._init_kernel(kernel_size, strides, conv_filters), dtype=tf.float32) if NN_init else None

Expand Down Expand Up @@ -634,12 +635,13 @@ def _init_kernel(self, kernel_size, strides, filters):

init_kernel = np.tile(np.expand_dims(init_kernel, 3), [1, 1, 1, filters])

return init_kernel * (self.NN_scaler)**(1/3)
return init_kernel * (self.NN_scaler)**(1/self.up_layers)


class ResizeConvolution(tf.layers.Conv2D):
def __init__(self, filters, kernel_size, padding, strides, NN_init, NN_scaler, name=None, **kwargs):
def __init__(self, filters, kernel_size, padding, strides, NN_init, NN_scaler, up_layers, name=None, **kwargs):
#Create initial kernel
self.up_layers = up_layers
self.NN_scaler = NN_scaler
init_kernel = tf.constant_initializer(self._init_kernel(kernel_size, strides), dtype=tf.float32) if NN_init else None

Expand Down Expand Up @@ -677,15 +679,16 @@ def _init_kernel(kernel_size, strides):
for j_i in j:
init_kernel[i, j_i] = 1. / overlap if kernel_size[1] % 2 == 0 else 1.

return init_kernel * (self.NN_scaler)**(1/3)
return init_kernel * (self.NN_scaler)**(1/self.up_layers)

class ConvTranspose1D(tf.layers.Conv2DTranspose):
def __init__(self, filters, kernel_size, padding, strides, NN_init, NN_scaler, name=None, **kwargs):
def __init__(self, filters, kernel_size, padding, strides, NN_init, NN_scaler, up_layers, name=None, **kwargs):
#convert 1D filters to 2D.
kernel_size = (1, ) + kernel_size #(ks, ) -> (1, ks). Inputs supposed [batch_size, channels, freq, time_steps].
strides = (1, ) + strides #(s, ) -> (1, s).

#Create initial kernel
self.up_layers = up_layers
self.NN_scaler = NN_scaler
init_kernel = tf.constant_initializer(self._init_kernel(kernel_size, strides, filters), dtype=tf.float32) if NN_init else None

Expand Down Expand Up @@ -714,14 +717,15 @@ def _init_kernel(self, kernel_size, strides, filters):
init_kernel = np.tile(init_kernel, [kernel_size[0], kernel_size[1], 1, 1])
init_kernel = init_kernel / overlap if kernel_size[1] % 2 == 0 else init_kernel

return init_kernel * (self.NN_scaler)**(1/3)
return init_kernel * (self.NN_scaler)**(1/self.up_layers)


class ConvTranspose2D(tf.layers.Conv2DTranspose):
def __init__(self, filters, kernel_size, padding, strides, NN_init, NN_scaler, name=None, **kwargs):
def __init__(self, filters, kernel_size, padding, strides, NN_init, NN_scaler, up_layers, name=None, **kwargs):
freq_axis_kernel_size = kernel_size[0]

#Create initial kernel
self.up_layers = up_layers
self.NN_scaler = NN_scaler
init_kernel = tf.constant_initializer(self._init_kernel(kernel_size, strides), dtype=tf.float32) if NN_init else None

Expand Down Expand Up @@ -750,7 +754,7 @@ def _init_kernel(self, kernel_size, strides):
for j_i in range(kernel_size[1]):
init_kernel[i, j_i] = 1. / overlap if kernel_size[1] % 2 == 0 else 1.

return init_kernel * (self.NN_scaler)**(1/3)
return init_kernel * (self.NN_scaler)**(1/self.up_layers)


def _conv1x1_forward(conv, x, is_incremental):
Expand Down
16 changes: 8 additions & 8 deletions wavenet_vocoder/models/wavenet.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,25 +165,25 @@ def __init__(self, hparams, init):
for i, s in enumerate(hparams.upsample_scales):
with tf.variable_scope('local_conditioning_upsampling_{}'.format(i+1)):
if hparams.upsample_type == '2D':
convt = ConvTranspose2D(1, (hparams.freq_axis_kernel_size, 2*s),
convt = ConvTranspose2D(1, (hparams.freq_axis_kernel_size, s),
padding='same', strides=(1, s), NN_init=hparams.NN_init, NN_scaler=hparams.NN_scaler,
name='ConvTranspose2D_layer_{}'.format(i))
up_layers=len(hparams.upsample_scales), name='ConvTranspose2D_layer_{}'.format(i))

elif hparams.upsample_type == '1D':
convt = ConvTranspose1D(hparams.cin_channels, (2*s, ),
convt = ConvTranspose1D(hparams.cin_channels, (s, ),
padding='same', strides=(s, ), NN_init=hparams.NN_init, NN_scaler=hparams.NN_scaler,
name='ConvTranspose1D_layer_{}'.format(i))
up_layers=len(hparams.upsample_scales), name='ConvTranspose1D_layer_{}'.format(i))

elif hparams.upsample_type == 'Resize':
convt = ResizeConvolution(1, (hparams.freq_axis_kernel_size, 2*s),
convt = ResizeConvolution(1, (hparams.freq_axis_kernel_size, s),
padding='same', strides=(1, s), NN_init=hparams.NN_init, NN_scaler=hparams.NN_scaler,
name='ResizeConvolution_layer_{}'.format(i))
up_layers=len(hparams.upsample_scales), name='ResizeConvolution_layer_{}'.format(i))

else:
assert hparams.upsample_type == 'SubPixel'
convt = SubPixelConvolution(1, (hparams.freq_axis_kernel_size, 2*s),
convt = SubPixelConvolution(1, (hparams.freq_axis_kernel_size, 2),
padding='same', strides=(1, s), NN_init=hparams.NN_init, NN_scaler=hparams.NN_scaler,
name='SubPixelConvolution_layer_{}'.format(i))
up_layers=len(hparams.upsample_scales), name='SubPixelConvolution_layer_{}'.format(i))

self.upsample_conv.append(maybe_Normalize_weights(convt,
hparams.wavenet_weight_normalization, init, hparams.wavenet_init_scale))
Expand Down
2 changes: 1 addition & 1 deletion wavenet_vocoder/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ def train(log_dir, args, hparams, input_path):
step, time_window.average, loss, loss_window.average)
log(message, end='\r', slack=(step % args.checkpoint_interval == 0))

if np.isnan(loss): #or loss > 1000.:
if np.isnan(loss) or loss > 100:
log('Loss exploded to {:.5f} at step {}'.format(loss, step))
raise Exception('Loss exploded')

Expand Down

0 comments on commit 1e99756

Please sign in to comment.