Skip to content

Commit

Permalink
Update
Browse files Browse the repository at this point in the history
Signed-off-by: begeekmyfriend <[email protected]>
  • Loading branch information
begeekmyfriend committed May 29, 2019
1 parent eac71b9 commit 359cf0f
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 25 deletions.
2 changes: 1 addition & 1 deletion datasets/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def save_wav(wav, path, hparams):
f1 = 0.5 * 32767 / max(0.01, np.max(np.abs(wav)))
f2 = np.sign(wav) * np.power(np.abs(wav), 0.95)
wav = f1 * f2
wav = signal.convolve(wav, signal.firwin(hparams.num_freq, [hparams.fmin, hparams.fmax], pass_zero=False, fs=hparams.sample_rate))
wav = signal.convolve(wav, signal.firwin(256, [hparams.fmin, hparams.fmax], pass_zero=False, fs=hparams.sample_rate))
#proposed by @dsmiller
wavfile.write(path, hparams.sample_rate, wav.astype(np.int16))

Expand Down
1 change: 0 additions & 1 deletion hparams.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,6 @@
mask_encoder = False, #whether to mask encoder padding while computing attention. Set to True for better prosody but slower convergence.
mask_decoder = False, #Whether to use loss mask for padded sequences (if False, <stop_token> loss function will not be weighted, else recommended pos_weight = 20)
cross_entropy_pos_weight = 1, #Use class weights to reduce the stop token classes imbalance (by adding more penalty on False Negatives (FN)) (1 = disabled)
predict_linear = True, #Whether to add a post-processing network to the Tacotron to predict linear spectrograms (True mode Not tested!!)
###########################################################################################################################################

#Tacotron Training
Expand Down
17 changes: 11 additions & 6 deletions tacotron/synthesize.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import re
import time
import numpy as np
from time import sleep
from datasets import audio
import tensorflow as tf
Expand Down Expand Up @@ -61,7 +62,12 @@ def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
batch_sentences = [sentences[i: i+hparams.tacotron_synthesis_batch_size] for i in range(0, len(sentences), delta_size)]
start = time.time()
for i, batch in enumerate(tqdm(batch_sentences)):
synth.eval(batch, log_dir)
mel_filename = os.path.join(eval_dir, '{:03d}.npy'.format(i))
mel = synth.eval(batch)
np.save(mel_filename, mel.T, allow_pickle=False)
wav = audio.inv_mel_spectrogram(mel.T, hparams)
audio.save_wav(wav, os.path.join(eval_dir, '{:03d}.wav'.format(i)), hparams)

log('\nGenerated total batch of {} in {:.3f} sec'.format(delta_size, time.time() - start))

return eval_dir
Expand All @@ -87,7 +93,7 @@ def run_synthesis(args, checkpoint_path, output_dir, hparams):
with open(metadata_filename, encoding='utf-8') as f:
metadata = [line.strip().split('|') for line in f]
frame_shift_ms = hparams.hop_size / hparams.sample_rate
hours = sum([int(x[4]) for x in metadata]) * frame_shift_ms / (3600)
hours = sum([int(x[2]) for x in metadata]) * frame_shift_ms / (3600)
log('Loaded metadata for {} examples ({:.2f} hours)'.format(len(metadata), hours))

metadata = [metadata[i: i+hparams.tacotron_synthesis_batch_size] for i in range(0, len(metadata), hparams.tacotron_synthesis_batch_size)]
Expand All @@ -97,13 +103,12 @@ def run_synthesis(args, checkpoint_path, output_dir, hparams):
wav_dir = os.path.join(args.input_dir, 'audio')
with open(os.path.join(synth_dir, 'map.txt'), 'w') as file:
for i, meta in enumerate(tqdm(metadata)):
texts = [m[5] for m in meta]
mel_filenames = [os.path.join(mel_dir, m[1]) for m in meta]
wav_filenames = [os.path.join(wav_dir, m[0]) for m in meta]
texts = [m[3] for m in meta]
mel_filenames = [os.path.join(mel_dir, m[0]) for m in meta]
basenames = [os.path.basename(m).replace('.npy', '').replace('mel-', '') for m in mel_filenames]
mel_output_filenames, speaker_ids = synth.synthesize(texts, basenames, synth_dir, None, mel_filenames)

for elems in zip(wav_filenames, mel_filenames, mel_output_filenames, speaker_ids, texts):
for elems in zip(mel_filenames, mel_output_filenames, speaker_ids, texts):
file.write('|'.join([str(x) for x in elems]) + '\n')
log('synthesized mel spectrograms at {}'.format(synth_dir))
return os.path.join(synth_dir, 'map.txt')
Expand Down
19 changes: 2 additions & 17 deletions tacotron/synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames):

return saved_mels_paths, speaker_ids

def eval(self, batch, out_dir):
def eval(self, batch):
hparams = self._hparams
cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
seqs = [np.asarray(text_to_sequence(text, cleaner_names)) for text in batch]
Expand All @@ -120,22 +120,7 @@ def eval(self, batch, out_dir):
mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)]
assert len(mels) == len(batch)

#save wav (mel -> wav)
mel_filenames = []
for i, mel in enumerate(mels):
mel_filename = os.path.join(out_dir, '{:03d}.npy'.format(i))
np.save(mel_filename, mel.T, allow_pickle=False)
wav = audio.inv_mel_spectrogram(mel.T, self._hparams)
audio.save_wav(wav, os.path.join(out_dir, '{:03d}.wav'.format(i)), self._hparams)
mel_filenames.append(mel_filename)
return mel_filenames

results = []
for i, linear in enumerate(linears):
linear_wav = self.session.run(self.linear_wav_outputs, feed_dict={self.linear_spectrograms: linear})
wav = audio.inv_preemphasis(linear_wav, hparams.preemphasis)
results.append(wav)
return np.concatenate(results)
return np.concatenate(mels)

def _round_up(self, x, multiple):
remainder = x % multiple
Expand Down

0 comments on commit 359cf0f

Please sign in to comment.