forked from Rayhane-mamah/Tacotron-2
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Wavenet prep/correction, Global cond, GTA train
- It is now possible to do wavenet preprocessing on its own to make use of wavenet as standalone model (This will omit GTA training) - Wavenet synthesis has been fixed: Rayhane-mamah#106 - Added global conditioning provided you write the speaker_id rules during preprocessing - Added GTA training function
- Loading branch information
1 parent
e2f9780
commit 87bedae
Showing
18 changed files
with
529 additions
and
178 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
from concurrent.futures import ProcessPoolExecutor | ||
from functools import partial | ||
from datasets import audio | ||
import os | ||
import numpy as np | ||
from wavenet_vocoder.util import mulaw_quantize, mulaw, is_mulaw, is_mulaw_quantize | ||
|
||
|
||
def build_from_path(hparams, input_dir, mel_dir, wav_dir, n_jobs=12, tqdm=lambda x: x): | ||
""" | ||
Preprocesses the speech dataset from a gven input path to given output directories | ||
Args: | ||
- hparams: hyper parameters | ||
- input_dir: input directory that contains the files to prerocess | ||
- mel_dir: output directory of the preprocessed speech mel-spectrogram dataset | ||
- linear_dir: output directory of the preprocessed speech linear-spectrogram dataset | ||
- wav_dir: output directory of the preprocessed speech audio dataset | ||
- n_jobs: Optional, number of worker process to parallelize across | ||
- tqdm: Optional, provides a nice progress bar | ||
Returns: | ||
- A list of tuple describing the train examples. this should be written to train.txt | ||
""" | ||
|
||
# We use ProcessPoolExecutor to parallelize across processes, this is just for | ||
# optimization purposes and it can be omited | ||
executor = ProcessPoolExecutor(max_workers=n_jobs) | ||
futures = [] | ||
for file in os.listdir(input_dir): | ||
wav_path = os.path.join(input_dir, file) | ||
basename = os.path.basename(wav_path).replace('.wav', '') | ||
futures.append(executor.submit(partial(_process_utterance, mel_dir, wav_dir, basename, wav_path, hparams))) | ||
|
||
return [future.result() for future in tqdm(futures) if future.result() is not None] | ||
|
||
|
||
def _process_utterance(mel_dir, wav_dir, index, wav_path, hparams): | ||
""" | ||
Preprocesses a single utterance wav/text pair | ||
this writes the mel scale spectogram to disk and return a tuple to write | ||
to the train.txt file | ||
Args: | ||
- mel_dir: the directory to write the mel spectograms into | ||
- linear_dir: the directory to write the linear spectrograms into | ||
- wav_dir: the directory to write the preprocessed wav into | ||
- index: the numeric index to use in the spectrogram filename | ||
- wav_path: path to the audio file containing the speech input | ||
- text: text spoken in the input audio file | ||
- hparams: hyper parameters | ||
Returns: | ||
- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) | ||
""" | ||
try: | ||
# Load the audio as numpy array | ||
wav = audio.load_wav(wav_path, sr=hparams.sample_rate) | ||
except FileNotFoundError: #catch missing wav exception | ||
print('file {} present in csv metadata is not present in wav folder. skipping!'.format( | ||
wav_path)) | ||
return None | ||
|
||
#rescale wav | ||
if hparams.rescale: | ||
wav = wav / np.abs(wav).max() * hparams.rescaling_max | ||
|
||
#M-AILABS extra silence specific | ||
if hparams.trim_silence: | ||
wav = audio.trim_silence(wav, hparams) | ||
|
||
#Mu-law quantize | ||
if is_mulaw_quantize(hparams.input_type): | ||
#[0, quantize_channels) | ||
out = mulaw_quantize(wav, hparams.quantize_channels) | ||
|
||
#Trim silences | ||
start, end = audio.start_and_end_indices(out, hparams.silence_threshold) | ||
wav = wav[start: end] | ||
out = out[start: end] | ||
|
||
constant_values = mulaw_quantize(0, hparams.quantize_channels) | ||
out_dtype = np.int16 | ||
|
||
elif is_mulaw(hparams.input_type): | ||
#[-1, 1] | ||
out = mulaw(wav, hparams.quantize_channels) | ||
constant_values = mulaw(0., hparams.quantize_channels) | ||
out_dtype = np.float32 | ||
|
||
else: | ||
#[-1, 1] | ||
out = wav | ||
constant_values = 0. | ||
out_dtype = np.float32 | ||
|
||
# Compute the mel scale spectrogram from the wav | ||
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) | ||
mel_frames = mel_spectrogram.shape[1] | ||
|
||
if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: | ||
return None | ||
|
||
#Ensure time resolution adjustement between audio and mel-spectrogram | ||
fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size | ||
l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) | ||
|
||
#Zero pad for quantized signal | ||
out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) | ||
assert len(out) >= mel_frames * audio.get_hop_size(hparams) | ||
|
||
#time resolution adjustement | ||
#ensure length of raw audio is multiple of hop size so that we can use | ||
#transposed convolution to upsample | ||
out = out[:mel_frames * audio.get_hop_size(hparams)] | ||
assert len(out) % audio.get_hop_size(hparams) == 0 | ||
time_steps = len(out) | ||
|
||
# Write the spectrogram and audio to disk | ||
audio_filename = os.path.join(wav_dir, 'audio-{}.npy'.format(index)) | ||
mel_filename = os.path.join(mel_dir, 'mel-{}.npy'.format(index)) | ||
np.save(audio_filename, out.astype(out_dtype), allow_pickle=False) | ||
np.save(mel_filename, mel_spectrogram.T, allow_pickle=False) | ||
|
||
#global condition features | ||
if hparams.gin_channels > 0: | ||
raise RuntimeError('When activating global conditions, please set your speaker_id rules in line 128 of datasets/wavenet_preprocessor.py to use them during training') | ||
speaker_id = '<no_g>' #put the rule to determine how to assign speaker ids (using file names maybe? file basenames are available in "index" variable) | ||
else: | ||
speaker_id = '<no_g>' | ||
|
||
# Return a tuple describing this training example | ||
return (audio_filename, mel_filename, '_', speaker_id, time_steps, mel_frames) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.