forked from Rayhane-mamah/Tacotron-2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwavenet_preprocessor.py
135 lines (108 loc) · 4.97 KB
/
wavenet_preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import os
from concurrent.futures import ProcessPoolExecutor
from functools import partial
import numpy as np
from datasets import audio
from wavenet_vocoder.util import is_mulaw, is_mulaw_quantize, mulaw, mulaw_quantize
def build_from_path(hparams, input_dir, mel_dir, wav_dir, n_jobs=12, tqdm=lambda x: x):
"""
Preprocesses the speech dataset from a gven input path to given output directories
Args:
- hparams: hyper parameters
- input_dir: input directory that contains the files to prerocess
- mel_dir: output directory of the preprocessed speech mel-spectrogram dataset
- linear_dir: output directory of the preprocessed speech linear-spectrogram dataset
- wav_dir: output directory of the preprocessed speech audio dataset
- n_jobs: Optional, number of worker process to parallelize across
- tqdm: Optional, provides a nice progress bar
Returns:
- A list of tuple describing the train examples. this should be written to train.txt
"""
# We use ProcessPoolExecutor to parallelize across processes, this is just for
# optimization purposes and it can be omited
executor = ProcessPoolExecutor(max_workers=n_jobs)
futures = []
for file in os.listdir(input_dir):
wav_path = os.path.join(input_dir, file)
basename = os.path.basename(wav_path).replace('.wav', '')
futures.append(executor.submit(partial(_process_utterance, mel_dir, wav_dir, basename, wav_path, hparams)))
return [future.result() for future in tqdm(futures) if future.result() is not None]
def _process_utterance(mel_dir, wav_dir, index, wav_path, hparams):
"""
Preprocesses a single utterance wav/text pair
this writes the mel scale spectogram to disk and return a tuple to write
to the train.txt file
Args:
- mel_dir: the directory to write the mel spectograms into
- linear_dir: the directory to write the linear spectrograms into
- wav_dir: the directory to write the preprocessed wav into
- index: the numeric index to use in the spectrogram filename
- wav_path: path to the audio file containing the speech input
- text: text spoken in the input audio file
- hparams: hyper parameters
Returns:
- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
"""
try:
# Load the audio as numpy array
wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
except FileNotFoundError: #catch missing wav exception
print('file {} present in csv metadata is not present in wav folder. skipping!'.format(
wav_path))
return None
#rescale wav
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
#M-AILABS extra silence specific
if hparams.trim_silence:
wav = audio.trim_silence(wav, hparams)
#Mu-law quantize
if is_mulaw_quantize(hparams.input_type):
#[0, quantize_channels)
out = mulaw_quantize(wav, hparams.quantize_channels)
#Trim silences
start, end = audio.start_and_end_indices(out, hparams.silence_threshold)
wav = wav[start: end]
out = out[start: end]
constant_values = mulaw_quantize(0, hparams.quantize_channels)
out_dtype = np.int16
elif is_mulaw(hparams.input_type):
#[-1, 1]
out = mulaw(wav, hparams.quantize_channels)
constant_values = mulaw(0., hparams.quantize_channels)
out_dtype = np.float32
else:
#[-1, 1]
out = wav
constant_values = 0.
out_dtype = np.float32
# Compute the mel scale spectrogram from the wav
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
mel_frames = mel_spectrogram.shape[1]
if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
return None
#Ensure time resolution adjustement between audio and mel-spectrogram
fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))
#Zero pad for quantized signal
out = np.pad(out, (l, r), mode='constant', constant_values=constant_values)
assert len(out) >= mel_frames * audio.get_hop_size(hparams)
#time resolution adjustement
#ensure length of raw audio is multiple of hop size so that we can use
#transposed convolution to upsample
out = out[:mel_frames * audio.get_hop_size(hparams)]
assert len(out) % audio.get_hop_size(hparams) == 0
time_steps = len(out)
# Write the spectrogram and audio to disk
audio_filename = os.path.join(wav_dir, 'audio-{}.npy'.format(index))
mel_filename = os.path.join(mel_dir, 'mel-{}.npy'.format(index))
np.save(audio_filename, out.astype(out_dtype), allow_pickle=False)
np.save(mel_filename, mel_spectrogram.T, allow_pickle=False)
#global condition features
if hparams.gin_channels > 0:
raise RuntimeError('When activating global conditions, please set your speaker_id rules in line 129 of datasets/wavenet_preprocessor.py to use them during training')
speaker_id = '<no_g>' #put the rule to determine how to assign speaker ids (using file names maybe? file basenames are available in "index" variable)
else:
speaker_id = '<no_g>'
# Return a tuple describing this training example
return (audio_filename, mel_filename, '_', speaker_id, time_steps, mel_frames)