Skip to content

Commit

Permalink
Sort and Optimise imports
Browse files Browse the repository at this point in the history
  • Loading branch information
h-meru committed Aug 7, 2018
1 parent 76fcab0 commit 5b2e7db
Show file tree
Hide file tree
Showing 32 changed files with 190 additions and 190 deletions.
12 changes: 6 additions & 6 deletions datasets/audio.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
import librosa
import librosa.filters
import numpy as np
import numpy as np
import tensorflow as tf
from scipy import signal
import tensorflow as tf
from scipy.io import wavfile


def load_wav(path, sr):
return librosa.core.load(path, sr=sr)[0]

def save_wav(wav, path, sr):
wav *= 32767 / max(0.01, np.max(np.abs(wav)))
wav *= 32767 / max(0.01, np.max(np.abs(wav)))
#proposed by @dsmiller
wavfile.write(path, sr, wav.astype(np.int16))

Expand Down Expand Up @@ -75,7 +75,7 @@ def inv_linear_spectrogram(linear_spectrogram, hparams):
return y
else:
return _griffin_lim(S ** hparams.power, hparams)


def inv_mel_spectrogram(mel_spectrogram, hparams):
'''Converts mel spectrogram to waveform using librosa'''
Expand Down Expand Up @@ -186,12 +186,12 @@ def _denormalize(D, hparams):
if hparams.allow_clipping_in_normalization:
if hparams.symmetric_mels:
return (((np.clip(D, -hparams.max_abs_value,
hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value))
hparams.max_abs_value) + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value))
+ hparams.min_level_db)
else:
return ((np.clip(D, 0, hparams.max_abs_value) * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)

if hparams.symmetric_mels:
return (((D + hparams.max_abs_value) * -hparams.min_level_db / (2 * hparams.max_abs_value)) + hparams.min_level_db)
else:
return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
return ((D * -hparams.min_level_db / hparams.max_abs_value) + hparams.min_level_db)
15 changes: 8 additions & 7 deletions datasets/preprocessor.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import os
from concurrent.futures import ProcessPoolExecutor
from functools import partial

import numpy as np
from datasets import audio
import os
import numpy as np
from wavenet_vocoder.util import mulaw_quantize, mulaw, is_mulaw, is_mulaw_quantize
from wavenet_vocoder.util import is_mulaw, is_mulaw_quantize, mulaw, mulaw_quantize


def build_from_path(hparams, input_dirs, mel_dir, linear_dir, wav_dir, n_jobs=12, tqdm=lambda x: x):
Expand All @@ -23,7 +24,7 @@ def build_from_path(hparams, input_dirs, mel_dir, linear_dir, wav_dir, n_jobs=12
- A list of tuple describing the train examples. this should be written to train.txt
"""

# We use ProcessPoolExecutor to parallelize across processes, this is just for
# We use ProcessPoolExecutor to parallelize across processes, this is just for
# optimization purposes and it can be omited
executor = ProcessPoolExecutor(max_workers=n_jobs)
futures = []
Expand Down Expand Up @@ -94,7 +95,7 @@ def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hpar
out = mulaw(wav, hparams.quantize_channels)
constant_values = mulaw(0., hparams.quantize_channels)
out_dtype = np.float32

else:
#[-1, 1]
out = wav
Expand All @@ -110,7 +111,7 @@ def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hpar

#Compute the linear scale spectrogram from the wav
linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32)
linear_frames = linear_spectrogram.shape[1]
linear_frames = linear_spectrogram.shape[1]

#sanity check
assert linear_frames == mel_frames
Expand Down Expand Up @@ -139,4 +140,4 @@ def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hpar
np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False)

# Return a tuple describing this training example
return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
13 changes: 7 additions & 6 deletions datasets/wavenet_preprocessor.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import os
from concurrent.futures import ProcessPoolExecutor
from functools import partial

import numpy as np
from datasets import audio
import os
import numpy as np
from wavenet_vocoder.util import mulaw_quantize, mulaw, is_mulaw, is_mulaw_quantize
from wavenet_vocoder.util import is_mulaw, is_mulaw_quantize, mulaw, mulaw_quantize


def build_from_path(hparams, input_dir, mel_dir, wav_dir, n_jobs=12, tqdm=lambda x: x):
Expand All @@ -23,7 +24,7 @@ def build_from_path(hparams, input_dir, mel_dir, wav_dir, n_jobs=12, tqdm=lambda
- A list of tuple describing the train examples. this should be written to train.txt
"""

# We use ProcessPoolExecutor to parallelize across processes, this is just for
# We use ProcessPoolExecutor to parallelize across processes, this is just for
# optimization purposes and it can be omited
executor = ProcessPoolExecutor(max_workers=n_jobs)
futures = []
Expand Down Expand Up @@ -88,7 +89,7 @@ def _process_utterance(mel_dir, wav_dir, index, wav_path, hparams):
out = mulaw(wav, hparams.quantize_channels)
constant_values = mulaw(0., hparams.quantize_channels)
out_dtype = np.float32

else:
#[-1, 1]
out = wav
Expand Down Expand Up @@ -131,4 +132,4 @@ def _process_utterance(mel_dir, wav_dir, index, wav_path, hparams):
speaker_id = '<no_g>'

# Return a tuple describing this training example
return (audio_filename, mel_filename, '_', speaker_id, time_steps, mel_frames)
return (audio_filename, mel_filename, '_', speaker_id, time_steps, mel_frames)
21 changes: 10 additions & 11 deletions hparams.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import tensorflow as tf
import numpy as np

import numpy as np
import tensorflow as tf

# Default hyperparameters
hparams = tf.contrib.training.HParams(
Expand Down Expand Up @@ -44,17 +43,17 @@
signal_normalization = True,
allow_clipping_in_normalization = True, #Only relevant if mel_normalization = True
symmetric_mels = False, #Whether to scale the data to be symmetric around 0
max_abs_value = 4., #max absolute value of data. If symmetric, data will be [-max, max] else [0, max]
max_abs_value = 4., #max absolute value of data. If symmetric, data will be [-max, max] else [0, max]
normalize_for_wavenet = True, #whether to rescale to [0, 1] for wavenet.

#Limits
min_level_db = -100,
ref_level_db = 20,
fmin = 0, #Set this to 75 if your speaker is male! if female, 125 should help taking off noise. (To test depending on dataset)
fmax = 7600,
fmax = 7600,

#Griffin Lim
power = 1.5,
power = 1.5,
griffin_lim_iters = 60,
###########################################################################################################################################

Expand All @@ -69,7 +68,7 @@
enc_conv_channels = 512, #number of encoder convolutions filters for each layer
encoder_lstm_units = 256, #number of lstm units for each direction (forward and backward)

smoothing = False, #Whether to smooth the attention normalization function
smoothing = False, #Whether to smooth the attention normalization function
attention_dim = 128, #dimension of attention space
attention_filters = 32, #number of attention convolution filters
attention_kernel = (31, ), #kernel size of attention convolution
Expand Down Expand Up @@ -228,10 +227,10 @@
'it appears that oswald had only one caller in response to all of his fpcc activities,',
'he relied on the absence of the strychnia.',
'scoggins thought it was lighter.',
'''would, it is probable, have eventually overcome the reluctance of some of the prisoners at least,
'''would, it is probable, have eventually overcome the reluctance of some of the prisoners at least,
and would have possessed so much moral dignity''',
'''Sequence to sequence models have enjoyed great success in a variety of tasks such as machine translation, speech recognition, and text summarization.
This project covers a sequence to sequence model trained to predict a speech representation from an input sequence of characters. We show that
'''Sequence to sequence models have enjoyed great success in a variety of tasks such as machine translation, speech recognition, and text summarization.
This project covers a sequence to sequence model trained to predict a speech representation from an input sequence of characters. We show that
the adopted architecture is able to perform this task with wild success.''',
'Thank you so much for your support!',
]
Expand All @@ -241,4 +240,4 @@
def hparams_debug_string():
values = hparams.values()
hp = [' %s: %s' % (name, values[name]) for name in sorted(values) if name != 'sentences']
return 'Hyperparameters:\n' + '\n'.join(hp)
return 'Hyperparameters:\n' + '\n'.join(hp)
7 changes: 3 additions & 4 deletions infolog.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import atexit
from datetime import datetime
import json
from threading import Thread
from datetime import datetime
from threading import Thread
from urllib.request import Request, urlopen


_format = '%Y-%m-%d %H:%M:%S.%f'
_file = None
_run_name = None
Expand Down Expand Up @@ -48,4 +47,4 @@ def _send_slack(msg):
}).encode())


atexit.register(_close_logfile)
atexit.register(_close_logfile)
13 changes: 7 additions & 6 deletions preprocess.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import argparse
from multiprocessing import cpu_count
import os
from tqdm import tqdm
from multiprocessing import cpu_count

from datasets import preprocessor
from hparams import hparams
from tqdm import tqdm


def preprocess(args, input_folders, out_dir, hparams):
Expand Down Expand Up @@ -43,9 +44,9 @@ def norm_data(args):
if args.dataset.startswith('LJSpeech'):
return [os.path.join(args.base_dir, args.dataset)]


if args.dataset == 'M-AILABS':
supported_languages = ['en_US', 'en_UK', 'fr_FR', 'it_IT', 'de_DE', 'es_ES', 'ru_RU',
supported_languages = ['en_US', 'en_UK', 'fr_FR', 'it_IT', 'de_DE', 'es_ES', 'ru_RU',
'uk_UK', 'pl_PL', 'nl_NL', 'pt_PT', 'fi_FI', 'se_SE', 'tr_TR', 'ar_SA']
if args.language not in supported_languages:
raise ValueError('Please enter a supported language to use from M-AILABS dataset! \n{}'.format(
Expand Down Expand Up @@ -86,7 +87,7 @@ def main():
print('initializing preprocessing..')
parser = argparse.ArgumentParser()
parser.add_argument('--base_dir', default='')
parser.add_argument('--hparams', default='',
parser.add_argument('--hparams', default='',
help='Hyperparameter overrides as a comma-separated list of name=value pairs')
parser.add_argument('--dataset', default='LJSpeech-1.1')
parser.add_argument('--language', default='en_US')
Expand All @@ -106,4 +107,4 @@ def main():


if __name__ == '__main__':
main()
main()
10 changes: 5 additions & 5 deletions tacotron/feeder.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import numpy as np
import os
import threading
import time
import traceback
from tacotron.utils.text import text_to_sequence

import numpy as np
import tensorflow as tf
from infolog import log
from sklearn.model_selection import train_test_split
import tensorflow as tf

from tacotron.utils.text import text_to_sequence

_batches_per_group = 32

Expand Down Expand Up @@ -37,7 +37,7 @@ def __init__(self, coordinator, metadata_filename, hparams):
if hparams.tacotron_test_size is None:
assert hparams.tacotron_test_batches is not None

test_size = (hparams.tacotron_test_size if hparams.tacotron_test_size is not None
test_size = (hparams.tacotron_test_size if hparams.tacotron_test_size is not None
else hparams.tacotron_test_batches * hparams.tacotron_batch_size)
indices = np.arange(len(self._metadata))
train_indices, test_indices = train_test_split(indices,
Expand Down
25 changes: 11 additions & 14 deletions tacotron/models/Architecture_wrappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,14 @@
All notations and variable names were used in concordance with originial tensorflow implementation
"""
import collections

import numpy as np
import tensorflow as tf
from tacotron.models.attention import _compute_attention
from tensorflow.contrib.rnn import RNNCell
from tensorflow.python.framework import ops
from tensorflow.python.ops import rnn_cell_impl
from tensorflow.python.ops import check_ops
from tensorflow.python.framework import ops, tensor_shape
from tensorflow.python.ops import array_ops, check_ops, rnn_cell_impl, tensor_array_ops
from tensorflow.python.util import nest
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import tensor_array_ops
from tensorflow.python.framework import tensor_shape
from tacotron.models.attention import _compute_attention

_zero_state_tensors = rnn_cell_impl._zero_state_tensors

Expand Down Expand Up @@ -83,7 +80,7 @@ class TacotronDecoderCell(RNNCell):
* : This is typically taking a vanilla LSTM, wrapping it using tensorflow's attention wrapper,
and wrap that with the prenet before doing an input feeding, and with the prediction layer
that uses RNN states to project on output space. Actions marked with (*) can be replaced with
that uses RNN states to project on output space. Actions marked with (*) can be replaced with
tensorflow's attention wrapper call if it was using cumulative alignments instead of previous alignments only.
"""

Expand All @@ -92,11 +89,11 @@ def __init__(self, prenet, attention_mechanism, rnn_cell, frame_projection, stop
Args:
prenet: A tensorflow fully connected layer acting as the decoder pre-net
attention_mechanism: A _BaseAttentionMechanism instance, usefull to
attention_mechanism: A _BaseAttentionMechanism instance, usefull to
learn encoder-decoder alignments
rnn_cell: Instance of RNNCell, main body of the decoder
frame_projection: tensorflow fully connected layer with r * num_mels output units
stop_projection: tensorflow fully connected layer, expected to project to a scalar
stop_projection: tensorflow fully connected layer, expected to project to a scalar
and through a sigmoid activation
mask_finished: Boolean, Whether to mask decoder frames after the <stop_token>
"""
Expand Down Expand Up @@ -135,7 +132,7 @@ def state_size(self):

def zero_state(self, batch_size, dtype):
"""Return an initial (zero) state tuple for this `AttentionWrapper`.
Args:
batch_size: `0D` integer tensor: the batch size.
dtype: The internal state data type.
Expand Down Expand Up @@ -179,14 +176,14 @@ def __call__(self, inputs, state):


#Compute the attention (context) vector and alignments using
#the new decoder cell hidden state as query vector
#the new decoder cell hidden state as query vector
#and cumulative alignments to extract location features
#The choice of the new cell hidden state (s_{i}) of the last
#decoder RNN Cell is based on Luong et Al. (2015):
#https://arxiv.org/pdf/1508.04025.pdf
previous_alignments = state.alignments
previous_alignment_history = state.alignment_history
context_vector, alignments, cumulated_alignments = _compute_attention(self._attention_mechanism,
context_vector, alignments, cumulated_alignments = _compute_attention(self._attention_mechanism,
LSTM_output,
previous_alignments,
attention_layer=None)
Expand All @@ -209,4 +206,4 @@ def __call__(self, inputs, state):
alignments=cumulated_alignments,
alignment_history=alignment_history)

return (cell_outputs, stop_tokens), next_state
return (cell_outputs, stop_tokens), next_state
Loading

0 comments on commit 5b2e7db

Please sign in to comment.