Skip to content

Commit

Permalink
Use first mel frame in redundant factor as feeding
Browse files Browse the repository at this point in the history
Signed-off-by: begeekmyfriend <[email protected]>
  • Loading branch information
begeekmyfriend committed Nov 26, 2019
1 parent 812ca50 commit a271f60
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 7 deletions.
10 changes: 9 additions & 1 deletion datasets/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,23 @@
from scipy.io import wavfile


def dc_notch_filter(wav):
# code from speex
notch_radius = 0.982
den = notch_radius ** 2 + 0.7 * (1 - notch_radius) ** 2
b = np.array([1, -2, 1]) * notch_radius
a = np.array([1, -2 * notch_radius, den])
return signal.lfilter(b, a, wav)

def load_wav(path, sr):
return librosa.core.load(path, sr=sr)[0]

def save_wav(wav, path, hparams):
wav = dc_notch_filter(wav)
wav = wav / np.abs(wav).max() * 0.999
f1 = 0.5 * 32767 / max(0.01, np.max(np.abs(wav)))
f2 = np.sign(wav) * np.power(np.abs(wav), 0.95)
wav = f1 * f2
wav = signal.convolve(wav, signal.firwin(256, [hparams.fmin, hparams.fmax], pass_zero=False, fs=hparams.sample_rate))
#proposed by @dsmiller
wavfile.write(path, hparams.sample_rate, wav.astype(np.int16))

Expand Down
5 changes: 3 additions & 2 deletions hparams.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
clip_mels_length = True, #For cases of OOM (Not really recommended, working on a workaround)
max_mel_frames = 900, #Only relevant when clip_mels_length = True
max_text_length = 300, #Only relevant when clip_mels_length = True
sentence_span = 20, # Number of mel hops for each sentence interval

#Mel spectrogram
n_fft = 2048, #Extra window size is filled with 0 paddings to match this parameter
Expand All @@ -30,7 +31,7 @@
preemphasis = 0.97, # preemphasis coefficient

#Multi-speaker batch_size should be integer multiplies number of speakers.
anchor_dirs = ['xmly_fanfanli_22050', 'xmly_xiaoya_22050', 'xmly_jinhua_22050', 'xmly_qiuyixin_22050'],
anchor_dirs = ['tts_fanfanli_22050', 'tts_xiaoya_22050', 'tts_yangluzhuo_22050', 'tts_qiuyixin_22050'],

#M-AILABS (and other datasets) trim params
trim_fft_size = 512,
Expand Down Expand Up @@ -221,7 +222,7 @@
"rang4 wu2 shu4 ren2 dui4 xi1 zang4 qing2 gen1 shen1 zhong4 .",
"shi2 ge2 liang3 nian2 , you2 yuan2 ban1 ren2 ma3 da3 zao4 de jie3 mei4 pian1 ,",
"ji2 di4 , qiao1 ran2 shang4 xian4 !",
"mei3 yi4 zheng4 dou1 shi4 bi4 zhi3 , mei3 yi2 mu4 dou1 shi4 ren2 jian1 xian1 jing4 .",
"mei3 yi4 zheng1 dou1 shi4 bi4 zhi3 , mei3 yi2 mu4 dou1 shi4 ren2 jian1 xian1 jing4 .",
"zi4 ying3 pian1 bo1 chu1 zhi1 lai2 , hao3 ping2 ru2 chao2 ,",
"jiu4 lian2 yi2 xiang4 yi3 yan2 jin3 chu1 ming2 de dou4 ban4 ping2 fen1 ye3 shi4 hen3 gao1 .",
"zao3 zai4 er4 ling2 yi1 wu3 nian2 ,",
Expand Down
6 changes: 3 additions & 3 deletions tacotron/models/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, n
finished = tf.reduce_all(tf.reduce_all(finished, axis=0)) #Safer option

# Feed last output frame as next input. outputs is [N, output_dim * r]
next_inputs = outputs[:, -self._output_dim:]
next_inputs = outputs[:, :self._output_dim]
next_state = state
return (finished, next_inputs, next_state)

Expand All @@ -74,7 +74,7 @@ def __init__(self, batch_size, targets, hparams, gta, evaluating, global_step):

r = self._reduction_factor
# Feed every r-th target frame as input
self._targets = targets[:, r-1::r, :]
self._targets = targets[:, ::r, :]

#Maximal sequence length
self._lengths = tf.tile([tf.shape(self._targets)[1]], [self._batch_size])
Expand Down Expand Up @@ -121,7 +121,7 @@ def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, n
next_inputs = tf.cond(
tf.less(tf.random_uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio),
lambda: self._targets[:, time, :], #Teacher-forcing: return true frame
lambda: outputs[:,-self._output_dim:])
lambda: outputs[:,:self._output_dim])

#Pass on state
next_state = state
Expand Down
3 changes: 2 additions & 1 deletion tacotron/synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,8 @@ def eval(self, batch, speaker_id):
target_lengths = self._get_output_lengths(stop_tokens)

#Take off the batch wise padding
mels = [mel[:target_length, :] for mel, target_length in zip(mels, target_lengths)]
padding = np.full((self._hparams.sentence_span, self._hparams.num_mels), self._target_pad)
mels = [np.concatenate([mel[:target_length, :], padding]) for mel, target_length in zip(mels, target_lengths)]
assert len(mels) == len(batch)

return np.concatenate(mels)
Expand Down

0 comments on commit a271f60

Please sign in to comment.