Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Monitoring metric #284

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Prev Previous commit
Next Next commit
Applying monitoring metrics to Tensorboard.
ㄴ attention alignment diagonality
ㄴ average max attention weight
ㄴ f0 RMSE
ㄴ MCD
  • Loading branch information
chmenet authored and Yeongtae committed Aug 26, 2019
commit d2a3fe77bad385ea272f676e54a12e70268ba8c9
17 changes: 10 additions & 7 deletions logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,21 @@ class Tacotron2Logger(SummaryWriter):
def __init__(self, logdir):
super(Tacotron2Logger, self).__init__(logdir)

def log_training(self, reduced_loss, grad_norm, learning_rate, duration,
def log_training(self, reduced_loss, grad_norm, learning_rate, duration, alignment_len_rate, avg_prob,
iteration):
self.add_scalar("training.loss", reduced_loss, iteration)
self.add_scalar("grad.norm", grad_norm, iteration)
self.add_scalar("learning.rate", learning_rate, iteration)
self.add_scalar("duration", duration, iteration)
self.add_scalar("training.loss", reduced_loss, iteration)
self.add_scalar("grad.norm", grad_norm, iteration)
self.add_scalar("learning.rate", learning_rate, iteration)
self.add_scalar("duration", duration, iteration)
self.add_scalar("training attention alignment similarity", alignment_len_rate, iteration)
self.add_scalar("training attention alignment average probability", avg_prob, iteration)

def log_validation(self, reduced_loss, model, y, y_pred, iteration):
def log_validation(self, reduced_loss, model, y, y_pred, alignment_len_rate, avg_prob, iteration):
self.add_scalar("validation.loss", reduced_loss, iteration)
_, mel_outputs, gate_outputs, alignments = y_pred
mel_targets, gate_targets = y

self.add_scalar("validation attention alignment similarity", alignment_len_rate, iteration)
self.add_scalar("validation attention alignment average probability", avg_prob, iteration)
# plot distribution of parameters
for tag, value in model.named_parameters():
tag = tag.replace('.', '/')
Expand Down
114 changes: 114 additions & 0 deletions metric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import torch
from torch.autograd import Variable
import numpy as np

import wave
from scipy.io.wavfile import read
from layers import cepstral
from parabolic import parabolic
from scipy.signal import blackmanharris



def alignment_metric(alignments):
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
x_len = torch.from_numpy(np.array(alignments[0].shape[1])).float()
y_len = torch.from_numpy(np.array(alignments[0].shape[0])).float()

# Compute the squared distances
optimum = np.array((x_len.pow(2) + y_len.pow(2)).pow(0.5))
dist = torch.zeros(1)
val_sum = torch.zeros(1)
for i in range(np.int(y_len)):
value, cur_idx = torch.max(alignments[0][i], 0)
val_sum += value
if i==0:
prev_idx = cur_idx
continue
else:
dist += (1 + (cur_idx - prev_idx).pow(2)).float().pow(0.5)
prev_idx = cur_idx

avg_prob = Variable(val_sum /y_len).float()
optimum = torch.from_numpy(optimum)
rate = Variable(dist/optimum)

return rate, avg_prob


def MCD(source_sound, syn_sound):
sourc_cep = source_sound.cepstral()
syn_cep = syn_sound.cepstral()

mcd = 10 * ((2*torch.sum(sourc_cep-syn_cep).pow(2)).pow(0.5))/torch.log(10)

return mcd


def freq_from_fft(sig, fs):
"""
Estimate frequency from peak of FFT
"""
# Compute Fourier transform of windowed signal
windowed = sig * blackmanharris(len(sig))
f = np.fft.rfft(windowed)

# Find the peak and interpolate to get a more accurate peak
i = np.argmax(abs(f)) # Just use this for less-accurate, naive version
true_i = parabolic(np.log(abs(f)), i)[0]

# Convert to equivalent frequency
return torch.from_numpy(np.array(fs * true_i / len(windowed))).float()


def f0(wav):
nchannels, sampwidth, framerate, nframes, comptype, compname = wav.getparams()

# Inititalize a fundamental frequency
freqs = torch.tensor([])
up = framerate // 80
down = framerate // 270
d = framerate / 270.0

# Number of frames per window
window_size = 1024

# Create a window function
window = np.hamming(window_size)

# Iterate over the wave file frames
for i in range(nframes // window_size):
# Reading n=window_size frames from the wave file
content = wav.readframes(window_size)

# Converting array of bytes to array of integers according to sampwidth. If stereo only the first channel is picked
samples = np.fromstring(content, dtype=types[sampwidth])[0::nchannels]

# Applying window function for our samples
samples = torch.from_numpy(window * samples)

# Calculating spectrum of a signal frame as fft with n=window_size
#spectrum = np.fft.fft(samples, n=window_size)

# Calculating cepstrum as ifft(log(abs(spectrum))))
#cepstrum = np.fft.ifft(np.log(np.abs(spectrum))).real

cepstrum = cepstral(samples)

_, idx = torch.max(cepstrum[down:up])

# Calculating fundamental frequency by finding peak
fund_freq = torch.from_numpy(np.array(framerate)).float() * cepstrum.shape[0] / (idx + d) / cepstrum.shape[0]
freqs = torch.cat(freqs, fund_freq)

return torch.from_numpy(np.array(freqs))


def cal_fft(src_sound, syn_sound):
src_f0 = f0(src_sound)
syn_f0 = f0(syn_sound)
return Variable(torch.sum(((src_f0 - syn_f0).pow(2))/src_f0.shape[0]).pow(0.5))

#src_sound = wave.open("C:/Users/chme/Desktop/Voice_AI/wavenet-audio-mel_wiener_.wav", mode='r')
#syn_sound = wave.open("C:/Users/chme/Desktop/Voice_AI/wavenet-audio-mel_.wav", mode='r')
#print(cal_fft(src_sound, syn_sound)) #, MCD(source_sound, syn_sound))
18 changes: 15 additions & 3 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from loss_function import Tacotron2Loss
from logger import Tacotron2Logger
from hparams import create_hparams

from metric import alignment_metric

def reduce_tensor(tensor, n_gpus):
rt = tensor.clone()
Expand Down Expand Up @@ -129,21 +129,30 @@ def validate(model, criterion, valset, iteration, batch_size, n_gpus,
pin_memory=False, collate_fn=collate_fn)

val_loss = 0.0
alignment_len_rate = torch.zeros(1)
avg_prob = torch.zeros(1)
for i, batch in enumerate(val_loader):
x, y = model.parse_batch(batch)
y_pred = model(x)
_, _, _, alignments = y_pred
loss = criterion(y_pred, y)
if distributed_run:
reduced_val_loss = reduce_tensor(loss.data, n_gpus).item()
else:
reduced_val_loss = loss.item()
val_loss += reduced_val_loss

rate, prob = alignment_metric(alignments)
alignment_len_rate += rate
avg_prob += prob
alignment_len_rate = alignment_len_rate / (i + 1)
avg_prob = avg_prob / (i + 1)
val_loss = val_loss / (i + 1)

model.train()
if rank == 0:
print("Validation loss {}: {:9f} ".format(iteration, reduced_val_loss))
logger.log_validation(reduced_val_loss, model, y, y_pred, iteration)
logger.log_validation(reduced_val_loss, model, y, y_pred, alignment_len_rate, avg_prob, iteration)


def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
Expand Down Expand Up @@ -233,14 +242,17 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
grad_norm = torch.nn.utils.clip_grad_norm_(
model.parameters(), hparams.grad_clip_thresh)

_, _, _, alignments = y_pred
alignment_len_rate, avg_prob = alignment_metric(alignments)

optimizer.step()

if not is_overflow and rank == 0:
duration = time.perf_counter() - start
print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
iteration, reduced_loss, grad_norm, duration))
logger.log_training(
reduced_loss, grad_norm, learning_rate, duration, iteration)
reduced_loss, grad_norm, learning_rate, duration, alignment_len_rate, avg_prob, iteration)

if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0):
validate(model, criterion, valset, iteration,
Expand Down