Skip to content

Commit

Permalink
Better VUV estimation
Browse files Browse the repository at this point in the history
  • Loading branch information
Scarfmonster committed Nov 20, 2023
1 parent 59f0504 commit 0142d37
Show file tree
Hide file tree
Showing 4 changed files with 170 additions and 68 deletions.
3 changes: 2 additions & 1 deletion configs/vuv.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ dataset:
path: "dataset/train"
segment_length: 32768
return_vuv: True
pitch_shift: [-6, 6]
pitch_shift: [-12, 12]
loudness_shift: [0.25, 1.0]
valid:
path: "dataset/valid"
Expand Down Expand Up @@ -47,6 +47,7 @@ model:
channels: 512
layers: 4
preprocessing:
threads: 8
f0_min: 40
f0_max: 1400
pitch_extractor:
Expand Down
7 changes: 3 additions & 4 deletions pitch.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import abc
from typing import Optional

import librosa
import numpy as np
import parselmouth
import pyworld
import torch
import librosa


class BasePE(abc.ABC):
Expand Down Expand Up @@ -48,7 +48,7 @@ def __call__(self, x: torch.Tensor, pad_to=None):
if self.keep_zeros:
return f0, vuv, f0

org_f0 = f0
org_f0 = torch.clone(f0)

# Remove zero frequencies and linearly interpolate
nzindex = torch.nonzero(f0).squeeze()
Expand Down Expand Up @@ -193,10 +193,9 @@ def process(self, x: torch.Tensor):
)
x2 = np.pad(x2, (l_pad, r_pad))

# noinspection PyArgumentList
s = parselmouth.Sound(x2, sampling_frequency=self.sample_rate).to_pitch_ac(
time_step=self.hop_length / self.sample_rate,
voicing_threshold=0.48,
voicing_threshold=0.45,
pitch_floor=self.f0_min,
pitch_ceiling=self.f0_max,
very_accurate=self.very_accurate,
Expand Down
100 changes: 37 additions & 63 deletions preproc.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,26 @@
import argparse
import os
from multiprocessing import Pool, RLock, current_process, freeze_support
from pathlib import Path
from random import shuffle

import librosa
import numpy as np
import torch
from omegaconf import OmegaConf, DictConfig
from omegaconf import DictConfig, OmegaConf
from torchaudio.transforms import MelSpectrogram

from tqdm import tqdm

from pitch import BasePE
import pyworld
from multiprocessing import Pool, freeze_support, RLock
from multiprocessing import current_process
from random import shuffle
from vuv import VUVEstimator


def process(
config: DictConfig,
audio_path: Path,
pitch_extractor: BasePE,
spectogram_extractor: MelSpectrogram,
vuv_extractor: VUVEstimator,
):
save_path = audio_path.with_suffix(".npy")
if save_path.exists():
Expand All @@ -41,58 +40,24 @@ def process(
else:
pad_to = None

f0, _, f0_0 = pitch_extractor(audio, pad_to)

data["pitch"] = f0.cpu().numpy()
if config.preprocessing.vuv:
pad_to = None

vuv = get_vuv(config, audio, f0_0)
f0, _, f0_0 = pitch_extractor(audio, pad_to)
f0 = f0.cpu().numpy()

if config.preprocessing.vuv:
vuv = vuv_extractor.get_vuv(audio, f0_0)
data["vuv"] = vuv

np.save(save_path, data)


def get_vuv(config: DictConfig, audio, f0):
audio = audio.cpu().numpy().astype(np.float64)[0]
f0 = f0.cpu().numpy().astype(np.float64)
f0_len = f0.shape[0]

time_step = config.hop_length / config.sample_rate
wav_frames = (audio.shape[-1] + config.hop_length - 1) // config.hop_length
t = np.arange(0, wav_frames) * time_step

if f0.shape[0] < wav_frames - 1:
f0 = np.pad(
f0 = np.interp(
np.linspace(np.min(f0), np.max(f0), pad_to if pad_to else len(f0) // 4),
np.linspace(np.min(f0), np.max(f0), len(f0)),
f0,
(0, wav_frames - f0.shape[0]),
mode="constant",
constant_values=(f0[0], f0[-1]),
)
elif f0.shape[0] > wav_frames - 1:
f0 = f0[:wav_frames]

ap = pyworld.d4c(audio, f0, t, config.sample_rate, fft_size=config.n_fft)

avg = 1 - ap[:, 0]

avg = np.ones_like(avg) * (avg > 0.01)

for s in range(1, config.preprocessing.vuv_smoothing + 1):
smooth(avg, s)

# avg = np.mean(ap[:, 0 : ap.shape[-1] // 2], axis=-1)
data["pitch"] = f0

return avg.astype(np.float32)[:f0_len]


def smooth(arr, s):
for i in range(s - 1, len(arr) - s):
m = np.mean(np.concatenate((arr[i - s : i], arr[i + 1 : i + s + 1])))
if m < 0.5:
arr[i] = 0
elif m > 0.5:
arr[i] = 1
np.save(save_path, data)


def chunks(lst, n):
Expand All @@ -105,17 +70,6 @@ def run(config, files):
current = current_process()
pos = current._identity[0] - 1

pitch_extractor_cls = getattr(
__import__("pitch", fromlist=[config.preprocessing.pitch_extractor.name]),
config.preprocessing.pitch_extractor.name,
)
pitch_extractor = pitch_extractor_cls(
sample_rate=config.sample_rate,
keep_zeros=config.preprocessing.pitch_extractor.keep_zeros,
f0_min=config.preprocessing.f0_min,
f0_max=config.preprocessing.f0_max,
)

if config.preprocessing.spectogram:
spectogram_extractor = MelSpectrogram(
sample_rate=config.sample_rate,
Expand All @@ -129,8 +83,28 @@ def run(config, files):
else:
spectogram_extractor = None

hop_length = config.hop_length

if config.preprocessing.vuv:
vuv_extractor = VUVEstimator(config)
hop_length = hop_length // 4
else:
vuv_extractor = None

pitch_extractor_cls = getattr(
__import__("pitch", fromlist=[config.preprocessing.pitch_extractor.name]),
config.preprocessing.pitch_extractor.name,
)
pitch_extractor = pitch_extractor_cls(
sample_rate=config.sample_rate,
hop_length=hop_length,
keep_zeros=config.preprocessing.pitch_extractor.keep_zeros,
f0_min=config.preprocessing.f0_min,
f0_max=config.preprocessing.f0_max,
)

for af in tqdm(files, position=pos):
process(config, af, pitch_extractor, spectogram_extractor)
process(config, af, pitch_extractor, spectogram_extractor, vuv_extractor)


if __name__ == "__main__":
Expand Down Expand Up @@ -163,7 +137,7 @@ def run(config, files):

shuffle(audio_files)

splits = np.array_split(np.array(audio_files), 8)
splits = np.array_split(np.array(audio_files), config.preprocessing.threads)
splits = [(config, files) for files in splits]

with Pool(8, initializer=tqdm.set_lock, initargs=(RLock(),)) as pool:
Expand Down
128 changes: 128 additions & 0 deletions vuv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import librosa
import numpy as np
import pyworld
import torch
from omegaconf import DictConfig
from torchaudio.functional import highpass_biquad, lowpass_biquad


class VUVEstimator:
def __init__(self, config: DictConfig) -> None:
self.sample_rate = config.sample_rate
self.win_length = config.win_length // 4
self.hop_length = config.hop_length // 4
self.f0_min = config.preprocessing.f0_min
self.f0_max = config.preprocessing.f0_max
self.f_max = config.f_max
self.vuv_smoothing = config.preprocessing.vuv_smoothing
self.zcr_uv = 0.25
self.zcr_v = 0.03
self.rms_uv = 0.01

def get_vuv(self, audio, f0):
audio = highpass_biquad(audio, self.sample_rate, self.f0_min)
audio = lowpass_biquad(audio, self.sample_rate, self.f_max)

max_loudness = torch.max(torch.abs(audio))
if max_loudness > 0:
audio /= max_loudness

ap = self.get_world(audio, f0)
ap = ap[:, 0]

audio = audio.cpu().numpy()[0].astype(np.float64)

zcr = librosa.feature.zero_crossing_rate(
audio,
frame_length=self.win_length,
hop_length=self.hop_length,
threshold=0.001,
)
zcr = zcr[0]
zcr = np.convolve(zcr, np.hanning(7) / 3, "same")

rms = self.get_rms(
audio, win_length=self.win_length, hop_length=self.hop_length
)
rms = rms[0]

vuv = 1 - (np.ones_like(ap) * (ap > 0.01))

for i in range(len(vuv)):
if zcr[i] > self.zcr_uv:
vuv[i] = 0
elif zcr[i] < self.zcr_v and rms[i] > self.rms_uv:
vuv[i] = 1
elif rms[i] <= self.rms_uv:
vuv[i] = 0

vuv = np.convolve(
vuv,
np.hanning(self.vuv_smoothing) / (self.vuv_smoothing / 2),
"same",
)
vuv = np.interp(
np.linspace(0, np.max(vuv), len(vuv) // 4),
np.linspace(0, np.max(vuv), len(vuv)),
vuv,
)

vuv = np.ones_like(vuv) * (vuv >= 0.5)

for s in range(1, self.vuv_smoothing + 1):
self.smooth(vuv, s)

return vuv.astype(np.float32)

def get_world(self, audio, f0):
time_step = self.hop_length / self.sample_rate
wav_frames = (audio.shape[-1] + self.hop_length - 1) // self.hop_length
t = np.arange(0, wav_frames) * time_step

f0 = f0.cpu().numpy().astype(np.float64)

if f0.shape[0] < wav_frames - 1:
f0 = np.pad(
f0,
(0, wav_frames - f0.shape[0]),
mode="constant",
constant_values=(f0[0], f0[-1]),
)
elif f0.shape[0] > wav_frames - 1:
f0 = f0[:wav_frames]
ap = pyworld.d4c(
audio.cpu().numpy().astype(np.float64)[0],
f0,
t,
self.sample_rate,
fft_size=self.hop_length * 4,
)

return ap

def get_rms(self, audio, win_length=2048, hop_length=512):
S = librosa.magphase(
librosa.stft(
audio,
hop_length=hop_length,
win_length=win_length,
window="hann",
center=True,
pad_mode="reflect",
)
)[0]
rms = librosa.feature.rms(S=S)

return rms

@staticmethod
def smooth(arr, s):
org_len = len(arr)
arr = np.pad(arr, s, "reflect")
for i in range(s - 1, org_len - s):
m = np.mean(np.concatenate((arr[i - s : i], arr[i + 1 : i + s + 1])))
if m > 0.5:
arr[i] = 1
elif m < 0.5:
arr[i] = 0
return arr[s:-s]

0 comments on commit 0142d37

Please sign in to comment.