Better VUV estimation

Scarfmonster · Nov 20, 2023 · 0142d37 · 0142d37
1 parent 59f0504
commit 0142d37
Show file tree

Hide file tree

Showing 4 changed files with 170 additions and 68 deletions.
diff --git a/configs/vuv.yaml b/configs/vuv.yaml
@@ -18,7 +18,7 @@ dataset:
     path: "dataset/train"
     segment_length: 32768
     return_vuv: True
-    pitch_shift: [-6, 6]
+    pitch_shift: [-12, 12]
     loudness_shift: [0.25, 1.0]
   valid:
     path: "dataset/valid"
@@ -47,6 +47,7 @@ model:
   channels: 512
   layers: 4
 preprocessing:
+  threads: 8
   f0_min: 40
   f0_max: 1400
   pitch_extractor:

diff --git a/pitch.py b/pitch.py
@@ -1,11 +1,11 @@
 import abc
 from typing import Optional
 
+import librosa
 import numpy as np
 import parselmouth
 import pyworld
 import torch
-import librosa
 
 
 class BasePE(abc.ABC):
@@ -48,7 +48,7 @@ def __call__(self, x: torch.Tensor, pad_to=None):
         if self.keep_zeros:
             return f0, vuv, f0
 
-        org_f0 = f0
+        org_f0 = torch.clone(f0)
 
         # Remove zero frequencies and linearly interpolate
         nzindex = torch.nonzero(f0).squeeze()
@@ -193,10 +193,9 @@ def process(self, x: torch.Tensor):
         )
         x2 = np.pad(x2, (l_pad, r_pad))
 
-        # noinspection PyArgumentList
         s = parselmouth.Sound(x2, sampling_frequency=self.sample_rate).to_pitch_ac(
             time_step=self.hop_length / self.sample_rate,
-            voicing_threshold=0.48,
+            voicing_threshold=0.45,
             pitch_floor=self.f0_min,
             pitch_ceiling=self.f0_max,
             very_accurate=self.very_accurate,

diff --git a/preproc.py b/preproc.py
@@ -1,27 +1,26 @@
 import argparse
 import os
+from multiprocessing import Pool, RLock, current_process, freeze_support
 from pathlib import Path
+from random import shuffle
 
 import librosa
 import numpy as np
 import torch
-from omegaconf import OmegaConf, DictConfig
+from omegaconf import DictConfig, OmegaConf
 from torchaudio.transforms import MelSpectrogram
-
 from tqdm import tqdm
 
 from pitch import BasePE
-import pyworld
-from multiprocessing import Pool, freeze_support, RLock
-from multiprocessing import current_process
-from random import shuffle
+from vuv import VUVEstimator
 
 
 def process(
     config: DictConfig,
     audio_path: Path,
     pitch_extractor: BasePE,
     spectogram_extractor: MelSpectrogram,
+    vuv_extractor: VUVEstimator,
 ):
     save_path = audio_path.with_suffix(".npy")
     if save_path.exists():
@@ -41,58 +40,24 @@ def process(
     else:
         pad_to = None
 
-    f0, _, f0_0 = pitch_extractor(audio, pad_to)
-
-    data["pitch"] = f0.cpu().numpy()
+    if config.preprocessing.vuv:
+        pad_to = None
 
-    vuv = get_vuv(config, audio, f0_0)
+    f0, _, f0_0 = pitch_extractor(audio, pad_to)
+    f0 = f0.cpu().numpy()
 
     if config.preprocessing.vuv:
+        vuv = vuv_extractor.get_vuv(audio, f0_0)
         data["vuv"] = vuv
-
-    np.save(save_path, data)
-
-
-def get_vuv(config: DictConfig, audio, f0):
-    audio = audio.cpu().numpy().astype(np.float64)[0]
-    f0 = f0.cpu().numpy().astype(np.float64)
-    f0_len = f0.shape[0]
-
-    time_step = config.hop_length / config.sample_rate
-    wav_frames = (audio.shape[-1] + config.hop_length - 1) // config.hop_length
-    t = np.arange(0, wav_frames) * time_step
-
-    if f0.shape[0] < wav_frames - 1:
-        f0 = np.pad(
+        f0 = np.interp(
+            np.linspace(np.min(f0), np.max(f0), pad_to if pad_to else len(f0) // 4),
+            np.linspace(np.min(f0), np.max(f0), len(f0)),
             f0,
-            (0, wav_frames - f0.shape[0]),
-            mode="constant",
-            constant_values=(f0[0], f0[-1]),
         )
-    elif f0.shape[0] > wav_frames - 1:
-        f0 = f0[:wav_frames]
-
-    ap = pyworld.d4c(audio, f0, t, config.sample_rate, fft_size=config.n_fft)
-
-    avg = 1 - ap[:, 0]
-
-    avg = np.ones_like(avg) * (avg > 0.01)
-
-    for s in range(1, config.preprocessing.vuv_smoothing + 1):
-        smooth(avg, s)
 
-    # avg = np.mean(ap[:, 0 : ap.shape[-1] // 2], axis=-1)
+    data["pitch"] = f0
 
-    return avg.astype(np.float32)[:f0_len]
-
-
-def smooth(arr, s):
-    for i in range(s - 1, len(arr) - s):
-        m = np.mean(np.concatenate((arr[i - s : i], arr[i + 1 : i + s + 1])))
-        if m < 0.5:
-            arr[i] = 0
-        elif m > 0.5:
-            arr[i] = 1
+    np.save(save_path, data)
 
 
 def chunks(lst, n):
@@ -105,17 +70,6 @@ def run(config, files):
     current = current_process()
     pos = current._identity[0] - 1
 
-    pitch_extractor_cls = getattr(
-        __import__("pitch", fromlist=[config.preprocessing.pitch_extractor.name]),
-        config.preprocessing.pitch_extractor.name,
-    )
-    pitch_extractor = pitch_extractor_cls(
-        sample_rate=config.sample_rate,
-        keep_zeros=config.preprocessing.pitch_extractor.keep_zeros,
-        f0_min=config.preprocessing.f0_min,
-        f0_max=config.preprocessing.f0_max,
-    )
-
     if config.preprocessing.spectogram:
         spectogram_extractor = MelSpectrogram(
             sample_rate=config.sample_rate,
@@ -129,8 +83,28 @@ def run(config, files):
     else:
         spectogram_extractor = None
 
+    hop_length = config.hop_length
+
+    if config.preprocessing.vuv:
+        vuv_extractor = VUVEstimator(config)
+        hop_length = hop_length // 4
+    else:
+        vuv_extractor = None
+
+    pitch_extractor_cls = getattr(
+        __import__("pitch", fromlist=[config.preprocessing.pitch_extractor.name]),
+        config.preprocessing.pitch_extractor.name,
+    )
+    pitch_extractor = pitch_extractor_cls(
+        sample_rate=config.sample_rate,
+        hop_length=hop_length,
+        keep_zeros=config.preprocessing.pitch_extractor.keep_zeros,
+        f0_min=config.preprocessing.f0_min,
+        f0_max=config.preprocessing.f0_max,
+    )
+
     for af in tqdm(files, position=pos):
-        process(config, af, pitch_extractor, spectogram_extractor)
+        process(config, af, pitch_extractor, spectogram_extractor, vuv_extractor)
 
 
 if __name__ == "__main__":
@@ -163,7 +137,7 @@ def run(config, files):
 
     shuffle(audio_files)
 
-    splits = np.array_split(np.array(audio_files), 8)
+    splits = np.array_split(np.array(audio_files), config.preprocessing.threads)
     splits = [(config, files) for files in splits]
 
     with Pool(8, initializer=tqdm.set_lock, initargs=(RLock(),)) as pool:

diff --git a/vuv.py b/vuv.py
@@ -0,0 +1,128 @@
+import librosa
+import numpy as np
+import pyworld
+import torch
+from omegaconf import DictConfig
+from torchaudio.functional import highpass_biquad, lowpass_biquad
+
+
+class VUVEstimator:
+    def __init__(self, config: DictConfig) -> None:
+        self.sample_rate = config.sample_rate
+        self.win_length = config.win_length // 4
+        self.hop_length = config.hop_length // 4
+        self.f0_min = config.preprocessing.f0_min
+        self.f0_max = config.preprocessing.f0_max
+        self.f_max = config.f_max
+        self.vuv_smoothing = config.preprocessing.vuv_smoothing
+        self.zcr_uv = 0.25
+        self.zcr_v = 0.03
+        self.rms_uv = 0.01
+
+    def get_vuv(self, audio, f0):
+        audio = highpass_biquad(audio, self.sample_rate, self.f0_min)
+        audio = lowpass_biquad(audio, self.sample_rate, self.f_max)
+
+        max_loudness = torch.max(torch.abs(audio))
+        if max_loudness > 0:
+            audio /= max_loudness
+
+        ap = self.get_world(audio, f0)
+        ap = ap[:, 0]
+
+        audio = audio.cpu().numpy()[0].astype(np.float64)
+
+        zcr = librosa.feature.zero_crossing_rate(
+            audio,
+            frame_length=self.win_length,
+            hop_length=self.hop_length,
+            threshold=0.001,
+        )
+        zcr = zcr[0]
+        zcr = np.convolve(zcr, np.hanning(7) / 3, "same")
+
+        rms = self.get_rms(
+            audio, win_length=self.win_length, hop_length=self.hop_length
+        )
+        rms = rms[0]
+
+        vuv = 1 - (np.ones_like(ap) * (ap > 0.01))
+
+        for i in range(len(vuv)):
+            if zcr[i] > self.zcr_uv:
+                vuv[i] = 0
+            elif zcr[i] < self.zcr_v and rms[i] > self.rms_uv:
+                vuv[i] = 1
+            elif rms[i] <= self.rms_uv:
+                vuv[i] = 0
+
+        vuv = np.convolve(
+            vuv,
+            np.hanning(self.vuv_smoothing) / (self.vuv_smoothing / 2),
+            "same",
+        )
+        vuv = np.interp(
+            np.linspace(0, np.max(vuv), len(vuv) // 4),
+            np.linspace(0, np.max(vuv), len(vuv)),
+            vuv,
+        )
+
+        vuv = np.ones_like(vuv) * (vuv >= 0.5)
+
+        for s in range(1, self.vuv_smoothing + 1):
+            self.smooth(vuv, s)
+
+        return vuv.astype(np.float32)
+
+    def get_world(self, audio, f0):
+        time_step = self.hop_length / self.sample_rate
+        wav_frames = (audio.shape[-1] + self.hop_length - 1) // self.hop_length
+        t = np.arange(0, wav_frames) * time_step
+
+        f0 = f0.cpu().numpy().astype(np.float64)
+
+        if f0.shape[0] < wav_frames - 1:
+            f0 = np.pad(
+                f0,
+                (0, wav_frames - f0.shape[0]),
+                mode="constant",
+                constant_values=(f0[0], f0[-1]),
+            )
+        elif f0.shape[0] > wav_frames - 1:
+            f0 = f0[:wav_frames]
+        ap = pyworld.d4c(
+            audio.cpu().numpy().astype(np.float64)[0],
+            f0,
+            t,
+            self.sample_rate,
+            fft_size=self.hop_length * 4,
+        )
+
+        return ap
+
+    def get_rms(self, audio, win_length=2048, hop_length=512):
+        S = librosa.magphase(
+            librosa.stft(
+                audio,
+                hop_length=hop_length,
+                win_length=win_length,
+                window="hann",
+                center=True,
+                pad_mode="reflect",
+            )
+        )[0]
+        rms = librosa.feature.rms(S=S)
+
+        return rms
+
+    @staticmethod
+    def smooth(arr, s):
+        org_len = len(arr)
+        arr = np.pad(arr, s, "reflect")
+        for i in range(s - 1, org_len - s):
+            m = np.mean(np.concatenate((arr[i - s : i], arr[i + 1 : i + s + 1])))
+            if m > 0.5:
+                arr[i] = 1
+            elif m < 0.5:
+                arr[i] = 0
+        return arr[s:-s]