add loudness threshold to filter silent regions from pitched data

rakuri255 · rakuri255 · Sep 14, 2023 · Oct 3, 2023 · Oct 3, 2023 · Oct 3, 2023
commit da434684844c0757373a3ed11a211b8ea3d456e9
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,7 @@
 MIT License
 
 Copyright (c) 2023 Vadim Rangnau
+Copyright (c) 2020 Max Morrison (torchcrepe code adapted for crepe output filtering abd thresholding)
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/pytest/modules/Pitcher/test_pitcher.py b/pytest/modules/Pitcher/test_pitcher.py
@@ -8,7 +8,7 @@
 
 
 class PitcherTest(unittest.TestCase):
-    @pytest.mark.skip(reason="Skipping this FUNCTION level test, can be used for manual tests")
+    # @pytest.mark.skip(reason="Skipping this FUNCTION level test, can be used for manual tests")
     def test_get_pitch_with_crepe_file(self):
         # Arrange
         test_dir = os.path.dirname(os.path.abspath(__file__))

diff --git a/src/UltraSinger.py b/src/UltraSinger.py
@@ -55,7 +55,8 @@
 from modules.musicbrainz_client import get_music_infos
 
 settings = Settings()
-
+SYLLABLE_SEGMENT_SIZE = 0.1
+SYLLABLE_SEGMENT_MAX_GAP_FOR_MERGE = 0.1
 
 def convert_midi_notes_to_ultrastar_notes(midi_notes: list[str]) -> list[int]:
     """Convert midi notes to ultrastar notes"""
@@ -255,6 +256,73 @@ def print_support() -> None:
     )
 
 
+def split_syllables_into_segments(transcribed_data: list[TranscribedData]) -> list[TranscribedData]:
+    """Split every syllable into sub-segments"""
+    segment_size_decimal_points = len(str(SYLLABLE_SEGMENT_SIZE).split(".")[1])
+    new_data = []
+
+    for i, data in enumerate(transcribed_data):
+
+        duration = data.end - data.start
+        if duration <= SYLLABLE_SEGMENT_SIZE:
+            new_data.append(data)
+            continue
+
+        has_space = str(data.word).endswith(" ")
+        first_segment = copy.deepcopy(data)
+        filler_words_start = data.start + SYLLABLE_SEGMENT_SIZE
+        remainder = data.end - (filler_words_start)
+        first_segment.end = filler_words_start
+        if has_space:
+            first_segment.word = first_segment.word[:-1]
+
+        new_data.append(first_segment)
+
+        full_segments, partial_segment = divmod(remainder, SYLLABLE_SEGMENT_SIZE)
+
+        if full_segments >= 1:
+            for i in range(int(full_segments)):
+                segment = TranscribedData()
+                segment.word = "~"
+                segment.start = filler_words_start + round(i * SYLLABLE_SEGMENT_SIZE, segment_size_decimal_points)
+                segment.end = segment.start + SYLLABLE_SEGMENT_SIZE
+                new_data.append(segment)
+
+        if partial_segment >= 0.01:
+            segment = TranscribedData()
+            segment.word = "~"
+            segment.start = filler_words_start + round(full_segments * SYLLABLE_SEGMENT_SIZE, segment_size_decimal_points)
+            segment.end = segment.start + partial_segment
+            new_data.append(segment)
+
+        if has_space:
+            new_data[-1].word += " "
+    return new_data
+
+
+def merge_syllable_segments(
+        transcribed_data: list[TranscribedData],
+        midi_notes: list[str],
+        us_notes = list[int]
+) -> tuple[list[TranscribedData], list[str], list[int]]:
+    """Merge sub-segments of a syllable where the pitch is the same"""
+    new_data = []
+    new_midi_notes = []
+    new_us_notes = []
+
+    previous_data = None
+
+    for i, data in enumerate(transcribed_data):
+        if str(data.word).startswith("~") and previous_data is not None and midi_notes[i] == midi_notes[i-1] and data.start - previous_data.end <= SYLLABLE_SEGMENT_MAX_GAP_FOR_MERGE:
+            new_data[-1].end = data.end
+        else:
+            new_data.append(data)
+            new_midi_notes.append(midi_notes[i])
+            new_us_notes.append(us_notes[i])
+        previous_data = data
+    return new_data, new_midi_notes, new_us_notes
+
+
 def run() -> None:
     """The processing function of this program"""
     is_audio = ".txt" not in settings.input_file_path
@@ -330,6 +398,8 @@ def run() -> None:
         # lyric = 'input/faber_lyric.txt'
         # --corrected_words = correct_words(vosk_speech, lyric)
 
+    transcribed_data = split_syllables_into_segments(transcribed_data)
+
     # Create audio chunks
     if settings.create_audio_chunks:
         create_audio_chunks(
@@ -345,6 +415,8 @@ def run() -> None:
         is_audio, transcribed_data, ultrastar_class
     )
 
+    transcribed_data, midi_notes, ultrastar_note_numbers = merge_syllable_segments(transcribed_data, midi_notes, ultrastar_note_numbers)
+
     # Create plot
     if settings.create_plot:
         plot(pitched_data, song_output, transcribed_data, midi_notes)
@@ -706,7 +778,7 @@ def pitch_audio(is_audio: bool, transcribed_data: list[TranscribedData], ultrast
         settings.mono_audio_path,
         settings.crepe_model_capacity,
         settings.crepe_step_size,
-        settings.tensorflow_device,
+        settings.tensorflow_device
     )
     if is_audio:
         start_times = []

diff --git a/src/modules/Pitcher/core.py b/src/modules/Pitcher/core.py
@@ -0,0 +1 @@
+CREPE_MODEL_SAMPLE_RATE = 16000
diff --git a/src/modules/Pitcher/loudness.py b/src/modules/Pitcher/loudness.py
@@ -0,0 +1,69 @@
+import warnings
+
+import librosa
+import numpy as np
+from modules.Pitcher.core import CREPE_MODEL_SAMPLE_RATE
+
+###############################################################################
+# Constants
+###############################################################################
+
+WINDOW_SIZE = 1024
+TIMES_DECIMAL_PLACES: int = 3
+# Minimum decibel level
+MIN_DB = -100.
+
+# Reference decibel level
+REF_DB = 20.
+
+def set_confidence_to_zero_in_silent_regions(confidence, audio, threshold=-60, step_size=10, pad=True):
+    # Don't modify in-place
+    confidence = confidence[:]
+
+    # Compute loudness
+    loudness = a_weighted(audio, step_size, pad)
+
+    # Threshold silence
+    confidence[loudness < threshold] = 0.
+
+    return confidence, loudness
+
+def a_weighted(audio, step_size=10, pad=True):
+    """Retrieve the per-frame loudness"""
+    step_size_seconds = round(step_size / 1000, TIMES_DECIMAL_PLACES)
+    steps_per_second = 1 / step_size_seconds
+    hop_length = int(CREPE_MODEL_SAMPLE_RATE // steps_per_second)
+
+    a_perceptual_weights = perceptual_weights()
+
+    # Take stft
+    stft = librosa.stft(audio,
+                        n_fft=WINDOW_SIZE,
+                        hop_length=hop_length,
+                        win_length=WINDOW_SIZE,
+                        center=pad,
+                        pad_mode='constant')
+
+    # Compute magnitude on db scale
+    db = librosa.amplitude_to_db(np.abs(stft))
+
+    # Apply A-weighting
+    weighted = db + a_perceptual_weights
+
+    # Threshold
+    weighted[weighted < MIN_DB] = MIN_DB
+
+    # Average over weighted frequencies
+    return weighted.mean(axis=0)
+
+
+def perceptual_weights():
+    """A-weighted frequency-dependent perceptual loudness weights"""
+    frequencies = librosa.fft_frequencies(sr=CREPE_MODEL_SAMPLE_RATE,
+                                          n_fft=WINDOW_SIZE)
+
+    # A warning is raised for nearly inaudible frequencies, but it ends up
+    # defaulting to -100 db. That default is fine for our purposes.
+    with warnings.catch_warnings():
+        warnings.simplefilter('ignore', RuntimeWarning)
+        return librosa.A_weighting(frequencies)[:, None] - REF_DB
diff --git a/src/modules/Pitcher/pitched_data.py b/src/modules/Pitcher/pitched_data.py
@@ -9,3 +9,4 @@ class PitchedData:
     times: list[float]
     frequencies: list[float]
     confidence: list[float]
+    perceived_loudness_db: list[float]
diff --git a/src/modules/Pitcher/pitcher.py b/src/modules/Pitcher/pitcher.py
@@ -1,10 +1,13 @@
 """Pitcher module"""
 
 import crepe
-from scipy.io import wavfile
+import librosa
 
 from modules.console_colors import ULTRASINGER_HEAD, blue_highlighted, red_highlighted
+from modules.Pitcher.core import CREPE_MODEL_SAMPLE_RATE
+from modules.Pitcher.loudness import set_confidence_to_zero_in_silent_regions
 from modules.Pitcher.pitched_data import PitchedData
+import modules.timer as timer
 
 
 def get_pitch_with_crepe_file(
@@ -15,26 +18,37 @@ def get_pitch_with_crepe_file(
     print(
         f"{ULTRASINGER_HEAD} Pitching with {blue_highlighted('crepe')} and model {blue_highlighted(model_capacity)} and {red_highlighted(device)} as worker"
     )
-    sample_rate, audio = wavfile.read(filename)
+    timer.log('Load file for pitch detection start')
+    audio, sample_rate = librosa.load(filename)
+    timer.log('Load file for pitch detection end')
 
     return get_pitch_with_crepe(audio, sample_rate, model_capacity, step_size)
 
 
-def get_pitch_with_crepe(
-    audio, sample_rate: int, model_capacity: str, step_size: int = 10
-) -> PitchedData:
+def get_pitch_with_crepe(audio, sample_rate: int, model_capacity: str, step_size: int = 10) -> PitchedData:
     """Pitch with crepe"""
-    times, frequencies, confidence, activation = crepe.predict(
-        audio, sample_rate, model_capacity, step_size=step_size, viterbi=True
-    )
-    return PitchedData(times, frequencies, confidence)
+
+    if sample_rate != CREPE_MODEL_SAMPLE_RATE:
+        from resampy import resample
+        audio = resample(audio, sample_rate, CREPE_MODEL_SAMPLE_RATE)
+        sample_rate = CREPE_MODEL_SAMPLE_RATE
+
+    timer.log('Crepe pitch detection start')
+    times, frequencies, confidence, activation = crepe.predict(audio, sample_rate, model_capacity, step_size=step_size, viterbi=True)
+    timer.log('Crepe pitch detection end')
+
+    timer.log('Computing loudness start')
+    confidence, perceived_loudness = set_confidence_to_zero_in_silent_regions(confidence, audio, step_size=step_size)
+    timer.log('Computing loudness end')
+
+    return PitchedData(times, frequencies, confidence, perceived_loudness)
 
 
 def get_pitched_data_with_high_confidence(
     pitched_data: PitchedData, threshold=0.4
 ) -> PitchedData:
     """Get frequency with high confidence"""
-    new_pitched_data = PitchedData([], [], [])
+    new_pitched_data = PitchedData([], [], [], [])
     for i, conf in enumerate(pitched_data.confidence):
         if conf > threshold:
             new_pitched_data.times.append(pitched_data.times[i])

diff --git a/src/modules/Speech_Recognition/TranscribedData.py b/src/modules/Speech_Recognition/TranscribedData.py
@@ -4,15 +4,17 @@
 class TranscribedData:
     """Transcribed data from json file"""
 
-    def __init__(self, transcribed_json):
-        # Vosk = conf, Whisper = confidence
-        self.conf = transcribed_json.get(
-            "conf", transcribed_json.get("confidence", None)
-        )
-        # Vosk = word, Whisper = text
-        self.word = transcribed_json.get(
-            "word", transcribed_json.get("text", None)
-        )
-        self.end = transcribed_json.get("end", None)
-        self.start = transcribed_json.get("start", None)
-        self.is_hyphen = None
+    def __init__(self, transcribed_json = None):
+
+        if transcribed_json:
+            # Vosk = conf, Whisper = confidence
+            self.conf = transcribed_json.get(
+                "conf", transcribed_json.get("confidence", None)
+            )
+            # Vosk = word, Whisper = text
+            self.word = transcribed_json.get(
+                "word", transcribed_json.get("text", None)
+            )
+            self.end = transcribed_json.get("end", None)
+            self.start = transcribed_json.get("start", None)
+            self.is_hyphen = None
diff --git a/src/modules/plot.py b/src/modules/plot.py
@@ -187,7 +187,7 @@ def create_gaps(pitched_data: PitchedData, step_size: float) -> PitchedData:
     This way the graph is only continuous where it should be.
 
     """
-    pitched_data_with_gaps = PitchedData([], [], [])
+    pitched_data_with_gaps = PitchedData([], [], [], [])
 
     previous_time = 0
     for i, time in enumerate(pitched_data.times):