Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/evaluation #146

Open
wants to merge 22 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
add loudness threshold to filter silent regions from pitched data
  • Loading branch information
BWagener committed Sep 14, 2023
commit da434684844c0757373a3ed11a211b8ea3d456e9
1 change: 1 addition & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
MIT License

Copyright (c) 2023 Vadim Rangnau
Copyright (c) 2020 Max Morrison (torchcrepe code adapted for crepe output filtering abd thresholding)
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe fork crepe an make a separate package for it? Or PR the changes to the crepe project?

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not fork crepe, move the changes there and release an package?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wasn't sure how to give proper credit here. I would say it's up to you how to handle this and if you want to accept the code in question.

The code here https://github.com/rakuri255/UltraSinger/pull/146/files#diff-7bda13ea2689179c7952b68174b8b8ea2cc2250f9b9699c5cf55939fb6c4ac7d is copied and adapted from here https://github.com/maxrmorrison/torchcrepe/blob/master/torchcrepe/loudness.py


Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
2 changes: 1 addition & 1 deletion pytest/modules/Pitcher/test_pitcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


class PitcherTest(unittest.TestCase):
@pytest.mark.skip(reason="Skipping this FUNCTION level test, can be used for manual tests")
# @pytest.mark.skip(reason="Skipping this FUNCTION level test, can be used for manual tests")
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

commented skip

def test_get_pitch_with_crepe_file(self):
# Arrange
test_dir = os.path.dirname(os.path.abspath(__file__))
Expand Down
76 changes: 74 additions & 2 deletions src/UltraSinger.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@
from modules.musicbrainz_client import get_music_infos

settings = Settings()

SYLLABLE_SEGMENT_SIZE = 0.1
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move to module?

SYLLABLE_SEGMENT_MAX_GAP_FOR_MERGE = 0.1

def convert_midi_notes_to_ultrastar_notes(midi_notes: list[str]) -> list[int]:
"""Convert midi notes to ultrastar notes"""
Expand Down Expand Up @@ -255,6 +256,73 @@ def print_support() -> None:
)


def split_syllables_into_segments(transcribed_data: list[TranscribedData]) -> list[TranscribedData]:
"""Split every syllable into sub-segments"""
segment_size_decimal_points = len(str(SYLLABLE_SEGMENT_SIZE).split(".")[1])
new_data = []

for i, data in enumerate(transcribed_data):

duration = data.end - data.start
if duration <= SYLLABLE_SEGMENT_SIZE:
new_data.append(data)
continue

has_space = str(data.word).endswith(" ")
first_segment = copy.deepcopy(data)
filler_words_start = data.start + SYLLABLE_SEGMENT_SIZE
remainder = data.end - (filler_words_start)
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why () ?

first_segment.end = filler_words_start
if has_space:
first_segment.word = first_segment.word[:-1]

new_data.append(first_segment)

full_segments, partial_segment = divmod(remainder, SYLLABLE_SEGMENT_SIZE)

if full_segments >= 1:
for i in range(int(full_segments)):
segment = TranscribedData()
segment.word = "~"
segment.start = filler_words_start + round(i * SYLLABLE_SEGMENT_SIZE, segment_size_decimal_points)
segment.end = segment.start + SYLLABLE_SEGMENT_SIZE
new_data.append(segment)

if partial_segment >= 0.01:
segment = TranscribedData()
segment.word = "~"
segment.start = filler_words_start + round(full_segments * SYLLABLE_SEGMENT_SIZE, segment_size_decimal_points)
segment.end = segment.start + partial_segment
new_data.append(segment)

if has_space:
new_data[-1].word += " "
return new_data


def merge_syllable_segments(
transcribed_data: list[TranscribedData],
midi_notes: list[str],
us_notes = list[int]
) -> tuple[list[TranscribedData], list[str], list[int]]:
"""Merge sub-segments of a syllable where the pitch is the same"""
new_data = []
new_midi_notes = []
new_us_notes = []

previous_data = None

for i, data in enumerate(transcribed_data):
if str(data.word).startswith("~") and previous_data is not None and midi_notes[i] == midi_notes[i-1] and data.start - previous_data.end <= SYLLABLE_SEGMENT_MAX_GAP_FOR_MERGE:
new_data[-1].end = data.end
else:
new_data.append(data)
new_midi_notes.append(midi_notes[i])
new_us_notes.append(us_notes[i])
previous_data = data
return new_data, new_midi_notes, new_us_notes


def run() -> None:
"""The processing function of this program"""
is_audio = ".txt" not in settings.input_file_path
Expand Down Expand Up @@ -330,6 +398,8 @@ def run() -> None:
# lyric = 'input/faber_lyric.txt'
# --corrected_words = correct_words(vosk_speech, lyric)

transcribed_data = split_syllables_into_segments(transcribed_data)

# Create audio chunks
if settings.create_audio_chunks:
create_audio_chunks(
Expand All @@ -345,6 +415,8 @@ def run() -> None:
is_audio, transcribed_data, ultrastar_class
)

transcribed_data, midi_notes, ultrastar_note_numbers = merge_syllable_segments(transcribed_data, midi_notes, ultrastar_note_numbers)

# Create plot
if settings.create_plot:
plot(pitched_data, song_output, transcribed_data, midi_notes)
Expand Down Expand Up @@ -706,7 +778,7 @@ def pitch_audio(is_audio: bool, transcribed_data: list[TranscribedData], ultrast
settings.mono_audio_path,
settings.crepe_model_capacity,
settings.crepe_step_size,
settings.tensorflow_device,
settings.tensorflow_device
)
if is_audio:
start_times = []
Expand Down
1 change: 1 addition & 0 deletions src/modules/Pitcher/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CREPE_MODEL_SAMPLE_RATE = 16000
69 changes: 69 additions & 0 deletions src/modules/Pitcher/loudness.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import warnings

import librosa
import numpy as np
from modules.Pitcher.core import CREPE_MODEL_SAMPLE_RATE

###############################################################################
# Constants
###############################################################################

WINDOW_SIZE = 1024
TIMES_DECIMAL_PLACES: int = 3
# Minimum decibel level
MIN_DB = -100.

# Reference decibel level
REF_DB = 20.

def set_confidence_to_zero_in_silent_regions(confidence, audio, threshold=-60, step_size=10, pad=True):
# Don't modify in-place
confidence = confidence[:]

# Compute loudness
loudness = a_weighted(audio, step_size, pad)

# Threshold silence
confidence[loudness < threshold] = 0.

return confidence, loudness

def a_weighted(audio, step_size=10, pad=True):
"""Retrieve the per-frame loudness"""
step_size_seconds = round(step_size / 1000, TIMES_DECIMAL_PLACES)
steps_per_second = 1 / step_size_seconds
hop_length = int(CREPE_MODEL_SAMPLE_RATE // steps_per_second)

a_perceptual_weights = perceptual_weights()

# Take stft
stft = librosa.stft(audio,
n_fft=WINDOW_SIZE,
hop_length=hop_length,
win_length=WINDOW_SIZE,
center=pad,
pad_mode='constant')

# Compute magnitude on db scale
db = librosa.amplitude_to_db(np.abs(stft))

# Apply A-weighting
weighted = db + a_perceptual_weights

# Threshold
weighted[weighted < MIN_DB] = MIN_DB

# Average over weighted frequencies
return weighted.mean(axis=0)


def perceptual_weights():
"""A-weighted frequency-dependent perceptual loudness weights"""
frequencies = librosa.fft_frequencies(sr=CREPE_MODEL_SAMPLE_RATE,
n_fft=WINDOW_SIZE)

# A warning is raised for nearly inaudible frequencies, but it ends up
# defaulting to -100 db. That default is fine for our purposes.
with warnings.catch_warnings():
warnings.simplefilter('ignore', RuntimeWarning)
return librosa.A_weighting(frequencies)[:, None] - REF_DB
1 change: 1 addition & 0 deletions src/modules/Pitcher/pitched_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ class PitchedData:
times: list[float]
frequencies: list[float]
confidence: list[float]
perceived_loudness_db: list[float]
34 changes: 24 additions & 10 deletions src/modules/Pitcher/pitcher.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
"""Pitcher module"""

import crepe
from scipy.io import wavfile
import librosa

from modules.console_colors import ULTRASINGER_HEAD, blue_highlighted, red_highlighted
from modules.Pitcher.core import CREPE_MODEL_SAMPLE_RATE
from modules.Pitcher.loudness import set_confidence_to_zero_in_silent_regions
from modules.Pitcher.pitched_data import PitchedData
import modules.timer as timer


def get_pitch_with_crepe_file(
Expand All @@ -15,26 +18,37 @@ def get_pitch_with_crepe_file(
print(
f"{ULTRASINGER_HEAD} Pitching with {blue_highlighted('crepe')} and model {blue_highlighted(model_capacity)} and {red_highlighted(device)} as worker"
)
sample_rate, audio = wavfile.read(filename)
timer.log('Load file for pitch detection start')
audio, sample_rate = librosa.load(filename)
timer.log('Load file for pitch detection end')

return get_pitch_with_crepe(audio, sample_rate, model_capacity, step_size)


def get_pitch_with_crepe(
audio, sample_rate: int, model_capacity: str, step_size: int = 10
) -> PitchedData:
def get_pitch_with_crepe(audio, sample_rate: int, model_capacity: str, step_size: int = 10) -> PitchedData:
"""Pitch with crepe"""
times, frequencies, confidence, activation = crepe.predict(
audio, sample_rate, model_capacity, step_size=step_size, viterbi=True
)
return PitchedData(times, frequencies, confidence)

if sample_rate != CREPE_MODEL_SAMPLE_RATE:
from resampy import resample
audio = resample(audio, sample_rate, CREPE_MODEL_SAMPLE_RATE)
sample_rate = CREPE_MODEL_SAMPLE_RATE

timer.log('Crepe pitch detection start')
times, frequencies, confidence, activation = crepe.predict(audio, sample_rate, model_capacity, step_size=step_size, viterbi=True)
timer.log('Crepe pitch detection end')

timer.log('Computing loudness start')
confidence, perceived_loudness = set_confidence_to_zero_in_silent_regions(confidence, audio, step_size=step_size)
timer.log('Computing loudness end')

return PitchedData(times, frequencies, confidence, perceived_loudness)


def get_pitched_data_with_high_confidence(
pitched_data: PitchedData, threshold=0.4
) -> PitchedData:
"""Get frequency with high confidence"""
new_pitched_data = PitchedData([], [], [])
new_pitched_data = PitchedData([], [], [], [])
for i, conf in enumerate(pitched_data.confidence):
if conf > threshold:
new_pitched_data.times.append(pitched_data.times[i])
Expand Down
26 changes: 14 additions & 12 deletions src/modules/Speech_Recognition/TranscribedData.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,17 @@
class TranscribedData:
"""Transcribed data from json file"""

def __init__(self, transcribed_json):
# Vosk = conf, Whisper = confidence
self.conf = transcribed_json.get(
"conf", transcribed_json.get("confidence", None)
)
# Vosk = word, Whisper = text
self.word = transcribed_json.get(
"word", transcribed_json.get("text", None)
)
self.end = transcribed_json.get("end", None)
self.start = transcribed_json.get("start", None)
self.is_hyphen = None
def __init__(self, transcribed_json = None):

if transcribed_json:
# Vosk = conf, Whisper = confidence
self.conf = transcribed_json.get(
"conf", transcribed_json.get("confidence", None)
)
# Vosk = word, Whisper = text
self.word = transcribed_json.get(
"word", transcribed_json.get("text", None)
)
self.end = transcribed_json.get("end", None)
self.start = transcribed_json.get("start", None)
self.is_hyphen = None
2 changes: 1 addition & 1 deletion src/modules/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def create_gaps(pitched_data: PitchedData, step_size: float) -> PitchedData:
This way the graph is only continuous where it should be.

"""
pitched_data_with_gaps = PitchedData([], [], [])
pitched_data_with_gaps = PitchedData([], [], [], [])

previous_time = 0
for i, time in enumerate(pitched_data.times):
Expand Down