-
-
Notifications
You must be signed in to change notification settings - Fork 35
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Feature/evaluation #146
base: main
Are you sure you want to change the base?
Feature/evaluation #146
Changes from 1 commit
da43468
6ffdfc5
b52945b
442c5b1
63ad200
cb4aa4e
7d57225
3bf7a06
eae51e7
c139c74
4339c35
e2c2209
c3bd2d0
4729daf
945581d
0c04e02
582b644
10e063a
75a00d2
8f59771
76a4956
051290c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
MIT License | ||
|
||
Copyright (c) 2023 Vadim Rangnau | ||
Copyright (c) 2020 Max Morrison (torchcrepe code adapted for crepe output filtering abd thresholding) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not fork crepe, move the changes there and release an package? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wasn't sure how to give proper credit here. I would say it's up to you how to handle this and if you want to accept the code in question. The code here https://github.com/rakuri255/UltraSinger/pull/146/files#diff-7bda13ea2689179c7952b68174b8b8ea2cc2250f9b9699c5cf55939fb6c4ac7d is copied and adapted from here https://github.com/maxrmorrison/torchcrepe/blob/master/torchcrepe/loudness.py |
||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,7 +8,7 @@ | |
|
||
|
||
class PitcherTest(unittest.TestCase): | ||
@pytest.mark.skip(reason="Skipping this FUNCTION level test, can be used for manual tests") | ||
# @pytest.mark.skip(reason="Skipping this FUNCTION level test, can be used for manual tests") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. commented skip |
||
def test_get_pitch_with_crepe_file(self): | ||
# Arrange | ||
test_dir = os.path.dirname(os.path.abspath(__file__)) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -55,7 +55,8 @@ | |
from modules.musicbrainz_client import get_music_infos | ||
|
||
settings = Settings() | ||
|
||
SYLLABLE_SEGMENT_SIZE = 0.1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Move to module? |
||
SYLLABLE_SEGMENT_MAX_GAP_FOR_MERGE = 0.1 | ||
|
||
def convert_midi_notes_to_ultrastar_notes(midi_notes: list[str]) -> list[int]: | ||
"""Convert midi notes to ultrastar notes""" | ||
|
@@ -255,6 +256,73 @@ def print_support() -> None: | |
) | ||
|
||
|
||
def split_syllables_into_segments(transcribed_data: list[TranscribedData]) -> list[TranscribedData]: | ||
"""Split every syllable into sub-segments""" | ||
segment_size_decimal_points = len(str(SYLLABLE_SEGMENT_SIZE).split(".")[1]) | ||
new_data = [] | ||
|
||
for i, data in enumerate(transcribed_data): | ||
|
||
duration = data.end - data.start | ||
if duration <= SYLLABLE_SEGMENT_SIZE: | ||
new_data.append(data) | ||
continue | ||
|
||
has_space = str(data.word).endswith(" ") | ||
first_segment = copy.deepcopy(data) | ||
filler_words_start = data.start + SYLLABLE_SEGMENT_SIZE | ||
remainder = data.end - (filler_words_start) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why () ? |
||
first_segment.end = filler_words_start | ||
if has_space: | ||
first_segment.word = first_segment.word[:-1] | ||
|
||
new_data.append(first_segment) | ||
|
||
full_segments, partial_segment = divmod(remainder, SYLLABLE_SEGMENT_SIZE) | ||
|
||
if full_segments >= 1: | ||
for i in range(int(full_segments)): | ||
segment = TranscribedData() | ||
segment.word = "~" | ||
segment.start = filler_words_start + round(i * SYLLABLE_SEGMENT_SIZE, segment_size_decimal_points) | ||
segment.end = segment.start + SYLLABLE_SEGMENT_SIZE | ||
new_data.append(segment) | ||
|
||
if partial_segment >= 0.01: | ||
segment = TranscribedData() | ||
segment.word = "~" | ||
segment.start = filler_words_start + round(full_segments * SYLLABLE_SEGMENT_SIZE, segment_size_decimal_points) | ||
segment.end = segment.start + partial_segment | ||
new_data.append(segment) | ||
|
||
if has_space: | ||
new_data[-1].word += " " | ||
return new_data | ||
|
||
|
||
def merge_syllable_segments( | ||
transcribed_data: list[TranscribedData], | ||
midi_notes: list[str], | ||
us_notes = list[int] | ||
) -> tuple[list[TranscribedData], list[str], list[int]]: | ||
"""Merge sub-segments of a syllable where the pitch is the same""" | ||
new_data = [] | ||
new_midi_notes = [] | ||
new_us_notes = [] | ||
|
||
previous_data = None | ||
|
||
for i, data in enumerate(transcribed_data): | ||
if str(data.word).startswith("~") and previous_data is not None and midi_notes[i] == midi_notes[i-1] and data.start - previous_data.end <= SYLLABLE_SEGMENT_MAX_GAP_FOR_MERGE: | ||
new_data[-1].end = data.end | ||
else: | ||
new_data.append(data) | ||
new_midi_notes.append(midi_notes[i]) | ||
new_us_notes.append(us_notes[i]) | ||
previous_data = data | ||
return new_data, new_midi_notes, new_us_notes | ||
|
||
|
||
def run() -> None: | ||
"""The processing function of this program""" | ||
is_audio = ".txt" not in settings.input_file_path | ||
|
@@ -330,6 +398,8 @@ def run() -> None: | |
# lyric = 'input/faber_lyric.txt' | ||
# --corrected_words = correct_words(vosk_speech, lyric) | ||
|
||
transcribed_data = split_syllables_into_segments(transcribed_data) | ||
|
||
# Create audio chunks | ||
if settings.create_audio_chunks: | ||
create_audio_chunks( | ||
|
@@ -345,6 +415,8 @@ def run() -> None: | |
is_audio, transcribed_data, ultrastar_class | ||
) | ||
|
||
transcribed_data, midi_notes, ultrastar_note_numbers = merge_syllable_segments(transcribed_data, midi_notes, ultrastar_note_numbers) | ||
|
||
# Create plot | ||
if settings.create_plot: | ||
plot(pitched_data, song_output, transcribed_data, midi_notes) | ||
|
@@ -706,7 +778,7 @@ def pitch_audio(is_audio: bool, transcribed_data: list[TranscribedData], ultrast | |
settings.mono_audio_path, | ||
settings.crepe_model_capacity, | ||
settings.crepe_step_size, | ||
settings.tensorflow_device, | ||
settings.tensorflow_device | ||
) | ||
if is_audio: | ||
start_times = [] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
CREPE_MODEL_SAMPLE_RATE = 16000 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
import warnings | ||
|
||
import librosa | ||
import numpy as np | ||
from modules.Pitcher.core import CREPE_MODEL_SAMPLE_RATE | ||
|
||
############################################################################### | ||
# Constants | ||
############################################################################### | ||
|
||
WINDOW_SIZE = 1024 | ||
TIMES_DECIMAL_PLACES: int = 3 | ||
# Minimum decibel level | ||
MIN_DB = -100. | ||
|
||
# Reference decibel level | ||
REF_DB = 20. | ||
|
||
def set_confidence_to_zero_in_silent_regions(confidence, audio, threshold=-60, step_size=10, pad=True): | ||
# Don't modify in-place | ||
confidence = confidence[:] | ||
|
||
# Compute loudness | ||
loudness = a_weighted(audio, step_size, pad) | ||
|
||
# Threshold silence | ||
confidence[loudness < threshold] = 0. | ||
|
||
return confidence, loudness | ||
|
||
def a_weighted(audio, step_size=10, pad=True): | ||
"""Retrieve the per-frame loudness""" | ||
step_size_seconds = round(step_size / 1000, TIMES_DECIMAL_PLACES) | ||
steps_per_second = 1 / step_size_seconds | ||
hop_length = int(CREPE_MODEL_SAMPLE_RATE // steps_per_second) | ||
|
||
a_perceptual_weights = perceptual_weights() | ||
|
||
# Take stft | ||
stft = librosa.stft(audio, | ||
n_fft=WINDOW_SIZE, | ||
hop_length=hop_length, | ||
win_length=WINDOW_SIZE, | ||
center=pad, | ||
pad_mode='constant') | ||
|
||
# Compute magnitude on db scale | ||
db = librosa.amplitude_to_db(np.abs(stft)) | ||
|
||
# Apply A-weighting | ||
weighted = db + a_perceptual_weights | ||
|
||
# Threshold | ||
weighted[weighted < MIN_DB] = MIN_DB | ||
|
||
# Average over weighted frequencies | ||
return weighted.mean(axis=0) | ||
|
||
|
||
def perceptual_weights(): | ||
"""A-weighted frequency-dependent perceptual loudness weights""" | ||
frequencies = librosa.fft_frequencies(sr=CREPE_MODEL_SAMPLE_RATE, | ||
n_fft=WINDOW_SIZE) | ||
|
||
# A warning is raised for nearly inaudible frequencies, but it ends up | ||
# defaulting to -100 db. That default is fine for our purposes. | ||
with warnings.catch_warnings(): | ||
warnings.simplefilter('ignore', RuntimeWarning) | ||
return librosa.A_weighting(frequencies)[:, None] - REF_DB |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe fork crepe an make a separate package for it? Or PR the changes to the crepe project?