Skip to content

Commit

Permalink
feat: Pitch detection algorithm option for mangio-crepe
Browse files Browse the repository at this point in the history
  • Loading branch information
SociallyIneptWeeb committed Aug 26, 2023
1 parent de72508 commit 46a5ff7
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 17 deletions.
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ WebUI is under constant development and testing, but you can try it out right no
- Extra RVC options - filter_radius, rms_mix_rate, protect
- Local file upload via file browser option
- Upload of locally trained RVC v2 models via WebUI
- Pitch detection method control, e.g. rmvpe/mangio-crepe

## Colab notebook

Expand Down Expand Up @@ -137,7 +138,7 @@ The directory structure should look something like this:
To run the AI cover generation pipeline using the command line, run the following command.

```
python src/main.py [-h] -i SONG_INPUT -dir RVC_DIRNAME -p PITCH_CHANGE [-k | --keep-files | --no-keep-files] [-ir INDEX_RATE] [-fr FILTER_RADIUS] [-rms RMS_MIX_RATE] [-pro PROTECT] [-mv MAIN_VOL] [-bv BACKUP_VOL] [-iv INST_VOL] [-rsize REVERB_SIZE] [-rwet REVERB_WETNESS] [-rdry REVERB_DRYNESS] [-rdamp REVERB_DAMPING]
python src/main.py [-h] -i SONG_INPUT -dir RVC_DIRNAME -p PITCH_CHANGE [-k | --keep-files | --no-keep-files] [-ir INDEX_RATE] [-fr FILTER_RADIUS] [-rms RMS_MIX_RATE] [-palgo PITCH_DETECTION_ALGO] [-hop CREPE_HOP_LENGTH] [-pro PROTECT] [-mv MAIN_VOL] [-bv BACKUP_VOL] [-iv INST_VOL] [-rsize REVERB_SIZE] [-rwet REVERB_WETNESS] [-rdry REVERB_DRYNESS] [-rdamp REVERB_DAMPING]
```

| Flag | Description |
Expand All @@ -150,6 +151,8 @@ python src/main.py [-h] -i SONG_INPUT -dir RVC_DIRNAME -p PITCH_CHANGE [-k | --k
| `-ir INDEX_RATE` | Optional. Default 0.5. Control how much of the AI's accent to leave in the vocals. 0 <= INDEX_RATE <= 1. |
| `-fr FILTER_RADIUS` | Optional. Default 3. If >=3: apply median filtering median filtering to the harvested pitch results. 0 <= FILTER_RADIUS <= 7. |
| `-rms RMS_MIX_RATE` | Optional. Default 0.25. Control how much to use the original vocal's loudness (0) or a fixed loudness (1). 0 <= RMS_MIX_RATE <= 1. |
| `-palgo PITCH_DETECTION_ALGO` | Optional. Default rmvpe. Best option is rmvpe (clarity in vocals), then mangio-crepe (smoother vocals). |
| `-hop CREPE_HOP_LENGTH` | Optional. Default 128. Controls how often it checks for pitch changes in milliseconds when using mangio-crepe algo specifically. Lower values leads to longer conversions and higher risk of voice cracks, but better pitch accuracy. |
| `-pro PROTECT` | Optional. Default 0.33. Control how much of the original vocals' breath and voiceless consonants to leave in the AI vocals. Set 0.5 to disable. 0 <= PROTECT <= 0.5. |
| `-mv MAIN_VOCALS_VOLUME_CHANGE` | Optional. Default 0. Control volume of main AI vocals. Use -3 to decrease the volume by 3 decibels, or 3 to increase the volume by 3 decibels. |
| `-bv BACKUP_VOCALS_VOLUME_CHANGE` | Optional. Default 0. Control volume of backup AI vocals. |
Expand Down
17 changes: 10 additions & 7 deletions src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,15 +176,15 @@ def preprocess_song(song_input, mdx_model_params, song_id, is_webui, input_type,
return orig_song_path, vocals_path, instrumentals_path, main_vocals_path, backup_vocals_path, main_vocals_dereverb_path


def voice_change(voice_model, vocals_path, output_path, pitch_change, index_rate, filter_radius, rms_mix_rate, protect, is_webui):
def voice_change(voice_model, vocals_path, output_path, pitch_change, f0_method, index_rate, filter_radius, rms_mix_rate, protect, crepe_hop_length, is_webui):
rvc_model_path, rvc_index_path = get_rvc_model(voice_model, is_webui)
device = 'cuda:0'
config = Config(device, True)
hubert_model = load_hubert(device, config.is_half, os.path.join(rvc_models_dir, 'hubert_base.pt'))
cpt, version, net_g, tgt_sr, vc = get_vc(device, config.is_half, config, rvc_model_path)

# convert main vocals
rvc_infer(rvc_index_path, index_rate, vocals_path, output_path, pitch_change, cpt, version, net_g, filter_radius, tgt_sr, rms_mix_rate, protect, vc, hubert_model)
rvc_infer(rvc_index_path, index_rate, vocals_path, output_path, pitch_change, f0_method, cpt, version, net_g, filter_radius, tgt_sr, rms_mix_rate, protect, crepe_hop_length, vc, hubert_model)
del hubert_model, cpt
gc.collect()

Expand Down Expand Up @@ -221,8 +221,8 @@ def combine_audio(audio_paths, output_path, main_gain, backup_gain, inst_gain):

def song_cover_pipeline(song_input, voice_model, pitch_change, keep_files,
is_webui=0, main_gain=0, backup_gain=0, inst_gain=0, index_rate=0.5, filter_radius=3,
rms_mix_rate=0.25, protect=0.33, reverb_rm_size=0.15, reverb_wet=0.2, reverb_dry=0.8,
reverb_damping=0.7, progress=gr.Progress()):
rms_mix_rate=0.25, f0_method='rmvpe', crepe_hop_length=128, protect=0.33, reverb_rm_size=0.15,
reverb_wet=0.2, reverb_dry=0.8, reverb_damping=0.7, progress=gr.Progress()):
try:
if not song_input or not voice_model:
raise_exception('Ensure that the song input field and voice model field is filled.', is_webui)
Expand Down Expand Up @@ -267,12 +267,12 @@ def song_cover_pipeline(song_input, voice_model, pitch_change, keep_files,
else:
orig_song_path, instrumentals_path, main_vocals_dereverb_path, backup_vocals_path = paths

ai_vocals_path = os.path.join(song_dir, f'{os.path.splitext(os.path.basename(orig_song_path))[0]}_{voice_model}_p{pitch_change}_i{index_rate}_fr{filter_radius}_rms{rms_mix_rate}_pro{protect}.wav')
ai_vocals_path = os.path.join(song_dir, f'{os.path.splitext(os.path.basename(orig_song_path))[0]}_{voice_model}_p{pitch_change}_i{index_rate}_fr{filter_radius}_rms{rms_mix_rate}_pro{protect}_{f0_method}{"" if f0_method != "mangio-crepe" else f"_{crepe_hop_length}"}.wav')
ai_cover_path = os.path.join(song_dir, f'{os.path.splitext(os.path.basename(orig_song_path))[0]} ({voice_model} Ver).mp3')

if not os.path.exists(ai_vocals_path):
display_progress('[~] Converting voice using RVC...', 0.5, is_webui, progress)
voice_change(voice_model, main_vocals_dereverb_path, ai_vocals_path, pitch_change, index_rate, filter_radius, rms_mix_rate, protect, is_webui)
voice_change(voice_model, main_vocals_dereverb_path, ai_vocals_path, pitch_change, f0_method, index_rate, filter_radius, rms_mix_rate, protect, crepe_hop_length, is_webui)

display_progress('[~] Applying audio effects to vocals...', 0.8, is_webui, progress)
ai_vocals_mixed_path = add_audio_effects(ai_vocals_path, reverb_rm_size, reverb_wet, reverb_dry, reverb_damping)
Expand Down Expand Up @@ -302,6 +302,8 @@ def song_cover_pipeline(song_input, voice_model, pitch_change, keep_files,
parser.add_argument('-ir', '--index-rate', type=float, default=0.5, help='A decimal number e.g. 0.5, used to reduce/resolve the timbre leakage problem. If set to 1, more biased towards the timbre quality of the training dataset')
parser.add_argument('-fr', '--filter-radius', type=int, default=3, help='A number between 0 and 7. If >=3: apply median filtering to the harvested pitch results. The value represents the filter radius and can reduce breathiness.')
parser.add_argument('-rms', '--rms-mix-rate', type=float, default=0.25, help="A decimal number e.g. 0.25. Control how much to use the original vocal's loudness (0) or a fixed loudness (1).")
parser.add_argument('-palgo', '--pitch-detection-algo', type=str, default='rmvpe', help='Best option is rmvpe (clarity in vocals), then mangio-crepe (smoother vocals).')
parser.add_argument('-hop', '--crepe-hop-length', type=int, default=128, help='If pitch detection algo is mangio-crepe, controls how often it checks for pitch changes in milliseconds. The higher the value, the faster the conversion and less risk of voice cracks, but there is less pitch accuracy. Recommended: 128.')
parser.add_argument('-pro', '--protect', type=float, default=0.33, help='A decimal number e.g. 0.33. Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy.')
parser.add_argument('-mv', '--main-vol', type=int, default=0, help='Volume change for AI main vocals in decibels. Use -3 to decrease by 3 decibels and 3 to increase by 3 decibels')
parser.add_argument('-bv', '--backup-vol', type=int, default=0, help='Volume change for backup vocals in decibels')
Expand All @@ -319,7 +321,8 @@ def song_cover_pipeline(song_input, voice_model, pitch_change, keep_files,
cover_path = song_cover_pipeline(args.song_input, rvc_dirname, args.pitch_change, args.keep_files,
main_gain=args.main_vol, backup_gain=args.backup_vol, inst_gain=args.inst_vol,
index_rate=args.index_rate, filter_radius=args.filter_radius,
rms_mix_rate=args.rms_mix_rate, protect=args.protect,
rms_mix_rate=args.rms_mix_rate, f0_method=args.pitch_detection_algo,
crepe_hop_length=args.crepe_hop_length, protect=args.protect,
reverb_rm_size=args.reverb_size, reverb_wet=args.reverb_wetness,
reverb_dry=args.reverb_dryness, reverb_damping=args.reverb_damping)
print(f'[+] Cover generated at {cover_path}')
4 changes: 2 additions & 2 deletions src/rvc.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,9 @@ def get_vc(device, is_half, config, model_path):
return cpt, version, net_g, tgt_sr, vc


def rvc_infer(index_path, index_rate, input_path, output_path, pitch_change, cpt, version, net_g, filter_radius, tgt_sr, rms_mix_rate, protect, vc, hubert_model):
def rvc_infer(index_path, index_rate, input_path, output_path, pitch_change, f0_method, cpt, version, net_g, filter_radius, tgt_sr, rms_mix_rate, protect, crepe_hop_length, vc, hubert_model):
audio = load_audio(input_path, 16000)
times = [0, 0, 0]
if_f0 = cpt.get('f0', 1)
audio_opt = vc.pipeline(hubert_model, net_g, 0, audio, input_path, times, pitch_change, 'rmvpe', index_path, index_rate, if_f0, filter_radius, tgt_sr, 0, rms_mix_rate, version, protect, None)
audio_opt = vc.pipeline(hubert_model, net_g, 0, audio, input_path, times, pitch_change, f0_method, index_path, index_rate, if_f0, filter_radius, tgt_sr, 0, rms_mix_rate, version, protect, crepe_hop_length)
wavfile.write(output_path, tgt_sr, audio_opt)
24 changes: 17 additions & 7 deletions src/webui.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,13 @@ def process_file_upload(file):
return file.name, gr.update(value=file.name)


def show_hop_slider(pitch_detection_algo):
if pitch_detection_algo == 'mangio-crepe':
return gr.update(visible=True)
else:
return gr.update(visible=False)


if __name__ == '__main__':
parser = ArgumentParser(description='Generate a AI cover song in the song_output/id directory.', add_help=True)
parser.add_argument("--share", action="store_true", dest="share_enabled", default=False, help="Enable sharing")
Expand Down Expand Up @@ -190,10 +197,13 @@ def process_file_upload(file):
with gr.Row():
index_rate = gr.Slider(0, 1, value=0.5, label='Index Rate', info="Controls how much of the AI voice's accent to keep in the vocals")
filter_radius = gr.Slider(0, 7, value=3, step=1, label='Filter radius', info='If >=3: apply median filtering median filtering to the harvested pitch results. Can reduce breathiness')
rms_mix_rate = gr.Slider(0, 1, value=0.25, label='RMS mix rate', info="Control how much to use the original vocal's loudness (0) or a fixed loudness (1)")
rms_mix_rate = gr.Slider(0, 1, value=0.25, label='RMS mix rate', info="Control how much to mimic the original vocal's loudness (0) or a fixed loudness (1)")
protect = gr.Slider(0, 0.5, value=0.33, label='Protect rate', info='Protect voiceless consonants and breath sounds. Set to 0.5 to disable.')
keep_files = gr.Checkbox(label='Keep intermediate files',
info='Keep all audio files generated in the song_output/id directory, e.g. Isolated Vocals/Instrumentals. Leave unchecked to save space')
with gr.Column():
f0_method = gr.Dropdown(['rmvpe', 'mangio-crepe'], value='rmvpe', label='Pitch detection algorithm', info='Best option is rmvpe (clarity in vocals), then mangio-crepe (smoother vocals)')
crepe_hop_length = gr.Slider(32, 320, value=128, step=1, visible=False, label='Crepe hop length', info='Lower values leads to longer conversions and higher risk of voice cracks, but better pitch accuracy.')
f0_method.change(show_hop_slider, inputs=f0_method, outputs=crepe_hop_length)
keep_files = gr.Checkbox(label='Keep intermediate files', info='Keep all audio files generated in the song_output/id directory, e.g. Isolated Vocals/Instrumentals. Leave unchecked to save space')

with gr.Accordion('Audio mixing options', open=False):
gr.Markdown('### Volume Change (decibels)')
Expand All @@ -218,12 +228,12 @@ def process_file_upload(file):
is_webui = gr.Number(value=1, visible=False)
generate_btn.click(song_cover_pipeline,
inputs=[song_input, rvc_model, pitch, keep_files, is_webui, main_gain, backup_gain,
inst_gain, index_rate, filter_radius, rms_mix_rate, protect, reverb_rm_size,
reverb_wet, reverb_dry, reverb_damping],
inst_gain, index_rate, filter_radius, rms_mix_rate, f0_method, crepe_hop_length,
protect, reverb_rm_size, reverb_wet, reverb_dry, reverb_damping],
outputs=[ai_cover])
clear_btn.click(lambda: [0, 0, 0, 0, 0.5, 3, 0.25, 0.33, 0.15, 0.2, 0.8, 0.7, None],
clear_btn.click(lambda: [0, 0, 0, 0, 0.5, 3, 0.25, 0.33, 'rmvpe', 128, 0.15, 0.2, 0.8, 0.7, None],
outputs=[pitch, main_gain, backup_gain, inst_gain, index_rate, filter_radius, rms_mix_rate,
protect, reverb_rm_size, reverb_wet, reverb_dry, reverb_damping, ai_cover])
protect, f0_method, crepe_hop_length, reverb_rm_size, reverb_wet, reverb_dry, reverb_damping, ai_cover])

# Download tab
with gr.Tab('Download model'):
Expand Down

0 comments on commit 46a5ff7

Please sign in to comment.