forked from babysor/MockingBird
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Upgrade to new web service (babysor#529)
* Init new GUI * Remove unused codes * Reset layout * Add samples * Make framework to support multiple pages * Add vc mode * Add preprocessing mode * Add training mode * Remove text input in vc mode * Add entry for GUI and revise readme * Move requirement together * Add error raise when no model folder found * Add readme
- Loading branch information
Showing
35 changed files
with
1,966 additions
and
71 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,7 +13,6 @@ | |
*.bbl | ||
*.bcf | ||
*.toc | ||
*.wav | ||
*.sh | ||
*/saved_models | ||
!vocoder/saved_models/pretrained/** | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,167 @@ | ||
from asyncio.windows_events import NULL | ||
from synthesizer.inference import Synthesizer | ||
from pydantic import BaseModel, Field | ||
from encoder import inference as speacker_encoder | ||
import torch | ||
import os | ||
from pathlib import Path | ||
from enum import Enum | ||
import ppg_extractor as Extractor | ||
import ppg2mel as Convertor | ||
import librosa | ||
from scipy.io.wavfile import write | ||
import re | ||
import numpy as np | ||
from mkgui.base.components.types import FileContent | ||
from vocoder.hifigan import inference as gan_vocoder | ||
from typing import Any | ||
import matplotlib.pyplot as plt | ||
|
||
|
||
# Constants | ||
AUDIO_SAMPLES_DIR = 'samples\\' | ||
EXT_MODELS_DIRT = "ppg_extractor\\saved_models" | ||
CONV_MODELS_DIRT = "ppg2mel\\saved_models" | ||
VOC_MODELS_DIRT = "vocoder\\saved_models" | ||
TEMP_SOURCE_AUDIO = "wavs/temp_source.wav" | ||
TEMP_TARGET_AUDIO = "wavs/temp_target.wav" | ||
TEMP_RESULT_AUDIO = "wavs/temp_result.wav" | ||
|
||
# Load local sample audio as options TODO: load dataset | ||
if os.path.isdir(AUDIO_SAMPLES_DIR): | ||
audio_input_selection = Enum('samples', list((file.name, file) for file in Path(AUDIO_SAMPLES_DIR).glob("*.wav"))) | ||
# Pre-Load models | ||
if os.path.isdir(EXT_MODELS_DIRT): | ||
extractors = Enum('extractors', list((file.name, file) for file in Path(EXT_MODELS_DIRT).glob("**/*.pt"))) | ||
print("Loaded extractor models: " + str(len(extractors))) | ||
else: | ||
raise Exception(f"Model folder {EXT_MODELS_DIRT} doesn't exist.") | ||
|
||
if os.path.isdir(CONV_MODELS_DIRT): | ||
convertors = Enum('convertors', list((file.name, file) for file in Path(CONV_MODELS_DIRT).glob("**/*.pth"))) | ||
print("Loaded convertor models: " + str(len(convertors))) | ||
else: | ||
raise Exception(f"Model folder {CONV_MODELS_DIRT} doesn't exist.") | ||
|
||
if os.path.isdir(VOC_MODELS_DIRT): | ||
vocoders = Enum('vocoders', list((file.name, file) for file in Path(VOC_MODELS_DIRT).glob("**/*gan*.pt"))) | ||
print("Loaded vocoders models: " + str(len(vocoders))) | ||
else: | ||
raise Exception(f"Model folder {VOC_MODELS_DIRT} doesn't exist.") | ||
|
||
class Input(BaseModel): | ||
local_audio_file: audio_input_selection = Field( | ||
..., alias="输入语音(本地wav)", | ||
description="选择本地语音文件." | ||
) | ||
upload_audio_file: FileContent = Field(default=None, alias="或上传语音", | ||
description="拖拽或点击上传.", mime_type="audio/wav") | ||
local_audio_file_target: audio_input_selection = Field( | ||
..., alias="目标语音(本地wav)", | ||
description="选择本地语音文件." | ||
) | ||
upload_audio_file_target: FileContent = Field(default=None, alias="或上传目标语音", | ||
description="拖拽或点击上传.", mime_type="audio/wav") | ||
extractor: extractors = Field( | ||
..., alias="编码模型", | ||
description="选择语音编码模型文件." | ||
) | ||
convertor: convertors = Field( | ||
..., alias="转换模型", | ||
description="选择语音转换模型文件." | ||
) | ||
vocoder: vocoders = Field( | ||
..., alias="语音编码模型", | ||
description="选择语音解码模型文件(目前只支持HifiGan类型)." | ||
) | ||
|
||
class AudioEntity(BaseModel): | ||
content: bytes | ||
mel: Any | ||
|
||
class Output(BaseModel): | ||
__root__: tuple[AudioEntity, AudioEntity, AudioEntity] | ||
|
||
def render_output_ui(self, streamlit_app, input) -> None: # type: ignore | ||
"""Custom output UI. | ||
If this method is implmeneted, it will be used instead of the default Output UI renderer. | ||
""" | ||
src, target, result = self.__root__ | ||
|
||
streamlit_app.subheader("Synthesized Audio") | ||
streamlit_app.audio(result.content, format="audio/wav") | ||
|
||
fig, ax = plt.subplots() | ||
ax.imshow(src.mel, aspect="equal", interpolation="none") | ||
ax.set_title("mel spectrogram(Source Audio)") | ||
streamlit_app.pyplot(fig) | ||
fig, ax = plt.subplots() | ||
ax.imshow(target.mel, aspect="equal", interpolation="none") | ||
ax.set_title("mel spectrogram(Target Audio)") | ||
streamlit_app.pyplot(fig) | ||
fig, ax = plt.subplots() | ||
ax.imshow(result.mel, aspect="equal", interpolation="none") | ||
ax.set_title("mel spectrogram(Result Audio)") | ||
streamlit_app.pyplot(fig) | ||
|
||
def convert(input: Input) -> Output: | ||
"""convert(转换)""" | ||
# load models | ||
extractor = Extractor.load_model(Path(input.extractor.value)) | ||
convertor = Convertor.load_model(Path(input.convertor.value)) | ||
# current_synt = Synthesizer(Path(input.synthesizer.value)) | ||
gan_vocoder.load_model(Path(input.vocoder.value)) | ||
|
||
# load file | ||
if input.upload_audio_file != None: | ||
with open(TEMP_SOURCE_AUDIO, "w+b") as f: | ||
f.write(input.upload_audio_file.as_bytes()) | ||
f.seek(0) | ||
src_wav, sample_rate = librosa.load(TEMP_SOURCE_AUDIO) | ||
else: | ||
src_wav, sample_rate = librosa.load(input.local_audio_file.value) | ||
write(TEMP_SOURCE_AUDIO, sample_rate, src_wav) #Make sure we get the correct wav | ||
|
||
if input.upload_audio_file_target != None: | ||
with open(TEMP_TARGET_AUDIO, "w+b") as f: | ||
f.write(input.upload_audio_file_target.as_bytes()) | ||
f.seek(0) | ||
ref_wav, _ = librosa.load(TEMP_TARGET_AUDIO) | ||
else: | ||
ref_wav, _ = librosa.load(input.local_audio_file_target.value) | ||
write(TEMP_TARGET_AUDIO, sample_rate, ref_wav) #Make sure we get the correct wav | ||
|
||
ppg = extractor.extract_from_wav(src_wav) | ||
# Import necessary dependency of Voice Conversion | ||
from utils.f0_utils import compute_f0, f02lf0, compute_mean_std, get_converted_lf0uv | ||
ref_lf0_mean, ref_lf0_std = compute_mean_std(f02lf0(compute_f0(ref_wav))) | ||
speacker_encoder.load_model(Path("encoder/saved_models/pretrained_bak_5805000.pt")) | ||
embed = speacker_encoder.embed_utterance(ref_wav) | ||
lf0_uv = get_converted_lf0uv(src_wav, ref_lf0_mean, ref_lf0_std, convert=True) | ||
min_len = min(ppg.shape[1], len(lf0_uv)) | ||
ppg = ppg[:, :min_len] | ||
lf0_uv = lf0_uv[:min_len] | ||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | ||
_, mel_pred, att_ws = convertor.inference( | ||
ppg, | ||
logf0_uv=torch.from_numpy(lf0_uv).unsqueeze(0).float().to(device), | ||
spembs=torch.from_numpy(embed).unsqueeze(0).to(device), | ||
) | ||
mel_pred= mel_pred.transpose(0, 1) | ||
breaks = [mel_pred.shape[1]] | ||
mel_pred= mel_pred.detach().cpu().numpy() | ||
|
||
# synthesize and vocode | ||
wav, sample_rate = gan_vocoder.infer_waveform(mel_pred) | ||
|
||
# write and output | ||
write(TEMP_RESULT_AUDIO, sample_rate, wav) #Make sure we get the correct wav | ||
with open(TEMP_SOURCE_AUDIO, "rb") as f: | ||
source_file = f.read() | ||
with open(TEMP_TARGET_AUDIO, "rb") as f: | ||
target_file = f.read() | ||
with open(TEMP_RESULT_AUDIO, "rb") as f: | ||
result_file = f.read() | ||
|
||
|
||
return Output(__root__=(AudioEntity(content=source_file, mel=Synthesizer.make_spectrogram(src_wav)), AudioEntity(content=target_file, mel=Synthesizer.make_spectrogram(ref_wav)), AudioEntity(content=result_file, mel=Synthesizer.make_spectrogram(wav)))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
|
||
from .core import Opyrator |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .fastapi_app import create_api |
Oops, something went wrong.