Upgrade to new web service (babysor#529)

* Init new GUI * Remove unused codes * Reset layout * Add samples * Make framework to support multiple pages * Add vc mode * Add preprocessing mode * Add training mode * Remove text input in vc mode * Add entry for GUI and revise readme * Move requirement together * Add error raise when no model folder found * Add readme
xiehe · May 9, 2022 · c5d03fb · c5d03fb
1 parent 7f799d3
commit c5d03fb
Show file tree

Hide file tree

Showing 35 changed files with 1,966 additions and 71 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,7 +13,6 @@
 *.bbl
 *.bcf
 *.toc
-*.wav
 *.sh
 */saved_models
 !vocoder/saved_models/pretrained/**

diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -61,5 +61,13 @@
         "-m", ".\\ppg2mel\\saved_models\\best_loss_step_304000.pth", "--wav_dir", ".\\wavs\\input", "--ref_wav_path", ".\\wavs\\pkq.mp3", "-o", ".\\wavs\\output\\"
       ]
     },
+    {
+      "name": "GUI",
+      "type": "python",
+      "request": "launch",
+      "program": "mkgui\\base\\_cli.py",
+      "console": "integratedTerminal",
+      "args": []
+    },
   ]
 }
diff --git a/README-CN.md b/README-CN.md
@@ -18,6 +18,15 @@
 
 🌍 **Webserver Ready** 可伺服你的训练结果，供远程调用
 
+### 进行中的工作
+*  GUI/客户端大升级与合并
+[X] 初始化框架 `./mkgui` （基于streamlit + fastapi）和 [技术设计](https://vaj2fgg8yn.feishu.cn/docs/doccnvotLWylBub8VJIjKzoEaee)
+[X] 增加 Voice Cloning and Conversion的演示页面
+[X] 增加Voice Conversion的预处理preprocessing 和训练 training 页面 
+[ ] 增加其他的的预处理preprocessing 和训练 training 页面 
+* 模型后端基于ESPnet2升级
+
+
 ## 开始
 ### 1. 安装要求
 > 按照原始存储库测试您是否已准备好所有环境。
@@ -82,15 +91,10 @@
 ### 3. 启动程序或工具箱
 您可以尝试使用以下命令：
 
-### 3.1 启动Web程序：
+### 3.1 启动Web程序（v2）：
 `python web.py`
 运行成功后在浏览器打开地址, 默认为 `http://localhost:8080`
-![123](https://user-images.githubusercontent.com/12797292/135494044-ae59181c-fe3a-406f-9c7d-d21d12fdb4cb.png)
-> 注：目前界面比较buggy, 
-> * 第一次点击`录制`要等待几秒浏览器正常启动录音，否则会有重音
-> * 录制结束不要再点`录制`而是`停止`
 > * 仅支持手动新录音（16khz）, 不支持超过4MB的录音，最佳长度在5~15秒
-> * 默认使用第一个找到的模型，有动手能力的可以看代码修改 `web\__init__.py`。
 
 ### 3.2 启动工具箱：
 `python demo_toolbox.py -d <datasets_root>`

diff --git a/README.md b/README.md
@@ -18,6 +18,14 @@
 
 ### [DEMO VIDEO](https://www.bilibili.com/video/BV17Q4y1B7mY/)
 
+### Ongoing Works(Helps Needed)
+* Major upgrade on GUI/Client and unifying web and toolbox
+[X] Init framework `./mkgui` and [tech design](https://vaj2fgg8yn.feishu.cn/docs/doccnvotLWylBub8VJIjKzoEaee)
+[X] Add demo part of Voice Cloning and Conversion
+[X] Add preprocessing and training for Voice Conversion
+[ ] Add preprocessing and training for Encoder/Synthesizer/Vocoder
+* Major upgrade on model backend based on ESPnet2(not yet started)
+
 ## Quick Start
 
 ### 1. Install Requirements

diff --git a/gui/___init__.py → mkgui/__init__.py b/gui/___init__.py → mkgui/__init__.py
diff --git a/gui/app.py → mkgui/app.py b/gui/app.py → mkgui/app.py
@@ -8,9 +8,11 @@
 from scipy.io.wavfile import write
 import re
 import numpy as np
-from opyrator.components.types import FileContent
+from mkgui.base.components.types import FileContent
 from vocoder.hifigan import inference as gan_vocoder
 from synthesizer.inference import Synthesizer
+from typing import Any
+import matplotlib.pyplot as plt
 
 # Constants
 AUDIO_SAMPLES_DIR = 'samples\\'
@@ -27,58 +29,80 @@
 if os.path.isdir(SYN_MODELS_DIRT):    
     synthesizers =  Enum('synthesizers', list((file.name, file) for file in Path(SYN_MODELS_DIRT).glob("**/*.pt")))
     print("Loaded synthesizer models: " + str(len(synthesizers)))
+else:
+    raise Exception(f"Model folder {SYN_MODELS_DIRT} doesn't exist.")
+
 if os.path.isdir(ENC_MODELS_DIRT):    
     encoders =  Enum('encoders', list((file.name, file) for file in Path(ENC_MODELS_DIRT).glob("**/*.pt")))
     print("Loaded encoders models: " + str(len(encoders)))
+else:
+    raise Exception(f"Model folder {ENC_MODELS_DIRT} doesn't exist.")
+
 if os.path.isdir(VOC_MODELS_DIRT):    
     vocoders =  Enum('vocoders', list((file.name, file) for file in Path(VOC_MODELS_DIRT).glob("**/*gan*.pt")))
     print("Loaded vocoders models: " + str(len(synthesizers)))
+else:
+    raise Exception(f"Model folder {VOC_MODELS_DIRT} doesn't exist.")
 
 
 class Input(BaseModel):
+    message: str = Field(
+        ..., example="欢迎使用工具箱, 现已支持中文输入！", alias="文本内容"
+    )
     local_audio_file: audio_input_selection = Field(
         ..., alias="输入语音（本地wav）",
         description="选择本地语音文件."
     )
-    upload_audio_file: FileContent = Field(..., alias="或上传语音",
+    upload_audio_file: FileContent = Field(default=None, alias="或上传语音",
         description="拖拽或点击上传.", mime_type="audio/wav")
     encoder: encoders = Field(
         ..., alias="编码模型", 
         description="选择语音编码模型文件."
     )
     synthesizer: synthesizers = Field(
         ..., alias="合成模型", 
-        description="选择语音编码模型文件."
+        description="选择语音合成模型文件."
     )
     vocoder: vocoders = Field(
-        ..., alias="语音编码模型", 
-        description="选择语音编码模型文件(目前只支持HifiGan类型)."
-    )
-    message: str = Field(
-        ..., example="欢迎使用工具箱, 现已支持中文输入！", alias="输出文本内容"
+        ..., alias="语音解码模型", 
+        description="选择语音解码模型文件(目前只支持HifiGan类型)."
     )
 
+class AudioEntity(BaseModel):
+    content: bytes
+    mel: Any
+
 class Output(BaseModel):
-    result_file: FileContent = Field(
-        ...,
-        mime_type="audio/wav",
-        description="输出音频",
-    )
-    source_file: FileContent = Field(
-        ...,
-        mime_type="audio/wav",
-        description="原始音频.",
-    )
+    __root__: tuple[AudioEntity, AudioEntity]
+
+    def render_output_ui(self, streamlit_app, input) -> None:  # type: ignore
+        """Custom output UI.
+        If this method is implmeneted, it will be used instead of the default Output UI renderer.
+        """
+        src, result = self.__root__
+
+        streamlit_app.subheader("Synthesized Audio")
+        streamlit_app.audio(result.content, format="audio/wav")
 
-def mocking_bird(input: Input) -> Output:
-    """欢迎使用MockingBird Web 2"""
+        fig, ax = plt.subplots()
+        ax.imshow(src.mel, aspect="equal", interpolation="none")
+        ax.set_title("mel spectrogram(Source Audio)")
+        streamlit_app.pyplot(fig)
+        fig, ax = plt.subplots()
+        ax.imshow(result.mel, aspect="equal", interpolation="none")
+        ax.set_title("mel spectrogram(Result Audio)")
+        streamlit_app.pyplot(fig)
+
+
+def synthesize(input: Input) -> Output:
+    """synthesize(合成)"""
     # load models
     encoder.load_model(Path(input.encoder.value))
     current_synt = Synthesizer(Path(input.synthesizer.value))
     gan_vocoder.load_model(Path(input.vocoder.value))
 
     # load file
-    if input.upload_audio_file != NULL:
+    if input.upload_audio_file != None:
         with open(TEMP_SOURCE_AUDIO, "w+b") as f:
             f.write(input.upload_audio_file.as_bytes())
             f.seek(0)
@@ -87,6 +111,8 @@ def mocking_bird(input: Input) -> Output:
         wav, sample_rate  = librosa.load(input.local_audio_file.value)
         write(TEMP_SOURCE_AUDIO, sample_rate, wav) #Make sure we get the correct wav
 
+    source_spec = Synthesizer.make_spectrogram(wav)
+
     # preprocess
     encoder_wav = encoder.preprocess_wav(wav, sample_rate)
     embed, _, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
@@ -114,4 +140,4 @@ def mocking_bird(input: Input) -> Output:
         source_file = f.read()
     with open(TEMP_RESULT_AUDIO, "rb") as f:
         result_file = f.read()
-    return Output(source_file=source_file, result_file=result_file)
+    return Output(__root__=(AudioEntity(content=source_file, mel=source_spec), AudioEntity(content=result_file, mel=spec)))
diff --git a/mkgui/app_vc.py b/mkgui/app_vc.py
@@ -0,0 +1,167 @@
+from asyncio.windows_events import NULL
+from synthesizer.inference import Synthesizer
+from pydantic import BaseModel, Field
+from encoder import inference as speacker_encoder
+import torch
+import os
+from pathlib import Path
+from enum import Enum
+import ppg_extractor as Extractor
+import ppg2mel as Convertor
+import librosa
+from scipy.io.wavfile import write
+import re
+import numpy as np
+from mkgui.base.components.types import FileContent
+from vocoder.hifigan import inference as gan_vocoder
+from typing import Any
+import matplotlib.pyplot as plt
+
+
+# Constants
+AUDIO_SAMPLES_DIR = 'samples\\'
+EXT_MODELS_DIRT = "ppg_extractor\\saved_models"
+CONV_MODELS_DIRT = "ppg2mel\\saved_models"
+VOC_MODELS_DIRT = "vocoder\\saved_models"
+TEMP_SOURCE_AUDIO = "wavs/temp_source.wav"
+TEMP_TARGET_AUDIO = "wavs/temp_target.wav"
+TEMP_RESULT_AUDIO = "wavs/temp_result.wav"
+
+# Load local sample audio as options TODO: load dataset 
+if os.path.isdir(AUDIO_SAMPLES_DIR):
+    audio_input_selection = Enum('samples', list((file.name, file) for file in Path(AUDIO_SAMPLES_DIR).glob("*.wav")))
+# Pre-Load models
+if os.path.isdir(EXT_MODELS_DIRT):    
+    extractors =  Enum('extractors', list((file.name, file) for file in Path(EXT_MODELS_DIRT).glob("**/*.pt")))
+    print("Loaded extractor models: " + str(len(extractors)))
+else:
+    raise Exception(f"Model folder {EXT_MODELS_DIRT} doesn't exist.")
+
+if os.path.isdir(CONV_MODELS_DIRT):    
+    convertors =  Enum('convertors', list((file.name, file) for file in Path(CONV_MODELS_DIRT).glob("**/*.pth")))
+    print("Loaded convertor models: " + str(len(convertors)))
+else:
+    raise Exception(f"Model folder {CONV_MODELS_DIRT} doesn't exist.")
+
+if os.path.isdir(VOC_MODELS_DIRT):    
+    vocoders =  Enum('vocoders', list((file.name, file) for file in Path(VOC_MODELS_DIRT).glob("**/*gan*.pt")))
+    print("Loaded vocoders models: " + str(len(vocoders)))
+else:
+    raise Exception(f"Model folder {VOC_MODELS_DIRT} doesn't exist.")
+
+class Input(BaseModel):
+    local_audio_file: audio_input_selection = Field(
+        ..., alias="输入语音（本地wav）",
+        description="选择本地语音文件."
+    )
+    upload_audio_file: FileContent = Field(default=None, alias="或上传语音",
+        description="拖拽或点击上传.", mime_type="audio/wav")
+    local_audio_file_target: audio_input_selection = Field(
+        ..., alias="目标语音（本地wav）",
+        description="选择本地语音文件."
+    )
+    upload_audio_file_target: FileContent = Field(default=None, alias="或上传目标语音",
+        description="拖拽或点击上传.", mime_type="audio/wav")
+    extractor: extractors = Field(
+        ..., alias="编码模型", 
+        description="选择语音编码模型文件."
+    )
+    convertor: convertors = Field(
+        ..., alias="转换模型", 
+        description="选择语音转换模型文件."
+    )
+    vocoder: vocoders = Field(
+        ..., alias="语音编码模型", 
+        description="选择语音解码模型文件(目前只支持HifiGan类型)."
+    )
+
+class AudioEntity(BaseModel):
+    content: bytes
+    mel: Any
+
+class Output(BaseModel):
+    __root__: tuple[AudioEntity, AudioEntity, AudioEntity]
+
+    def render_output_ui(self, streamlit_app, input) -> None:  # type: ignore
+        """Custom output UI.
+        If this method is implmeneted, it will be used instead of the default Output UI renderer.
+        """
+        src, target, result = self.__root__
+
+        streamlit_app.subheader("Synthesized Audio")
+        streamlit_app.audio(result.content, format="audio/wav")
+
+        fig, ax = plt.subplots()
+        ax.imshow(src.mel, aspect="equal", interpolation="none")
+        ax.set_title("mel spectrogram(Source Audio)")
+        streamlit_app.pyplot(fig)
+        fig, ax = plt.subplots()
+        ax.imshow(target.mel, aspect="equal", interpolation="none")
+        ax.set_title("mel spectrogram(Target Audio)")
+        streamlit_app.pyplot(fig)
+        fig, ax = plt.subplots()
+        ax.imshow(result.mel, aspect="equal", interpolation="none")
+        ax.set_title("mel spectrogram(Result Audio)")
+        streamlit_app.pyplot(fig)
+
+def convert(input: Input) -> Output:
+    """convert(转换)"""
+    # load models
+    extractor = Extractor.load_model(Path(input.extractor.value))
+    convertor = Convertor.load_model(Path(input.convertor.value))
+    # current_synt = Synthesizer(Path(input.synthesizer.value))
+    gan_vocoder.load_model(Path(input.vocoder.value))
+
+    # load file
+    if input.upload_audio_file != None:
+        with open(TEMP_SOURCE_AUDIO, "w+b") as f:
+            f.write(input.upload_audio_file.as_bytes())
+            f.seek(0)
+        src_wav, sample_rate = librosa.load(TEMP_SOURCE_AUDIO)
+    else:
+        src_wav, sample_rate  = librosa.load(input.local_audio_file.value)
+        write(TEMP_SOURCE_AUDIO, sample_rate, src_wav) #Make sure we get the correct wav
+
+    if input.upload_audio_file_target != None:
+        with open(TEMP_TARGET_AUDIO, "w+b") as f:
+            f.write(input.upload_audio_file_target.as_bytes())
+            f.seek(0)
+        ref_wav, _ = librosa.load(TEMP_TARGET_AUDIO)
+    else:
+        ref_wav, _  = librosa.load(input.local_audio_file_target.value)
+        write(TEMP_TARGET_AUDIO, sample_rate, ref_wav) #Make sure we get the correct wav
+
+    ppg = extractor.extract_from_wav(src_wav)
+    # Import necessary dependency of Voice Conversion
+    from utils.f0_utils import compute_f0, f02lf0, compute_mean_std, get_converted_lf0uv   
+    ref_lf0_mean, ref_lf0_std = compute_mean_std(f02lf0(compute_f0(ref_wav)))
+    speacker_encoder.load_model(Path("encoder/saved_models/pretrained_bak_5805000.pt"))
+    embed = speacker_encoder.embed_utterance(ref_wav)
+    lf0_uv = get_converted_lf0uv(src_wav, ref_lf0_mean, ref_lf0_std, convert=True)
+    min_len = min(ppg.shape[1], len(lf0_uv))
+    ppg = ppg[:, :min_len]
+    lf0_uv = lf0_uv[:min_len]
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    _, mel_pred, att_ws = convertor.inference(
+        ppg,
+        logf0_uv=torch.from_numpy(lf0_uv).unsqueeze(0).float().to(device),
+        spembs=torch.from_numpy(embed).unsqueeze(0).to(device),
+    )
+    mel_pred= mel_pred.transpose(0, 1)
+    breaks = [mel_pred.shape[1]]
+    mel_pred= mel_pred.detach().cpu().numpy()
+
+    # synthesize and vocode
+    wav, sample_rate = gan_vocoder.infer_waveform(mel_pred)
+
+    # write and output 
+    write(TEMP_RESULT_AUDIO, sample_rate, wav) #Make sure we get the correct wav
+    with open(TEMP_SOURCE_AUDIO, "rb") as f:
+        source_file = f.read()
+    with open(TEMP_TARGET_AUDIO, "rb") as f:
+        target_file = f.read()
+    with open(TEMP_RESULT_AUDIO, "rb") as f:
+        result_file = f.read()
+
+
+    return Output(__root__=(AudioEntity(content=source_file, mel=Synthesizer.make_spectrogram(src_wav)), AudioEntity(content=target_file, mel=Synthesizer.make_spectrogram(ref_wav)), AudioEntity(content=result_file, mel=Synthesizer.make_spectrogram(wav))))
diff --git a/mkgui/base/__init__.py b/mkgui/base/__init__.py
@@ -0,0 +1,2 @@
+
+from .core import Opyrator
diff --git a/mkgui/base/api/__init__.py b/mkgui/base/api/__init__.py
@@ -0,0 +1 @@
+from .fastapi_app import create_api
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,7 +13,6 @@ @@
     *.bbl
     *.bcf
     *.toc
-    *.wav
     *.sh
     */saved_models
     !vocoder/saved_models/pretrained/**
@@ Expand Down @@