TTS新增VALL-E-X的接入

Mintycats · Aug 29, 2023 · 17ec290 · 17ec290
1 parent 05f187a
commit 17ec290
Show file tree

Hide file tree

Showing 11 changed files with 265 additions and 13 deletions.
diff --git a/README.md b/README.md
@@ -1029,6 +1029,12 @@ for id, (key, name) in enumerate(obj.get_speakers().items()):
 训练用整合包（原作者整合包备份）：[https://pan.quark.cn/s/da52e78983da](https://pan.quark.cn/s/da52e78983da)  
 整合包视频教程：[AI声音克隆又进化了，10分钟学会声音克隆！一键启动包发布！](https://www.bilibili.com/video/BV1K94y1k7Bw)  
 
+### VALL-E-X
+官方仓库：[VALL-E-X](https://github.com/Plachtaa/VALL-E-X)  
+个人提供的已训练好的preset预设：[点我跳转](https://github.com/Ikaros-521/VALL-E-X/releases/tag/preset)  
+官方视频讲解：[仅需3秒音频实现声音克隆！我开源了微软的最新语音合成模型VALL-E X](https://www.bilibili.com/video/av617666708)  
+整合包下载：[https://pan.quark.cn/s/b817e285ab41](https://pan.quark.cn/s/b817e285ab41)  
+
 ### DDSP-SVC
 官方仓库：[DDSP-SVC](https://github.com/yxlllc/DDSP-SVC)  
 个人提供的已训练好的模型：[点我跳转](https://github.com/Ikaros-521/DDSP-SVC/releases)  
@@ -1192,6 +1198,7 @@ cmd运行`npm i docsify-cli -g`
 - [x] 支持关闭聊天（LLM等）
 - [x] langchain-chatglm的接入
 - [ ] Edge-TTS在合成音频时会出现合成成功但是系统找不到文件的bug
+- [ ] 抖音关注事件可以触发关注感谢话术
 
 ## 📝更新日志
 
@@ -1493,6 +1500,8 @@ cmd运行`npm i docsify-cli -g`
 - 2023-08-27
   - 新增web字幕打印机的接入
 
+- 2023-08-29
+  - TTS新增VALL-E-X的接入
 
 </details>
 

diff --git a/UI_main.py b/UI_main.py
@@ -86,7 +86,7 @@ def setupUi(self, MainWindow):
         self.scrollArea.setWidgetResizable(True)
         self.scrollArea.setObjectName("scrollArea")
         self.scrollAreaWidgetContents = QtWidgets.QWidget()
-        self.scrollAreaWidgetContents.setGeometry(QtCore.QRect(0, -8830, 984, 9572))
+        self.scrollAreaWidgetContents.setGeometry(QtCore.QRect(0, -6514, 984, 9648))
         self.scrollAreaWidgetContents.setObjectName("scrollAreaWidgetContents")
         self.verticalLayout = QtWidgets.QVBoxLayout(self.scrollAreaWidgetContents)
         self.verticalLayout.setContentsMargins(35, 20, 35, 20)
@@ -1086,6 +1086,14 @@ def setupUi(self, MainWindow):
         self.gridLayout_56.addWidget(self.checkBox_bark_gui_quick_generation, 5, 1, 1, 1)
         self.gridLayout_57.addLayout(self.gridLayout_56, 0, 0, 1, 1)
         self.verticalLayout.addWidget(self.groupBox_bark_gui)
+        self.groupBox_vall_e_x = QtWidgets.QGroupBox(self.scrollAreaWidgetContents)
+        self.groupBox_vall_e_x.setObjectName("groupBox_vall_e_x")
+        self.gridLayout_70 = QtWidgets.QGridLayout(self.groupBox_vall_e_x)
+        self.gridLayout_70.setObjectName("gridLayout_70")
+        self.gridLayout_vall_e_x = QtWidgets.QGridLayout()
+        self.gridLayout_vall_e_x.setObjectName("gridLayout_vall_e_x")
+        self.gridLayout_70.addLayout(self.gridLayout_vall_e_x, 0, 0, 1, 1)
+        self.verticalLayout.addWidget(self.groupBox_vall_e_x)
         self.groupBox_live2d = QtWidgets.QGroupBox(self.scrollAreaWidgetContents)
         self.groupBox_live2d.setObjectName("groupBox_live2d")
         self.gridLayout_30 = QtWidgets.QGridLayout(self.groupBox_live2d)
@@ -2048,6 +2056,7 @@ def retranslateUi(self, MainWindow):
         self.label_bark_gui_waveform_temperature.setText(_translate("MainWindow", "波形温度"))
         self.label_bark_gui_spk.setText(_translate("MainWindow", "说话人"))
         self.checkBox_bark_gui_quick_generation.setText(_translate("MainWindow", "启用"))
+        self.groupBox_vall_e_x.setTitle(_translate("MainWindow", "VALL-E-X"))
         self.groupBox_live2d.setTitle(_translate("MainWindow", "Live2D"))
         self.checkBox_live2d_enable.setText(_translate("MainWindow", "是"))
         self.label_live2d_port.setText(_translate("MainWindow", "端口"))

diff --git a/config.json b/config.json
@@ -126,7 +126,7 @@
     "history_enable": true,
     "history_max_len": 500
   },
-  "audio_synthesis_type": "edge-tts",
+  "audio_synthesis_type": "vall_e_x",
   "audio_random_speed": {
     "normal": {
       "enable": false,
@@ -200,6 +200,13 @@
     "seed": -1.0,
     "batch_count": 1
   },
+  "vall_e_x": {
+    "api_ip_port": "http://127.0.0.1:7860",
+    "language": "auto-detect",
+    "accent": "no-accent",
+    "voice_preset": "ikaros",
+    "voice_preset_file_path": "D:\\GitHub_pro\\AI-Vtuber\\tests\\test_VALL-E-X\\ikaros.npz"
+  },
   "chatterbot": {
     "name": "bot",
     "db_path": "db.sqlite3"
@@ -353,7 +360,7 @@
       }
     ],
     "random_play": true,
-    "play_interval": 5000
+    "play_interval": 30000
   },
   "web_captions_printer": {
     "enable": false,

diff --git a/config.json.bak b/config.json.bak
@@ -200,6 +200,13 @@
     "seed": -1.0,
     "batch_count": 1
   },
+  "vall_e_x": {
+    "api_ip_port": "http://127.0.0.1:7860",
+    "language": "auto-detect",
+    "accent": "no-accent",
+    "voice_preset": "ikaros",
+    "voice_preset_file_path": "D:\\GitHub_pro\\AI-Vtuber\\tests\\test_VALL-E-X\\ikaros.npz"
+  },
   "chatterbot": {
     "name": "bot",
     "db_path": "db.sqlite3"

diff --git a/main.py b/main.py
@@ -941,7 +941,7 @@ def init_config(self):
             self.ui.lineEdit_sparkdesk_api_key.setText(self.sparkdesk_config['api_key'])
 
             self.ui.comboBox_audio_synthesis_type.clear()
-            self.ui.comboBox_audio_synthesis_type.addItems(["Edge-TTS", "VITS", "VITS-Fast", "elevenlabs", "genshinvoice_top", "bark_gui"])
+            self.ui.comboBox_audio_synthesis_type.addItems(["Edge-TTS", "VITS", "VITS-Fast", "elevenlabs", "genshinvoice_top", "bark_gui", "VALL-E-X"])
             audio_synthesis_type_index = 0
             if self.audio_synthesis_type == "edge-tts":
                 audio_synthesis_type_index = 0
@@ -955,6 +955,8 @@ def init_config(self):
                 audio_synthesis_type_index = 4
             elif self.audio_synthesis_type == "bark_gui":
                 audio_synthesis_type_index = 5
+            elif self.audio_synthesis_type == "vall_e_x":
+                audio_synthesis_type_index = 6
             self.ui.comboBox_audio_synthesis_type.setCurrentIndex(audio_synthesis_type_index)
 
             self.ui.lineEdit_vits_fast_config_path.setText(self.vits_fast_config['config_path'])
@@ -1115,6 +1117,9 @@ def init_config(self):
                 talk_google_tgt_lang_index = 2 
             self.ui.comboBox_talk_google_tgt_lang.setCurrentIndex(talk_google_tgt_lang_index)
 
+            """
+            GUI部分 动态生成的widget
+            """
             # 定时任务动态加载
             data_json = []
             for index, tmp in enumerate(config.get("schedule")):
@@ -1596,6 +1601,77 @@ def web_captions_printer_gui_create():
 
             web_captions_printer_gui_create()
 
+            # VALL-E-X
+            def vall_e_x_gui_create():
+                data_json = []
+                vall_e_x_config = config.get("vall_e_x")
+
+                tmp_json = {
+                    "label_text": "API地址",
+                    "label_tip": "VALL-E-X启动后监听的ip端口地址",
+                    "data": vall_e_x_config["api_ip_port"],
+                    "main_obj_name": "vall_e_x",
+                    "index": 1
+                }
+                data_json.append(tmp_json)
+
+                tmp_json = {
+                    "label_text": "language",
+                    "label_tip": "VALL-E-X language",
+                    "widget_type": "combo_box",
+                    "combo_data_list": ['auto-detect', 'English', '中文', '日本語', 'Mix'],
+                    "data": vall_e_x_config["language"],
+                    "main_obj_name": "vall_e_x",
+                    "index": 1
+                }
+                data_json.append(tmp_json)
+
+                tmp_json = {
+                    "label_text": "accent",
+                    "label_tip": "VALL-E-X accent",
+                    "widget_type": "combo_box",
+                    "combo_data_list": ['no-accent', 'English', '中文', '日本語'],
+                    "data": vall_e_x_config["accent"],
+                    "main_obj_name": "vall_e_x",
+                    "index": 1
+                }
+                data_json.append(tmp_json)
+
+                tmp_json = {
+                    "label_text": "voice preset",
+                    "label_tip": "VALL-E-X说话人预设名（Prompt name）",
+                    "data": vall_e_x_config["voice_preset"],
+                    "main_obj_name": "vall_e_x",
+                    "index": 1
+                }
+                data_json.append(tmp_json)
+
+                tmp_json = {
+                    "label_text": "voice_preset_file_path",
+                    "label_tip": "VALL-E-X说话人预设文件路径（npz）",
+                    "data": vall_e_x_config["voice_preset_file_path"],
+                    "main_obj_name": "vall_e_x",
+                    "index": 1
+                }
+                data_json.append(tmp_json)
+
+                widgets = self.create_widgets_from_json(data_json)
+
+                # 动态添加widget到对应的gridLayout
+                row = 0
+                # 分2列，左边就是label说明，右边就是输入框等
+                for i in range(0, len(widgets), 2):
+                    self.ui.gridLayout_vall_e_x.addWidget(widgets[i], row, 0)
+                    self.ui.gridLayout_vall_e_x.addWidget(widgets[i + 1], row, 1)
+                    row += 1
+
+            vall_e_x_gui_create()
+
+
+            """
+            ↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑
+            -------------------------------------------------------------------------------------------------------------
+            """
 
             # 显隐各板块
             self.oncomboBox_chat_type_IndexChanged(chat_type_index)
@@ -1994,6 +2070,8 @@ def common_textEdit_handle(content):
                 config_data["audio_synthesis_type"] = "genshinvoice_top"
             elif audio_synthesis_type == "bark_gui":
                 config_data["audio_synthesis_type"] = "bark_gui"
+            elif audio_synthesis_type == "VALL-E-X":
+                config_data["audio_synthesis_type"] = "vall_e_x"
 
             # 音频随机变速
             config_data["audio_random_speed"]["normal"]["enable"] = self.ui.checkBox_audio_random_speed_normal_enable.isChecked()
@@ -2101,6 +2179,10 @@ def common_textEdit_handle(content):
 
             schedule_data = self.update_data_from_gridLayout(self.ui.gridLayout_schedule)
 
+            """
+            动态读取GUI内数据到配置变量
+            """
+
             def reorganize_schedule_data(schedule_data):
                 tmp_json = []
                 keys = list(schedule_data.keys())
@@ -2293,6 +2375,31 @@ def reorganize_web_captions_printer_data(web_captions_printer_data):
             # 写回json
             config_data["web_captions_printer"] = reorganize_web_captions_printer_data(web_captions_printer_data)
 
+            # VALL-E-X
+            def reorganize_vall_e_x_data(vall_e_x_data):
+                keys = list(vall_e_x_data.keys())
+
+                tmp_json = {
+                    "api_ip_port": vall_e_x_data[keys[0]],
+                    "language": vall_e_x_data[keys[1]],
+                    "accent": vall_e_x_data[keys[2]],
+                    "voice_preset": vall_e_x_data[keys[3]],
+                    "voice_preset_file_path": vall_e_x_data[keys[4]]
+                }
+
+                logging.debug(f"tmp_json={tmp_json}")
+
+                return tmp_json
+
+            vall_e_x_data = self.update_data_from_gridLayout(self.ui.gridLayout_vall_e_x)
+            # 写回json
+            config_data["vall_e_x"] = reorganize_vall_e_x_data(vall_e_x_data)
+
+            """
+            ↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑↑
+            -------------------------------------------------------------------------------------------------------------
+            """
+
             # 获取自定义板块显隐的数据
             show_box_data = self.update_data_from_gridLayout(self.ui.gridLayout_show_box, "show_box")
             show_box_json = {}
@@ -2962,25 +3069,28 @@ def oncomboBox_chat_type_IndexChanged(self, index):
 
 
     # 语音合成类型改变 加载显隐不同groupBox
+    # 当你新增TTS时，你需要同步修改此处的TTS配置的显隐
     def oncomboBox_audio_synthesis_type_IndexChanged(self, index):
         # 各index对应的groupbox的显隐值
         visibility_map = {
-            0: (1, 0, 0, 0, 0, 0),
-            1: (0, 1, 0, 0, 0, 0),
-            2: (0, 0, 1, 0, 0, 0),
-            3: (0, 0, 0, 1, 0, 0),
-            4: (0, 0, 0, 0, 1, 0),
-            5: (0, 0, 0, 0, 0, 1)
+            0: (1, 0, 0, 0, 0, 0, 0),
+            1: (0, 1, 0, 0, 0, 0, 0),
+            2: (0, 0, 1, 0, 0, 0, 0),
+            3: (0, 0, 0, 1, 0, 0, 0),
+            4: (0, 0, 0, 0, 1, 0, 0),
+            5: (0, 0, 0, 0, 0, 1, 0),
+            6: (0, 0, 0, 0, 0, 0, 1)
         }
 
-        visibility_values = visibility_map.get(index, (0, 0, 0, 0, 0, 0))
+        visibility_values = visibility_map.get(index, (0, 0, 0, 0, 0, 0, 0))
 
         self.ui.groupBox_edge_tts.setVisible(visibility_values[0])
         self.ui.groupBox_vits.setVisible(visibility_values[1])
         self.ui.groupBox_vits_fast.setVisible(visibility_values[2])
         self.ui.groupBox_elevenlabs.setVisible(visibility_values[3])
         self.ui.groupBox_genshinvoice_top.setVisible(visibility_values[4])
         self.ui.groupBox_bark_gui.setVisible(visibility_values[5])
+        self.ui.groupBox_vall_e_x.setVisible(visibility_values[6])
 
 
     # 语音识别类型改变 加载显隐不同groupBox

diff --git a/tests/test_VALL-E-X/api5.py b/tests/test_VALL-E-X/api5.py
@@ -0,0 +1,13 @@
+from gradio_client import Client
+
+client = Client("http://127.0.0.1:7860/")
+result = client.predict(
+				"こんにちは",	# str in 'Text' Textbox component
+				"auto-detect",	# str (Option from: ['auto-detect', 'English', '中文', '日本語', 'Mix']) in 'language' Dropdown component
+				"no-accent",	# str (Option from: ['no-accent', 'English', '中文', '日本語']) in 'accent' Dropdown component
+				"ikaros",	# str (Option from: ['astraea', 'cafe', 'dingzhen', 'esta', 'ikaros', 'MakiseKurisu', 'mikako', 'nymph', 'rosalia', 'seel', 'sohara', 'sukata', 'tomoki', 'tomoko', 'yaesakura', '早见沙织', '神里绫华-日语']) in 'Voice preset' Dropdown component
+				"ikaros.npz",	# str (filepath or URL to file) in 'parameter_46' File component
+				fn_index=5
+)
+print(type(result))
+print(result)
diff --git a/tests/test_VALL-E-X/ikaros.npz b/tests/test_VALL-E-X/ikaros.npz
diff --git a/ui/main.ui b/ui/main.ui
@@ -136,9 +136,9 @@ background-color: rgba(255, 255, 255, 50);
            <property name="geometry">
             <rect>
              <x>0</x>
-             <y>-8830</y>
+             <y>-6514</y>
              <width>984</width>
-             <height>9572</height>
+             <height>9648</height>
             </rect>
            </property>
            <layout class="QVBoxLayout" name="verticalLayout">
@@ -2032,6 +2032,18 @@ background-color: rgba(255, 255, 255, 50);
               </layout>
              </widget>
             </item>
+            <item>
+             <widget class="QGroupBox" name="groupBox_vall_e_x">
+              <property name="title">
+               <string>VALL-E-X</string>
+              </property>
+              <layout class="QGridLayout" name="gridLayout_70">
+               <item row="0" column="0">
+                <layout class="QGridLayout" name="gridLayout_vall_e_x"/>
+               </item>
+              </layout>
+             </widget>
+            </item>
             <item>
              <widget class="QGroupBox" name="groupBox_live2d">
               <property name="title">