接入clone-voice（新TTS）

xuxubaob · Feb 9, 2024 · ff8a5af · ff8a5af
1 parent db53421
commit ff8a5af
Show file tree

Hide file tree

Showing 7 changed files with 152 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@
 
 `Luna AI` 的外观由 `Live2D、Vtube Studio、xuniren 和 UE5 结合 Audio2Face` 技术打造，为用户提供了一个生动、互动的虚拟形象。这使得 `Luna AI` 能够在各大直播平台，如 `Bilibili、抖音、快手、微信视频号、斗鱼、YouTube、Twitch 和 TikTok`，进行实时互动直播。当然，它也可以在本地环境中与您进行个性化对话。
 
-为了使交流更加自然，`Luna AI` 使用了先进的自然语言处理技术，结合文本转语音系统，如 `Edge-TTS、VITS-Fast、elevenlabs、bark-gui、VALL-E-X、睿声AI、genshinvoice.top、tts.ai-lab.top、OpenVoice 和 GPT_SoVITS`。这不仅让它能够生成流畅的回答，还可以通过 `so-vits-svc 和 DDSP-SVC` 实现声音的变化，以适应不同的场景和角色。
+为了使交流更加自然，`Luna AI` 使用了先进的自然语言处理技术，结合文本转语音系统，如 `Edge-TTS、VITS-Fast、elevenlabs、bark-gui、VALL-E-X、睿声AI、genshinvoice.top、tts.ai-lab.top、OpenVoice、GPT_SoVITS 和 clone-voice`。这不仅让它能够生成流畅的回答，还可以通过 `so-vits-svc 和 DDSP-SVC` 实现声音的变化，以适应不同的场景和角色。
 
 此外，`Luna AI` 还能够通过特定指令与 `Stable Diffusion` 协作，展示画作。用户还可以自定义文案，让 Luna AI 循环播放，以满足不同场合的需求。
 

diff --git a/config.json b/config.json
@@ -490,7 +490,7 @@
     "voiceId": "b4b885c3-89a7-46d4-badb-015a55bb3a91"
   },
   "gradio_tts": {
-    "request_parameters": "{{\"url\": \"https://v2.genshinvoice.top/\", \"fn_index\": 0, \"data_analysis\": 1, \"text_input\": \"{content}\", \"speaker_option\": \"派蒙_ZH\", \"sdp_ratio\": 0.5, \"noise\": 0.6, \"noise_w\": 0.9, \"length\": 1, \"language\": \"ZH\", \"audio_prompt_url\": null, \"text_prompt\": \"Happy\", \"prompt_mode\": \"Text prompt\", \"auxiliary_text\": \"\", \"weight\": 0.7}}"
+    "request_parameters": "{{\"url\": \"https://xzjosh-nana7mi-bert-vits2.hf.space/--replicas/b9be4/\", \"fn_index\": 0, \"data_analysis\": 1, \"text_input\": \"{content}\", \"speaker_option\": \"Nana7mi\", \"sdp_ratio\": 0.5, \"noise\": 0.6, \"noise_w\": 0.9, \"length\": 1}}"
   },
   "gpt_sovits": {
     "type": "gradio",
@@ -509,6 +509,13 @@
       "emotion": "正常"
     }
   },
+  "clone_voice": {
+    "type": "tts",
+    "api_ip_port": "http://127.0.0.1:9988",
+    "voice": "cn-nan.wav",
+    "language": "zh-cn",
+    "speed": 1
+  },
   "choose_song": {
     "enable": true,
     "similarity": 0.5,
@@ -1240,7 +1247,8 @@
         "openai_tts": true,
         "reecho_ai": true,
         "gradio_tts": true,
-        "gpt_sovits": true
+        "gpt_sovits": true,
+        "clone_voice": true
       },
       "svc": {
         "ddsp_svc": true,

diff --git a/config.json.bak b/config.json.bak
@@ -490,7 +490,7 @@
     "voiceId": "b4b885c3-89a7-46d4-badb-015a55bb3a91"
   },
   "gradio_tts": {
-    "request_parameters": "{{\"url\": \"https://v2.genshinvoice.top/\", \"fn_index\": 0, \"data_analysis\": 1, \"text_input\": \"{content}\", \"speaker_option\": \"派蒙_ZH\", \"sdp_ratio\": 0.5, \"noise\": 0.6, \"noise_w\": 0.9, \"length\": 1, \"language\": \"ZH\", \"audio_prompt_url\": null, \"text_prompt\": \"Happy\", \"prompt_mode\": \"Text prompt\", \"auxiliary_text\": \"\", \"weight\": 0.7}}"
+    "request_parameters": "{{\"url\": \"https://xzjosh-nana7mi-bert-vits2.hf.space/--replicas/b9be4/\", \"fn_index\": 0, \"data_analysis\": 1, \"text_input\": \"{content}\", \"speaker_option\": \"Nana7mi\", \"sdp_ratio\": 0.5, \"noise\": 0.6, \"noise_w\": 0.9, \"length\": 1}}"
   },
   "gpt_sovits": {
     "type": "gradio",
@@ -509,6 +509,13 @@
       "emotion": "正常"
     }
   },
+  "clone_voice": {
+    "type": "tts",
+    "api_ip_port": "http://127.0.0.1:9988",
+    "voice": "cn-nan.wav",
+    "language": "zh-cn",
+    "speed": 1
+  },
   "choose_song": {
     "enable": true,
     "similarity": 0.5,
@@ -1240,7 +1247,8 @@
         "openai_tts": true,
         "reecho_ai": true,
         "gradio_tts": true,
-        "gpt_sovits": true
+        "gpt_sovits": true,
+        "clone_voice": true
       },
       "svc": {
         "ddsp_svc": true,

diff --git a/tests/test_clone_voice/api.py b/tests/test_clone_voice/api.py
@@ -0,0 +1,40 @@
+import json, logging, asyncio
+import aiohttp, requests, ssl
+from urllib.parse import urlencode
+import traceback
+from urllib.parse import urljoin
+
+async def clone_voice_api(text):
+    url = 'http://127.0.0.1:9988/tts'
+
+    # voice=cn-nan.wav&text=%E4%BD%A0%E5%A5%BD&language=zh-cn&speed=1
+    params = {
+        "voice": "cn-nan.wav",
+        "language": "zh-cn",
+        'speed': 1,
+        'text': text
+    }
+
+    print(f"params={params}")
+
+    try:
+        async with aiohttp.ClientSession() as session:
+            async with session.post(url, data=params) as response:
+                ret = await response.json()
+                print(ret)
+
+                file_path = ret["filename"]
+
+                return file_path
+
+    except aiohttp.ClientError as e:
+        logging.error(traceback.format_exc())
+        logging.error(f'clone_voice请求失败: {e}')
+    except Exception as e:
+        logging.error(traceback.format_exc())
+        logging.error(f'clone_voice未知错误: {e}')
+
+    return None
+
+
+asyncio.run(clone_voice_api("你好"))
diff --git a/utils/audio.py b/utils/audio.py
@@ -736,6 +736,17 @@ async def voice_change_and_put_to_queue(message, voice_tmp_path):
                 }
 
                 voice_tmp_path = await self.my_tts.gpt_sovits_api(data)  
+            elif message["tts_type"] == "clone_voice":
+                data = {
+                    "type": message["data"]["type"],
+                    "api_ip_port": message["data"]["api_ip_port"],
+                    "voice": message["data"]["voice"],
+                    "language": message["data"]["language"],
+                    "speed": message["data"]["speed"],
+                    "content": message["content"]
+                }
+
+                voice_tmp_path = await self.my_tts.clone_voice_api(data) 
             elif message["tts_type"] == "none":
                 pass
         except Exception as e:
@@ -1497,7 +1508,20 @@ async def voice_change_and_put_to_queue(voice_tmp_path):
 
                             # 调用接口合成语音
                             voice_tmp_path = await self.my_tts.gpt_sovits_api(content)
-
+
+                        elif audio_synthesis_type == "clone_voice":
+                            data = {
+                                "type": self.config.get("clone_voice", "type"),
+                                "api_ip_port": self.config.get("clone_voice", "api_ip_port"),
+                                "voice": self.config.get("clone_voice", "voice"),
+                                "language": self.config.get("clone_voice", "language"),
+                                "speed": self.config.get("clone_voice", "speed"),
+                                "content": content
+                            }
+
+                            # 调用接口合成语音
+                            voice_tmp_path = await self.my_tts.clone_voice_api(content)
+
                         if voice_tmp_path is None:
                             raise Exception(f"{audio_synthesis_type}合成失败")
 

diff --git a/utils/audio_handle/my_tts.py b/utils/audio_handle/my_tts.py
@@ -667,3 +667,36 @@ async def websocket_client_logic(websocket, data_json):
             logging.error(f'gpt_sovits未知错误，请检查您的gpt_sovits推理是否启动/配置是否正确，报错内容: {e}')
 
         return None
+
+
+    async def clone_voice_api(self, data):
+        API_URL = urljoin(data["api_ip_port"], '/tts')
+
+        # voice=cn-nan.wav&text=%E4%BD%A0%E5%A5%BD&language=zh-cn&speed=1
+        params = {
+            "voice": data["voice"],
+            "language": data["language"],
+            "speed": data["speed"],
+            "text": data["content"]
+        }
+
+        logging.debug(f"params={params}")
+
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.post(API_URL, data=params) as response:
+                    ret = await response.json()
+                    logging.debug(ret)
+
+                    file_path = ret["filename"]
+
+                    return file_path
+
+        except aiohttp.ClientError as e:
+            logging.error(traceback.format_exc())
+            logging.error(f'clone_voice请求失败: {e}')
+        except Exception as e:
+            logging.error(traceback.format_exc())
+            logging.error(f'clone_voice未知错误: {e}')
+
+        return None
diff --git a/webui.py b/webui.py
@@ -1272,7 +1272,14 @@ def common_textarea_handle(content):
                     config_data["gpt_sovits"]["webtts"]["lang"] = select_gpt_sovits_webtts_lang.value
                     config_data["gpt_sovits"]["webtts"]["speed"] = input_gpt_sovits_webtts_speed.value
                     config_data["gpt_sovits"]["webtts"]["emotion"] = input_gpt_sovits_webtts_emotion.value
-
+
+                if config.get("webui", "show_card", "tts", "clone_voice"):
+                    config_data["clone_voice"]["type"] = select_clone_voice_type.value
+                    config_data["clone_voice"]["api_ip_port"] = input_clone_voice_api_ip_port.value
+                    config_data["clone_voice"]["voice"] = input_clone_voice_voice.value
+                    config_data["clone_voice"]["language"] = select_clone_voice_language.value
+                    config_data["clone_voice"]["speed"] = float(input_clone_voice_speed.value)
+
             """
             SVC
             """
@@ -1541,6 +1548,7 @@ def common_textarea_handle(content):
                 config_data["webui"]["show_card"]["tts"]["reecho_ai"] = switch_webui_show_card_tts_reecho_ai.value
                 config_data["webui"]["show_card"]["tts"]["gradio_tts"] = switch_webui_show_card_tts_gradio_tts.value
                 config_data["webui"]["show_card"]["tts"]["gpt_sovits"] = switch_webui_show_card_tts_gpt_sovits.value
+                config_data["webui"]["show_card"]["tts"]["clone_voice"] = switch_webui_show_card_tts_clone_voice.value
 
                 config_data["webui"]["show_card"]["svc"]["ddsp_svc"] = switch_webui_show_card_svc_ddsp_svc.value
                 config_data["webui"]["show_card"]["svc"]["so_vits_svc"] = switch_webui_show_card_svc_so_vits_svc.value                
@@ -1694,6 +1702,7 @@ def common_textarea_handle(content):
                         'reecho_ai': '睿声AI',
                         'gradio_tts': 'Gradio',
                         'gpt_sovits': 'GPT_SoVITS',
+                        'clone_voice': 'clone-voice'
                     }, 
                     value=config.get("audio_synthesis_type")
                 ).style("width:200px;")
@@ -2902,6 +2911,27 @@ def common_textarea_handle(content):
                             ).style("width:200px;")
                             input_gpt_sovits_webtts_speed = ui.input(label='语速', value=config.get("gpt_sovits", "webtts", "speed"), placeholder='语速').style("width:200px;")
                             input_gpt_sovits_webtts_emotion = ui.input(label='情感', value=config.get("gpt_sovits", "webtts", "emotion"), placeholder='情感').style("width:200px;")
+
+            if config.get("webui", "show_card", "tts", "clone_voice"): 
+                with ui.card().style(card_css):
+                    ui.label("clone-voice")
+                    with ui.row():
+                        select_clone_voice_type = ui.select(
+                            label='API接口类型', 
+                            options={'tts':'tts'}, 
+                            value=config.get("clone_voice", "type")
+                        ).style("width:100px;")
+                        input_clone_voice_api_ip_port = ui.input(label='API地址', value=config.get("clone_voice", "api_ip_port"), placeholder='官方程序启动后监听的地址').style("width:200px;")
+                    with ui.row():
+                        input_clone_voice_voice = ui.input(label='参考音频路径', value=config.get("clone_voice", "voice"), placeholder='参考音频路径，建议填绝对路径').style("width:200px;")
+                        select_clone_voice_language = ui.select(
+                            label='需要合成的语种', 
+                            options={'zh-cn':'中文', 'ja':'日文', 'en':'英文',"ko":'ko',"es":'es',"de":'de',
+                                     "fr":'fr',"it":'it',"tr":'tr',"ru":'ru',"pt":'pt',"pl":'pl',"nl":'nl',"ar":'ar',"hu":'hu',"cs":'cs'}, 
+                            value=config.get("clone_voice", "language")
+                        ).style("width:200px;")
+                        input_clone_voice_speed = ui.input(label='语速', value=config.get("clone_voice", "speed"), placeholder='语速').style("width:100px;")
+
         with ui.tab_panel(svc_page).style(tab_panel_css):
             if config.get("webui", "show_card", "svc", "ddsp_svc"):
                 with ui.card().style(card_css):
@@ -3472,6 +3502,8 @@ def update_echart_gift():
                         switch_webui_show_card_tts_reecho_ai = ui.switch('reecho_ai', value=config.get("webui", "show_card", "tts", "reecho_ai")).style(switch_internal_css)
                         switch_webui_show_card_tts_gradio_tts = ui.switch('gradio', value=config.get("webui", "show_card", "tts", "gradio_tts")).style(switch_internal_css)
                         switch_webui_show_card_tts_gpt_sovits = ui.switch('gpt_sovits', value=config.get("webui", "show_card", "tts", "gpt_sovits")).style(switch_internal_css)
+                        switch_webui_show_card_tts_clone_voice = ui.switch('clone_voice', value=config.get("webui", "show_card", "tts", "clone_voice")).style(switch_internal_css)
+
                 with ui.card().style(card_css):
                     ui.label("变声")
                     with ui.row():