新增openai tts接入

Meeweston · Nov 9, 2023 · 525e703 · 525e703
1 parent acd96fd
commit 525e703
Show file tree

Hide file tree

Showing 7 changed files with 140 additions and 3 deletions.
diff --git a/config.json b/config.json
@@ -185,7 +185,7 @@
     "history_enable": true,
     "history_max_len": 500
   },
-  "audio_synthesis_type": "genshinvoice_top",
+  "audio_synthesis_type": "edge-tts",
   "audio_random_speed": {
     "normal": {
       "enable": false,
@@ -269,6 +269,13 @@
     "voice_preset": "ikaros",
     "voice_preset_file_path": "D:\\GitHub_pro\\AI-Vtuber\\tests\\test_VALL-E-X\\ikaros.npz"
   },
+  "openai_tts": {
+    "type": "api",
+    "api_ip_port": "https://ysharma-openai-tts-new.hf.space/--replicas/zcq5n/",
+    "model": "tts-1",
+    "voice": "nova",
+    "api_key": "你的openai api key"
+  },
   "chatterbot": {
     "name": "bot",
     "db_path": "db.sqlite3"

diff --git a/config.json.bak b/config.json.bak
@@ -185,7 +185,7 @@
     "history_enable": true,
     "history_max_len": 500
   },
-  "audio_synthesis_type": "genshinvoice_top",
+  "audio_synthesis_type": "edge-tts",
   "audio_random_speed": {
     "normal": {
       "enable": false,
@@ -269,6 +269,13 @@
     "voice_preset": "ikaros",
     "voice_preset_file_path": "D:\\GitHub_pro\\AI-Vtuber\\tests\\test_VALL-E-X\\ikaros.npz"
   },
+  "openai_tts": {
+    "type": "api",
+    "api_ip_port": "https://ysharma-openai-tts-new.hf.space/--replicas/zcq5n/",
+    "model": "tts-1",
+    "voice": "nova",
+    "api_key": "你的openai api key"
+  },
   "chatterbot": {
     "name": "bot",
     "db_path": "db.sqlite3"

diff --git a/requirements_common.txt b/requirements_common.txt
@@ -48,7 +48,7 @@ multidict==6.0.4
 murmurhash==1.0.9
 mypy-extensions==1.0.0
 numexpr==2.8.4
-openai==0.27.2
+openai==1.2.0
 openapi-schema-pydantic==1.2.4
 packaging==23.1
 parso==0.8.3

diff --git a/tests/test_openai_tts/hf.py b/tests/test_openai_tts/hf.py
@@ -0,0 +1,11 @@
+from gradio_client import Client
+
+client = Client("https://ysharma-openai-tts-new.hf.space/--replicas/zcq5n/")
+result = client.predict(
+		"你好",	# str  in 'Input text' Textbox component
+		"tts-1",	# Literal[tts-1, tts-1-hd]  in 'Model' Dropdown component
+		"nova",	# Literal[alloy, echo, fable, onyx, nova, shimmer]  in 'Voice Options' Dropdown component
+		"sk-",	# str  in 'OpenAI API Key' Textbox component
+		api_name="/tts_enter_key"
+)
+print(f"音频合成成功，输出到={result}")
diff --git a/utils/audio.py b/utils/audio.py
@@ -581,6 +581,28 @@ async def voice_change_and_put_to_queue(message, voice_tmp_path):
             except Exception as e:
                 logging.error(traceback.format_exc())
                 return
+        elif message["tts_type"] == "openai_tts":
+            try:
+                data = {
+                    "type": message["data"]["type"],
+                    "api_ip_port": message["data"]["api_ip_port"],
+                    "model": message["data"]["model"],
+                    "voice": message["data"]["voice"],
+                    "api_key": message["data"]["api_key"],
+                    "content": message["content"]
+                }
+
+                # 调用接口合成语音
+                voice_tmp_path = self.my_tts.openai_tts_api(data)
+                logging.info(f"openai_tts合成成功，合成内容：【{message['content']}】，输出到={voice_tmp_path}")
+
+                if voice_tmp_path is None:
+                    return
+
+                await voice_change_and_put_to_queue(message, voice_tmp_path)  
+            except Exception as e:
+                logging.error(traceback.format_exc())
+                return
 
 
     # 音频变速
@@ -950,6 +972,7 @@ async def copywriting_synthesis_audio(self, file_path, out_audio_path="out/"):
             edge_tts_config = self.config.get("edge-tts")
             bark_gui = self.config.get("bark_gui")
             vall_e_x = self.config.get("vall_e_x")
+            openai_tts = self.config.get("openai_tts")
             genshinvoice_top = self.config.get("genshinvoice_top")
             file_path = os.path.join(file_path)
 
@@ -1163,6 +1186,28 @@ async def voice_change_and_put_to_queue(voice_tmp_path):
                         voice_tmp_path = await self.my_tts.genshinvoice_top_api(content)
                         logging.info(f"genshinvoice_top合成成功，合成内容：【{content}】，输出到={voice_tmp_path}")
 
+                        if voice_tmp_path is None:
+                            return
+
+                        await voice_change_and_put_to_queue(voice_tmp_path)
+                    except Exception as e:
+                        logging.error(traceback.format_exc())
+                        return
+                elif audio_synthesis_type == "openai_tts":
+                    try:
+                        data = {
+                            "type": openai_tts["type"],
+                            "api_ip_port": openai_tts["api_ip_port"],
+                            "model": openai_tts["model"],
+                            "voice": openai_tts["voice"],
+                            "api_key": openai_tts["api_key"],
+                            "content": content
+                        }
+
+                        # 调用接口合成语音
+                        voice_tmp_path = self.my_tts.openai_tts_api(data)
+                        logging.info(f"openai_tts合成成功，合成内容：【{content}】，输出到={voice_tmp_path}")
+
                         if voice_tmp_path is None:
                             return
 

diff --git a/utils/audio_handle/my_tts.py b/utils/audio_handle/my_tts.py
@@ -245,3 +245,41 @@ async def genshinvoice_top_api(self, text):
             logging.error(f'genshinvoice.top未知错误: {e}')
 
         return None
+
+
+    # 请求OpenAI_TTS的api
+    def openai_tts_api(self, data):
+        try:
+            if data["type"] == "huggingface":
+                client = Client(data["api_ip_port"])
+                result = client.predict(
+                    data["content"],	# str in 'Text' Textbox component
+                    data["model"],	# Literal[tts-1, tts-1-hd]  in 'Model' Dropdown component
+                    data["voice"],	# Literal[alloy, echo, fable, onyx, nova, shimmer]  in 'Voice Options' Dropdown component
+                    data["api_key"],	# str  in 'OpenAI API Key' Textbox component
+                    api_name="/tts_enter_key"
+                )
+
+                new_file_path = self.common.move_file(result, os.path.join(self.audio_out_path, 'openai_tts_' + self.common.get_bj_time(4)), 'openai_tts_' + self.common.get_bj_time(4), "mp3")
+
+                return new_file_path
+            elif data["type"] == "api":
+                from openai import OpenAI
+
+                client = OpenAI(api_key=data["api_key"])
+
+                response = client.audio.speech.create(
+                    model=data["model"],
+                    voice=data["voice"],
+                    input=data["content"]
+                )
+
+                file_name = 'openai_tts_' + self.common.get_bj_time(4) + '.mp3'
+                voice_tmp_path = self.common.get_new_audio_path(self.audio_out_path, file_name)
+
+                response.stream_to_file(voice_tmp_path)
+
+                return voice_tmp_path
+        except Exception as e:
+            logging.error(f'OpenAI_TTS请求失败: {e}')
+            return None
diff --git a/webui.py b/webui.py
@@ -535,6 +535,12 @@ def common_textarea_handle(content):
                 config_data["vall_e_x"]["accent"] = select_vall_e_x_accent.value
                 config_data["vall_e_x"]["voice_preset"] = input_vall_e_x_voice_preset.value
                 config_data["vall_e_x"]["voice_preset_file_path"] = input_vall_e_x_voice_preset_file_path.value
+
+                config_data["openai_tts"]["type"] = select_openai_tts_type.value
+                config_data["openai_tts"]["api_ip_port"] = input_openai_tts_api_ip_port.value
+                config_data["openai_tts"]["model"] = select_openai_tts_model.value
+                config_data["openai_tts"]["voice"] = select_openai_tts_voice.value
+                config_data["openai_tts"]["api_key"] = input_openai_tts_api_key.value
 
             """
             SVC
@@ -740,6 +746,7 @@ def common_textarea_handle(content):
                         'genshinvoice_top': 'genshinvoice_top',
                         'bark_gui': 'bark_gui',
                         'vall_e_x': 'VALL-E-X',
+                        'openai_tts': 'OpenAI TTS',
                     }, 
                     value=config.get("audio_synthesis_type")
                 ).style("width:200px;")
@@ -1377,6 +1384,28 @@ def common_textarea_handle(content):
 
                     input_vall_e_x_voice_preset = ui.input(label='voice preset', placeholder='VALL-E-X说话人预设名（Prompt name）', value=config.get("vall_e_x", "voice_preset")).style("width:300px;")
                     input_vall_e_x_voice_preset_file_path = ui.input(label='voice_preset_file_path', placeholder='VALL-E-X说话人预设文件路径（npz）', value=config.get("vall_e_x", "voice_preset_file_path")).style("width:300px;")
+            with ui.card().style("margin:10px 0px;background: linear-gradient(45deg, #3494E6, #EC6EAD);"):
+                ui.label("OpenAI TTS")
+                with ui.row():
+                    select_openai_tts_type = ui.select(
+                        label='类型', 
+                        options={'api': 'api', 'huggingface': 'huggingface'}, 
+                        value=config.get("openai_tts", "type")
+                    ).style("width:200px;")
+                    input_openai_tts_api_ip_port = ui.input(label='API地址', value=config.get("openai_tts", "api_ip_port"), placeholder='huggingface上对应项目的API地址').style("width:200px;")
+                with ui.row():
+                    select_openai_tts_model = ui.select(
+                        label='模型', 
+                        options={'tts-1': 'tts-1', 'tts-1-hd': 'tts-1-hd'}, 
+                        value=config.get("openai_tts", "model")
+                    ).style("width:200px;")
+                    select_openai_tts_voice = ui.select(
+                        label='说话人', 
+                        options={'alloy': 'alloy', 'echo': 'echo', 'fable': 'fable', 'onyx': 'onyx', 'nova': 'nova', 'shimmer': 'shimmer'}, 
+                        value=config.get("openai_tts", "voice")
+                    ).style("width:200px;")
+                    input_openai_tts_api_key = ui.input(label='api key', value=config.get("openai_tts", "api_key"), placeholder='OpenAI API KEY').style("width:200px;")
+
         with ui.tab_panel(svc_page).style("background: linear-gradient(45deg, #3494E6, #EC6EAD);"):
             with ui.card().style("margin:10px 0px;background: linear-gradient(45deg, #3494E6, #EC6EAD);"):
                 ui.label("DDSP-SVC")