对接 fish-speech

molihuan · Mar 12, 2024 · 344006f · 344006f
1 parent dee9065
commit 344006f
Show file tree

Hide file tree

Showing 7 changed files with 313 additions and 6 deletions.
diff --git a/config.json b/config.json
@@ -569,6 +569,37 @@
     "region": "japanwest",
     "voice_name": "zh-CN-XiaoyanNeural"
   },
+  "fish_speech": {
+    "api_ip_port": "http://127.0.0.1:8000",
+    "model_name": "default",
+    "model_config": {
+      "device": "cuda",
+      "llama": {
+        "config_name": "text2semantic_finetune",
+        "checkpoint_path": "checkpoints/text2semantic-400m-v0.2-4k.pth",
+        "precision": "bfloat16",
+        "tokenizer": "fishaudio/speech-lm-v1",
+        "compile": true
+      },
+      "vqgan": {
+        "config_name": "vqgan_pretrain",
+        "checkpoint_path": "checkpoints/vqgan-v1.pth"
+      }
+    },
+    "tts_config": {
+      "prompt_text": "",
+      "prompt_tokens": "",
+      "max_new_tokens": 0,
+      "top_k": 3,
+      "top_p": 0.5,
+      "repetition_penalty": 1.5,
+      "temperature": 0.7,
+      "order": "zh,jp,en",
+      "use_g2p": true,
+      "seed": 1,
+      "speaker": ""
+    }
+  },
   "choose_song": {
     "enable": true,
     "similarity": 0.5,
@@ -1324,7 +1355,8 @@
         "gradio_tts": true,
         "gpt_sovits": true,
         "clone_voice": true,
-        "azure_tts": true
+        "azure_tts": true,
+        "fish_speech": true
       },
       "svc": {
         "ddsp_svc": true,

diff --git a/config.json.bak b/config.json.bak
@@ -569,6 +569,37 @@
     "region": "japanwest",
     "voice_name": "zh-CN-XiaoyanNeural"
   },
+  "fish_speech": {
+    "api_ip_port": "http://127.0.0.1:8000",
+    "model_name": "default",
+    "model_config": {
+      "device": "cuda",
+      "llama": {
+        "config_name": "text2semantic_finetune",
+        "checkpoint_path": "checkpoints/text2semantic-400m-v0.2-4k.pth",
+        "precision": "bfloat16",
+        "tokenizer": "fishaudio/speech-lm-v1",
+        "compile": true
+      },
+      "vqgan": {
+        "config_name": "vqgan_pretrain",
+        "checkpoint_path": "checkpoints/vqgan-v1.pth"
+      }
+    },
+    "tts_config": {
+      "prompt_text": "",
+      "prompt_tokens": "",
+      "max_new_tokens": 0,
+      "top_k": 3,
+      "top_p": 0.5,
+      "repetition_penalty": 1.5,
+      "temperature": 0.7,
+      "order": "zh,jp,en",
+      "use_g2p": true,
+      "seed": 1,
+      "speaker": ""
+    }
+  },
   "choose_song": {
     "enable": true,
     "similarity": 0.5,
@@ -1324,7 +1355,8 @@
         "gradio_tts": true,
         "gpt_sovits": true,
         "clone_voice": true,
-        "azure_tts": true
+        "azure_tts": true,
+        "fish_speech": true
       },
       "svc": {
         "ddsp_svc": true,

diff --git a/tests/test_fish_speech/1.wav b/tests/test_fish_speech/1.wav
diff --git a/tests/test_fish_speech/api.py b/tests/test_fish_speech/api.py
@@ -0,0 +1,106 @@
+import json, logging
+import aiohttp, asyncio
+from urllib.parse import urljoin
+
+async def fish_speech_load_model(data):
+    API_URL = urljoin(data["api_ip_port"], f'/v1/models/{data["model_name"]}')
+
+    try:
+        async with aiohttp.ClientSession() as session:
+            async with session.put(API_URL, json=data["model_config"]) as response:
+                if response.status == 200:
+                    ret = await response.json()
+                    print(ret)
+
+                    if ret["name"] == data["model_name"]:
+                        print(f'fish_speech模型加载成功: {ret["name"]}')
+                        return ret
+                else: 
+                    return None
+
+    except aiohttp.ClientError as e:
+        print(f'fish_speech请求失败: {e}')
+    except Exception as e:
+        print(f'fish_speech未知错误: {e}')
+
+    return None
+
+async def fish_speech_api(data):
+    API_URL = urljoin(data["api_ip_port"], f'/v1/models/{data["model_name"]}/invoke')
+
+    print(f"data={data}")
+
+    def replace_empty_strings_with_none(input_dict):
+        for key, value in input_dict.items():
+            if value == "":
+                input_dict[key] = None
+        return input_dict
+
+    data["tts_config"] = replace_empty_strings_with_none(data["tts_config"])
+
+    print(f"data={data}")
+
+    try:
+        async with aiohttp.ClientSession() as session:
+            async with session.post(API_URL, json=data["tts_config"]) as response:
+                if response.status == 200:
+                    content = await response.read()
+
+                    # voice_tmp_path = os.path.join(self.audio_out_path, 'reecho_ai_' + self.common.get_bj_time(4) + '.wav')
+                    # file_name = 'fish_speech_' + self.common.get_bj_time(4) + '.wav'
+
+                    # voice_tmp_path = self.common.get_new_audio_path(self.audio_out_path, file_name)
+                    voice_tmp_path = "1.wav"
+                    with open(voice_tmp_path, 'wb') as file:
+                        file.write(content)
+
+                    return voice_tmp_path
+                else:
+                    print(f'fish_speech下载音频失败: {response.status}')
+                    return None
+    except aiohttp.ClientError as e:
+        print(f'fish_speech请求失败: {e}')
+    except Exception as e:
+        print(f'fish_speech未知错误: {e}')
+
+    return None
+
+
+data = {
+    "fish_speech": {
+        "api_ip_port": "http://127.0.0.1:8000",
+        "model_name": "default",
+        "model_config": {
+            "device": "cuda",
+            "llama": {
+                "config_name": "text2semantic_finetune",
+                "checkpoint_path": "checkpoints/text2semantic-400m-v0.2-4k.pth",
+                "precision": "bfloat16",
+                "tokenizer": "fishaudio/speech-lm-v1",
+                "compile": True
+            },
+            "vqgan": {
+                "config_name": "vqgan_pretrain",
+                "checkpoint_path": "checkpoints/vqgan-v1.pth"
+            }
+        },
+        "tts_config": {
+            "prompt_text": "",
+            "prompt_tokens": "",
+            "max_new_tokens": 0,
+            "top_k": 3,
+            "top_p": 0.5,
+            "repetition_penalty": 1.5,
+            "temperature": 0.7,
+            "order": "zh,jp,en",
+            "use_g2p": True,
+            "seed": 1,
+            "speaker": ""
+        }
+    }
+}
+
+asyncio.run(fish_speech_load_model(data["fish_speech"]))
+
+data["fish_speech"]["tts_config"]["text"] = "你好"
+asyncio.run(fish_speech_api(data["fish_speech"]))
diff --git a/utils/audio.py b/utils/audio.py
@@ -863,6 +863,11 @@ async def voice_change_and_put_to_queue(message, voice_tmp_path):
                 }
 
                 voice_tmp_path = self.my_tts.azure_tts_api(data) 
+            elif message["tts_type"] == "fish_speech":
+                data = message["data"]
+                data["tts_config"]["text"] = message["content"]
+
+                voice_tmp_path = await self.my_tts.fish_speech_api(data) 
             elif message["tts_type"] == "none":
                 pass
         except Exception as e:
@@ -1596,6 +1601,14 @@ async def audio_synthesis_use_local_config(self, content, audio_synthesis_type="
             logging.debug(f"data={data}")
 
             voice_tmp_path = self.my_tts.azure_tts_api(data) 
+        elif audio_synthesis_type == "fish_speech":
+            data = self.config.get("fish_speech")
+            data["tts_config"]["text"] = content
+
+            logging.debug(f"data={data}")
+
+            voice_tmp_path = await self.my_tts.fish_speech_api(data) 
+
 
         return voice_tmp_path
 

diff --git a/utils/audio_handle/my_tts.py b/utils/audio_handle/my_tts.py
@@ -746,4 +746,67 @@ def azure_tts_api(self, data):
             logging.error(traceback.format_exc())
             logging.error(f'azure_tts未知错误: {e}')
 
-            return None
+            return None
+
+
+    async def fish_speech_load_model(self, data):
+        API_URL = urljoin(data["api_ip_port"], f'/v1/models/{data["model_name"]}')
+
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.put(API_URL, json=data["model_config"]) as response:
+                    if response.status == 200:
+                        ret = await response.json()
+                        logging.debug(ret)
+
+                        if ret["name"] == data["model_name"]:
+                            logging.info(f'fish_speech模型加载成功: {ret["name"]}')
+                            return ret
+                    else: 
+                        return None
+
+        except aiohttp.ClientError as e:
+            logging.error(f'fish_speech请求失败: {e}')
+        except Exception as e:
+            logging.error(f'fish_speech未知错误: {e}')
+
+        return None
+
+    async def fish_speech_api(self, data):
+        API_URL = urljoin(data["api_ip_port"], f'/v1/models/{data["model_name"]}/invoke')
+
+        def replace_empty_strings_with_none(input_dict):
+            for key, value in input_dict.items():
+                if value == "":
+                    input_dict[key] = None
+            return input_dict
+
+        data["tts_config"] = replace_empty_strings_with_none(data["tts_config"])
+
+        logging.debug(f"data={data}")
+
+        try:
+            async with aiohttp.ClientSession() as session:
+                async with session.post(API_URL, json=data["tts_config"]) as response:
+                    if response.status == 200:
+                        content = await response.read()
+
+                        voice_tmp_path = os.path.join(self.audio_out_path, 'fish_speech_' + self.common.get_bj_time(4) + '.wav')
+                        file_name = 'fish_speech_' + self.common.get_bj_time(4) + '.wav'
+
+                        voice_tmp_path = self.common.get_new_audio_path(self.audio_out_path, file_name)
+
+                        with open(voice_tmp_path, 'wb') as file:
+                            file.write(content)
+
+                        return voice_tmp_path
+                    else:
+                        logging.error(f'fish_speech下载音频失败: {response.status}')
+                        return None
+        except aiohttp.ClientError as e:
+            logging.error(f'fish_speech请求失败: {e}')
+        except Exception as e:
+            logging.error(f'fish_speech未知错误: {e}')
+
+        return None
+