Skip to content

Commit

Permalink
初步实现文案音频合成功能
Browse files Browse the repository at this point in the history
  • Loading branch information
Ikaros-521 committed Jul 6, 2023
1 parent d72135b commit 20e8590
Show file tree
Hide file tree
Showing 12 changed files with 243 additions and 17 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -684,6 +684,7 @@ ChatterBot 的核心思想是:基于历史对话数据,使用机器学习和
- 本地的用户拉黑机制
- 礼物互动机制(跳舞,唱歌)
- 弹幕跳过机制
- 点歌后的弹幕触发问题(如:只取最新的几个)

## 📝 更新日志

Expand Down
12 changes: 6 additions & 6 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
"max_new_tokens": 250
},
"chat_with_file": {
"chat_mode": "",
"chat_mode": "claude",
"data_path": "data/伊卡洛斯百度百科.zip",
"separator": "\n",
"chunk_size": 100,
Expand All @@ -48,12 +48,12 @@
"local_max_query": 3,
"local_vector_embedding_model": "sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco"
},
"audio_synthesis_type": "edge-tts",
"audio_synthesis_type": "vits",
"vits": {
"config_path": "D:\\GitHub_pro\\VITS-fast-fine-tuning\\inference\\finetune_speaker.json",
"api_ip_port": "http://127.0.0.1:7860",
"character": "ikaros",
"speed": 1
"speed": 1.0
},
"so_vits_svc": {
"enable": false,
Expand Down Expand Up @@ -107,7 +107,7 @@
"ip": "127.0.0.1",
"port": 7860,
"negative_prompt": "ufsw, longbody, lowres, bad anatomy, bad hands, missing fingers, pubic hair,extra digit, fewer digits, cropped, worst quality, low quality",
"seed": -1,
"seed": -1.0,
"styles": [],
"cfg_scale": 7,
"steps": 30,
Expand All @@ -119,9 +119,9 @@
"denoising_strength": 0.4
},
"copywriting": {
"file_path": "data/copywriting",
"file_path": "data/copywriting/",
"list": [],
"audio_path": "out/copywriting"
"audio_path": "out/copywriting/"
},
"header": {
"userAgent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.42"
Expand Down
8 changes: 3 additions & 5 deletions data/copywriting/测试文案2.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
你好,我是测试文案2。
我要开始复读了喵2。
复读12
复读22
复读32
这里编辑文案。
整体逻辑和音频合成基本一致。
合成的时候请耐心等待。
22 changes: 21 additions & 1 deletion main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import sys, os, json, subprocess, importlib, re
import logging
import time
import asyncio
# from functools import partial

from utils.config import Config
Expand All @@ -16,6 +17,7 @@

from utils.common import Common
from utils.logger import Configure_logger
from utils.audio import Audio



Expand Down Expand Up @@ -66,6 +68,7 @@ def __init__(self):
self.init_ui()



# 设置实例
def CreateItems(self):
# 定时器
Expand Down Expand Up @@ -491,12 +494,14 @@ def init_config(self):
self.ui.lineEdit_sd_denoising_strength.setText(str(self.sd_config['denoising_strength']))

# 文案数据回显到UI
tmp_str = ""
copywriting_file_names = self.get_dir_txt_filename(self.copywriting_config['file_path'])
for tmp in copywriting_file_names:
tmp_str = tmp_str + tmp + "\n"
self.ui.textEdit_copywriting_list.setText(tmp_str)

# 文案音频数据回显到UI
tmp_str = ""
copywriting_audio_file_names = self.get_dir_audio_filename(self.copywriting_config['audio_path'])
for tmp in copywriting_audio_file_names:
tmp_str = tmp_str + tmp + "\n"
Expand Down Expand Up @@ -559,9 +564,11 @@ def init_ui(self):
self.ui.pushButton_copywriting_page.disconnect()
self.ui.pushButton_copywriting_select.disconnect()
self.ui.pushButton_copywriting_save.disconnect()
self.ui.pushButton_copywriting_synthetic_audio.disconnect()
self.ui.pushButton_copywriting_page.clicked.connect(self.on_pushButton_copywriting_page_clicked)
self.ui.pushButton_copywriting_select.clicked.connect(self.on_pushButton_copywriting_select_clicked)
self.ui.pushButton_copywriting_save.clicked.connect(self.on_pushButton_copywriting_save_clicked)
self.ui.pushButton_copywriting_synthetic_audio.clicked.connect(self.on_pushButton_copywriting_synthetic_audio_clicked)

# 下拉框相关槽函数
self.ui.comboBox_chat_type.disconnect()
Expand All @@ -583,6 +590,7 @@ def init_ui(self):
self.throttled_copywriting_page = self.throttle(self.copywriting_page, 1)
self.throttled_copywriting_select = self.throttle(self.copywriting_select, 1)
self.throttled_copywriting_save = self.throttle(self.copywriting_save, 1)
self.throttled_copywriting_synthetic_audio = self.throttle(self.copywriting_synthetic_audio, 1)



Expand Down Expand Up @@ -1005,7 +1013,17 @@ def copywriting_save(self):
# 保存文案按钮
def on_pushButton_copywriting_save_clicked(self):
self.throttled_copywriting_save()



# 合成音频
def copywriting_synthetic_audio(self):
select_file_path = self.ui.lineEdit_copywriting_select.text()
asyncio.run(audio.copywriting_synthesis_audio(select_file_path))


# 合成音频按钮
def on_pushButton_copywriting_synthetic_audio_clicked(self):
self.throttled_copywriting_synthetic_audio()


'''
Expand Down Expand Up @@ -1309,6 +1327,8 @@ def run(self):
# 配置文件路径
config_path = os.path.join(file_relative_path, 'config.json')

audio = Audio(config_path)

# 日志文件路径
file_path = "./log/log-" + common.get_bj_time(1) + ".txt"
Configure_logger(file_path)
Expand Down
Binary file added out/copywriting/测试文案2.wav
Binary file not shown.
2 changes: 2 additions & 0 deletions requirements_bilibili.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ pyvirtualcam
numpy
faiss-cpu
sentence_transformers
pydub
send2trash
bilibili-api==9.1.0
bilibili-api-python
langchain==0.0.142
4 changes: 3 additions & 1 deletion requirements_common.txt
Original file line number Diff line number Diff line change
Expand Up @@ -117,4 +117,6 @@ pypdf==3.11.1
faiss-cpu==1.7.4
webuiapi==0.9.3
pyvirtualcam==0.10.2
numpy==1.25.0
numpy==1.25.0
pydub==0.25.1
send2trash==1.8.2
4 changes: 3 additions & 1 deletion requirements_dy.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,6 @@ webuiapi
pyvirtualcam
numpy
faiss-cpu
sentence_transformers
sentence_transformers
pydub
send2trash
4 changes: 3 additions & 1 deletion requirements_ks.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,6 @@ webuiapi
pyvirtualcam
numpy
faiss-cpu
sentence_transformers
sentence_transformers
pydub
send2trash
6 changes: 5 additions & 1 deletion ui/main.ui
Original file line number Diff line number Diff line change
Expand Up @@ -1686,7 +1686,11 @@ p, li { white-space: pre-wrap; }
</widget>
</item>
<item row="1" column="5" colspan="2">
<widget class="QTextEdit" name="textEdit_copywriting_audio_list"/>
<widget class="QTextEdit" name="textEdit_copywriting_audio_list">
<property name="readOnly">
<bool>true</bool>
</property>
</widget>
</item>
</layout>
</widget>
Expand Down
161 changes: 160 additions & 1 deletion utils/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

from elevenlabs import generate, play, set_api_key

from pydub import AudioSegment

from .common import Common
from .logger import Configure_logger
from .config import Config
Expand Down Expand Up @@ -275,7 +277,7 @@ async def my_play_voice(self, message):
return


# 只进行音频合成
# 只进行音频播放
def only_play_audio(self):
try:
pygame.mixer.init()
Expand All @@ -300,3 +302,160 @@ def only_play_audio(self):
# 停止当前播放的音频
def stop_current_audio(self):
pygame.mixer.music.fadeout(1000)


# 合并文案音频文件
def merge_audio_files(self, directory, base_filename, last_index, pause_duration=1, format="wav"):
merged_audio = None

for i in range(1, last_index+1):
filename = f"{base_filename}-{i}.{format}" # 假设音频文件为 wav 格式
filepath = os.path.join(directory, filename)

if os.path.isfile(filepath):
audio_segment = AudioSegment.from_file(filepath)

if pause_duration > 0 and merged_audio is not None:
pause = AudioSegment.silent(duration=pause_duration * 1000) # 将秒数转换为毫秒
merged_audio += pause

if merged_audio is None:
merged_audio = audio_segment
else:
merged_audio += audio_segment

os.remove(filepath) # 删除已合并的音频文件

if merged_audio is not None:
merged_filename = f"{base_filename}.wav" # 合并后的文件名
merged_filepath = os.path.join(directory, merged_filename)
merged_audio.export(merged_filepath, format="wav")
logging.info(f"音频文件合并成功:{merged_filepath}")
else:
logging.error("没有找到要合并的音频文件")


# 只进行文案音频合成
async def copywriting_synthesis_audio(self, file_path):
try:
max_len = self.config.get("filter", "max_len")
max_char_len = self.config.get("filter", "max_char_len")
audio_synthesis_type = self.config.get("audio_synthesis_type")
vits = self.config.get("vits")
copywriting = self.config.get("copywriting")
edge_tts_config = self.config.get("edge-tts")


file_path = os.path.join(copywriting["file_path"], file_path)
# 从文件路径提取文件名
file_name = self.common.extract_filename(file_path)
# 获取文件内容
content = self.common.read_file_return_content(file_path)

logging.debug(f"合成音频前的原始数据:{content}")
content = self.common.remove_extra_words(content, max_len, max_char_len)
# logging.info("裁剪后的合成文本:" + text)

content = content.replace('\n', '。')

# 文件名自增值,在后期多合一的时候起到排序作用
file_index = 0

# 同样进行文本切分
sentences = self.common.split_sentences(content)
# 遍历逐一合成文案音频
for content in sentences:
file_index = file_index + 1

if audio_synthesis_type == "vits":
try:
# 语言检测
language = self.common.lang_check(content)

# 自定义语言名称(需要匹配请求解析)
language_name_dict = {"en": "英语", "zh": "中文", "jp": "日语"}

if language in language_name_dict:
language = language_name_dict[language]
else:
language = "日语" # 无法识别出语言代码时的默认值

# logging.info("language=" + language)

# 调用接口合成语音
data_json = self.vits_fast_api(vits["api_ip_port"], vits["character"], language, content, vits["speed"])
# logging.info(data_json)

voice_tmp_path = data_json["data"][1]["name"]
logging.info(f"vits-fast合成成功,输出到={voice_tmp_path}")

if True == self.config.get("so_vits_svc", "enable"):
voice_tmp_path = await self.so_vits_svc_api(audio_path=voice_tmp_path)
logging.info(f"so-vits-svc合成成功,输出到={voice_tmp_path}")

# 移动音频到 临时音频路径(本项目的out文件夹) 并重命名
out_file_path = os.path.join(os.getcwd(), "out/")
logging.info(f"out_file_path={out_file_path}")
self.common.move_file(voice_tmp_path, out_file_path, file_name + "-" + str(file_index))

# self.voice_tmp_path_queue.put(voice_tmp_path)
except Exception as e:
logging.error(e)
return
elif audio_synthesis_type == "edge-tts":
try:
voice_tmp_path = './out/' + self.common.get_bj_time(4) + '.wav'
# 过滤" '字符
content = content.replace('"', '').replace("'", '').replace(" ", ',')
# 使用 Edge TTS 生成回复消息的语音文件
communicate = edge_tts.Communicate(text=content, voice=edge_tts_config["voice"], rate=edge_tts_config["rate"], volume=edge_tts_config["volume"])
await communicate.save(voice_tmp_path)

logging.info(f"edge-tts合成成功,输出到={voice_tmp_path}")

if True == self.config.get("so_vits_svc", "enable"):
voice_tmp_path = await self.so_vits_svc_api(audio_path=os.path.abspath(voice_tmp_path))
logging.info(f"so-vits-svc合成成功,输出到={voice_tmp_path}")

# 移动音频到 临时音频路径(本项目的out文件夹) 并重命名
out_file_path = os.path.join(os.getcwd(), "out/")
self.common.move_file(voice_tmp_path, out_file_path, file_name + "-" + str(file_index))

# self.voice_tmp_path_queue.put(voice_tmp_path)
except Exception as e:
logging.error(e)
elif audio_synthesis_type == "elevenlabs":
return

try:
# 如果配置了密钥就设置上0.0
if message["data"]["elevenlabs_api_key"] != "":
set_api_key(message["data"]["elevenlabs_api_key"])

audio = generate(
text=message["content"],
voice=message["data"]["elevenlabs_voice"],
model=message["data"]["elevenlabs_model"]
)

# play(audio)
except Exception as e:
logging.error(e)
return

# 进行音频合并 输出到文案音频路径
out_file_path = os.path.join(os.getcwd(), "out")
self.merge_audio_files(out_file_path, file_name, file_index)

file_path = os.path.join(os.getcwd(), "out/", file_name + ".wav")
logging.info(f"file_path={file_path}")
# 移动音频到 文案音频路径
out_file_path = os.path.join(os.getcwd(), copywriting["audio_path"])
logging.info(f"out_file_path={out_file_path}")
self.common.move_file(file_path, out_file_path)
file_path = os.path.join(copywriting["audio_path"], file_name + ".wav")

return file_path
except Exception as e:
logging.error(e)
return None
Loading

0 comments on commit 20e8590

Please sign in to comment.