Skip to content

Commit

Permalink
update app
Browse files Browse the repository at this point in the history
  • Loading branch information
Kedreamix committed Feb 4, 2024
1 parent ed5d129 commit 9ea4719
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 142 deletions.
83 changes: 38 additions & 45 deletions app.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import os
import random
import gradio as gr
from zhconv import convert
from src.LLM import *
from src.Asr import OpenAIASR
from src.SadTalker import SadTalker
from src.TTS import EdgeTTS
import time
import random
from LLM import LLM
from ASR import WhisperASR
from TFG import SadTalker
from TTS import EdgeTTS
from src.cost_time import calculate_time

from configs import *
os.environ["GRADIO_TEMP_DIR"]= './temp'

Expand All @@ -32,9 +33,7 @@
facerender = 'facevid2vid'
enhancer = False
is_still_mode = False
# pose_style = gr.Slider(minimum=0, maximum=45, step=1, label="Pose style", value=0)

# exp_weight = gr.Slider(minimum=0, maximum=3, step=0.1, label="expression scale", value=1)
exp_weight = 1

use_ref_video = False
Expand All @@ -43,17 +42,18 @@
use_idle_mode = False
length_of_audio = 5

def asr(audio):
#sr, data = audio
#audio = "audio_output.wav"
#sf.write(audio, data, samplerate=sr) # 以44100Hz的采样率保存为WAV文件
#print(audio)
question = openaiasr.transcribe(audio)
question = convert(question, 'zh-cn')
@calculate_time
def Asr(audio):
try:
question = asr.transcribe(audio)
question = convert(question, 'zh-cn')
except Exception as e:
print("ASR Error: ", e)
question = 'Gradio 的麦克风有时候可能音频还未传入,请重试一下'
return question
def llm_response(question, voice = 'zh-CN-XiaoxiaoNeural', rate = 0, volume = 0, pitch = 0):
#answer = llm.predict(question)

@calculate_time
def LLM_response(question, voice = 'zh-CN-XiaoxiaoNeural', rate = 0, volume = 0, pitch = 0):
answer = llm.generate(question)
print(answer)
try:
Expand All @@ -62,17 +62,14 @@ def llm_response(question, voice = 'zh-CN-XiaoxiaoNeural', rate = 0, volume = 0,
os.system(f'edge-tts --text "{answer}" --voice {voice} --write-media answer.wav')
return 'answer.wav', 'answer.vtt', answer

def text_response(text, voice = 'zh-CN-XiaoxiaoNeural', rate = 0, volume = 100, pitch = 0, batch_size = 2):
@calculate_time
def Talker_response(text, voice = 'zh-CN-XiaoxiaoNeural', rate = 0, volume = 100, pitch = 0, batch_size = 2):
voice = 'zh-CN-XiaoxiaoNeural' if voice not in tts.SUPPORTED_VOICE else voice
print(voice , rate , volume , pitch)
s = time.time()
llm_response(text, voice, rate, volume, pitch)
e = time.time()
print("Using Time", e-s)
# print(voice , rate , volume , pitch)
LLM_response(text, voice, rate, volume, pitch)
pose_style = random.randint(0, 45)
s = time.time()
driven_audio = 'answer.wav'
video = sad_talker.test(source_image,
video = talker.test(source_image,
driven_audio,
preprocess_type,
is_still_mode,
Expand All @@ -87,30 +84,28 @@ def text_response(text, voice = 'zh-CN-XiaoxiaoNeural', rate = 0, volume = 100,
ref_info,
use_idle_mode,
length_of_audio,
blink_every)
e = time.time()
print("Using Time", e-s)
blink_every,
fps=20)
if os.path.exists('answer.vtt'):
return video, './answer.vtt'
else:
return video

def main():

with gr.Blocks(analytics_enabled=False, title = 'Linly-Talker') as inference:
gr.HTML(description)
with gr.Row().style(equal_height=False):
with gr.Row(equal_height=False):
with gr.Column(variant='panel'):
with gr.Tabs(elem_id="question_audio"):
with gr.TabItem('对话'):
with gr.Column(variant='panel'):
question_audio = gr.Audio(source="microphone", type="filepath")
question_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label = '语音对话')
input_text = gr.Textbox(label="Input Text", lines=3)

with gr.Accordion("Advanced Settings(高级设置语音参数) ",
open=False) as parameter_article:
open=False):
voice = gr.Dropdown(tts.SUPPORTED_VOICE,
values='zh-CN-XiaoxiaoNeural',
value='zh-CN-XiaoxiaoNeural',
label="Voice")
rate = gr.Slider(minimum=-100,
maximum=100,
Expand All @@ -133,22 +128,20 @@ def main():
step=1,
label='Talker Batch size')
asr_text = gr.Button('语音识别(语音对话后点击)')
asr_text.click(fn=asr,inputs=[question_audio],outputs=[input_text])
asr_text.click(fn=Asr,inputs=[question_audio],outputs=[input_text])

# with gr.Column(variant='panel'):
# input_text = gr.Textbox(label="Input Text", lines=3)
# text_button = gr.Button("文字对话", variant='primary')


with gr.Column(variant='panel'):

with gr.Tabs(elem_id="sadtalker_genearted"):
with gr.Tabs():
with gr.TabItem('数字人问答'):
gen_video = gr.Video(label="Generated video", format="mp4", scale=1)
video_button = gr.Button("提交",variant='primary')
video_button.click(fn=text_response,inputs=[input_text,voice, rate, volume, pitch, batch_size],outputs=[gen_video])

# text_button.click(fn=text_response,inputs=[input_text],outputs=[gen_video])
gen_video = gr.Video(label="Generated video", format="mp4", scale=1, autoplay=True)
video_button = gr.Button("提交", variant='primary')
video_button.click(fn=Talker_response,inputs=[input_text,voice, rate, volume, pitch, batch_size],outputs=[gen_video])

with gr.Row():
with gr.Column(variant='panel'):
gr.Markdown("## Text Examples")
Expand All @@ -162,7 +155,7 @@ def main():
]
gr.Examples(
examples = examples,
fn = text_response,
fn = Talker_response,
inputs = [input_text],
outputs=[gen_video],
# cache_examples = True,
Expand All @@ -176,8 +169,8 @@ def main():
# llm = LLM(mode='offline').init_model('Gemini', 'gemini-pro', api_key = "your api key")
# llm = LLM(mode='offline').init_model('Qwen', 'Qwen/Qwen-1_8B-Chat')
llm = LLM(mode=mode).init_model('Qwen', 'Qwen/Qwen-1_8B-Chat')
sad_talker = SadTalker(lazy_load=True)
openaiasr = OpenAIASR('base')
talker = SadTalker(lazy_load=True)
asr = WhisperASR('base')
tts = EdgeTTS()
gr.close_all()
demo = main()
Expand Down
94 changes: 35 additions & 59 deletions app_img.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import os
import os
import gradio as gr
from zhconv import convert
from src.LLM import *
from src.Asr import OpenAIASR
from src.SadTalker import SadTalker
from src.TTS import EdgeTTS
import time
import random
from LLM import LLM
from ASR import WhisperASR
from TFG import SadTalker
from TTS import EdgeTTS

from src.cost_time import calculate_time
from configs import *
description = """<p style="text-align: center; font-weight: bold;">
<span style="font-size: 28px;">Linly 智能对话系统 (Linly-Talker)</span>
Expand All @@ -28,16 +28,18 @@
use_idle_mode = False
length_of_audio = 5

def asr(audio):
#sr, data = audio
#audio = "audio_output.wav"
#sf.write(audio, data, samplerate=sr) # 以44100Hz的采样率保存为WAV文件
#print(audio)
question = openaiasr.transcribe(audio)
question = convert(question, 'zh-cn')
@calculate_time
def Asr(audio):
try:
question = asr.transcribe(audio)
question = convert(question, 'zh-cn')
except Exception as e:
print("ASR Error: ", e)
question = 'Gradio 的麦克风有时候可能音频还未传入,请重试一下'
return question

def llm_response(question, voice = 'zh-CN-XiaoxiaoNeural', rate = 0, volume = 0, pitch = 0):
@calculate_time
def LLM_response(question, voice = 'zh-CN-XiaoxiaoNeural', rate = 0, volume = 0, pitch = 0):
#answer = llm.predict(question)
answer = llm.generate(question)
print(answer)
Expand All @@ -47,7 +49,8 @@ def llm_response(question, voice = 'zh-CN-XiaoxiaoNeural', rate = 0, volume = 0,
os.system(f'edge-tts --text "{answer}" --voice {voice} --write-media answer.wav')
return 'answer.wav', 'answer.vtt', answer

def text_response(text, voice, rate, volume, pitch, source_image,
@calculate_time
def Talker_response(text, voice, rate, volume, pitch, source_image,
preprocess_type,
is_still_mode,
enhancer,
Expand All @@ -59,14 +62,9 @@ def text_response(text, voice, rate, volume, pitch, source_image,
blink_every,
fps):
voice = 'zh-CN-XiaoxiaoNeural' if voice not in tts.SUPPORTED_VOICE else voice
s = time.time()
llm_response(text, voice, rate, volume, pitch)
e = time.time()
print("Using Time", e-s)
pose_style = random.randint(0, 45)
s = time.time()
LLM_response(text, voice, rate, volume, pitch)
driven_audio = 'answer.wav'
video = sad_talker.test2(source_image,
video = talker.test2(source_image,
driven_audio,
preprocess_type,
is_still_mode,
Expand All @@ -82,10 +80,7 @@ def text_response(text, voice, rate, volume, pitch, source_image,
use_idle_mode,
length_of_audio,
blink_every,
fps)
e = time.time()
print("Using Time", e-s)
# print(video)
fps=20)
if os.path.exists('answer.vtt'):
return video, './answer.vtt'
else:
Expand All @@ -95,23 +90,24 @@ def main():

with gr.Blocks(analytics_enabled=False, title = 'Linly-Talker') as inference:
gr.HTML(description)
with gr.Row().style(equal_height=False):
with gr.Row(equal_height=False):
with gr.Column(variant='panel'):
with gr.Tabs(elem_id="sadtalker_source_image"):
with gr.TabItem('Source image'):
with gr.Row():
source_image = gr.Image(label="Source image", source="upload", type="filepath", elem_id="img2img_image").style(width=512)

source_image = gr.Image(label="Source image", type="filepath", elem_id="img2img_image", width=512)

with gr.Tabs(elem_id="question_audio"):
with gr.TabItem('对话'):
with gr.Column(variant='panel'):
question_audio = gr.Audio(source="microphone",type="filepath", info = '语音对话')
question_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label = '语音对话')
input_text = gr.Textbox(label="Input Text", lines=3, info = '文字对话')
with gr.Accordion("Advanced Settings",
open=False,
visible=True) as parameter_article:
voice = gr.Dropdown(tts.SUPPORTED_VOICE, values='zh-CN-XiaoxiaoNeural', label="Voice")
voice = gr.Dropdown(tts.SUPPORTED_VOICE,
value='zh-CN-XiaoxiaoNeural',
label="Voice")
rate = gr.Slider(minimum=-100,
maximum=100,
value=0,
Expand All @@ -129,7 +125,7 @@ def main():
label='Pitch')

asr_text = gr.Button('语音识别(语音对话后点击)')
asr_text.click(fn=asr,inputs=[question_audio],outputs=[input_text])
asr_text.click(fn=Asr,inputs=[question_audio],outputs=[input_text])

# with gr.Tabs(elem_id="response_audio"):
# with gr.TabItem("语音选择"):
Expand Down Expand Up @@ -158,7 +154,7 @@ def main():
with gr.Tabs(elem_id="sadtalker_checkbox"):
with gr.TabItem('Settings'):
with gr.Accordion("Advanced Settings",
open=False) as parameter_article:
open=False):
gr.Markdown("SadTalker: need help? please visit our [[best practice page](https://github.com/OpenTalker/SadTalker/blob/main/docs/best_practice.md)] for more detials")
with gr.Column(variant='panel'):
# width = gr.Slider(minimum=64, elem_id="img2img_width", maximum=2048, step=8, label="Manually Crop Width", value=512) # img2img_width
Expand All @@ -178,7 +174,7 @@ def main():

with gr.Row():
batch_size = gr.Slider(label="batch size in generation", step=1, maximum=10, value=1)
fps = gr.Slider(label='fps in generation', step=1, maximum=30, value =25)
fps = gr.Slider(label='fps in generation', step=1, maximum=30, value =20)
enhancer = gr.Checkbox(label="GFPGAN as Face enhancer(slow)")


Expand All @@ -188,7 +184,7 @@ def main():

submit = gr.Button('Generate', elem_id="sadtalker_generate", variant='primary')
submit.click(
fn=text_response,
fn=Talker_response,
inputs=[input_text,
voice, rate, volume, pitch,
source_image,
Expand Down Expand Up @@ -221,13 +217,6 @@ def main():
False,
False
],
[
'examples/source_image/full_body_1.png',
'crop',
False,
False
],

[
'examples/source_image/full3.png',
'crop',
Expand All @@ -240,13 +229,6 @@ def main():
False,
False
],
[
'examples/source_image/full4.jpeg',
'crop',
False,
False
],

[
'examples/source_image/art_13.png',
'crop',
Expand All @@ -259,15 +241,9 @@ def main():
False,
False
],
[
'examples/source_image/art_5.png',
'crop',
False,
False
],
]
gr.Examples(examples=examples,
fn=text_response,
fn=Talker_response,
inputs=[
source_image,
preprocess_type,
Expand All @@ -285,8 +261,8 @@ def main():
# llm = LLM(mode='offline').init_model('Gemini', 'gemini-pro', api_key = "your api key")
# llm = LLM(mode='offline').init_model('Qwen', 'Qwen/Qwen-1_8B-Chat')
llm = LLM(mode=mode).init_model('Qwen', 'Qwen/Qwen-1_8B-Chat')
sad_talker = SadTalker(lazy_load=True)
openaiasr = OpenAIASR('base')
talker = SadTalker(lazy_load=True)
asr = WhisperASR('base')
tts = EdgeTTS()
gr.close_all()
demo = main()
Expand Down
Loading

0 comments on commit 9ea4719

Please sign in to comment.