Skip to content

Commit

Permalink
Replaced encodec with vocos decoder
Browse files Browse the repository at this point in the history
  • Loading branch information
Plachtaa committed Aug 30, 2023
1 parent 17aadb8 commit 350e5fc
Show file tree
Hide file tree
Showing 38 changed files with 37 additions and 23 deletions.
8 changes: 4 additions & 4 deletions README-ZH.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ VALL-E X 是一个强大而创新的多语言文本转语音(TTS)模型,
* [🧠 TODO](#-todo)

## 🚀 Updates
**2023.08.24**
- 如果无法访问Google Drive,请从[Hugging Face链接](https://huggingface.co/Plachta/VALL-E-X/resolve/main/vallex-checkpoint.pt)下载模型权重,并将其放在`./checkpoints/`文件夹中。
**2023.08.30
- 将EnCodec解码器替换成了Vocos解码器,提升了音质。 (感谢[@v0xie](https://github.com/v0xie))

**2023.08.23**
- 加入了长文本生成功能
Expand Down Expand Up @@ -313,10 +313,10 @@ VALL-E X 与 [Bark](https://github.com/suno-ai/bark), [VALL-E](https://arxiv.org

## 🧠 待办事项
- [x] 添加中文 README
- [ ] 给非python用户的`.bat`脚本
- [x] 长文本生成
- [x] 用Vocos解码器替换Encodec解码器
- [ ] 微调以实现更好的语音自适应
- [ ] 用Vocos解码器替换Encodec解码器
- [ ] 给非python用户的`.bat`脚本
- [ ] 更多...

## 🙏 感谢
Expand Down
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ More details about the model are presented in [model card](./model-card.md).
* [🧠 TODO](#-todo)

## 🚀 Updates
**2023.08.30
- Replaced EnCodec decoder with Vocos decoder, improved audio quality. (Thanks to [@v0xie](https://github.com/v0xie))

**2023.08.23**
- Added long text generation.

Expand Down Expand Up @@ -320,10 +323,10 @@ to ensure acceptable performance.

## 🧠 TODO
- [x] Add Chinese README
- [ ] `.bat` scripts for non-python users
- [x] Long text generation
- [x] Replace Encodec decoder with Vocos decoder
- [ ] Fine-tuning for better voice adaptation
- [ ] Replace Encodec decoder with Vocos decoder
- [ ] `.bat` scripts for non-python users
- [ ] To be added...

## 🙏 Appreciation
Expand Down
45 changes: 28 additions & 17 deletions launch-ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@

import gradio as gr
import whisper
from vocos import Vocos
import multiprocessing

thread_count = multiprocessing.cpu_count()
Expand Down Expand Up @@ -95,6 +96,9 @@
# Encodec model
audio_tokenizer = AudioTokenizer(device)

# Vocos decoder
vocos = Vocos.from_pretrained('charactr/vocos-encodec-24khz').to(device)

# ASR
if not os.path.exists("./whisper/"): os.mkdir("./whisper/")
try:
Expand Down Expand Up @@ -281,16 +285,17 @@ def infer_from_audio(text, language, accent, audio_prompt, record_audio_prompt,
prompt_language=lang_pr,
text_language=langs if accent == "no-accent" else lang,
)
samples = audio_tokenizer.decode(
[(encoded_frames.transpose(2, 1), None)]
)
# Decode with Vocos
frames = encoded_frames.permute(2,0,1)
features = vocos.codes_to_features(frames)
samples = vocos.decode(features, bandwidth_id=torch.tensor([2], device=device))

# offload model
model.to('cpu')
torch.cuda.empty_cache()

message = f"text prompt: {text_pr}\nsythesized text: {text}"
return message, (24000, samples[0][0].cpu().numpy())
return message, (24000, samples.squeeze(0).cpu().numpy())

@torch.no_grad()
def infer_from_prompt(text, language, accent, preset_prompt, prompt_file):
Expand Down Expand Up @@ -340,14 +345,16 @@ def infer_from_prompt(text, language, accent, preset_prompt, prompt_file):
prompt_language=lang_pr,
text_language=langs if accent == "no-accent" else lang,
)
samples = audio_tokenizer.decode(
[(encoded_frames.transpose(2, 1), None)]
)
# Decode with Vocos
frames = encoded_frames.permute(2,0,1)
features = vocos.codes_to_features(frames)
samples = vocos.decode(features, bandwidth_id=torch.tensor([2], device=device))

model.to('cpu')
torch.cuda.empty_cache()

message = f"sythesized text: {text}"
return message, (24000, samples[0][0].cpu().numpy())
return message, (24000, samples.squeeze(0).cpu().numpy())


from utils.sentence_cutter import split_text_into_sentences
Expand Down Expand Up @@ -429,12 +436,14 @@ def infer_long_text(text, preset_prompt, prompt=None, language='auto', accent='n
text_language=langs if accent == "no-accent" else lang,
)
complete_tokens = torch.cat([complete_tokens, encoded_frames.transpose(2, 1)], dim=-1)
samples = audio_tokenizer.decode(
[(complete_tokens, None)]
)
# Decode with Vocos
frames = encoded_frames.permute(2, 0, 1)
features = vocos.codes_to_features(frames)
samples = vocos.decode(features, bandwidth_id=torch.tensor([2], device=device))

model.to('cpu')
message = f"Cut into {len(sentences)} sentences"
return message, (24000, samples[0][0].cpu().numpy())
return message, (24000, samples.squeeze(0).cpu().numpy())
elif mode == "sliding-window":
complete_tokens = torch.zeros([1, NUM_QUANTIZERS, 0]).type(torch.LongTensor).to(device)
original_audio_prompts = audio_prompts
Expand Down Expand Up @@ -476,18 +485,20 @@ def infer_long_text(text, preset_prompt, prompt=None, language='auto', accent='n
else:
audio_prompts = original_audio_prompts
text_prompts = original_text_prompts
samples = audio_tokenizer.decode(
[(complete_tokens, None)]
)
# Decode with Vocos
frames = encoded_frames.permute(2, 0, 1)
features = vocos.codes_to_features(frames)
samples = vocos.decode(features, bandwidth_id=torch.tensor([2], device=device))

model.to('cpu')
message = f"Cut into {len(sentences)} sentences"
return message, (24000, samples[0][0].cpu().numpy())
return message, (24000, samples.squeeze(0).cpu().numpy())
else:
raise ValueError(f"No such mode {mode}")


def main():
app = gr.Blocks(title="VALL-E-X")
app = gr.Blocks(title="VALL-E X")
with app:
gr.Markdown(top_md)
with gr.Tab("Infer from audio"):
Expand Down
Binary file added presets/acou_1.npz
Binary file not shown.
Binary file added presets/acou_2.npz
Binary file not shown.
Binary file added presets/acou_3.npz
Binary file not shown.
Binary file added presets/acou_4.npz
Binary file not shown.
Binary file added presets/alan.npz
Binary file not shown.
Binary file added presets/amused.npz
Binary file not shown.
Binary file added presets/anger.npz
Binary file not shown.
Binary file added presets/babara.npz
Binary file not shown.
Binary file added presets/bronya.npz
Binary file not shown.
Binary file added presets/disgust.npz
Binary file not shown.
Binary file added presets/emo_amused.npz
Binary file not shown.
Binary file added presets/emo_anger.npz
Binary file not shown.
Binary file added presets/emo_neutral.npz
Binary file not shown.
Binary file added presets/emo_sleepy.npz
Binary file not shown.
Binary file added presets/emotion_sleepiness.npz
Binary file not shown.
Binary file added presets/en2zh_tts_1.npz
Binary file not shown.
Binary file added presets/en2zh_tts_2.npz
Binary file not shown.
Binary file added presets/en2zh_tts_3.npz
Binary file not shown.
Binary file added presets/en2zh_tts_4.npz
Binary file not shown.
Binary file added presets/fuxuan.npz
Binary file not shown.
Binary file added presets/librispeech_1.npz
Binary file not shown.
Binary file added presets/librispeech_2.npz
Binary file not shown.
Binary file added presets/librispeech_3.npz
Binary file not shown.
Binary file added presets/librispeech_4.npz
Binary file not shown.
Binary file added presets/neutral.npz
Binary file not shown.
Binary file added presets/paimon.npz
Binary file not shown.
Binary file added presets/sleepiness.npz
Binary file not shown.
Binary file added presets/vctk_1.npz
Binary file not shown.
Binary file added presets/vctk_2.npz
Binary file not shown.
Binary file added presets/vctk_3.npz
Binary file not shown.
Binary file added presets/vctk_4.npz
Binary file not shown.
Binary file added presets/zh2en_tts_1.npz
Binary file not shown.
Binary file added presets/zh2en_tts_2.npz
Binary file not shown.
Binary file added presets/zh2en_tts_3.npz
Binary file not shown.
Binary file added presets/zh2en_tts_4.npz
Binary file not shown.

0 comments on commit 350e5fc

Please sign in to comment.