Replaced encodec with vocos decoder

bndzor · Aug 30, 2023 · 350e5fc · 350e5fc
1 parent 17aadb8
commit 350e5fc
Show file tree

Hide file tree

Showing 38 changed files with 37 additions and 23 deletions.
diff --git a/README-ZH.md b/README-ZH.md
@@ -21,8 +21,8 @@ VALL-E X 是一个强大而创新的多语言文本转语音（TTS）模型，
 * [🧠 TODO](#-todo)
 
 ## 🚀 Updates
-**2023.08.24**
-- 如果无法访问Google Drive，请从[Hugging Face链接](https://huggingface.co/Plachta/VALL-E-X/resolve/main/vallex-checkpoint.pt)下载模型权重，并将其放在`./checkpoints/`文件夹中。
+**2023.08.30
+- 将EnCodec解码器替换成了Vocos解码器，提升了音质。 (感谢[@v0xie](https://github.com/v0xie))
 
 **2023.08.23**
 - 加入了长文本生成功能
@@ -313,10 +313,10 @@ VALL-E X 与 [Bark](https://github.com/suno-ai/bark), [VALL-E](https://arxiv.org
 
 ## 🧠 待办事项
 - [x] 添加中文 README
-- [ ] 给非python用户的`.bat`脚本
 - [x] 长文本生成
+- [x] 用Vocos解码器替换Encodec解码器
 - [ ] 微调以实现更好的语音自适应
-- [ ] 用Vocos解码器替换Encodec解码器
+- [ ] 给非python用户的`.bat`脚本
 - [ ] 更多...
 
 ## 🙏 感谢

diff --git a/README.md b/README.md
@@ -23,6 +23,9 @@ More details about the model are presented in [model card](./model-card.md).
 * [🧠 TODO](#-todo)
 
 ## 🚀 Updates
+**2023.08.30
+- Replaced EnCodec decoder with Vocos decoder, improved audio quality. (Thanks to [@v0xie](https://github.com/v0xie))
+
 **2023.08.23**
 - Added long text generation.
 
@@ -320,10 +323,10 @@ to ensure acceptable performance.
 
 ## 🧠 TODO
 - [x] Add Chinese README
-- [ ] `.bat` scripts for non-python users
 - [x] Long text generation
+- [x] Replace Encodec decoder with Vocos decoder
 - [ ] Fine-tuning for better voice adaptation
-- [ ] Replace Encodec decoder with Vocos decoder
+- [ ] `.bat` scripts for non-python users
 - [ ] To be added...
 
 ## 🙏 Appreciation

diff --git a/launch-ui.py b/launch-ui.py
@@ -39,6 +39,7 @@
 
 import gradio as gr
 import whisper
+from vocos import Vocos
 import multiprocessing
 
 thread_count = multiprocessing.cpu_count()
@@ -95,6 +96,9 @@
 # Encodec model
 audio_tokenizer = AudioTokenizer(device)
 
+# Vocos decoder
+vocos = Vocos.from_pretrained('charactr/vocos-encodec-24khz').to(device)
+
 # ASR
 if not os.path.exists("./whisper/"): os.mkdir("./whisper/")
 try:
@@ -281,16 +285,17 @@ def infer_from_audio(text, language, accent, audio_prompt, record_audio_prompt,
  prompt_language=lang_pr,
  text_language=langs if accent == "no-accent" else lang,
  )
- samples = audio_tokenizer.decode(
- [(encoded_frames.transpose(2, 1), None)]
- )
+ # Decode with Vocos
+ frames = encoded_frames.permute(2,0,1)
+ features = vocos.codes_to_features(frames)
+ samples = vocos.decode(features, bandwidth_id=torch.tensor([2], device=device))
 
  # offload model
  model.to('cpu')
  torch.cuda.empty_cache()
 
  message = f"text prompt: {text_pr}\nsythesized text: {text}"
- return message, (24000, samples[0][0].cpu().numpy())
+ return message, (24000, samples.squeeze(0).cpu().numpy())
 
 @torch.no_grad()
 def infer_from_prompt(text, language, accent, preset_prompt, prompt_file):
@@ -340,14 +345,16 @@ def infer_from_prompt(text, language, accent, preset_prompt, prompt_file):
  prompt_language=lang_pr,
  text_language=langs if accent == "no-accent" else lang,
  )
- samples = audio_tokenizer.decode(
- [(encoded_frames.transpose(2, 1), None)]
- )
+ # Decode with Vocos
+ frames = encoded_frames.permute(2,0,1)
+ features = vocos.codes_to_features(frames)
+ samples = vocos.decode(features, bandwidth_id=torch.tensor([2], device=device))
+
  model.to('cpu')
  torch.cuda.empty_cache()
 
  message = f"sythesized text: {text}"
- return message, (24000, samples[0][0].cpu().numpy())
+ return message, (24000, samples.squeeze(0).cpu().numpy())
 
 
 from utils.sentence_cutter import split_text_into_sentences
@@ -429,12 +436,14 @@ def infer_long_text(text, preset_prompt, prompt=None, language='auto', accent='n
  text_language=langs if accent == "no-accent" else lang,
  )
  complete_tokens = torch.cat([complete_tokens, encoded_frames.transpose(2, 1)], dim=-1)
- samples = audio_tokenizer.decode(
- [(complete_tokens, None)]
- )
+ # Decode with Vocos
+ frames = encoded_frames.permute(2, 0, 1)
+ features = vocos.codes_to_features(frames)
+ samples = vocos.decode(features, bandwidth_id=torch.tensor([2], device=device))
+
  model.to('cpu')
  message = f"Cut into {len(sentences)} sentences"
- return message, (24000, samples[0][0].cpu().numpy())
+ return message, (24000, samples.squeeze(0).cpu().numpy())
  elif mode == "sliding-window":
  complete_tokens = torch.zeros([1, NUM_QUANTIZERS, 0]).type(torch.LongTensor).to(device)
  original_audio_prompts = audio_prompts
@@ -476,18 +485,20 @@ def infer_long_text(text, preset_prompt, prompt=None, language='auto', accent='n
  else:
  audio_prompts = original_audio_prompts
  text_prompts = original_text_prompts
- samples = audio_tokenizer.decode(
- [(complete_tokens, None)]
- )
+ # Decode with Vocos
+ frames = encoded_frames.permute(2, 0, 1)
+ features = vocos.codes_to_features(frames)
+ samples = vocos.decode(features, bandwidth_id=torch.tensor([2], device=device))
+
  model.to('cpu')
  message = f"Cut into {len(sentences)} sentences"
- return message, (24000, samples[0][0].cpu().numpy())
+ return message, (24000, samples.squeeze(0).cpu().numpy())
  else:
  raise ValueError(f"No such mode {mode}")
 
 
 def main():
- app = gr.Blocks(title="VALL-E-X")
+ app = gr.Blocks(title="VALL-E X")
  with app:
  gr.Markdown(top_md)
  with gr.Tab("Infer from audio"):

diff --git a/presets/acou_1.npz b/presets/acou_1.npz
diff --git a/presets/acou_2.npz b/presets/acou_2.npz
diff --git a/presets/acou_3.npz b/presets/acou_3.npz
diff --git a/presets/acou_4.npz b/presets/acou_4.npz
diff --git a/presets/alan.npz b/presets/alan.npz
diff --git a/presets/amused.npz b/presets/amused.npz
diff --git a/presets/anger.npz b/presets/anger.npz
diff --git a/presets/babara.npz b/presets/babara.npz
diff --git a/presets/bronya.npz b/presets/bronya.npz
diff --git a/presets/disgust.npz b/presets/disgust.npz
diff --git a/presets/emo_amused.npz b/presets/emo_amused.npz
diff --git a/presets/emo_anger.npz b/presets/emo_anger.npz
diff --git a/presets/emo_neutral.npz b/presets/emo_neutral.npz
diff --git a/presets/emo_sleepy.npz b/presets/emo_sleepy.npz
diff --git a/presets/emotion_sleepiness.npz b/presets/emotion_sleepiness.npz
diff --git a/presets/en2zh_tts_1.npz b/presets/en2zh_tts_1.npz
diff --git a/presets/en2zh_tts_2.npz b/presets/en2zh_tts_2.npz
diff --git a/presets/en2zh_tts_3.npz b/presets/en2zh_tts_3.npz
diff --git a/presets/en2zh_tts_4.npz b/presets/en2zh_tts_4.npz
diff --git a/presets/fuxuan.npz b/presets/fuxuan.npz
diff --git a/presets/librispeech_1.npz b/presets/librispeech_1.npz
diff --git a/presets/librispeech_2.npz b/presets/librispeech_2.npz
diff --git a/presets/librispeech_3.npz b/presets/librispeech_3.npz
diff --git a/presets/librispeech_4.npz b/presets/librispeech_4.npz
diff --git a/presets/neutral.npz b/presets/neutral.npz
diff --git a/presets/paimon.npz b/presets/paimon.npz
diff --git a/presets/sleepiness.npz b/presets/sleepiness.npz
diff --git a/presets/vctk_1.npz b/presets/vctk_1.npz
diff --git a/presets/vctk_2.npz b/presets/vctk_2.npz
diff --git a/presets/vctk_3.npz b/presets/vctk_3.npz
diff --git a/presets/vctk_4.npz b/presets/vctk_4.npz
diff --git a/presets/zh2en_tts_1.npz b/presets/zh2en_tts_1.npz
diff --git a/presets/zh2en_tts_2.npz b/presets/zh2en_tts_2.npz
diff --git a/presets/zh2en_tts_3.npz b/presets/zh2en_tts_3.npz
diff --git a/presets/zh2en_tts_4.npz b/presets/zh2en_tts_4.npz