update

eatYmz · Apr 13, 2023 · 209995f · 209995f
1 parent 34d0365
commit 209995f
Showing 1 changed file with 83 additions and 44 deletions.
diff --git a/audio-chatgpt.py b/audio-chatgpt.py
@@ -4,6 +4,8 @@
 sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
 sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'NeuralSeq'))
 sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio'))
+sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio_img'))
+sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio_inpaint'))
 sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'audio_detection'))
 sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'mono2binaural'))
 import gradio as gr
@@ -867,6 +869,86 @@ def inference(self, text, audio_path):
         #print(ans)
         return ans
 
+# class Speech_Enh_SS_SC:
+#     """Speech Enhancement or Separation in single-channel
+#     Example usage:
+#         enh_model = Speech_Enh_SS("cuda")
+#         enh_wav = enh_model.inference("./test_chime4_audio_M05_440C0213_PED_REAL.wav")
+#     """
+#     def __init__(self, device="cuda", model_name="lichenda/chime4_fasnet_dprnn_tac"):
+#         self.model_name = model_name
+#         self.device = device
+#         print("Initializing ESPnet Enh to %s" % device)
+#         self._initialize_model()
+
+#     def _initialize_model(self):
+#         from espnet_model_zoo.downloader import ModelDownloader
+#         from espnet2.bin.enh_inference import SeparateSpeech
+
+#         d = ModelDownloader()
+
+#         cfg = d.download_and_unpack(self.model_name)
+#         self.separate_speech = SeparateSpeech(
+#             train_config=cfg["train_config"],
+#             model_file=cfg["model_file"],
+#             # for segment-wise process on long speech
+#             segment_size=2.4,
+#             hop_size=0.8,
+#             normalize_segment_scale=False,
+#             show_progressbar=True,
+#             ref_channel=None,
+#             normalize_output_wav=True,
+#             device=self.device,
+#         )
+
+#     def inference(self, speech_path, ref_channel=0):
+#         speech, sr = soundfile.read(speech_path)
+#         speech = speech[:, ref_channel]
+#         assert speech.dim() == 1
+
+#         enh_speech = self.separate_speech(speech[None, ], fs=sr)
+#         if len(enh_speech) == 1:
+#             return enh_speech[0]
+#         return enh_speech
+
+# class Speech_Enh_SS_MC:
+#     """Speech Enhancement or Separation in multi-channel"""
+#     def __init__(self, device="cuda", model_name=None, ref_channel=4):
+#         self.model_name = model_name
+#         self.ref_channel = ref_channel
+#         self.device = device
+#         print("Initializing ESPnet Enh to %s" % device)
+#         self._initialize_model()
+
+#     def _initialize_model(self):
+#         from espnet_model_zoo.downloader import ModelDownloader
+#         from espnet2.bin.enh_inference import SeparateSpeech
+
+#         d = ModelDownloader()
+
+#         cfg = d.download_and_unpack(self.model_name)
+#         self.separate_speech = SeparateSpeech(
+#             train_config=cfg["train_config"],
+#             model_file=cfg["model_file"],
+#             # for segment-wise process on long speech
+#             segment_size=2.4,
+#             hop_size=0.8,
+#             normalize_segment_scale=False,
+#             show_progressbar=True,
+#             ref_channel=self.ref_channel,
+#             normalize_output_wav=True,
+#             device=self.device,
+#         )
+
+#     def inference(self, speech_path):
+#         speech, sr = soundfile.read(speech_path)
+#         speech = speech.T
+
+#         enh_speech = self.separate_speech(speech[None, ...], fs=sr)
+#         if len(enh_speech) == 1:
+#             return enh_speech[0]
+#         return enh_speech
+
 class Speech_Enh_SS_SC:
     """Speech Enhancement or Separation in single-channel
     Example usage:
@@ -961,49 +1043,6 @@ def inference(self, speech_path):
             audio_filename = merge_audio(audio_filename_1, audio_filename_2)
         return audio_filename
 
-class Speech_Enh_SS_MC:
-    """Speech Enhancement or Separation in multi-channel"""
-    def __init__(self, device="cuda", model_name="espnet/Wangyou_Zhang_chime4_enh_train_enh_dc_crn_mapping_snr_raw", ref_channel=4):
-        self.model_name = model_name
-        self.ref_channel = ref_channel
-        self.device = device
-        print("Initializing ESPnet Enh to %s" % device)
-        self._initialize_model()
-
-    def _initialize_model(self):
-        from espnet_model_zoo.downloader import ModelDownloader
-        from espnet2.bin.enh_inference import SeparateSpeech
-
-        d = ModelDownloader()
-
-        cfg = d.download_and_unpack(self.model_name)
-        self.separate_speech = SeparateSpeech(
-            train_config=cfg["train_config"],
-            model_file=cfg["model_file"],
-            # for segment-wise process on long speech
-            segment_size=2.4,
-            hop_size=0.8,
-            normalize_segment_scale=False,
-            show_progressbar=True,
-            ref_channel=self.ref_channel,
-            normalize_output_wav=True,
-            device=self.device,
-        )
-
-    def inference(self, speech_path):
-        speech, sr = soundfile.read(speech_path)
-        speech = speech.T
-        print(speech[None, ...])
-        enh_speech = self.separate_speech(speech[None, ...], fs=sr)
-        audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
-        # if len(enh_speech) == 1:
-        soundfile.write(audio_filename, enh_speech[0].squeeze(), samplerate=sr)
-            # return enh_speech[0]
-        # return enh_speech
-        # else:
-            # soundfile.write(audio_filename, enh_speech, samplerate=sr)
-        return audio_filename
-
 class ConversationBot:
     def __init__(self):
         print("Initializing AudioGPT")
@@ -1396,4 +1435,4 @@ def clear_button(self):
         clear_speech.click(lambda: [], None, state)
         clear_speech.click(bot.clear_video, None, outvideo)
 
-        demo.launch(server_name="0.0.0.0", server_port=7861, share=True)
+        demo.launch(server_name="0.0.0.0", server_port=7860, share=True)