Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
lmzjms committed Apr 13, 2023
1 parent 34d0365 commit 209995f
Showing 1 changed file with 83 additions and 44 deletions.
127 changes: 83 additions & 44 deletions audio-chatgpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
sys.path.append(os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'NeuralSeq'))
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio'))
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio_img'))
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'text_to_audio/Make_An_Audio_inpaint'))
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'audio_detection'))
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'mono2binaural'))
import gradio as gr
Expand Down Expand Up @@ -867,6 +869,86 @@ def inference(self, text, audio_path):
#print(ans)
return ans

# class Speech_Enh_SS_SC:
# """Speech Enhancement or Separation in single-channel
# Example usage:
# enh_model = Speech_Enh_SS("cuda")
# enh_wav = enh_model.inference("./test_chime4_audio_M05_440C0213_PED_REAL.wav")
# """
# def __init__(self, device="cuda", model_name="lichenda/chime4_fasnet_dprnn_tac"):
# self.model_name = model_name
# self.device = device
# print("Initializing ESPnet Enh to %s" % device)
# self._initialize_model()

# def _initialize_model(self):
# from espnet_model_zoo.downloader import ModelDownloader
# from espnet2.bin.enh_inference import SeparateSpeech

# d = ModelDownloader()

# cfg = d.download_and_unpack(self.model_name)
# self.separate_speech = SeparateSpeech(
# train_config=cfg["train_config"],
# model_file=cfg["model_file"],
# # for segment-wise process on long speech
# segment_size=2.4,
# hop_size=0.8,
# normalize_segment_scale=False,
# show_progressbar=True,
# ref_channel=None,
# normalize_output_wav=True,
# device=self.device,
# )

# def inference(self, speech_path, ref_channel=0):
# speech, sr = soundfile.read(speech_path)
# speech = speech[:, ref_channel]
# assert speech.dim() == 1

# enh_speech = self.separate_speech(speech[None, ], fs=sr)
# if len(enh_speech) == 1:
# return enh_speech[0]
# return enh_speech

# class Speech_Enh_SS_MC:
# """Speech Enhancement or Separation in multi-channel"""
# def __init__(self, device="cuda", model_name=None, ref_channel=4):
# self.model_name = model_name
# self.ref_channel = ref_channel
# self.device = device
# print("Initializing ESPnet Enh to %s" % device)
# self._initialize_model()

# def _initialize_model(self):
# from espnet_model_zoo.downloader import ModelDownloader
# from espnet2.bin.enh_inference import SeparateSpeech

# d = ModelDownloader()

# cfg = d.download_and_unpack(self.model_name)
# self.separate_speech = SeparateSpeech(
# train_config=cfg["train_config"],
# model_file=cfg["model_file"],
# # for segment-wise process on long speech
# segment_size=2.4,
# hop_size=0.8,
# normalize_segment_scale=False,
# show_progressbar=True,
# ref_channel=self.ref_channel,
# normalize_output_wav=True,
# device=self.device,
# )

# def inference(self, speech_path):
# speech, sr = soundfile.read(speech_path)
# speech = speech.T

# enh_speech = self.separate_speech(speech[None, ...], fs=sr)
# if len(enh_speech) == 1:
# return enh_speech[0]
# return enh_speech

class Speech_Enh_SS_SC:
"""Speech Enhancement or Separation in single-channel
Example usage:
Expand Down Expand Up @@ -961,49 +1043,6 @@ def inference(self, speech_path):
audio_filename = merge_audio(audio_filename_1, audio_filename_2)
return audio_filename

class Speech_Enh_SS_MC:
"""Speech Enhancement or Separation in multi-channel"""
def __init__(self, device="cuda", model_name="espnet/Wangyou_Zhang_chime4_enh_train_enh_dc_crn_mapping_snr_raw", ref_channel=4):
self.model_name = model_name
self.ref_channel = ref_channel
self.device = device
print("Initializing ESPnet Enh to %s" % device)
self._initialize_model()

def _initialize_model(self):
from espnet_model_zoo.downloader import ModelDownloader
from espnet2.bin.enh_inference import SeparateSpeech

d = ModelDownloader()

cfg = d.download_and_unpack(self.model_name)
self.separate_speech = SeparateSpeech(
train_config=cfg["train_config"],
model_file=cfg["model_file"],
# for segment-wise process on long speech
segment_size=2.4,
hop_size=0.8,
normalize_segment_scale=False,
show_progressbar=True,
ref_channel=self.ref_channel,
normalize_output_wav=True,
device=self.device,
)

def inference(self, speech_path):
speech, sr = soundfile.read(speech_path)
speech = speech.T
print(speech[None, ...])
enh_speech = self.separate_speech(speech[None, ...], fs=sr)
audio_filename = os.path.join('audio', str(uuid.uuid4())[0:8] + ".wav")
# if len(enh_speech) == 1:
soundfile.write(audio_filename, enh_speech[0].squeeze(), samplerate=sr)
# return enh_speech[0]
# return enh_speech
# else:
# soundfile.write(audio_filename, enh_speech, samplerate=sr)
return audio_filename

class ConversationBot:
def __init__(self):
print("Initializing AudioGPT")
Expand Down Expand Up @@ -1396,4 +1435,4 @@ def clear_button(self):
clear_speech.click(lambda: [], None, state)
clear_speech.click(bot.clear_video, None, outvideo)

demo.launch(server_name="0.0.0.0", server_port=7861, share=True)
demo.launch(server_name="0.0.0.0", server_port=7860, share=True)

0 comments on commit 209995f

Please sign in to comment.