diff --git a/data_init.py b/data_init.py index 5041b4b..80fbbe4 100644 --- a/data_init.py +++ b/data_init.py @@ -1,54 +1,127 @@ import os +import yaml +import shutil import sidekit import numpy as np from tqdm import tqdm - +from utils import convert_wav, safe_makedir, parse_yaml class Initializer(): """ - This class if for structure the data (training/test) into h5 files - that will be used later for training and evaluating our models - - #NOTE:All outputs of this script can be found in the directory TASK_DIR + To build speaker verification model, one needs speech data from each speaker + that is to be known by the system. The set of known speakers are in speaker + recognition known as the (enrollment speakers), and a speaker is enrolled + into the system when enrollment data from the speaker is processed to build + its model. + After the enrollment process, the performance of the speaker verification + system can be evaluated using test data, which in an open set scenario, will + consist of data from speakers in and outside the enrollment set. + The set of all speakers involved in testing the system will be referred to + as the test speakers. + + This class if for preprocessing and structure the preprocessed data + into h5 files that will be used later for training and evaluating our models + NOTE:All outputs of this script can be found in the directory self.task_dir """ - def __init__(self): - BASE_DIR = "/media/anwar/E/Voice_Biometrics/SIDEKIT-1.3/py3env" - self.AUDIO_DIR = os.path.join(BASE_DIR, "audio") - self.TASK_DIR = os.path.join(BASE_DIR, "task") - self.enrolled_speakers = self.__get_speakers() - - - def __get_speakers(self): + def __init__(self, conf_path): """ - This private method is supposed to return the unique speakers' IDs - who are enrolled in the training data. + This method parses the YAML configuration file which can be used for + initializing the member varaibles!! + Args: + conf_path (String): path of the YAML configuration file + """ + + #location of output files + self.conf = parse_yaml(conf_path) + self.task_dir = os.path.join(self.conf['outpath'], "task") + #location of audio files + self.audio_dir = os.path.join(self.conf['outpath'], "audio") + #location of all the audio data + self.data_dir = os.path.join(self.audio_dir, "data") + #location of just the enrollment audio data + self.enroll_dir = os.path.join(self.audio_dir, "enroll") + #location of just the test audio data + self.test_dir = os.path.join(self.audio_dir, "test") + + + def preprocess_audio(self): """ - enroll_dir = os.path.join(self.AUDIO_DIR, "enroll") # enrollment data directory - enroll_files = os.listdir(enroll_dir) - enroll_models = [files.split('_')[0] for files in enroll_files] # list of model IDs - return sorted(set(enroll_models)) + Copy the Merged Arabic Corpus of Isolated Words into their + associated directory. The whole audio data will be in 'data' + directory, the enrolled data only will be in 'enroll', and the + test data will be in 'test'. + """ + #remove the data directory if exists + if os.path.exists(self.data_dir): + shutil.rmtree(self.data_dir) + #iterate over speakers + speakers = sorted(os.listdir(self.conf['inpath'])) + for sp in tqdm(speakers, desc="Converting Audio"): + speaker_path = os.path.join(self.conf['inpath'], sp) + wav_filenames = os.listdir(speaker_path) + for wav in wav_filenames: + inwav = os.path.join(speaker_path, wav) + outwav = os.path.join(self.data_dir, wav) + convert_wav(inwav, + outwav, + no_channels = self.conf['no_channels'], + sampling_rate = self.conf['sampling_rate'], + bit_precision = self.conf['bit_precision']) + + #remove the enroll directory if exists + if os.path.exists(self.enroll_dir): + shutil.rmtree(self.enroll_dir) + #remove the test directory if exists + if os.path.exists(self.test_dir): + shutil.rmtree(self.test_dir) + + #create audio/enroll directory + safe_makedir(self.enroll_dir) + #create audio/test directory + safe_makedir(self.test_dir) + + #parse num of sessions from configuration + enroll_sessions = self.conf['enroll_sessions'] + test_sessions = self.conf['test_sessions'] + assert enroll_sessions+test_sessions <= 10,\ + "The summation of all sessions must be less than or equal 10!!" + #iterate over all preprocessed waves + wav_filenames = os.listdir(self.data_dir) + for wav in tqdm(wav_filenames, desc="Copying enroll/test waves"): + _, sess, _, _ = wav.split(".") + inwav = os.path.join(self.data_dir, wav) + if int(sess) <= enroll_sessions: + outwav = os.path.join(self.enroll_dir, wav) + shutil.copyfile(inwav, outwav) + elif int(sess) <= enroll_sessions+test_sessions: + outwav = os.path.join(self.test_dir, wav) + shutil.copyfile(inwav, outwav) def create_idMap(self, group): """ IdMap are used to store two lists of strings and to map between them. - Most of the time, IdMap are used to associate names of segments (sessions) - stored in leftids; with the ID of their class (that could be a speaker ID) - stored in rightids. + Most of the time, IdMap are used to associate segments names (sessions) + stored in leftids; with the ID of their class (that could be the speaker + ID) stored in rightids. + Additionally, and in order to allow more flexibility, IdMap includes two + other vectors: 'start'and 'stop' which are float vectors used to store + boudaries of audio segments. + Args: + group (string): name of the group that we want to create idmap for NOTE: Duplicated entries are allowed in each list. - Additionally, and in order to allow more flexibility, IdMap includes two other vectors: - 'start'and 'stop' which are vectors of floats and can be used to store boudaries of - audio segments. - An IdMap object is often used to store together: speaker IDs, segment IDs, - start and stop time of the segment and to initialize a StatServer. """ + assert group in ["enroll", "test"],\ + "Invalid group name!! Choose either 'enroll', 'test'" # Make enrollment (IdMap) file list - group_dir = os.path.join(self.AUDIO_DIR, group) # enrollment data directory - group_files = os.listdir(group_dir) - group_models = [files.split('_')[0] for files in group_files] # list of model IDs + group_dir = os.path.join(self.audio_dir, group) + group_files = sorted(os.listdir(group_dir)) + # list of model IDs + group_models = [files.split('.')[0] for files in group_files] + # list of audio segments IDs group_segments = [group+"/"+f for f in group_files] # Generate IdMap @@ -58,8 +131,11 @@ def create_idMap(self, group): group_idmap.start = np.empty(group_idmap.rightids.shape, '|O') group_idmap.stop = np.empty(group_idmap.rightids.shape, '|O') if group_idmap.validate(): - #TODO: possibily adding tv_idmap.h5 and plda_idmap.h5 - group_idmap.write(os.path.join(self.TASK_DIR, group+'_idmap.h5')) + group_idmap.write(os.path.join(self.task_dir, group+'_idmap.h5')) + #generate tv_idmap and plda_idmap as well + if group == "enroll": + group_idmap.write(os.path.join(self.task_dir, 'tv_idmap.h5')) + group_idmap.write(os.path.join(self.task_dir, 'plda_idmap.h5')) else: raise RuntimeError('Problems with creating idMap file') @@ -75,18 +151,22 @@ def create_test_trials(self): is true then the score between model i and segment j will be computed. """ # Make list of test segments - test_data_dir = os.path.join(self.AUDIO_DIR, "test") # test data directory - test_files = os.listdir(test_data_dir) + test_data_dir = os.path.join(self.audio_dir, "test") #test data directory + test_files = sorted(os.listdir(test_data_dir)) test_files = ["test/"+f for f in test_files] # Make lists for trial definition, and write to file test_models = [] test_segments = [] test_labels = [] - - for model in tqdm(self.enrolled_speakers, desc="Processing Enrolled-speakers"): + # Get enroll speakers + enrolled_speakers = set([]) + for filename in os.listdir(os.path.join(self.audio_dir, "enroll")): + enrolled_speakers.add(filename.split(".")[0]) + enrolled_speakers = sorted(enrolled_speakers) + for model in tqdm(enrolled_speakers, desc="Creating Test Cases"): for segment in sorted(test_files): - test_model = segment.split("_")[0].split("/")[-1] + test_model = segment.split(".")[0].split("/")[-1] test_models.append(model) test_segments.append(segment) # Compare gender and speaker ID for each test file @@ -95,7 +175,7 @@ def create_test_trials(self): else: test_labels.append('nontarget') - with open(os.path.join(self.TASK_DIR, "test_trials.txt"), "w") as fh: + with open(os.path.join(self.task_dir, "test_trials.txt"), "w") as fh: for i in range(len(test_models)): fh.write(test_models[i]+' '+test_segments[i]+' '+test_labels[i]+'\n') @@ -107,12 +187,12 @@ def create_Ndx(self): if the test between model i and segment j is target. non(i,j) is true if the test between model i and segment j is non-target. """ - # Define Key and Ndx from text file - # SEE: https://projets-lium.univ-lemans.fr/sidekit/_modules/sidekit/bosaris/key.html - key = sidekit.Key.read_txt(os.path.join(self.TASK_DIR, "test_trials.txt")) + #Define Key and Ndx from text file + #SEE: https://projets-lium.univ-lemans.fr/sidekit/_modules/sidekit/bosaris/key.html + key = sidekit.Key.read_txt(os.path.join(self.task_dir, "test_trials.txt")) ndx = key.to_ndx() if ndx.validate(): - ndx.write(os.path.join(self.TASK_DIR, 'test_ndx.h5')) + ndx.write(os.path.join(self.task_dir, 'test_ndx.h5')) else: raise RuntimeError('Problems with creating idMap file') @@ -122,6 +202,7 @@ def structure(self): This is the main method for this class, it calls all previous methods... that's basically what it does :) """ + self.preprocess_audio() self.create_idMap("enroll") self.create_idMap("test") self.create_test_trials() @@ -132,5 +213,6 @@ def structure(self): if __name__ == "__main__": - init = Initializer() + conf_filename = "py3env/conf.yaml" + init = Initializer(conf_filename) init.structure() \ No newline at end of file