editing data_init.py; Now this file is responsible for preprocessing …

…data and initializing files
Anwarvic · May 5, 2019 · 7d02811 · 7d02811
1 parent c6e3c19
commit 7d02811
Showing 1 changed file with 125 additions and 43 deletions.
diff --git a/data_init.py b/data_init.py
@@ -1,54 +1,127 @@
 import os
+import yaml
+import shutil
 import sidekit
 import numpy as np
 from tqdm import tqdm
-
+from utils import convert_wav, safe_makedir, parse_yaml
 
 
 
 class Initializer():
     """
-    This class if for structure the data (training/test) into h5 files
-    that will be used later for training and evaluating our models
-    
-    #NOTE:All outputs of this script can be found in the directory TASK_DIR
+    To build speaker verification model, one needs speech data from each speaker
+    that is to be known by the system. The set of known speakers are in speaker
+    recognition known as the (enrollment speakers), and a speaker is enrolled
+    into the system when enrollment data from the speaker is processed to build
+    its model.
+    After the enrollment process, the performance of the speaker verification
+    system can be evaluated using test data, which in an open set scenario, will
+    consist of data from speakers in and outside the enrollment set.
+    The set of all speakers involved in testing the system will be referred to
+    as the test speakers.
+
+    This class if for preprocessing and structure the preprocessed data
+    into h5 files that will be used later for training and evaluating our models
+    NOTE:All outputs of this script can be found in the directory self.task_dir
     """
 
-    def __init__(self):
-        BASE_DIR = "/media/anwar/E/Voice_Biometrics/SIDEKIT-1.3/py3env"
-        self.AUDIO_DIR = os.path.join(BASE_DIR, "audio")
-        self.TASK_DIR = os.path.join(BASE_DIR, "task")
-        self.enrolled_speakers = self.__get_speakers()
-
-
-    def __get_speakers(self):
+    def __init__(self, conf_path):
         """
-        This private method is supposed to return the unique speakers' IDs
-        who are enrolled in the training data.
+        This method parses the YAML configuration file which can be used for
+        initializing the member varaibles!!
+        Args:
+            conf_path (String): path of the YAML configuration file
+        """
+
+        #location of output files
+        self.conf = parse_yaml(conf_path)
+        self.task_dir = os.path.join(self.conf['outpath'], "task")
+        #location of audio files
+        self.audio_dir = os.path.join(self.conf['outpath'], "audio")
+        #location of all the audio data
+        self.data_dir = os.path.join(self.audio_dir, "data")
+        #location of just the enrollment audio data
+        self.enroll_dir = os.path.join(self.audio_dir, "enroll")
+        #location of just the test audio data
+        self.test_dir = os.path.join(self.audio_dir, "test")
+
+
+    def preprocess_audio(self):
         """
-        enroll_dir = os.path.join(self.AUDIO_DIR, "enroll") # enrollment data directory
-        enroll_files = os.listdir(enroll_dir)
-        enroll_models = [files.split('_')[0] for files in enroll_files] # list of model IDs
-        return sorted(set(enroll_models))
+        Copy the Merged Arabic Corpus of Isolated Words into their
+        associated directory. The whole audio data will be in 'data'
+        directory, the enrolled data only will be in 'enroll', and the
+        test data will be in 'test'.
+        """
+        #remove the data directory if exists
+        if os.path.exists(self.data_dir):
+            shutil.rmtree(self.data_dir)
+        #iterate over speakers
+        speakers = sorted(os.listdir(self.conf['inpath']))
+        for sp in tqdm(speakers, desc="Converting Audio"):
+            speaker_path = os.path.join(self.conf['inpath'], sp)
+            wav_filenames = os.listdir(speaker_path)
+            for wav in wav_filenames:
+                inwav = os.path.join(speaker_path, wav)
+                outwav = os.path.join(self.data_dir, wav)
+                convert_wav(inwav,
+                            outwav,
+                            no_channels = self.conf['no_channels'],
+                            sampling_rate = self.conf['sampling_rate'],
+                            bit_precision = self.conf['bit_precision'])
+
+        #remove the enroll directory if exists
+        if os.path.exists(self.enroll_dir):
+            shutil.rmtree(self.enroll_dir)
+        #remove the test directory if exists
+        if os.path.exists(self.test_dir):
+            shutil.rmtree(self.test_dir)
+
+        #create audio/enroll directory
+        safe_makedir(self.enroll_dir)
+        #create audio/test directory
+        safe_makedir(self.test_dir)
+
+        #parse num of sessions from configuration
+        enroll_sessions = self.conf['enroll_sessions']
+        test_sessions = self.conf['test_sessions']
+        assert enroll_sessions+test_sessions <= 10,\
+            "The summation of all sessions must be less than or equal 10!!"
+        #iterate over all preprocessed waves
+        wav_filenames = os.listdir(self.data_dir)
+        for wav in tqdm(wav_filenames, desc="Copying enroll/test waves"):
+            _, sess, _, _ = wav.split(".")
+            inwav = os.path.join(self.data_dir, wav)
+            if int(sess) <= enroll_sessions:
+                outwav = os.path.join(self.enroll_dir, wav)
+                shutil.copyfile(inwav, outwav)
+            elif int(sess) <= enroll_sessions+test_sessions:
+                outwav = os.path.join(self.test_dir, wav)
+                shutil.copyfile(inwav, outwav)
 
 
     def create_idMap(self, group):
         """
         IdMap are used to store two lists of strings and to map between them.
-        Most of the time, IdMap are used to associate names of segments (sessions)
-        stored in leftids; with the ID of their class (that could be a speaker ID)
-        stored in rightids.
+        Most of the time, IdMap are used to associate segments names (sessions)
+        stored in leftids; with the ID of their class (that could be the speaker
+        ID) stored in rightids.
+        Additionally, and in order to allow more flexibility, IdMap includes two
+        other vectors: 'start'and 'stop' which are float vectors used to store
+        boudaries of audio segments.
+        Args:
+            group (string): name of the group that we want to create idmap for
         NOTE: Duplicated entries are allowed in each list.
-        Additionally, and in order to allow more flexibility, IdMap includes two other vectors:
-        'start'and 'stop' which are vectors of floats and can be used to store boudaries of
-        audio segments.
-        An IdMap object is often used to store together: speaker IDs, segment IDs,
-        start and stop time of the segment and to initialize a StatServer.
         """
+        assert group in ["enroll", "test"],\
+            "Invalid group name!! Choose either 'enroll', 'test'"
         # Make enrollment (IdMap) file list
-        group_dir = os.path.join(self.AUDIO_DIR, group) # enrollment data directory
-        group_files = os.listdir(group_dir)
-        group_models = [files.split('_')[0] for files in group_files] # list of model IDs
+        group_dir = os.path.join(self.audio_dir, group)
+        group_files = sorted(os.listdir(group_dir))
+        # list of model IDs
+        group_models = [files.split('.')[0] for files in group_files]
+        # list of audio segments IDs
         group_segments = [group+"/"+f for f in group_files]
 
         # Generate IdMap
@@ -58,8 +131,11 @@ def create_idMap(self, group):
         group_idmap.start = np.empty(group_idmap.rightids.shape, '|O')
         group_idmap.stop = np.empty(group_idmap.rightids.shape, '|O')
         if group_idmap.validate():
-            #TODO: possibily adding tv_idmap.h5 and plda_idmap.h5
-            group_idmap.write(os.path.join(self.TASK_DIR, group+'_idmap.h5'))
+            group_idmap.write(os.path.join(self.task_dir, group+'_idmap.h5'))
+            #generate tv_idmap and plda_idmap as well
+            if group == "enroll":
+                group_idmap.write(os.path.join(self.task_dir, 'tv_idmap.h5'))
+                group_idmap.write(os.path.join(self.task_dir, 'plda_idmap.h5'))
         else:
             raise RuntimeError('Problems with creating idMap file')
 
@@ -75,18 +151,22 @@ def create_test_trials(self):
         is true then the score between model i and segment j will be computed.
         """
         # Make list of test segments
-        test_data_dir = os.path.join(self.AUDIO_DIR, "test") # test data directory
-        test_files = os.listdir(test_data_dir)
+        test_data_dir = os.path.join(self.audio_dir, "test") #test data directory
+        test_files = sorted(os.listdir(test_data_dir))
         test_files = ["test/"+f for f in test_files]
 
         # Make lists for trial definition, and write to file
         test_models = []
         test_segments = []
         test_labels = []
-
-        for model in tqdm(self.enrolled_speakers, desc="Processing Enrolled-speakers"):
+        # Get enroll speakers
+        enrolled_speakers = set([])
+        for filename in os.listdir(os.path.join(self.audio_dir, "enroll")):
+            enrolled_speakers.add(filename.split(".")[0])
+        enrolled_speakers = sorted(enrolled_speakers)
+        for model in tqdm(enrolled_speakers, desc="Creating Test Cases"):
             for segment in sorted(test_files):
-                test_model = segment.split("_")[0].split("/")[-1]
+                test_model = segment.split(".")[0].split("/")[-1]
                 test_models.append(model)
                 test_segments.append(segment)
                 # Compare gender and speaker ID for each test file
@@ -95,7 +175,7 @@ def create_test_trials(self):
                 else:
                     test_labels.append('nontarget')
 
-        with open(os.path.join(self.TASK_DIR, "test_trials.txt"), "w") as fh:
+        with open(os.path.join(self.task_dir, "test_trials.txt"), "w") as fh:
             for i in range(len(test_models)):
                 fh.write(test_models[i]+' '+test_segments[i]+' '+test_labels[i]+'\n')
 
@@ -107,12 +187,12 @@ def create_Ndx(self):
         if the test between model i and segment j is target. non(i,j) is true
         if the test between model i and segment j is non-target.
         """
-        # Define Key and Ndx from text file
-        # SEE: https://projets-lium.univ-lemans.fr/sidekit/_modules/sidekit/bosaris/key.html
-        key = sidekit.Key.read_txt(os.path.join(self.TASK_DIR, "test_trials.txt"))
+        #Define Key and Ndx from text file
+        #SEE: https://projets-lium.univ-lemans.fr/sidekit/_modules/sidekit/bosaris/key.html
+        key = sidekit.Key.read_txt(os.path.join(self.task_dir, "test_trials.txt"))
         ndx = key.to_ndx()
         if ndx.validate():
-            ndx.write(os.path.join(self.TASK_DIR, 'test_ndx.h5'))
+            ndx.write(os.path.join(self.task_dir, 'test_ndx.h5'))
         else:
             raise RuntimeError('Problems with creating idMap file')
 
@@ -122,6 +202,7 @@ def structure(self):
         This is the main method for this class, it calls all previous
         methods... that's basically what it does :)
         """
+        self.preprocess_audio()
         self.create_idMap("enroll")
         self.create_idMap("test")
         self.create_test_trials()
@@ -132,5 +213,6 @@ def structure(self):
 
 
 if __name__ == "__main__":
-    init = Initializer()
+    conf_filename = "py3env/conf.yaml"
+    init = Initializer(conf_filename)
     init.structure()