Skip to content

Commit

Permalink
editing data_init.py; Now this file is responsible for preprocessing …
Browse files Browse the repository at this point in the history
…data and initializing files
  • Loading branch information
Anwarvic committed May 5, 2019
1 parent c6e3c19 commit 7d02811
Showing 1 changed file with 125 additions and 43 deletions.
168 changes: 125 additions & 43 deletions data_init.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,127 @@
import os
import yaml
import shutil
import sidekit
import numpy as np
from tqdm import tqdm

from utils import convert_wav, safe_makedir, parse_yaml



class Initializer():
"""
This class if for structure the data (training/test) into h5 files
that will be used later for training and evaluating our models
#NOTE:All outputs of this script can be found in the directory TASK_DIR
To build speaker verification model, one needs speech data from each speaker
that is to be known by the system. The set of known speakers are in speaker
recognition known as the (enrollment speakers), and a speaker is enrolled
into the system when enrollment data from the speaker is processed to build
its model.
After the enrollment process, the performance of the speaker verification
system can be evaluated using test data, which in an open set scenario, will
consist of data from speakers in and outside the enrollment set.
The set of all speakers involved in testing the system will be referred to
as the test speakers.
This class if for preprocessing and structure the preprocessed data
into h5 files that will be used later for training and evaluating our models
NOTE:All outputs of this script can be found in the directory self.task_dir
"""

def __init__(self):
BASE_DIR = "/media/anwar/E/Voice_Biometrics/SIDEKIT-1.3/py3env"
self.AUDIO_DIR = os.path.join(BASE_DIR, "audio")
self.TASK_DIR = os.path.join(BASE_DIR, "task")
self.enrolled_speakers = self.__get_speakers()


def __get_speakers(self):
def __init__(self, conf_path):
"""
This private method is supposed to return the unique speakers' IDs
who are enrolled in the training data.
This method parses the YAML configuration file which can be used for
initializing the member varaibles!!
Args:
conf_path (String): path of the YAML configuration file
"""

#location of output files
self.conf = parse_yaml(conf_path)
self.task_dir = os.path.join(self.conf['outpath'], "task")
#location of audio files
self.audio_dir = os.path.join(self.conf['outpath'], "audio")
#location of all the audio data
self.data_dir = os.path.join(self.audio_dir, "data")
#location of just the enrollment audio data
self.enroll_dir = os.path.join(self.audio_dir, "enroll")
#location of just the test audio data
self.test_dir = os.path.join(self.audio_dir, "test")


def preprocess_audio(self):
"""
enroll_dir = os.path.join(self.AUDIO_DIR, "enroll") # enrollment data directory
enroll_files = os.listdir(enroll_dir)
enroll_models = [files.split('_')[0] for files in enroll_files] # list of model IDs
return sorted(set(enroll_models))
Copy the Merged Arabic Corpus of Isolated Words into their
associated directory. The whole audio data will be in 'data'
directory, the enrolled data only will be in 'enroll', and the
test data will be in 'test'.
"""
#remove the data directory if exists
if os.path.exists(self.data_dir):
shutil.rmtree(self.data_dir)
#iterate over speakers
speakers = sorted(os.listdir(self.conf['inpath']))
for sp in tqdm(speakers, desc="Converting Audio"):
speaker_path = os.path.join(self.conf['inpath'], sp)
wav_filenames = os.listdir(speaker_path)
for wav in wav_filenames:
inwav = os.path.join(speaker_path, wav)
outwav = os.path.join(self.data_dir, wav)
convert_wav(inwav,
outwav,
no_channels = self.conf['no_channels'],
sampling_rate = self.conf['sampling_rate'],
bit_precision = self.conf['bit_precision'])

#remove the enroll directory if exists
if os.path.exists(self.enroll_dir):
shutil.rmtree(self.enroll_dir)
#remove the test directory if exists
if os.path.exists(self.test_dir):
shutil.rmtree(self.test_dir)

#create audio/enroll directory
safe_makedir(self.enroll_dir)
#create audio/test directory
safe_makedir(self.test_dir)

#parse num of sessions from configuration
enroll_sessions = self.conf['enroll_sessions']
test_sessions = self.conf['test_sessions']
assert enroll_sessions+test_sessions <= 10,\
"The summation of all sessions must be less than or equal 10!!"
#iterate over all preprocessed waves
wav_filenames = os.listdir(self.data_dir)
for wav in tqdm(wav_filenames, desc="Copying enroll/test waves"):
_, sess, _, _ = wav.split(".")
inwav = os.path.join(self.data_dir, wav)
if int(sess) <= enroll_sessions:
outwav = os.path.join(self.enroll_dir, wav)
shutil.copyfile(inwav, outwav)
elif int(sess) <= enroll_sessions+test_sessions:
outwav = os.path.join(self.test_dir, wav)
shutil.copyfile(inwav, outwav)


def create_idMap(self, group):
"""
IdMap are used to store two lists of strings and to map between them.
Most of the time, IdMap are used to associate names of segments (sessions)
stored in leftids; with the ID of their class (that could be a speaker ID)
stored in rightids.
Most of the time, IdMap are used to associate segments names (sessions)
stored in leftids; with the ID of their class (that could be the speaker
ID) stored in rightids.
Additionally, and in order to allow more flexibility, IdMap includes two
other vectors: 'start'and 'stop' which are float vectors used to store
boudaries of audio segments.
Args:
group (string): name of the group that we want to create idmap for
NOTE: Duplicated entries are allowed in each list.
Additionally, and in order to allow more flexibility, IdMap includes two other vectors:
'start'and 'stop' which are vectors of floats and can be used to store boudaries of
audio segments.
An IdMap object is often used to store together: speaker IDs, segment IDs,
start and stop time of the segment and to initialize a StatServer.
"""
assert group in ["enroll", "test"],\
"Invalid group name!! Choose either 'enroll', 'test'"
# Make enrollment (IdMap) file list
group_dir = os.path.join(self.AUDIO_DIR, group) # enrollment data directory
group_files = os.listdir(group_dir)
group_models = [files.split('_')[0] for files in group_files] # list of model IDs
group_dir = os.path.join(self.audio_dir, group)
group_files = sorted(os.listdir(group_dir))
# list of model IDs
group_models = [files.split('.')[0] for files in group_files]
# list of audio segments IDs
group_segments = [group+"/"+f for f in group_files]

# Generate IdMap
Expand All @@ -58,8 +131,11 @@ def create_idMap(self, group):
group_idmap.start = np.empty(group_idmap.rightids.shape, '|O')
group_idmap.stop = np.empty(group_idmap.rightids.shape, '|O')
if group_idmap.validate():
#TODO: possibily adding tv_idmap.h5 and plda_idmap.h5
group_idmap.write(os.path.join(self.TASK_DIR, group+'_idmap.h5'))
group_idmap.write(os.path.join(self.task_dir, group+'_idmap.h5'))
#generate tv_idmap and plda_idmap as well
if group == "enroll":
group_idmap.write(os.path.join(self.task_dir, 'tv_idmap.h5'))
group_idmap.write(os.path.join(self.task_dir, 'plda_idmap.h5'))
else:
raise RuntimeError('Problems with creating idMap file')

Expand All @@ -75,18 +151,22 @@ def create_test_trials(self):
is true then the score between model i and segment j will be computed.
"""
# Make list of test segments
test_data_dir = os.path.join(self.AUDIO_DIR, "test") # test data directory
test_files = os.listdir(test_data_dir)
test_data_dir = os.path.join(self.audio_dir, "test") #test data directory
test_files = sorted(os.listdir(test_data_dir))
test_files = ["test/"+f for f in test_files]

# Make lists for trial definition, and write to file
test_models = []
test_segments = []
test_labels = []

for model in tqdm(self.enrolled_speakers, desc="Processing Enrolled-speakers"):
# Get enroll speakers
enrolled_speakers = set([])
for filename in os.listdir(os.path.join(self.audio_dir, "enroll")):
enrolled_speakers.add(filename.split(".")[0])
enrolled_speakers = sorted(enrolled_speakers)
for model in tqdm(enrolled_speakers, desc="Creating Test Cases"):
for segment in sorted(test_files):
test_model = segment.split("_")[0].split("/")[-1]
test_model = segment.split(".")[0].split("/")[-1]
test_models.append(model)
test_segments.append(segment)
# Compare gender and speaker ID for each test file
Expand All @@ -95,7 +175,7 @@ def create_test_trials(self):
else:
test_labels.append('nontarget')

with open(os.path.join(self.TASK_DIR, "test_trials.txt"), "w") as fh:
with open(os.path.join(self.task_dir, "test_trials.txt"), "w") as fh:
for i in range(len(test_models)):
fh.write(test_models[i]+' '+test_segments[i]+' '+test_labels[i]+'\n')

Expand All @@ -107,12 +187,12 @@ def create_Ndx(self):
if the test between model i and segment j is target. non(i,j) is true
if the test between model i and segment j is non-target.
"""
# Define Key and Ndx from text file
# SEE: https://projets-lium.univ-lemans.fr/sidekit/_modules/sidekit/bosaris/key.html
key = sidekit.Key.read_txt(os.path.join(self.TASK_DIR, "test_trials.txt"))
#Define Key and Ndx from text file
#SEE: https://projets-lium.univ-lemans.fr/sidekit/_modules/sidekit/bosaris/key.html
key = sidekit.Key.read_txt(os.path.join(self.task_dir, "test_trials.txt"))
ndx = key.to_ndx()
if ndx.validate():
ndx.write(os.path.join(self.TASK_DIR, 'test_ndx.h5'))
ndx.write(os.path.join(self.task_dir, 'test_ndx.h5'))
else:
raise RuntimeError('Problems with creating idMap file')

Expand All @@ -122,6 +202,7 @@ def structure(self):
This is the main method for this class, it calls all previous
methods... that's basically what it does :)
"""
self.preprocess_audio()
self.create_idMap("enroll")
self.create_idMap("test")
self.create_test_trials()
Expand All @@ -132,5 +213,6 @@ def structure(self):


if __name__ == "__main__":
init = Initializer()
conf_filename = "py3env/conf.yaml"
init = Initializer(conf_filename)
init.structure()

0 comments on commit 7d02811

Please sign in to comment.