config implementation syncd with other than the main model file; read…

…me added
csiki · Feb 27, 2019 · 8da364c · 8da364c
1 parent f8953ea
commit 8da364c
Show file tree

Hide file tree

Showing 9 changed files with 148 additions and 223 deletions.
diff --git a/README.md b/README.md
@@ -1,10 +1,48 @@
-mention where the original model comes from, which github site, which paper, wavegan:
-implementing [DRAW](https://arxiv.org/abs/1502.04623)
-from [Github repo](https://github.com/kvfrans/draw-color)
-uses wavegan
-
-# TODO
-- [ ] finish readme
-- [ ] create new config handling mechanism with a global config file
-- [ ] streamline training and testing
-- [ ] migrate aev2a params to config, input dimension can not be read from data
+#Autoencoded sensory substitution
+
+Visual-to-auditory (V2A) sensory substitution stands for the translation of images to sound,
+in the interest of aiding the blind. The generated soundscapes should convey visual information,
+ideally representing all the details on the given image, with as short a sound sequences as possible.
+Traditional V2A conversion methods apply an explicitly predefined function that transforms the input
+image pixel-by-pixel to soundscapes, superimposing them in the final step. Here is the implementation
+of a novel conversion approach, which posits sensory substitution as a compression problem.
+
+Optimal compression is learnt and computed by a recurrent variational autoencoder, called AEV2A.
+The autoencoder takes an image as input, translates it to a sequence of soundscapes, before
+reconstructing the image in an iterative manner drawing on a canvas. The neural network implementation
+is based on the [DRAW](https://arxiv.org/abs/1502.04623) model; the repository from which the code was
+initially cloned can be found [here](https://github.com/kvfrans/draw-color). AEV2A further builds on
+[WaveGAN](https://arxiv.org/abs/1802.04208) (repo [here](https://github.com/chrisdonahue/wavegan)).
+
+For further details check this [blog post](TODO) or the thesis [here](TODO).
+
+[TODO ADD GIF HERE]
+
+##Requirements
+[TODO PYTHON LIBS, FFMPEG, PYMATLAB]
+
+##How to run
+[TOOD high lvl: gen dataset/download, train network, test network, run proto, analyze image-to-sound]
+
+##What's what
+[TODO very short descr of each folder/file]
+
+##Datasets
+[TODO short description, more under /data]
+
+[TODO image of hands dataset]
+
+##Training
+
+###Tensorboard analysis
+
+##Testing
+
+##Running live
+
+##Image-to-sound conversion analysis
+
+##Citation
+```
+[TODO bibtex]
+```
diff --git a/aev2a.py b/aev2a.py
@@ -14,8 +14,7 @@
 
 class Draw:
 
-    def __init__(self, nepoch, network_params, model_name_postfix, logging=True, training=True):
-        self.nepoch = nepoch
+    def __init__(self, network_params, model_name_postfix, logging=True, training=True):
         self.img_h, self.img_w, self.num_colors = network_params['input_dim']
         self.grayscale = self.num_colors == 1  # else RGB
         self.logging = logging
@@ -593,7 +592,7 @@ def get_batch(self, data, indices=None, batch_size=None, start_stop_index=None):
         return np.array(
             [get_image(data[i], self.grayscale) for i in indices]).astype(self.npdtype)
 
-    def train(self, dataset, restore=True, model_name=None, log_every=100, save_every=1000):
+    def train(self, dataset, restore=True, model_name=None, nepoch=10000, log_every=100, save_every=1000):
         self.model_name = model_name or self.model_name
 
         data = self.get_data(dataset)
@@ -610,7 +609,7 @@ def train(self, dataset, restore=True, model_name=None, log_every=100, save_ever
             print('NEW MODEL "{}" IS BEING TRAINED'.format(self.model_name), file=sys.stderr)
 
         start_time = time.time()
-        for e in range(self.nepoch):
+        for e in range(nepoch):
             nbatch = (data_len // self.batch_size) - 2
             for i in range(nbatch):
 
@@ -643,7 +642,7 @@ def train(self, dataset, restore=True, model_name=None, log_every=100, save_ever
 
                 if (e * nbatch + i + 1) % save_every == 0:
                     saver.save(self.sess, os.path.join(os.getcwd(), 'training', self.model_name, 'train'), global_step=glob_step)
-                    print('MODEL "{}" SAVED at iteration {}'.format(self.model_name, e * self.nepoch + i), file=sys.stderr)
+                    print('MODEL "{}" SAVED at iteration {}'.format(self.model_name, e * nepoch + i), file=sys.stderr)
 
                     cs = 1.0/(1.0+np.exp(-np.array(cs)))  # x_recons=sigmoid(canvas)
 
@@ -779,7 +778,7 @@ def view(self, dataset, model_name=None):
     params = load_config(config_id)
     pprint(params, stream=sys.stderr)
 
-    model = Draw(nepoch, params, model_name_postfix=model_name_postfix, logging=logging, training=train_or_test)
+    model = Draw(params, model_name_postfix=model_name_postfix, logging=logging, training=train_or_test)
     print('MODEL IS BUILT', file=sys.stderr)
 
     # save config with the assigned model name updated
@@ -799,7 +798,7 @@ def view(self, dataset, model_name=None):
     print('total_parameters', total_parameters, file=sys.stderr)
 
     if train_or_test:
-        model.train(dataset, restore=True, log_every=log_every, save_every=save_every)
+        model.train(dataset, restore=True, nepoch=nepoch, log_every=log_every, save_every=save_every)
     else:
         model.gen_vids(dataset, output_prefix=config_id, training_path='training/')
         model.view(dataset)
diff --git a/audio_gen.py b/audio_gen.py
@@ -226,10 +226,10 @@ def separate_params(inp_params, nsoundstream, nmodulation, varying_delta, loggin
 
 
 # needed in color draw model init
-def soundscape_len(stream_params, fs):
-    section_len = int(stream_params['section_len_msec'] / 1000. * fs)
-    soundstream_len = int(stream_params['nmodulation'] * section_len)
-    return int(soundstream_len * stream_params['soundscape_len_by_stream_len'])
+def soundscape_len(audio_params, fs):
+    section_len = int(audio_params['section_len_msec'] / 1000. * fs)
+    soundstream_len = int(audio_params['nmodulation'] * section_len)
+    return int(soundstream_len * audio_params['soundscape_len_by_stream_len'])
 
 
 def gen_single_input(f0_in, a0_in, azim0_in, phase_in, df_in, da_in, dazim_in,

diff --git a/config.py b/config.py
@@ -107,6 +107,12 @@ def load_config(cfg_id):
     return params  # model names are not returned
 
 
+def find_model(cfg_id, model_name_postfix):
+    with open('configs.json', 'rt') as f:
+        configs = json.load(f)
+    return [m for m in configs[cfg_id]['models'] if model_name_postfix in m][0]  # select first occurrence
+
+
 def save_config(cfg_id, params, model_name):
     with open('configs.json', 'rt') as f:
         configs = json.load(f)

diff --git a/disentangle_anal.py b/disentangle_anal.py
@@ -99,20 +99,20 @@ def embed_features(sound_features, embedding_name, meta_name, sprite_img_path, i
         '5/draw_hand': False, '5/hist': False, '5/lame_plot': False, '6': True, '6/labeling': False,
         '6/create_sprite': False, '6/embed': False, '7': False, '8': True}
 
-# load data
-test_set = sys.argv[1] == 'test' if len(sys.argv) > 1 else True
-set_text = '_test' if test_set else '_train'
-table_cfg = 'table3-nov1-8seq'  # table3-nov1-8seq, table3-nov1-8seq-zind
-hand_cfg = 'v1-extra-26seq-4mod-cheat'
+config_id = sys.argv[1] if len(sys.argv) > 1 else 'default'  # have to be defined in configs.json
+test_set = sys.argv[2] == 'test' if len(sys.argv) > 2 else True
+table_cfg = sys.argv[3] if len(sys.argv) > 3 else 'default'
+hand_cfg = sys.argv[4] if len(sys.argv) > 4 else 'default'
 
-table_data_path = '/media/viktor/0C22201D22200DF0/hand_gestures/gendata_' + table_cfg + set_text + '.hdf5'
-hand_data_path = '/media/viktor/0C22201D22200DF0/hand_gestures/gendata_' + hand_cfg + set_text + '.hdf5'
+set_text = '_test' if test_set else '_train'
+table_data_path = 'data/gendata_' + table_cfg + set_text + '.hdf5'
+hand_data_path = 'data/gendata_' + hand_cfg + set_text + '.hdf5'
 table_data_file = tables.open_file(table_data_path, mode='r')
 hand_data_file = tables.open_file(hand_data_path, mode='r')
+
 print(table_data_file)
 print(hand_data_file)
 
-
 if ANAL['1']:
     # 1)
     # avg nonblack pixel pos, delta

diff --git a/gen_disentangle_data.py b/gen_disentangle_data.py
@@ -17,39 +17,25 @@ def img_to_uint(img):
 
 
 if __name__ == '__main__':
-    argv = sys.argv
-    config_name = argv[1] if len(argv) > 1 else 'table3-nov1-8seq-zind'  # table3-nov1-8seq, v1-extra-26seq-4mod-cheat
-    test_set = argv[2] == 'test' if len(argv) > 2 else True
-    rand_select = argv[3] == 'rand' if len(argv) > 3 else False
-
-    dataset = '/media/viktor/0C22201D22200DF0/hand_gestures/simple_hand.hdf5'  # FIXME
-    if 'ap-' in config_name:
-        dataset = '/media/viktor/0C22201D22200DF0/hand_gestures/apartment.hdf5'
-    elif 'table' in config_name:
-        dataset = '/media/viktor/0C22201D22200DF0/hand_gestures/table3.hdf5'
-    model_name = CFG_TO_MODEL[config_name]
-    model_root = '/media/viktor/0C22201D22200DF0/triton/triton_training/training/'
-    sound_len = CFG_TO_SOUND_LEN[config_name]
+
+    config_id = sys.argv[1] if len(sys.argv) > 1 else 'default'  # have to be defined in configs.json
+    dataset = sys.argv[2] if len(sys.argv) > 2 else 'data/simple_hand.hdf5'  # path to dataset, default can be downloaded
+    test_set = sys.argv[3] == 'test' if len(sys.argv) > 3 else True  # training by default
+    rand_select = sys.argv[4] == 'rand' if len(sys.argv) > 4 else True
+    model_name_postfix = sys.argv[5] if len(sys.argv) > 5 else ''  # if having more models with the same config
+
+    network_params = load_config(config_id)
+    network_params['batch_size'] = 1
+    model_name = find_model(config_id, model_name_postfix)
+    sound_len = audio_gen.soundscape_len(network_params['audio_gen'], network_params['fs'])
+    model_root = 'training/'
 
     RIGHT_BTN = ord('d')
     LEFT_BTN = ord('a')
 
-    # build V2A model
-    nepoch = 10000
-    img_h = 120
-    img_w = 160
-    num_colors = 1
-    v1_activation = False  # whether to load the matlab txt files or jpegs
-    crop_img = False
-    grayscale = True  # if true, 2D image is fed, 3D otherwise; set true when feeding 1 layer of CORF3D
-    only_layer = None
-    complement = False
-
-    network_params = load_config(config_name)
     pprint(network_params)
 
-    model = Draw(nepoch, img_h, img_w, num_colors, grayscale, network_params,
-                 logging=False, log_every=1000, save_every=2000, training=False)  # FIXME
+    model = Draw(network_params, model_name_postfix, logging=False, training=False)
     model.prepare_run_single(model_root + model_name)
     print('MODEL IS BUILT')
 
@@ -60,7 +46,6 @@ def img_to_uint(img):
 
     # test model image by image
     batch_round = 0
-    # data_to_save = []
     nseq = network_params['sequence_length']
     nsoundstream = network_params['audio_gen']['nsoundstream']
     n_v1_write = model.n_v1_write
@@ -74,7 +59,7 @@ def img_to_uint(img):
     img_dtype = tables.UInt8Atom()
 
     set_text = '_test' if test_set else '_train'
-    hdf5_file = tables.open_file('/media/viktor/0C22201D22200DF0/hand_gestures/' + 'gendata_' + config_name + set_text + '.hdf5', mode='w')
+    hdf5_file = tables.open_file('data/gendata_' + config_id + set_text + '.hdf5', mode='w')
     cs_storage = hdf5_file.create_earray(hdf5_file.root, 'cs', img_dtype, shape=[0, nseq, model.img_h, model.img_w])
     if test_set:
         ss_storage = hdf5_file.create_earray(hdf5_file.root, 'soundscapes', ss_dtype, shape=[0, soundscape_len, 2])
@@ -92,8 +77,7 @@ def img_to_uint(img):
     raw_dazim_storage = hdf5_file.create_earray(hdf5_file.root, 'raw_dazim', float_dtype, shape=[0, nseq, nsoundstream, nmodulation])
     if v1_gaussian:
         angle_storage = hdf5_file.create_earray(hdf5_file.root, 'angle', float_dtype, shape=[0, nseq, n_v1_write])
-    # with open('gendata_' + config_name + '.pickle', 'wb') as f:
-    # pickle_f = open('gendata_' + config_name + '.pickle', 'wb')
+
     while True:
         # select image
         if rand_select:
@@ -104,17 +88,15 @@ def img_to_uint(img):
                 break  # out
             indices = np.arange(batch_round * batch_size, (batch_round + 1) * batch_size)
             batch = model.get_batch(dataptr, indices=indices)
-        # batch = np.expand_dims(batch, 0)
 
         # run model
         cs, inp_imgs, gen_imgs, soundscapes, ss_tensors, wr_tensors = model.sess.run([model.cs, model.images, model.generated_images,
-                                                                                        model.whole_soundscape, model.soundscape_tensors,
-                                                                                        model.wr_attn_params],
-                                                                                       feed_dict={model.images: batch})
+                                                                                      model.whole_soundscape, model.soundscape_tensors,
+                                                                                      model.wr_attn_params],
+                                                                                     feed_dict={model.images: batch})
         cs = 1.0 / (1.0 + np.exp(-np.array(cs)))  # nseq x batch x height x width
         cs = np.reshape(cs, [model.sequence_length, batch_size, model.img_h, model.img_w])
         cs = np.transpose(cs, [1, 0, 2, 3])
-        # soundscape, gen_img, cs = soundscapes[0], np.reshape(gen_imgs[0], [model.img_h, model.img_w]), cs
 
         # ss_tensors: nseq x [dict] x batch x ...
         # we need df, da, dazim in the form of [dict] x batch x nseq x ...
@@ -155,47 +137,27 @@ def img_to_uint(img):
                     gx[:, i_seq, d] = np.reshape(wr_tensors[i_seq][0], [batch_size])
                     gy[:, i_seq, d] = np.reshape(wr_tensors[i_seq][1], [batch_size])
                     delta[:, i_seq, d] = np.reshape(wr_tensors[i_seq][2], [batch_size])
-                    # angle[:, i_seq, d] = np.reshape(wr_tensors[i_seq][3], [batch_size])
 
-        # ss_tensors_realigned = {'df': df, 'da': da, 'dazim': dazim}
-        # wr_tensors_realigned = {'gx': gx, 'gy': gy, 'delta': delta}
-
-        # print(ss_tensors_realigned, wr_tensors_realigned)
         print('.', end='', flush=True)
 
-        # for i in range(batch_size):
-        if True:
-            # record = {}
-            # record['cs'] = cs[i]
-            # record['gen_img'] = np.reshape(gen_imgs[i], [model.img_h, model.img_w])
-            # record['soundscape'] = np.int16(soundscapes[i] / np.max(np.abs(soundscapes[i])) * 32767)
-            # record['ss_tensors'] = {k: v[i] for k, v in ss_tensors_realigned.items()}
-            # record['wr_tensors'] = {k: v[i] for k, v in wr_tensors_realigned.items()}
-
-            cs_storage.append(img_to_uint(cs))
-            img_storage.append(img_to_uint(np.reshape(gen_imgs, [batch_size, model.img_h, model.img_w])))
-            inp_img_storage.append(img_to_uint(np.reshape(inp_imgs, [batch_size, model.img_h, model.img_w])))
-            if test_set:
-                ss_storage.append(np.int16(soundscapes / np.max(np.abs(soundscapes)) * 32767))
-            df_storage.append(df)
-            da_storage.append(da)
-            dazim_storage.append(dazim)
-
-            raw_df_storage.append(raw_df)
-            raw_da_storage.append(raw_da)
-            raw_dazim_storage.append(raw_dazim)
-
-            gx_storage.append(gx)
-            gy_storage.append(gy)
-            delta_storage.append(delta)
-            if v1_gaussian:
-                angle_storage.append(angle)
-
-            # pickle.dump(record, pickle_f)
-            # data_to_save.append(record)
+        cs_storage.append(img_to_uint(cs))
+        img_storage.append(img_to_uint(np.reshape(gen_imgs, [batch_size, model.img_h, model.img_w])))
+        inp_img_storage.append(img_to_uint(np.reshape(inp_imgs, [batch_size, model.img_h, model.img_w])))
+        if test_set:
+            ss_storage.append(np.int16(soundscapes / np.max(np.abs(soundscapes)) * 32767))
+        df_storage.append(df)
+        da_storage.append(da)
+        dazim_storage.append(dazim)
+
+        raw_df_storage.append(raw_df)
+        raw_da_storage.append(raw_da)
+        raw_dazim_storage.append(raw_dazim)
+
+        gx_storage.append(gx)
+        gy_storage.append(gy)
+        delta_storage.append(delta)
+        if v1_gaussian:
+            angle_storage.append(angle)
 
         batch_round += 1
-
-    # with open('gendata_' + config_name + '.pickle', 'wb') as f:
-    #     pickle.dump(data_to_save, f)
     hdf5_file.close()