Skip to content

Commit

Permalink
config implementation syncd with other than the main model file; read…
Browse files Browse the repository at this point in the history
…me added
  • Loading branch information
csiki committed Feb 27, 2019
1 parent f8953ea commit 8da364c
Show file tree
Hide file tree
Showing 9 changed files with 148 additions and 223 deletions.
58 changes: 48 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,48 @@
mention where the original model comes from, which github site, which paper, wavegan:
implementing [DRAW](https://arxiv.org/abs/1502.04623)
from [Github repo](https://github.com/kvfrans/draw-color)
uses wavegan

# TODO
- [ ] finish readme
- [ ] create new config handling mechanism with a global config file
- [ ] streamline training and testing
- [ ] migrate aev2a params to config, input dimension can not be read from data
#Autoencoded sensory substitution

Visual-to-auditory (V2A) sensory substitution stands for the translation of images to sound,
in the interest of aiding the blind. The generated soundscapes should convey visual information,
ideally representing all the details on the given image, with as short a sound sequences as possible.
Traditional V2A conversion methods apply an explicitly predefined function that transforms the input
image pixel-by-pixel to soundscapes, superimposing them in the final step. Here is the implementation
of a novel conversion approach, which posits sensory substitution as a compression problem.

Optimal compression is learnt and computed by a recurrent variational autoencoder, called AEV2A.
The autoencoder takes an image as input, translates it to a sequence of soundscapes, before
reconstructing the image in an iterative manner drawing on a canvas. The neural network implementation
is based on the [DRAW](https://arxiv.org/abs/1502.04623) model; the repository from which the code was
initially cloned can be found [here](https://github.com/kvfrans/draw-color). AEV2A further builds on
[WaveGAN](https://arxiv.org/abs/1802.04208) (repo [here](https://github.com/chrisdonahue/wavegan)).

For further details check this [blog post](TODO) or the thesis [here](TODO).

[TODO ADD GIF HERE]

##Requirements
[TODO PYTHON LIBS, FFMPEG, PYMATLAB]

##How to run
[TOOD high lvl: gen dataset/download, train network, test network, run proto, analyze image-to-sound]

##What's what
[TODO very short descr of each folder/file]

##Datasets
[TODO short description, more under /data]

[TODO image of hands dataset]

##Training

###Tensorboard analysis

##Testing

##Running live

##Image-to-sound conversion analysis

##Citation
```
[TODO bibtex]
```
13 changes: 6 additions & 7 deletions aev2a.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@

class Draw:

def __init__(self, nepoch, network_params, model_name_postfix, logging=True, training=True):
self.nepoch = nepoch
def __init__(self, network_params, model_name_postfix, logging=True, training=True):
self.img_h, self.img_w, self.num_colors = network_params['input_dim']
self.grayscale = self.num_colors == 1 # else RGB
self.logging = logging
Expand Down Expand Up @@ -593,7 +592,7 @@ def get_batch(self, data, indices=None, batch_size=None, start_stop_index=None):
return np.array(
[get_image(data[i], self.grayscale) for i in indices]).astype(self.npdtype)

def train(self, dataset, restore=True, model_name=None, log_every=100, save_every=1000):
def train(self, dataset, restore=True, model_name=None, nepoch=10000, log_every=100, save_every=1000):
self.model_name = model_name or self.model_name

data = self.get_data(dataset)
Expand All @@ -610,7 +609,7 @@ def train(self, dataset, restore=True, model_name=None, log_every=100, save_ever
print('NEW MODEL "{}" IS BEING TRAINED'.format(self.model_name), file=sys.stderr)

start_time = time.time()
for e in range(self.nepoch):
for e in range(nepoch):
nbatch = (data_len // self.batch_size) - 2
for i in range(nbatch):

Expand Down Expand Up @@ -643,7 +642,7 @@ def train(self, dataset, restore=True, model_name=None, log_every=100, save_ever

if (e * nbatch + i + 1) % save_every == 0:
saver.save(self.sess, os.path.join(os.getcwd(), 'training', self.model_name, 'train'), global_step=glob_step)
print('MODEL "{}" SAVED at iteration {}'.format(self.model_name, e * self.nepoch + i), file=sys.stderr)
print('MODEL "{}" SAVED at iteration {}'.format(self.model_name, e * nepoch + i), file=sys.stderr)

cs = 1.0/(1.0+np.exp(-np.array(cs))) # x_recons=sigmoid(canvas)

Expand Down Expand Up @@ -779,7 +778,7 @@ def view(self, dataset, model_name=None):
params = load_config(config_id)
pprint(params, stream=sys.stderr)

model = Draw(nepoch, params, model_name_postfix=model_name_postfix, logging=logging, training=train_or_test)
model = Draw(params, model_name_postfix=model_name_postfix, logging=logging, training=train_or_test)
print('MODEL IS BUILT', file=sys.stderr)

# save config with the assigned model name updated
Expand All @@ -799,7 +798,7 @@ def view(self, dataset, model_name=None):
print('total_parameters', total_parameters, file=sys.stderr)

if train_or_test:
model.train(dataset, restore=True, log_every=log_every, save_every=save_every)
model.train(dataset, restore=True, nepoch=nepoch, log_every=log_every, save_every=save_every)
else:
model.gen_vids(dataset, output_prefix=config_id, training_path='training/')
model.view(dataset)
8 changes: 4 additions & 4 deletions audio_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,10 +226,10 @@ def separate_params(inp_params, nsoundstream, nmodulation, varying_delta, loggin


# needed in color draw model init
def soundscape_len(stream_params, fs):
section_len = int(stream_params['section_len_msec'] / 1000. * fs)
soundstream_len = int(stream_params['nmodulation'] * section_len)
return int(soundstream_len * stream_params['soundscape_len_by_stream_len'])
def soundscape_len(audio_params, fs):
section_len = int(audio_params['section_len_msec'] / 1000. * fs)
soundstream_len = int(audio_params['nmodulation'] * section_len)
return int(soundstream_len * audio_params['soundscape_len_by_stream_len'])


def gen_single_input(f0_in, a0_in, azim0_in, phase_in, df_in, da_in, dazim_in,
Expand Down
6 changes: 6 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,12 @@ def load_config(cfg_id):
return params # model names are not returned


def find_model(cfg_id, model_name_postfix):
with open('configs.json', 'rt') as f:
configs = json.load(f)
return [m for m in configs[cfg_id]['models'] if model_name_postfix in m][0] # select first occurrence


def save_config(cfg_id, params, model_name):
with open('configs.json', 'rt') as f:
configs = json.load(f)
Expand Down
16 changes: 8 additions & 8 deletions disentangle_anal.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,20 +99,20 @@ def embed_features(sound_features, embedding_name, meta_name, sprite_img_path, i
'5/draw_hand': False, '5/hist': False, '5/lame_plot': False, '6': True, '6/labeling': False,
'6/create_sprite': False, '6/embed': False, '7': False, '8': True}

# load data
test_set = sys.argv[1] == 'test' if len(sys.argv) > 1 else True
set_text = '_test' if test_set else '_train'
table_cfg = 'table3-nov1-8seq' # table3-nov1-8seq, table3-nov1-8seq-zind
hand_cfg = 'v1-extra-26seq-4mod-cheat'
config_id = sys.argv[1] if len(sys.argv) > 1 else 'default' # have to be defined in configs.json
test_set = sys.argv[2] == 'test' if len(sys.argv) > 2 else True
table_cfg = sys.argv[3] if len(sys.argv) > 3 else 'default'
hand_cfg = sys.argv[4] if len(sys.argv) > 4 else 'default'

table_data_path = '/media/viktor/0C22201D22200DF0/hand_gestures/gendata_' + table_cfg + set_text + '.hdf5'
hand_data_path = '/media/viktor/0C22201D22200DF0/hand_gestures/gendata_' + hand_cfg + set_text + '.hdf5'
set_text = '_test' if test_set else '_train'
table_data_path = 'data/gendata_' + table_cfg + set_text + '.hdf5'
hand_data_path = 'data/gendata_' + hand_cfg + set_text + '.hdf5'
table_data_file = tables.open_file(table_data_path, mode='r')
hand_data_file = tables.open_file(hand_data_path, mode='r')

print(table_data_file)
print(hand_data_file)


if ANAL['1']:
# 1)
# avg nonblack pixel pos, delta
Expand Down
110 changes: 36 additions & 74 deletions gen_disentangle_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,39 +17,25 @@ def img_to_uint(img):


if __name__ == '__main__':
argv = sys.argv
config_name = argv[1] if len(argv) > 1 else 'table3-nov1-8seq-zind' # table3-nov1-8seq, v1-extra-26seq-4mod-cheat
test_set = argv[2] == 'test' if len(argv) > 2 else True
rand_select = argv[3] == 'rand' if len(argv) > 3 else False

dataset = '/media/viktor/0C22201D22200DF0/hand_gestures/simple_hand.hdf5' # FIXME
if 'ap-' in config_name:
dataset = '/media/viktor/0C22201D22200DF0/hand_gestures/apartment.hdf5'
elif 'table' in config_name:
dataset = '/media/viktor/0C22201D22200DF0/hand_gestures/table3.hdf5'
model_name = CFG_TO_MODEL[config_name]
model_root = '/media/viktor/0C22201D22200DF0/triton/triton_training/training/'
sound_len = CFG_TO_SOUND_LEN[config_name]

config_id = sys.argv[1] if len(sys.argv) > 1 else 'default' # have to be defined in configs.json
dataset = sys.argv[2] if len(sys.argv) > 2 else 'data/simple_hand.hdf5' # path to dataset, default can be downloaded
test_set = sys.argv[3] == 'test' if len(sys.argv) > 3 else True # training by default
rand_select = sys.argv[4] == 'rand' if len(sys.argv) > 4 else True
model_name_postfix = sys.argv[5] if len(sys.argv) > 5 else '' # if having more models with the same config

network_params = load_config(config_id)
network_params['batch_size'] = 1
model_name = find_model(config_id, model_name_postfix)
sound_len = audio_gen.soundscape_len(network_params['audio_gen'], network_params['fs'])
model_root = 'training/'

RIGHT_BTN = ord('d')
LEFT_BTN = ord('a')

# build V2A model
nepoch = 10000
img_h = 120
img_w = 160
num_colors = 1
v1_activation = False # whether to load the matlab txt files or jpegs
crop_img = False
grayscale = True # if true, 2D image is fed, 3D otherwise; set true when feeding 1 layer of CORF3D
only_layer = None
complement = False

network_params = load_config(config_name)
pprint(network_params)

model = Draw(nepoch, img_h, img_w, num_colors, grayscale, network_params,
logging=False, log_every=1000, save_every=2000, training=False) # FIXME
model = Draw(network_params, model_name_postfix, logging=False, training=False)
model.prepare_run_single(model_root + model_name)
print('MODEL IS BUILT')

Expand All @@ -60,7 +46,6 @@ def img_to_uint(img):

# test model image by image
batch_round = 0
# data_to_save = []
nseq = network_params['sequence_length']
nsoundstream = network_params['audio_gen']['nsoundstream']
n_v1_write = model.n_v1_write
Expand All @@ -74,7 +59,7 @@ def img_to_uint(img):
img_dtype = tables.UInt8Atom()

set_text = '_test' if test_set else '_train'
hdf5_file = tables.open_file('/media/viktor/0C22201D22200DF0/hand_gestures/' + 'gendata_' + config_name + set_text + '.hdf5', mode='w')
hdf5_file = tables.open_file('data/gendata_' + config_id + set_text + '.hdf5', mode='w')
cs_storage = hdf5_file.create_earray(hdf5_file.root, 'cs', img_dtype, shape=[0, nseq, model.img_h, model.img_w])
if test_set:
ss_storage = hdf5_file.create_earray(hdf5_file.root, 'soundscapes', ss_dtype, shape=[0, soundscape_len, 2])
Expand All @@ -92,8 +77,7 @@ def img_to_uint(img):
raw_dazim_storage = hdf5_file.create_earray(hdf5_file.root, 'raw_dazim', float_dtype, shape=[0, nseq, nsoundstream, nmodulation])
if v1_gaussian:
angle_storage = hdf5_file.create_earray(hdf5_file.root, 'angle', float_dtype, shape=[0, nseq, n_v1_write])
# with open('gendata_' + config_name + '.pickle', 'wb') as f:
# pickle_f = open('gendata_' + config_name + '.pickle', 'wb')

while True:
# select image
if rand_select:
Expand All @@ -104,17 +88,15 @@ def img_to_uint(img):
break # out
indices = np.arange(batch_round * batch_size, (batch_round + 1) * batch_size)
batch = model.get_batch(dataptr, indices=indices)
# batch = np.expand_dims(batch, 0)

# run model
cs, inp_imgs, gen_imgs, soundscapes, ss_tensors, wr_tensors = model.sess.run([model.cs, model.images, model.generated_images,
model.whole_soundscape, model.soundscape_tensors,
model.wr_attn_params],
feed_dict={model.images: batch})
model.whole_soundscape, model.soundscape_tensors,
model.wr_attn_params],
feed_dict={model.images: batch})
cs = 1.0 / (1.0 + np.exp(-np.array(cs))) # nseq x batch x height x width
cs = np.reshape(cs, [model.sequence_length, batch_size, model.img_h, model.img_w])
cs = np.transpose(cs, [1, 0, 2, 3])
# soundscape, gen_img, cs = soundscapes[0], np.reshape(gen_imgs[0], [model.img_h, model.img_w]), cs

# ss_tensors: nseq x [dict] x batch x ...
# we need df, da, dazim in the form of [dict] x batch x nseq x ...
Expand Down Expand Up @@ -155,47 +137,27 @@ def img_to_uint(img):
gx[:, i_seq, d] = np.reshape(wr_tensors[i_seq][0], [batch_size])
gy[:, i_seq, d] = np.reshape(wr_tensors[i_seq][1], [batch_size])
delta[:, i_seq, d] = np.reshape(wr_tensors[i_seq][2], [batch_size])
# angle[:, i_seq, d] = np.reshape(wr_tensors[i_seq][3], [batch_size])

# ss_tensors_realigned = {'df': df, 'da': da, 'dazim': dazim}
# wr_tensors_realigned = {'gx': gx, 'gy': gy, 'delta': delta}

# print(ss_tensors_realigned, wr_tensors_realigned)
print('.', end='', flush=True)

# for i in range(batch_size):
if True:
# record = {}
# record['cs'] = cs[i]
# record['gen_img'] = np.reshape(gen_imgs[i], [model.img_h, model.img_w])
# record['soundscape'] = np.int16(soundscapes[i] / np.max(np.abs(soundscapes[i])) * 32767)
# record['ss_tensors'] = {k: v[i] for k, v in ss_tensors_realigned.items()}
# record['wr_tensors'] = {k: v[i] for k, v in wr_tensors_realigned.items()}

cs_storage.append(img_to_uint(cs))
img_storage.append(img_to_uint(np.reshape(gen_imgs, [batch_size, model.img_h, model.img_w])))
inp_img_storage.append(img_to_uint(np.reshape(inp_imgs, [batch_size, model.img_h, model.img_w])))
if test_set:
ss_storage.append(np.int16(soundscapes / np.max(np.abs(soundscapes)) * 32767))
df_storage.append(df)
da_storage.append(da)
dazim_storage.append(dazim)

raw_df_storage.append(raw_df)
raw_da_storage.append(raw_da)
raw_dazim_storage.append(raw_dazim)

gx_storage.append(gx)
gy_storage.append(gy)
delta_storage.append(delta)
if v1_gaussian:
angle_storage.append(angle)

# pickle.dump(record, pickle_f)
# data_to_save.append(record)
cs_storage.append(img_to_uint(cs))
img_storage.append(img_to_uint(np.reshape(gen_imgs, [batch_size, model.img_h, model.img_w])))
inp_img_storage.append(img_to_uint(np.reshape(inp_imgs, [batch_size, model.img_h, model.img_w])))
if test_set:
ss_storage.append(np.int16(soundscapes / np.max(np.abs(soundscapes)) * 32767))
df_storage.append(df)
da_storage.append(da)
dazim_storage.append(dazim)

raw_df_storage.append(raw_df)
raw_da_storage.append(raw_da)
raw_dazim_storage.append(raw_dazim)

gx_storage.append(gx)
gy_storage.append(gy)
delta_storage.append(delta)
if v1_gaussian:
angle_storage.append(angle)

batch_round += 1

# with open('gendata_' + config_name + '.pickle', 'wb') as f:
# pickle.dump(data_to_save, f)
hdf5_file.close()
Loading

0 comments on commit 8da364c

Please sign in to comment.