Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
jasonwei20 committed Feb 3, 2019
1 parent 609dcf1 commit 5c5c36a
Show file tree
Hide file tree
Showing 7 changed files with 71 additions and 31 deletions.
Binary file modified code/__pycache__/c_config.cpython-36.pyc
Binary file not shown.
Binary file modified code/__pycache__/methods.cpython-36.pyc
Binary file not shown.
2 changes: 1 addition & 1 deletion code/c_2_train_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def run_cnn(train_file, test_file, num_classes, percent_dataset):
test_x, test_y = get_x_y(test_file, num_classes, word2vec_len, input_size, word2vec, 1)

#implement early stopping
callbacks = [EarlyStopping(monitor='val_loss', patience=4)]
callbacks = [EarlyStopping(monitor='val_loss', patience=3)]

#train model
model.fit( train_x,
Expand Down
2 changes: 1 addition & 1 deletion code/c_config.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#user inputs

#size folders
sizes = ['1_tiny', '2_small', '3_standard', '4_full']
sizes = ['3_standard']#, '4_full']#['1_tiny', '2_small', '3_standard', '4_full']
size_folders = ['size_data_f3/' + size for size in sizes]

#dataset folder
Expand Down
13 changes: 3 additions & 10 deletions code/d_0_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,24 +12,17 @@ def generate_short(input_file, output_file, alpha):

#global params
huge_word2vec = 'word2vec/glove.840B.300d.txt'
datasets = ['pc', 'trec']
datasets = ['pc']#, 'trec']

for dataset in datasets:

dataset_folder = 'special_f4/' + dataset
test_orig = 'special_f4/' + dataset + '/test.txt'
test_short = 'special_f4/' + dataset + '/test_short.txt'
word2vec_pickle = dataset_folder + '/word2vec.p'
test_aug_short = dataset_folder + '/test_short_aug.txt'

#generate short version of test set
if dataset == 'pc':
generate_short(test_orig, test_short, 100)
elif dataset == 'trec':
generate_short(test_orig, test_short, 250)
word2vec_pickle = dataset_folder + '/word2vec.p'

#augment the data
gen_standard_aug(test_short, test_aug_short)
gen_tsne_aug(test_short, test_aug_short)

#generate the vocab dictionaries
gen_vocab_dicts(dataset_folder, word2vec_pickle, huge_word2vec)
Expand Down
67 changes: 48 additions & 19 deletions code/d_2_tsne.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,53 @@
#### get dense layer output ####
################################

#getting the x and y inputs in numpy array form from the text file
def train_x(train_txt, word2vec_len, input_size, word2vec):

#read in lines
train_lines = open(train_txt, 'r').readlines()
num_lines = len(train_lines)

x_matrix = np.zeros((num_lines, input_size, word2vec_len))

#insert values
for i, line in enumerate(train_lines):

parts = line[:-1].split('\t')
label = int(parts[0])
sentence = parts[1]

#insert x
words = sentence.split(' ')
words = words[:x_matrix.shape[1]] #cut off if too long
for j, word in enumerate(words):
if word in word2vec:
x_matrix[i, j, :] = word2vec[word]

return x_matrix

def get_dense_output(model_checkpoint, file, num_classes):

x, y = get_x_y(file, num_classes, word2vec_len, input_size, word2vec, 1)
x = train_x(file, word2vec_len, input_size, word2vec)

model = load_model(model_checkpoint)

get_3rd_layer_output = K.function([model.layers[0].input], [model.layers[4].output])
layer_output = get_3rd_layer_output([x])[0]

return layer_output, np.argmax(y, axis=1)
return layer_output

def get_tsne_labels(file):
labels = []
alphas = []
lines = open(file, 'r').readlines()
for i, line in enumerate(lines):
parts = line[:-1].split('\t')
_class = int(parts[0])
alpha = i % 10
labels.append(_class)
alphas.append(alpha)
return labels, alphas

def get_plot_vectors(layer_output):

Expand Down Expand Up @@ -91,35 +128,27 @@ def plot_tsne(tsne, labels, output_path):

#load parameters
model_checkpoint = 'outputs_f4/' + dataset + '.h5'
file = 'special_f4/' + dataset + '/test_short_aug_shuffle.txt'
file = 'special_f4/' + dataset + '/test_short_aug.txt'
num_classes = num_classes_list[i]
word2vec_pickle = 'special_f4/' + dataset + '/word2vec.p'
word2vec = load_pickle(word2vec_pickle)

#do tsne
layer_output, labels = get_dense_output(model_checkpoint, file, num_classes)
layer_output = get_dense_output(model_checkpoint, file, num_classes)
print(layer_output.shape)
t = get_plot_vectors(layer_output)

#edit labels:
for i in range(len(labels)):

#mark original, unaugmented data
if i % 10 == 9:
labels[i] += 100
labels, alphas = get_tsne_labels(file)

print(labels, alphas)

output_path = 'outputs_f4/' + dataset + '_tsne.png'
plot_tsne(t, labels, output_path)
writer = open("outputs_f4/new_tsne.txt", 'w')

label_names = labels.tolist()
label_writers = {}
for label_name in label_names:
label_writers[label_name] = open('outputs_f4/' + str(label_name) + '.txt', 'w')
label_to_mark = {0:'x', 1:'o'}

for i, label in enumerate(labels):
line = str(t[i, 0]) + ' ' + str(t[i, 1])
print(line)
label_writers[label].write(line + '\n')
alpha = alphas[i]
line = str(t[i, 0]) + ' ' + str(t[i, 1]) + ' ' + str(label_to_mark[label]) + ' ' + str(alpha/10)
writer.write(line + '\n')


18 changes: 18 additions & 0 deletions code/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,24 @@ def get_x_y(train_txt, num_classes, word2vec_len, input_size, word2vec, percent_
############### data augmentation #################
###################################################

def gen_tsne_aug(train_orig, output_file):

writer = open(output_file, 'w')
lines = open(train_orig, 'r').readlines()
for i, line in enumerate(lines):
parts = line[:-1].split('\t')
label = parts[0]
sentence = parts[1]
writer.write(line)
for alpha in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
aug_sentence = eda_4(sentence, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=2)[0]
writer.write(label + "\t" + aug_sentence + '\n')
writer.close()
print("finished eda for tsne for", train_orig, "to", output_file)




#generate more data with standard augmentation
def gen_standard_aug(train_orig, output_file, num_aug=9):
writer = open(output_file, 'w')
Expand Down

0 comments on commit 5c5c36a

Please sign in to comment.