update

zeroesones · Feb 3, 2019 · 5c5c36a · 5c5c36a
1 parent 609dcf1
commit 5c5c36a
Show file tree

Hide file tree

Showing 7 changed files with 71 additions and 31 deletions.
diff --git a/code/__pycache__/c_config.cpython-36.pyc b/code/__pycache__/c_config.cpython-36.pyc
diff --git a/code/__pycache__/methods.cpython-36.pyc b/code/__pycache__/methods.cpython-36.pyc
diff --git a/code/c_2_train_eval.py b/code/c_2_train_eval.py
@@ -17,7 +17,7 @@ def run_cnn(train_file, test_file, num_classes, percent_dataset):
 	test_x, test_y = get_x_y(test_file, num_classes, word2vec_len, input_size, word2vec, 1)
 
 	#implement early stopping
-	callbacks = [EarlyStopping(monitor='val_loss', patience=4)]
+	callbacks = [EarlyStopping(monitor='val_loss', patience=3)]
 
 	#train model
 	model.fit(	train_x, 

diff --git a/code/c_config.py b/code/c_config.py
@@ -1,7 +1,7 @@
 #user inputs
 
 #size folders
-sizes = ['1_tiny', '2_small', '3_standard', '4_full']
+sizes = ['3_standard']#, '4_full']#['1_tiny', '2_small', '3_standard', '4_full']
 size_folders = ['size_data_f3/' + size for size in sizes]
 
 #dataset folder

diff --git a/code/d_0_preprocess.py b/code/d_0_preprocess.py
@@ -12,24 +12,17 @@ def generate_short(input_file, output_file, alpha):
 
 	#global params
 	huge_word2vec = 'word2vec/glove.840B.300d.txt'
-	datasets = ['pc', 'trec']
+	datasets = ['pc']#, 'trec']
 
 	for dataset in datasets:
 
 		dataset_folder = 'special_f4/' + dataset
-		test_orig = 'special_f4/' + dataset + '/test.txt'
 		test_short = 'special_f4/' + dataset + '/test_short.txt'
-		word2vec_pickle = dataset_folder + '/word2vec.p' 
 		test_aug_short = dataset_folder + '/test_short_aug.txt'
-
-		#generate short version of test set
-		if dataset == 'pc':
-			generate_short(test_orig, test_short, 100)
-		elif dataset == 'trec':
-			generate_short(test_orig, test_short, 250)
+		word2vec_pickle = dataset_folder + '/word2vec.p' 
 
 		#augment the data
-		gen_standard_aug(test_short, test_aug_short)
+		gen_tsne_aug(test_short, test_aug_short)
 
 		#generate the vocab dictionaries
 		gen_vocab_dicts(dataset_folder, word2vec_pickle, huge_word2vec)

diff --git a/code/d_2_tsne.py b/code/d_2_tsne.py
@@ -9,16 +9,53 @@
 #### get dense layer output ####
 ################################
 
+#getting the x and y inputs in numpy array form from the text file
+def train_x(train_txt, word2vec_len, input_size, word2vec):
+
+	#read in lines
+	train_lines = open(train_txt, 'r').readlines()
+	num_lines = len(train_lines)
+
+	x_matrix = np.zeros((num_lines, input_size, word2vec_len))
+
+	#insert values
+	for i, line in enumerate(train_lines):
+
+		parts = line[:-1].split('\t')
+		label = int(parts[0])
+		sentence = parts[1]	
+
+		#insert x
+		words = sentence.split(' ')
+		words = words[:x_matrix.shape[1]] #cut off if too long
+		for j, word in enumerate(words):
+			if word in word2vec:
+				x_matrix[i, j, :] = word2vec[word]
+
+	return x_matrix
+
 def get_dense_output(model_checkpoint, file, num_classes):
 
-	x, y = get_x_y(file, num_classes, word2vec_len, input_size, word2vec, 1)
+	x = train_x(file, word2vec_len, input_size, word2vec)
 
 	model = load_model(model_checkpoint)
 
 	get_3rd_layer_output = K.function([model.layers[0].input], [model.layers[4].output])
 	layer_output = get_3rd_layer_output([x])[0]
 
-	return layer_output, np.argmax(y, axis=1)
+	return layer_output
+
+def get_tsne_labels(file):
+	labels = []
+	alphas = []
+	lines = open(file, 'r').readlines()
+	for i, line in enumerate(lines):
+		parts = line[:-1].split('\t')
+		_class = int(parts[0])
+		alpha = i % 10
+		labels.append(_class)
+		alphas.append(alpha)
+	return labels, alphas
 
 def get_plot_vectors(layer_output):
 
@@ -91,35 +128,27 @@ def plot_tsne(tsne, labels, output_path):
 
 		#load parameters
 		model_checkpoint = 'outputs_f4/' + dataset + '.h5'
-		file = 'special_f4/' + dataset + '/test_short_aug_shuffle.txt'
+		file = 'special_f4/' + dataset + '/test_short_aug.txt'
 		num_classes = num_classes_list[i]
 		word2vec_pickle = 'special_f4/' + dataset + '/word2vec.p'
 		word2vec = load_pickle(word2vec_pickle)
 
 		#do tsne
-		layer_output, labels = get_dense_output(model_checkpoint, file, num_classes)
+		layer_output = get_dense_output(model_checkpoint, file, num_classes)
 		print(layer_output.shape)
 		t = get_plot_vectors(layer_output)
 
-		#edit labels:
-		for i in range(len(labels)):
-
-			#mark original, unaugmented data
-			if i % 10 == 9:
-				labels[i] += 100
+		labels, alphas = get_tsne_labels(file)
 
+		print(labels, alphas)
 
-		output_path = 'outputs_f4/' + dataset + '_tsne.png'
-		plot_tsne(t, labels, output_path)
+		writer = open("outputs_f4/new_tsne.txt", 'w')
 
-		label_names = labels.tolist()
-		label_writers = {}
-		for label_name in label_names:
-			label_writers[label_name] = open('outputs_f4/' + str(label_name) + '.txt', 'w')
+		label_to_mark = {0:'x', 1:'o'}
 
 		for i, label in enumerate(labels):
-			line = str(t[i, 0]) + ' ' + str(t[i, 1])
-			print(line)
-			label_writers[label].write(line + '\n')
+			alpha = alphas[i]
+			line = str(t[i, 0]) + ' ' + str(t[i, 1]) + ' ' + str(label_to_mark[label]) + ' ' + str(alpha/10)
+			writer.write(line + '\n')
 
 
diff --git a/code/methods.py b/code/methods.py
@@ -154,6 +154,24 @@ def get_x_y(train_txt, num_classes, word2vec_len, input_size, word2vec, percent_
 ############### data augmentation #################
 ###################################################
 
+def gen_tsne_aug(train_orig, output_file):
+
+    writer = open(output_file, 'w')
+    lines = open(train_orig, 'r').readlines()
+    for i, line in enumerate(lines):
+    	parts = line[:-1].split('\t')
+    	label = parts[0]
+    	sentence = parts[1]
+    	writer.write(line)
+    	for alpha in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
+    		aug_sentence = eda_4(sentence, alpha_sr=alpha, alpha_ri=alpha, alpha_rs=alpha, p_rd=alpha, num_aug=2)[0]
+    		writer.write(label + "\t" + aug_sentence + '\n')
+    writer.close()
+    print("finished eda for tsne for", train_orig, "to", output_file)
+
+
+
+
 #generate more data with standard augmentation
 def gen_standard_aug(train_orig, output_file, num_aug=9):
     writer = open(output_file, 'w')