|
| 1 | +import numpy as np |
| 2 | +import pandas as pd |
| 3 | +import tensorflow as tf |
| 4 | +import sys |
| 5 | +import os |
| 6 | +print(tf.__version__) |
| 7 | +import fire |
| 8 | +from elapsedtimer import ElapsedTimer |
| 9 | + |
| 10 | +class recommender: |
| 11 | + |
| 12 | + def __init__(self,mode,train_file,outdir,test_file=None, |
| 13 | + user_info_file=None,movie_info_file=None, |
| 14 | + batch_size=32,epochs=500, |
| 15 | + learning_rate=1e-3,num_hidden=50, |
| 16 | + display_step=5): |
| 17 | + |
| 18 | + |
| 19 | + self.mode = mode |
| 20 | + self.train_file = train_file |
| 21 | + self.outdir = outdir |
| 22 | + self.test_file = test_file |
| 23 | + self.batch_size = batch_size |
| 24 | + self.learning_rate = learning_rate |
| 25 | + self.num_hidden = num_hidden |
| 26 | + self.epochs = epochs |
| 27 | + self.display_step = display_step |
| 28 | + self.user_info_file = user_info_file |
| 29 | + self.movie_info_file = movie_info_file |
| 30 | + |
| 31 | + |
| 32 | + def read_data(self): |
| 33 | + |
| 34 | + if self.mode == 'train': |
| 35 | + self.train_data = np.load(self.train_file) |
| 36 | + self.num_ranks = self.train_data.shape[2] |
| 37 | + self.num_movies = self.train_data.shape[1] |
| 38 | + self.users = self.train_data.shape[0] |
| 39 | + |
| 40 | + else: |
| 41 | + self.train_df = pd.read_csv(self.train_file) |
| 42 | + self.test_data = np.load(self.test_file) |
| 43 | + self.test_df = pd.DataFrame(self.test_data,columns=['userid','movieid','rating']) |
| 44 | + |
| 45 | + if self.user_info_file != None: |
| 46 | + self.user_info_df = pd.read_csv(self.user_info_file,sep='|',header=None) |
| 47 | + self.user_info_df.columns = ['userid','age','gender','occupation','zipcode'] |
| 48 | + self.user_info_df = pd.read_csv(self.user_info_file,sep='|',columns=['userid','age','gender','occupation','zipcode']) |
| 49 | + |
| 50 | + if self.movie_info_file != None: |
| 51 | + self.movie_info_df = pd.read_csv(self.movie_info_file,sep='|',encoding='latin-1',header=None) |
| 52 | + self.movie_info_df = self.movie_info_df[[0,1]] |
| 53 | + self.movie_info_df.columns = ['movieid','movie Title'] |
| 54 | + |
| 55 | + |
| 56 | + |
| 57 | + |
| 58 | + def next_batch(self): |
| 59 | + while True: |
| 60 | + ix = np.random.choice(np.arange(self.train_data.shape[0]),self.batch_size) |
| 61 | + train_X = self.train_data[ix,:,:] |
| 62 | + yield train_X |
| 63 | + |
| 64 | + |
| 65 | + def __network(self): |
| 66 | + |
| 67 | + self.x = tf.placeholder(tf.float32, [None,self.num_movies,self.num_ranks], name="x") |
| 68 | + self.xr = tf.reshape(self.x, [-1,self.num_movies*self.num_ranks], name="xr") |
| 69 | + self.W = tf.Variable(tf.random_normal([self.num_movies*self.num_ranks,self.num_hidden], 0.01), name="W") |
| 70 | + self.b_h = tf.Variable(tf.zeros([1,self.num_hidden], tf.float32, name="b_h")) |
| 71 | + self.b_v = tf.Variable(tf.zeros([1,self.num_movies*self.num_ranks],tf.float32, name="b_v")) |
| 72 | + self.k = 2 |
| 73 | + |
| 74 | +## Converts the probability into discrete binary states i.e. 0 and 1 |
| 75 | + def sample_hidden(probs): |
| 76 | + return tf.floor(probs + tf.random_uniform(tf.shape(probs), 0, 1)) |
| 77 | + |
| 78 | + def sample_visible(logits): |
| 79 | + |
| 80 | + logits = tf.reshape(logits,[-1,self.num_ranks]) |
| 81 | + sampled_logits = tf.multinomial(logits,1) |
| 82 | + sampled_logits = tf.one_hot(sampled_logits,depth = 5) |
| 83 | + logits = tf.reshape(logits,[-1,self.num_movies*self.num_ranks]) |
| 84 | + print(logits) |
| 85 | + return logits |
| 86 | + |
| 87 | + |
| 88 | + |
| 89 | + |
| 90 | + |
| 91 | +## Gibbs sampling step |
| 92 | + def gibbs_step(x_k): |
| 93 | + # x_k = tf.reshape(x_k,[-1,self.num_movies*self.num_ranks]) |
| 94 | + h_k = sample_hidden(tf.sigmoid(tf.matmul(x_k,self.W) + self.b_h)) |
| 95 | + x_k = sample_visible(tf.add(tf.matmul(h_k,tf.transpose(self.W)),self.b_v)) |
| 96 | + return x_k |
| 97 | +## Run multiple gives Sampling step starting from an initital point |
| 98 | + def gibbs_sample(k,x_k): |
| 99 | + |
| 100 | + for i in range(k): |
| 101 | + x_k = gibbs_step(x_k) |
| 102 | +# Returns the gibbs sample after k iterations |
| 103 | + return x_k |
| 104 | + |
| 105 | +# Constrastive Divergence algorithm |
| 106 | +# 1. Through Gibbs sampling locate a new visible state x_sample based on the current visible state x |
| 107 | +# 2. Based on the new x sample a new h as h_sample |
| 108 | + self.x_s = gibbs_sample(self.k,self.xr) |
| 109 | + self.h_s = sample_hidden(tf.sigmoid(tf.matmul(self.x_s,self.W) + self.b_h)) |
| 110 | + |
| 111 | +# Sample hidden states based given visible states |
| 112 | + self.h = sample_hidden(tf.sigmoid(tf.matmul(self.xr,self.W) + self.b_h)) |
| 113 | +# Sample visible states based given hidden states |
| 114 | + self.x_ = sample_visible(tf.matmul(self.h,tf.transpose(self.W)) + self.b_v) |
| 115 | + |
| 116 | +# The weight updated based on gradient descent |
| 117 | + #self.size_batch = tf.cast(tf.shape(x)[0], tf.float32) |
| 118 | + self.W_add = tf.multiply(self.learning_rate/self.batch_size,tf.subtract(tf.matmul(tf.transpose(self.xr),self.h),tf.matmul(tf.transpose(self.x_s),self.h_s))) |
| 119 | + self.bv_add = tf.multiply(self.learning_rate/self.batch_size, tf.reduce_sum(tf.subtract(self.xr,self.x_s), 0, True)) |
| 120 | + self.bh_add = tf.multiply(self.learning_rate/self.batch_size, tf.reduce_sum(tf.subtract(self.h,self.h_s), 0, True)) |
| 121 | + self.updt = [self.W.assign_add(self.W_add), self.b_v.assign_add(self.bv_add), self.b_h.assign_add(self.bh_add)] |
| 122 | + |
| 123 | + |
| 124 | + def _train(self): |
| 125 | + |
| 126 | + self.__network() |
| 127 | +# TensorFlow graph execution |
| 128 | + |
| 129 | + with tf.Session() as sess: |
| 130 | + self.saver = tf.train.Saver() |
| 131 | + #saver = tf.train.Saver(write_version=tf.train.SaverDef.V2) |
| 132 | + # Initialize the variables of the Model |
| 133 | + init = tf.global_variables_initializer() |
| 134 | + sess.run(init) |
| 135 | + |
| 136 | + total_batches = self.train_data.shape[0]//self.batch_size |
| 137 | + batch_gen = self.next_batch() |
| 138 | + # Start the training |
| 139 | + for epoch in range(self.epochs): |
| 140 | + if epoch < 150: |
| 141 | + self.k = 2 |
| 142 | + |
| 143 | + if (epoch > 150) & (epoch < 250): |
| 144 | + self.k = 3 |
| 145 | + |
| 146 | + if (epoch > 250) & (epoch < 350): |
| 147 | + self.k = 5 |
| 148 | + |
| 149 | + if (epoch > 350) & (epoch < 500): |
| 150 | + self.k = 9 |
| 151 | + |
| 152 | + # Loop over all batches |
| 153 | + for i in range(total_batches): |
| 154 | + self.X_train = next(batch_gen) |
| 155 | + # Run the weight update |
| 156 | + #batch_xs = (batch_xs > 0)*1 |
| 157 | + _ = sess.run([self.updt],feed_dict={self.x:self.X_train}) |
| 158 | + |
| 159 | + # Display the running step |
| 160 | + if epoch % self.display_step == 0: |
| 161 | + print("Epoch:", '%04d' % (epoch+1)) |
| 162 | + print(self.outdir) |
| 163 | + self.saver.save(sess,os.path.join(self.outdir,'model'), global_step=epoch) |
| 164 | + # Do the prediction for all users all items irrespective of whether they have been rated |
| 165 | + self.logits_pred = tf.reshape(self.x_,[self.users,self.num_movies,self.num_ranks]) |
| 166 | + self.probs = tf.nn.softmax(self.logits_pred,axis=2) |
| 167 | + out = sess.run(self.probs,feed_dict={self.x:self.train_data}) |
| 168 | + recs = [] |
| 169 | + for i in range(self.users): |
| 170 | + for j in range(self.num_movies): |
| 171 | + rec = [i,j,np.argmax(out[i,j,:]) +1] |
| 172 | + recs.append(rec) |
| 173 | + recs = np.array(recs) |
| 174 | + df_pred = pd.DataFrame(recs,columns=['userid','movieid','predicted_rating']) |
| 175 | + df_pred.to_csv(self.outdir + 'pred_all_recs.csv',index=False) |
| 176 | + |
| 177 | + print("RBM training Completed !") |
| 178 | + |
| 179 | + def inference(self): |
| 180 | + |
| 181 | + self.df_result = self.test_df.merge(self.train_df,on=['userid','movieid']) |
| 182 | + self.df_result.to_csv(self.outdir + 'test_results.csv',index=False) |
| 183 | + # in order to get the original ids we just need to add 1 |
| 184 | + self.df_result['userid'] = self.df_result['userid'] + 1 |
| 185 | + self.df_result['movieid'] = self.df_result['movieid'] + 1 |
| 186 | + if self.user_info_file != None: |
| 187 | + self.df_result.merge(self.df_user_file,on=['userid']) |
| 188 | + if self.movie_info_file != None: |
| 189 | + self.df_result.merge(self.df_movie_file,on=['movieid']) |
| 190 | + |
| 191 | + |
| 192 | + print(f'output written to {self.outdir}test_results.csv') |
| 193 | + test_rmse = (np.mean((self.df_result['rating'].values - self.df_result['predicted_rating'].values)**2))**0.5 |
| 194 | + print(f'test RMSE : {test_rmse}') |
| 195 | + |
| 196 | + |
| 197 | + def main_process(self): |
| 198 | + self.read_data() |
| 199 | + |
| 200 | + if self.mode == 'train': |
| 201 | + self._train() |
| 202 | + else: |
| 203 | + self.inference() |
| 204 | + |
| 205 | +if __name__ == '__main__': |
| 206 | + with ElapsedTimer('process RBM'): |
| 207 | + fire.Fire(recommender) |
0 commit comments