Skip to content

Commit 82f52b8

Browse files
Chapter 6 runs
1 parent c867687 commit 82f52b8

File tree

3 files changed

+234
-18
lines changed

3 files changed

+234
-18
lines changed

Chapter06/@

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
import numpy as np
2+
import pandas as pd
3+
import tensorflow as tf
4+
import sys
5+
import os
6+
print(tf.__version__)
7+
import fire
8+
from elapsedtimer import ElapsedTimer
9+
10+
class recommender:
11+
12+
def __init__(self,mode,train_file,outdir,test_file=None,
13+
user_info_file=None,movie_info_file=None,
14+
batch_size=32,epochs=500,
15+
learning_rate=1e-3,num_hidden=50,
16+
display_step=5):
17+
18+
19+
self.mode = mode
20+
self.train_file = train_file
21+
self.outdir = outdir
22+
self.test_file = test_file
23+
self.batch_size = batch_size
24+
self.learning_rate = learning_rate
25+
self.num_hidden = num_hidden
26+
self.epochs = epochs
27+
self.display_step = display_step
28+
self.user_info_file = user_info_file
29+
self.movie_info_file = movie_info_file
30+
31+
32+
def read_data(self):
33+
34+
if self.mode == 'train':
35+
self.train_data = np.load(self.train_file)
36+
self.num_ranks = self.train_data.shape[2]
37+
self.num_movies = self.train_data.shape[1]
38+
self.users = self.train_data.shape[0]
39+
40+
else:
41+
self.train_df = pd.read_csv(self.train_file)
42+
self.test_data = np.load(self.test_file)
43+
self.test_df = pd.DataFrame(self.test_data,columns=['userid','movieid','rating'])
44+
45+
if self.user_info_file != None:
46+
self.user_info_df = pd.read_csv(self.user_info_file,sep='|',header=None)
47+
self.user_info_df.columns = ['userid','age','gender','occupation','zipcode']
48+
self.user_info_df = pd.read_csv(self.user_info_file,sep='|',columns=['userid','age','gender','occupation','zipcode'])
49+
50+
if self.movie_info_file != None:
51+
self.movie_info_df = pd.read_csv(self.movie_info_file,sep='|',encoding='latin-1',header=None)
52+
self.movie_info_df = self.movie_info_df[[0,1]]
53+
self.movie_info_df.columns = ['movieid','movie Title']
54+
55+
56+
57+
58+
def next_batch(self):
59+
while True:
60+
ix = np.random.choice(np.arange(self.train_data.shape[0]),self.batch_size)
61+
train_X = self.train_data[ix,:,:]
62+
yield train_X
63+
64+
65+
def __network(self):
66+
67+
self.x = tf.placeholder(tf.float32, [None,self.num_movies,self.num_ranks], name="x")
68+
self.xr = tf.reshape(self.x, [-1,self.num_movies*self.num_ranks], name="xr")
69+
self.W = tf.Variable(tf.random_normal([self.num_movies*self.num_ranks,self.num_hidden], 0.01), name="W")
70+
self.b_h = tf.Variable(tf.zeros([1,self.num_hidden], tf.float32, name="b_h"))
71+
self.b_v = tf.Variable(tf.zeros([1,self.num_movies*self.num_ranks],tf.float32, name="b_v"))
72+
self.k = 2
73+
74+
## Converts the probability into discrete binary states i.e. 0 and 1
75+
def sample_hidden(probs):
76+
return tf.floor(probs + tf.random_uniform(tf.shape(probs), 0, 1))
77+
78+
def sample_visible(logits):
79+
80+
logits = tf.reshape(logits,[-1,self.num_ranks])
81+
sampled_logits = tf.multinomial(logits,1)
82+
sampled_logits = tf.one_hot(sampled_logits,depth = 5)
83+
logits = tf.reshape(logits,[-1,self.num_movies*self.num_ranks])
84+
print(logits)
85+
return logits
86+
87+
88+
89+
90+
91+
## Gibbs sampling step
92+
def gibbs_step(x_k):
93+
# x_k = tf.reshape(x_k,[-1,self.num_movies*self.num_ranks])
94+
h_k = sample_hidden(tf.sigmoid(tf.matmul(x_k,self.W) + self.b_h))
95+
x_k = sample_visible(tf.add(tf.matmul(h_k,tf.transpose(self.W)),self.b_v))
96+
return x_k
97+
## Run multiple gives Sampling step starting from an initital point
98+
def gibbs_sample(k,x_k):
99+
100+
for i in range(k):
101+
x_k = gibbs_step(x_k)
102+
# Returns the gibbs sample after k iterations
103+
return x_k
104+
105+
# Constrastive Divergence algorithm
106+
# 1. Through Gibbs sampling locate a new visible state x_sample based on the current visible state x
107+
# 2. Based on the new x sample a new h as h_sample
108+
self.x_s = gibbs_sample(self.k,self.xr)
109+
self.h_s = sample_hidden(tf.sigmoid(tf.matmul(self.x_s,self.W) + self.b_h))
110+
111+
# Sample hidden states based given visible states
112+
self.h = sample_hidden(tf.sigmoid(tf.matmul(self.xr,self.W) + self.b_h))
113+
# Sample visible states based given hidden states
114+
self.x_ = sample_visible(tf.matmul(self.h,tf.transpose(self.W)) + self.b_v)
115+
116+
# The weight updated based on gradient descent
117+
#self.size_batch = tf.cast(tf.shape(x)[0], tf.float32)
118+
self.W_add = tf.multiply(self.learning_rate/self.batch_size,tf.subtract(tf.matmul(tf.transpose(self.xr),self.h),tf.matmul(tf.transpose(self.x_s),self.h_s)))
119+
self.bv_add = tf.multiply(self.learning_rate/self.batch_size, tf.reduce_sum(tf.subtract(self.xr,self.x_s), 0, True))
120+
self.bh_add = tf.multiply(self.learning_rate/self.batch_size, tf.reduce_sum(tf.subtract(self.h,self.h_s), 0, True))
121+
self.updt = [self.W.assign_add(self.W_add), self.b_v.assign_add(self.bv_add), self.b_h.assign_add(self.bh_add)]
122+
123+
124+
def _train(self):
125+
126+
self.__network()
127+
# TensorFlow graph execution
128+
129+
with tf.Session() as sess:
130+
self.saver = tf.train.Saver()
131+
#saver = tf.train.Saver(write_version=tf.train.SaverDef.V2)
132+
# Initialize the variables of the Model
133+
init = tf.global_variables_initializer()
134+
sess.run(init)
135+
136+
total_batches = self.train_data.shape[0]//self.batch_size
137+
batch_gen = self.next_batch()
138+
# Start the training
139+
for epoch in range(self.epochs):
140+
if epoch < 150:
141+
self.k = 2
142+
143+
if (epoch > 150) & (epoch < 250):
144+
self.k = 3
145+
146+
if (epoch > 250) & (epoch < 350):
147+
self.k = 5
148+
149+
if (epoch > 350) & (epoch < 500):
150+
self.k = 9
151+
152+
# Loop over all batches
153+
for i in range(total_batches):
154+
self.X_train = next(batch_gen)
155+
# Run the weight update
156+
#batch_xs = (batch_xs > 0)*1
157+
_ = sess.run([self.updt],feed_dict={self.x:self.X_train})
158+
159+
# Display the running step
160+
if epoch % self.display_step == 0:
161+
print("Epoch:", '%04d' % (epoch+1))
162+
print(self.outdir)
163+
self.saver.save(sess,os.path.join(self.outdir,'model'), global_step=epoch)
164+
# Do the prediction for all users all items irrespective of whether they have been rated
165+
self.logits_pred = tf.reshape(self.x_,[self.users,self.num_movies,self.num_ranks])
166+
self.probs = tf.nn.softmax(self.logits_pred,axis=2)
167+
out = sess.run(self.probs,feed_dict={self.x:self.train_data})
168+
recs = []
169+
for i in range(self.users):
170+
for j in range(self.num_movies):
171+
rec = [i,j,np.argmax(out[i,j,:]) +1]
172+
recs.append(rec)
173+
recs = np.array(recs)
174+
df_pred = pd.DataFrame(recs,columns=['userid','movieid','predicted_rating'])
175+
df_pred.to_csv(self.outdir + 'pred_all_recs.csv',index=False)
176+
177+
print("RBM training Completed !")
178+
179+
def inference(self):
180+
181+
self.df_result = self.test_df.merge(self.train_df,on=['userid','movieid'])
182+
self.df_result.to_csv(self.outdir + 'test_results.csv',index=False)
183+
# in order to get the original ids we just need to add 1
184+
self.df_result['userid'] = self.df_result['userid'] + 1
185+
self.df_result['movieid'] = self.df_result['movieid'] + 1
186+
if self.user_info_file != None:
187+
self.df_result.merge(self.df_user_file,on=['userid'])
188+
if self.movie_info_file != None:
189+
self.df_result.merge(self.df_movie_file,on=['movieid'])
190+
191+
192+
print(f'output written to {self.outdir}test_results.csv')
193+
test_rmse = (np.mean((self.df_result['rating'].values - self.df_result['predicted_rating'].values)**2))**0.5
194+
print(f'test RMSE : {test_rmse}')
195+
196+
197+
def main_process(self):
198+
self.read_data()
199+
200+
if self.mode == 'train':
201+
self._train()
202+
else:
203+
self.inference()
204+
205+
if __name__ == '__main__':
206+
with ElapsedTimer('process RBM'):
207+
fire.Fire(recommender)

Chapter06/preprocess_ratings.py

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,22 +5,16 @@
55
import numpy as np
66
import pandas as pd
77
import argparse
8-
from sklearn.externals import joblib
98

10-
11-
#path = '/home/santanu/Downloads/RBM Recommender/ml-100k/'
12-
#infile = 'u.data'
13-
#infile_path = path + infile
9+
'''
10+
Ratings file preprocessing script to create training and hold out test datasets
11+
'''
1412

1513
def process_file(infile_path):
1614
infile = pd.read_csv(infile_path,sep='\t',header=None)
1715
infile.columns = ['userId','movieId','rating','timestamp']
1816
users = list(np.unique(infile.userId.values))
1917
movies = list(np.unique(infile.movieId.values))
20-
movies_dict,movies_inverse_dict = {},{}
21-
for i in range(len(movies)):
22-
movies_dict[movies[i]] = i
23-
movies_inverse_dict[i] = movies[i]
2418

2519
test_data = []
2620
ratings_matrix = np.zeros([len(users),len(movies),5])
@@ -29,7 +23,7 @@ def process_file(infile_path):
2923
for i in range(len(infile)):
3024
rec = infile[i:i+1]
3125
user_index = int(rec['userId']-1)
32-
movie_index = movies_dict[int(rec['movieId'])]
26+
movie_index = int(rec['movieId']-1)
3327
rating_index = int(rec['rating']-1)
3428
if np.random.uniform(0,1) < 0.2 :
3529
test_data.append([user_index,movie_index,int(rec['rating'])])
@@ -43,10 +37,6 @@ def process_file(infile_path):
4337

4438
np.save(path + 'train_data',ratings_matrix)
4539
np.save(path + 'test_data',np.array(test_data))
46-
joblib.dump(movies_dict,path + 'movies_dict.pkl')
47-
joblib.dump(movies_inverse_dict,path + 'movies_inverse_dict.pkl')
48-
print(movies_dict)
49-
print(movies_inverse_dict)
5040

5141

5242
if __name__ == '__main__':

Chapter06/rbm.py

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
class recommender:
1111

1212
def __init__(self,mode,train_file,outdir,test_file=None,
13+
user_info_file=None,movie_info_file=None,
1314
batch_size=32,epochs=500,
1415
learning_rate=1e-3,num_hidden=50,
1516
display_step=5):
@@ -24,6 +25,8 @@ def __init__(self,mode,train_file,outdir,test_file=None,
2425
self.num_hidden = num_hidden
2526
self.epochs = epochs
2627
self.display_step = display_step
28+
self.user_info_file = user_info_file
29+
self.movie_info_file = movie_info_file
2730

2831

2932
def read_data(self):
@@ -38,9 +41,17 @@ def read_data(self):
3841
self.train_df = pd.read_csv(self.train_file)
3942
self.test_data = np.load(self.test_file)
4043
self.test_df = pd.DataFrame(self.test_data,columns=['userid','movieid','rating'])
41-
42-
43-
44+
45+
if self.user_info_file != None:
46+
self.user_info_df = pd.read_csv(self.user_info_file,sep='|',header=None)
47+
self.user_info_df.columns=['userid','age','gender','occupation','zipcode']
48+
49+
if self.movie_info_file != None:
50+
self.movie_info_df = pd.read_csv(self.movie_info_file,sep='|',encoding='latin-1',header=None)
51+
self.movie_info_df = self.movie_info_df[[0,1]]
52+
self.movie_info_df.columns = ['movieid','movie Title']
53+
54+
4455

4556

4657
def next_batch(self):
@@ -147,7 +158,6 @@ def _train(self):
147158
# Display the running step
148159
if epoch % self.display_step == 0:
149160
print("Epoch:", '%04d' % (epoch+1))
150-
print(self.outdir)
151161
self.saver.save(sess,os.path.join(self.outdir,'model'), global_step=epoch)
152162
# Do the prediction for all users all items irrespective of whether they have been rated
153163
self.logits_pred = tf.reshape(self.x_,[self.users,self.num_movies,self.num_ranks])
@@ -167,7 +177,16 @@ def _train(self):
167177
def inference(self):
168178

169179
self.df_result = self.test_df.merge(self.train_df,on=['userid','movieid'])
180+
# in order to get the original ids we just need to add 1
181+
self.df_result['userid'] = self.df_result['userid'] + 1
182+
self.df_result['movieid'] = self.df_result['movieid'] + 1
183+
if self.user_info_file != None:
184+
self.df_result = self.df_result.merge(self.user_info_df,on=['userid'])
185+
if self.movie_info_file != None:
186+
self.df_result = self.df_result.merge(self.movie_info_df,on=['movieid'])
170187
self.df_result.to_csv(self.outdir + 'test_results.csv',index=False)
188+
189+
171190
print(f'output written to {self.outdir}test_results.csv')
172191
test_rmse = (np.mean((self.df_result['rating'].values - self.df_result['predicted_rating'].values)**2))**0.5
173192
print(f'test RMSE : {test_rmse}')

0 commit comments

Comments
 (0)