Skip to content

Commit c867687

Browse files
chapter 6
1 parent 6d3ccb7 commit c867687

File tree

3 files changed

+20285
-104
lines changed

3 files changed

+20285
-104
lines changed

Chapter06/preprocess_ratings.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,27 +4,29 @@
44

55
import numpy as np
66
import pandas as pd
7+
import argparse
8+
from sklearn.externals import joblib
79

8-
path = '/home/santanu/Downloads/RBM Recommender/ml-100k/'
9-
infile = 'u.data'
1010

11-
infile_path = path + infile
11+
#path = '/home/santanu/Downloads/RBM Recommender/ml-100k/'
12+
#infile = 'u.data'
13+
#infile_path = path + infile
1214

1315
def process_file(infile_path):
1416
infile = pd.read_csv(infile_path,sep='\t',header=None)
1517
infile.columns = ['userId','movieId','rating','timestamp']
1618
users = list(np.unique(infile.userId.values))
1719
movies = list(np.unique(infile.movieId.values))
1820
movies_dict,movies_inverse_dict = {},{}
19-
for i in xrange(len(movies)):
21+
for i in range(len(movies)):
2022
movies_dict[movies[i]] = i
2123
movies_inverse_dict[i] = movies[i]
2224

2325
test_data = []
2426
ratings_matrix = np.zeros([len(users),len(movies),5])
2527
count = 0
2628
total_count = len(infile)
27-
for i in xrange(len(infile)):
29+
for i in range(len(infile)):
2830
rec = infile[i:i+1]
2931
user_index = int(rec['userId']-1)
3032
movie_index = movies_dict[int(rec['movieId'])]
@@ -37,15 +39,23 @@ def process_file(infile_path):
3739

3840
count +=1
3941
if (count % 100000 == 0) & (count>= 100000):
40-
print 'Processed ' + str(count) + ' records out of ' + str(total_count)
42+
print('Processed ' + str(count) + ' records out of ' + str(total_count))
4143

4244
np.save(path + 'train_data',ratings_matrix)
4345
np.save(path + 'test_data',np.array(test_data))
44-
np.save(path + 'movies_dict',movies_dict)
45-
np.save(path + 'movies_inverse_dict',movies_inverse_dict)
46+
joblib.dump(movies_dict,path + 'movies_dict.pkl')
47+
joblib.dump(movies_inverse_dict,path + 'movies_inverse_dict.pkl')
48+
print(movies_dict)
49+
print(movies_inverse_dict)
4650

4751

4852
if __name__ == '__main__':
53+
parser = argparse.ArgumentParser()
54+
parser.add_argument('--path',help='input data path')
55+
parser.add_argument('--infile',help='input file name')
56+
args = parser.parse_args()
57+
path = args.path
58+
infile = args.infile
4959
process_file(path + infile)
5060

5161

Chapter06/rbm.py

Lines changed: 72 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -3,45 +3,50 @@
33
import tensorflow as tf
44
import sys
55
import os
6-
print tf.__version__
6+
print(tf.__version__)
7+
import fire
8+
from elapsedtimer import ElapsedTimer
79

810
class recommender:
911

10-
def __init__(self,infile):
11-
12-
self.train_file = '/home/santanu/Downloads/RBM Recommender/ml-100k/train_data.npy'
13-
self.data = np.load(infile)
12+
def __init__(self,mode,train_file,outdir,test_file=None,
13+
batch_size=32,epochs=500,
14+
learning_rate=1e-3,num_hidden=50,
15+
display_step=5):
16+
17+
18+
self.mode = mode
19+
self.train_file = train_file
20+
self.outdir = outdir
21+
self.test_file = test_file
22+
self.batch_size = batch_size
23+
self.learning_rate = learning_rate
24+
self.num_hidden = num_hidden
25+
self.epochs = epochs
26+
self.display_step = display_step
1427

15-
if sys.argv[1] == 'train':
16-
self.train_file = infile
17-
self.data = np.load(infile)
18-
else:
19-
#elf.test_file = infile
20-
self.data = np.load(infile)
21-
self.user_index = list(self.data[:,0])
22-
self.movie_index = list(self.data[:,1])
23-
self.rating_index = list(self.data[:,2])
28+
29+
def read_data(self):
30+
31+
if self.mode == 'train':
2432
self.train_data = np.load(self.train_file)
25-
self.test_data = self.train_data[self.user_index,:,:]
26-
33+
self.num_ranks = self.train_data.shape[2]
34+
self.num_movies = self.train_data.shape[1]
35+
self.users = self.train_data.shape[0]
36+
37+
else:
38+
self.train_df = pd.read_csv(self.train_file)
39+
self.test_data = np.load(self.test_file)
40+
self.test_df = pd.DataFrame(self.test_data,columns=['userid','movieid','rating'])
41+
42+
2743

2844

29-
#self.data = np.load(infile)
30-
self.ranks = 5
31-
self.batch_size = 32
32-
self.epochs = 500
33-
self.learning_rate = 1e-4
34-
self.users = self.train_data.shape[0]
35-
self.num_hidden = 500
36-
self.num_movies = self.train_data.shape[1]
37-
self.num_ranks = 5
38-
self.display_step = 1
39-
self.path_save = sys.argv[3]
4045

4146
def next_batch(self):
4247
while True:
43-
ix = np.random.choice(np.arange(self.data.shape[0]),self.batch_size)
44-
train_X = self.data[ix,:,:]
48+
ix = np.random.choice(np.arange(self.train_data.shape[0]),self.batch_size)
49+
train_X = self.train_data[ix,:,:]
4550
yield train_X
4651

4752

@@ -64,7 +69,7 @@ def sample_visible(logits):
6469
sampled_logits = tf.multinomial(logits,1)
6570
sampled_logits = tf.one_hot(sampled_logits,depth = 5)
6671
logits = tf.reshape(logits,[-1,self.num_movies*self.num_ranks])
67-
print logits
72+
print(logits)
6873
return logits
6974

7075

@@ -110,27 +115,27 @@ def _train(self):
110115
# TensorFlow graph execution
111116

112117
with tf.Session() as sess:
113-
saver = tf.train.Saver(max_to_keep=100,write_version=1)
118+
self.saver = tf.train.Saver()
114119
#saver = tf.train.Saver(write_version=tf.train.SaverDef.V2)
115120
# Initialize the variables of the Model
116121
init = tf.global_variables_initializer()
117122
sess.run(init)
118123

119-
total_batches = self.data.shape[0]//self.batch_size
124+
total_batches = self.train_data.shape[0]//self.batch_size
120125
batch_gen = self.next_batch()
121126
# Start the training
122127
for epoch in range(self.epochs):
123128
if epoch < 150:
124-
k = 2
129+
self.k = 2
125130

126131
if (epoch > 150) & (epoch < 250):
127-
k = 3
132+
self.k = 3
128133

129134
if (epoch > 250) & (epoch < 350):
130-
k = 5
135+
self.k = 5
131136

132137
if (epoch > 350) & (epoch < 500):
133-
k = 9
138+
self.k = 9
134139

135140
# Loop over all batches
136141
for i in range(total_batches):
@@ -142,69 +147,40 @@ def _train(self):
142147
# Display the running step
143148
if epoch % self.display_step == 0:
144149
print("Epoch:", '%04d' % (epoch+1))
145-
saver.save(sess, os.path.join(self.path_save,'model'), global_step=epoch)
150+
print(self.outdir)
151+
self.saver.save(sess,os.path.join(self.outdir,'model'), global_step=epoch)
152+
# Do the prediction for all users all items irrespective of whether they have been rated
153+
self.logits_pred = tf.reshape(self.x_,[self.users,self.num_movies,self.num_ranks])
154+
self.probs = tf.nn.softmax(self.logits_pred,axis=2)
155+
out = sess.run(self.probs,feed_dict={self.x:self.train_data})
156+
recs = []
157+
for i in range(self.users):
158+
for j in range(self.num_movies):
159+
rec = [i,j,np.argmax(out[i,j,:]) +1]
160+
recs.append(rec)
161+
recs = np.array(recs)
162+
df_pred = pd.DataFrame(recs,columns=['userid','movieid','predicted_rating'])
163+
df_pred.to_csv(self.outdir + 'pred_all_recs.csv',index=False)
146164

147-
print("RBM training Completed !")
148-
149-
165+
print("RBM training Completed !")
150166

151-
def _inference(self):
152-
153-
self.model_path = sys.argv[3]
154-
155-
#self.test_data = self.data
156-
self.__network()
157-
sess = tf.Session()
158-
159-
saver = tf.train.Saver(tf.all_variables(), reshape=True)
160-
saver.restore(sess,self.model_path)
161-
x_ = tf.matmul(self.h,tf.transpose(self.W)) + self.b_v
162-
#print x_
163-
logits = tf.reshape(x_,[-1,self.num_ranks])
164-
# print logits
165-
logits = tf.argmax(logits,axis=-1)
166-
# print logits
167-
logits = tf.reshape(logits,[-1,self.num_movies])
168-
out = sess.run(logits,feed_dict={self.x:self.test_data})
169-
ratings_pred = []
170-
i = 0
171-
for x in self.movie_index:
172-
pred = out[i,x] + 1
173-
ratings_pred.append(pred)
174-
i+=1
175-
176-
ratings_pred = np.array(ratings_pred)
177-
ratings_pred = np.reshape(ratings_pred,(-1,1))
178-
print ratings_pred.shape
179-
print self.data.shape
180-
out = np.hstack((self.data,ratings_pred))
181-
out = pd.DataFrame(out)
182-
print out
183-
out.columns=['User','Movie','Actual Rating','Predicted Rating']
184-
return out
185-
167+
def inference(self):
186168

187-
188-
189-
190-
191-
if __name__ == '__main__':
192-
193-
if sys.argv[1] == 'train':
194-
195-
infile = sys.argv[2]
196-
model = recommender(infile)
197-
model._train()
198-
199-
if sys.argv[1] == 'test':
200-
201-
infile = sys.argv[2]
202-
203-
model = recommender(infile)
204-
out = model._inference()
205-
out.to_csv('/home/santanu/Downloads/RBM Recommender/results.csv')
169+
self.df_result = self.test_df.merge(self.train_df,on=['userid','movieid'])
170+
self.df_result.to_csv(self.outdir + 'test_results.csv',index=False)
171+
print(f'output written to {self.outdir}test_results.csv')
172+
test_rmse = (np.mean((self.df_result['rating'].values - self.df_result['predicted_rating'].values)**2))**0.5
173+
print(f'test RMSE : {test_rmse}')
206174

207175

208-
176+
def main_process(self):
177+
self.read_data()
209178

210-
179+
if self.mode == 'train':
180+
self._train()
181+
else:
182+
self.inference()
183+
184+
if __name__ == '__main__':
185+
with ElapsedTimer('process RBM'):
186+
fire.Fire(recommender)

0 commit comments

Comments
 (0)