Skip to content

Commit

Permalink
杨文勇 2021.01.28
Browse files Browse the repository at this point in the history
1.完成MLP的movielens数据集测试,产生推荐结果和覆盖率,复深蓝数据未测试
2.将协同过滤(ItemCF)与MLP放到文件夹comparison下
  • Loading branch information
yang100123 committed Jan 28, 2021
1 parent 679d76f commit 9d0d941
Show file tree
Hide file tree
Showing 2 changed files with 266 additions and 10 deletions.
79 changes: 69 additions & 10 deletions bigraph/ItemCF.py → comparison/ItemCF.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,15 @@ def getdata(flag = True):
names=['user_id', 'item_id', 'rating', 'timestamp'])
user_data = pd.read_csv('../dgl/ml-100k/u.user', sep='|', header=None, encoding='latin1')
item_data = pd.read_csv('../dgl/ml-100k/u.item', sep='|', header=None, encoding='latin1')
test_data = test_data[test_data['user_id'].isin(train_data['user_id']) &
test_data['item_id'].isin(train_data['item_id'])]

train_data = train_data.values.tolist()
test_data = test_data.values.tolist()
user_data = user_data.values.tolist()
item_data = item_data.values.tolist()
# print item_data
# print item_data
else:
dbHandle = DatabaseIo()
if not dbHandle:
Expand Down Expand Up @@ -64,7 +67,7 @@ def createDict(data):
else:
user_dict[recode[0]] = [(recode[1], recode[2])]
if recode[1] in item_dict:
item_dict[recode[2]].append(recode[0])
item_dict[recode[1]].append(recode[0])
else:
item_dict[recode[1]] = [recode[0]]
return user_dict, item_dict
Expand Down Expand Up @@ -113,15 +116,15 @@ def ItemSimilarity(user_dict):

return W

def makeRecommend(user_dict, K):
def makeRecommend(train_user_dict, test_user_dict, K):
print("make recommend")
rank = {}
result = []
W = ItemSimilarity(user_dict)
for user_id in user_dict.keys():
for i, score in user_dict[user_id]:
W = ItemSimilarity(train_user_dict)
for user_id in test_user_dict.keys():
for i, score in test_user_dict[user_id]:
for j, wj in sorted(W[i].items(), key = lambda x:x[1], reverse = True)[0 : K]:
if j in user_dict[user_id]:
if j in test_user_dict[user_id]:
continue
if j not in rank.keys():
rank[j] = 0
Expand All @@ -136,10 +139,66 @@ def makeRecommend(user_dict, K):
result.append(temp_list)
return result

train_data, test_data, user_data, item_data = getdata(False)
user_dict, item_dict = createDict(train_data)
result = makeRecommend(user_dict, 10)
print result
def makeClassifyDict(item_data):
print("make classify dict")
# 示例{“课程id”:[类别1、类别2、类别3]}
classifydict = {}
item = pd.DataFrame(item_data)
item = item.values.tolist()
for row in item:
if row[0] not in classifydict.keys():
classifydict[row[0]] = []
for i in range(5, 24):
if row[i] == 1:
classifydict[row[0]].append(i)
else:
continue
return classifydict

def getCoverage(result, item_data):
print("get coverage rate")
# 记录推荐类别的字典
classify_num_dict = {}
# 记录物品对应物品分类的字典
classify = makeClassifyDict(item_data)
recommend_result = pd.DataFrame(result)
# 获取推荐列表中不同物品的个数
recommend_length = len(recommend_result[1].value_counts())
item_id = pd.DataFrame(recommend_result[1].value_counts()).values.tolist()
for row in item_id:
for c in classify[row[0]]:
if isinstance(c, int):
if c not in classify_num_dict.keys():
classify_num_dict[c] = 1
else:
continue
else:
for i in c:
if i not in classify_num_dict.keys():
classify_num_dict[i] = 1
else:
continue
# print classify_num_dict
item_length = len(item_data)
# print recommend_length
# print item_length
cov = (recommend_length * 1.0) / (item_length * 1.0)
classify_cov = (len(classify_num_dict) * 1.0) / 19.0
# 计算列表覆盖率
print("the rate of coverage: ")
print(cov)
# 计算物品类别覆盖率
print("the rate of classify coverage: ")
print(classify_cov)



train_data, test_data, user_data, item_data = getdata(True)
train_user_dict, train_item_dict = createDict(train_data)
test_user_dict, test_item_dict = createDict(test_data)
result = makeRecommend(train_user_dict, test_user_dict, 20)
# print result
getCoverage(result, item_data)
recommend_result = pd.DataFrame(result)
recommend_result.to_csv("../file_saved/itemCF.csv", header = False, index = False)

Expand Down
197 changes: 197 additions & 0 deletions comparison/MLP.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-

from tqdm import tqdm_notebook
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Multiply, Concatenate
# from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model
# from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
# import tensorflow.keras.backend as K
import pandas as pd
# import numpy as np
from globalConst import DataBaseQuery, DataBaseOperateType, SetType
from utils.databaseIo import DatabaseIo
from utils.extends import formatDataByType
import random


def loadData(flag = True):
if flag == True:
all_data = pd.read_csv('../dgl/ml-100k/u.data', sep='\t', header=None,
names=['user_id', 'item_id', 'rating', 'timestamp'])
# test_data = pd.read_csv('../dgl/ml-100k/ua.test', sep='\t', header=None,
# names=['user_id', 'item_id', 'rating', 'timestamp'])
user_data = pd.read_csv('../dgl/ml-100k/u.user', sep='|', header=None, encoding='latin1')
item_data = pd.read_csv('../dgl/ml-100k/u.item', sep='|', header=None, encoding='latin1')
# test_data = test_data[test_data['user_id'].isin(train_data['user_id']) &
# test_data['item_id'].isin(train_data['item_id'])]
# u_data = user_data[[0,1,2,3,4]]
# u_data.columns = ['user_id','age','gender','occupation','zip_code']
# i_data = item_data
# i_data.columns = ['item_id','title','release_date','video_release_date','IMDb_URL','unknown','Action','Adventure','Animation','Children',
# 'Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller',
# 'War','Western']
else:
dbHandle = DatabaseIo()
if not dbHandle:
return None
sql_dr = DataBaseQuery["course_dr"]
sql_course = DataBaseQuery["course_info"]
sql_user = DataBaseQuery["user_id"]
result_dr = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
sql=sql_dr)
result_course = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
sql=sql_course)
dbHandle.changeCloseFlag()
result_user = dbHandle.doSql(execType=DataBaseOperateType.SearchMany,
sql=sql_user)
drList = formatDataByType(SetType.SetType_List, result_dr)
all_data = drList
user_data = formatDataByType(SetType.SetType_Set, result_user)
item_data = formatDataByType(SetType.SetType_List, result_course)

# return all_data,user_data,item_data,u_data,i_data
return all_data,user_data,item_data

def MLP(all_data, u_data, i_data,epoch,batch_size):
# all_data, user_data, item_data, u_data, i_data = loadData()
# df_items = item_data[['item_id', 'title']]
# print('Movies:')
# print(df_items.head())

# print('Ratings')
df_ratings = all_data
# print(df_ratings.head())
df_ratings = df_ratings[['user_id', 'item_id', 'rating']]

num_users = len(df_ratings.user_id.unique())
num_items = len(df_ratings.item_id.unique())
print('There are {} unique users and {} unique movies in this data set'.format(num_users, num_items))

# print(df_ratings.head())

num_users = len(df_ratings.user_id.unique())
num_items = len(df_ratings.item_id.unique())
print('There are {} unique users and {} unique movies in this data set'.format(num_users, num_items))

user_maxId = df_ratings.user_id.max()
item_maxId = df_ratings.item_id.max()
print('There are {} distinct users and the max of user ID is also {}'.format(num_users, user_maxId))
print('There are {} distinct movies, however, the max of movie ID is {}'.format(num_items, item_maxId))
print('In the context of matrix factorization, the current item vector is in unnecessarily high dimensional space')
print('So we need to do some data cleaning to reduce the dimension of item vector back to {}'.format(num_items))

users = {}
k = 0
for user in tqdm_notebook(df_ratings.user_id.unique()):
users[user] = k
k += 1

movies = {}
k = 0
for movie in tqdm_notebook(df_ratings.item_id.unique()):
movies[movie] = k
k += 1

df_ratings_f = df_ratings.copy()
df_ratings_f['user_id'] = df_ratings['user_id'].map(users)
df_ratings_f['item_id'] = df_ratings['item_id'].map(movies)

df_ratings_f.head().reset_index(drop=True)

train, test = train_test_split(df_ratings_f, test_size=0.2, shuffle=False, random_state=99)

user = Input(shape=(1,))
item = Input(shape=(1,))

embed_user = Embedding(input_dim=num_users + 1, output_dim=32, embeddings_initializer='uniform',
name='user_embedding', input_length=1)(user)
embed_item = Embedding(input_dim=num_items + 1, output_dim=32, embeddings_initializer='uniform',
name='item_embedding', input_length=1)(item)

user2 = Flatten()(embed_user)
item2 = Flatten()(embed_item)

combine = Concatenate(axis=-1)([user2, item2])

layer1 = Dense(32, activation='relu', kernel_initializer='glorot_uniform')(combine)
layer2 = Dense(32, activation='relu', kernel_initializer='glorot_uniform')(layer1)
layer3 = Dense(32, activation='relu', kernel_initializer='glorot_uniform')(layer2)

out = Dense(1)(layer3)

model = Model([user, item], out)
model.compile(loss="mean_squared_error", optimizer="adam")
# model.summary()

model.fit([train.user_id.values, train.item_id.values], train.rating.values, epochs=epoch, batch_size=batch_size, verbose=2)

# pred = model.predict([test.user_id.values, test.item_id.values])
u_pre = u_data[0].values.tolist()
i_pre = i_data[0].values.tolist()
ilen = len(i_pre)
ulen = len(u_pre)
u_pre = u_pre * ilen
i_pre = i_pre * ulen
pre = pd.DataFrame({'user_id': u_pre, 'item_id': i_pre})
pred = model.predict([pre.user_id.values, pre.item_id.values])

pre['rating'] = pred
result = pre.groupby('user_id').apply(lambda x: x.sort_values(by="rating", ascending=False)).reset_index(drop=True)

print(result)
result.to_csv('../file_saved/MLPresult.csv',index=None)
return result

def makeClassifyDict(item_data):
print("make classify dict")
# 示例{“课程id”:[类别1、类别2、类别3]}
classifydict = {}
item = item_data
item = item.values.tolist()
for row in item:
if row[0] not in classifydict.keys():
classifydict[row[0]] = []
for i in range(5, 24):
if row[i] == 1:
classifydict[row[0]].append(i)
else:
continue
return classifydict

def recommend(item_data,topK):
result = pd.read_csv('../file_saved/MLPresult.csv')
result_topK = result.groupby(['user_id']).head(topK).reset_index(drop=True)

# 计算列表覆盖率
recommend_length = len(result_topK['item_id'].value_counts())
item_length = len(item_data)
print(recommend_length/item_length)

# 计算物品类别覆盖率
# 记录推荐类别的字典
classify_num_dict = {}
classify = makeClassifyDict(item_data)
item_id = result_topK['item_id'].value_counts().values.tolist()
for row in item_id:
for c in classify[row]:
if isinstance(c, int):
if c not in classify_num_dict.keys():
classify_num_dict[c] = 1
else:
continue
else:
for i in c:
if i not in classify_num_dict.keys():
classify_num_dict[i] = 1
else:
continue
classify_cov = (len(classify_num_dict) * 1.0) / 19.0
print(classify_cov)



all_data, user_data, item_data = loadData()
MLP(all_data, user_data, item_data,epoch=10,batch_size=32)
recommend(item_data,topK=10)

0 comments on commit 9d0d941

Please sign in to comment.