Skip to content

Commit

Permalink
sdm
Browse files Browse the repository at this point in the history
  • Loading branch information
littlemesie committed Nov 17, 2020
1 parent 5284662 commit ed4c69c
Show file tree
Hide file tree
Showing 4 changed files with 213 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ venv.bak/
runs/
.DS_Store
.csv
.h5
data/
graphs/
model/
Expand Down
Empty file added src/sdm/__init__.py
Empty file.
85 changes: 85 additions & 0 deletions src/sdm/movie_process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

def load_data():
"""加载数据"""
base_path = os.path.dirname(os.path.abspath(__file__)) + "/../../data/"

unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
user = pd.read_csv(base_path + 'ml-1m/users.dat', sep='::', header=None, names=unames)

rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(base_path + 'ml-1m/ratings.dat', sep='::', header=None, names=rnames)

mnames = ['movie_id', 'title', 'genres']
movies = pd.read_csv(base_path + 'ml-1m/movies.dat', sep='::', header=None, names=mnames)

data = pd.merge(pd.merge(ratings, movies), user) # .iloc[:10000]

return data

def gen_data_set_sdm(data, seq_short_len=5, seq_prefer_len=50):

data.sort_values("timestamp", inplace=True)
train_set = []
test_set = []
for reviewerID, hist in tqdm(data.groupby('user_id')):
pos_list = hist['movie_id'].tolist()
genres_list = hist['genres'].tolist()
rating_list = hist['rating'].tolist()
for i in range(1, len(pos_list)):
hist = pos_list[:i]
genres_hist = genres_list[:i]
if i <= seq_short_len and i != len(pos_list) - 1:
train_set.append((reviewerID, hist[::-1], [0]*seq_prefer_len, pos_list[i], 1, len(hist[::-1]), 0,
rating_list[i], genres_hist[::-1], [0]*seq_prefer_len))
elif i != len(pos_list) - 1:
train_set.append((reviewerID, hist[::-1][:seq_short_len], hist[::-1][seq_short_len:], pos_list[i], 1, seq_short_len,
len(hist[::-1])-seq_short_len, rating_list[i], genres_hist[::-1][:seq_short_len], genres_hist[::-1][seq_short_len:]))
elif i <= seq_short_len and i == len(pos_list) - 1:
test_set.append((reviewerID, hist[::-1], [0] * seq_prefer_len, pos_list[i], 1, len(hist[::-1]), 0,
rating_list[i], genres_hist[::-1], [0]*seq_prefer_len))
else:
test_set.append((reviewerID, hist[::-1][:seq_short_len], hist[::-1][seq_short_len:], pos_list[i], 1, seq_short_len,
len(hist[::-1])-seq_short_len, rating_list[i], genres_hist[::-1][:seq_short_len], genres_hist[::-1][seq_short_len:]))

random.shuffle(train_set)
random.shuffle(test_set)

print(len(train_set[0]), len(test_set[0]))

return train_set, test_set

def gen_model_input_sdm(train_set, user_profile, seq_short_len, seq_prefer_len):

train_uid = np.array([line[0] for line in train_set])
short_train_seq = [line[1] for line in train_set]
prefer_train_seq = [line[2] for line in train_set]
train_iid = np.array([line[3] for line in train_set])
train_label = np.array([line[4] for line in train_set])
train_short_len = np.array([line[5] for line in train_set])
train_prefer_len = np.array([line[6] for line in train_set])
short_train_seq_genres = np.array([line[8] for line in train_set])
prefer_train_seq_genres = np.array([line[9] for line in train_set])

train_short_item_pad = pad_sequences(short_train_seq, maxlen=seq_short_len, padding='post', truncating='post',
value=0)
train_prefer_item_pad = pad_sequences(prefer_train_seq, maxlen=seq_prefer_len, padding='post', truncating='post',
value=0)
train_short_genres_pad = pad_sequences(short_train_seq_genres, maxlen=seq_short_len, padding='post', truncating='post',
value=0)
train_prefer_genres_pad = pad_sequences(prefer_train_seq_genres, maxlen=seq_prefer_len, padding='post', truncating='post',
value=0)

train_model_input = {"user_id": train_uid, "movie_id": train_iid, "short_movie_id": train_short_item_pad,
"prefer_movie_id": train_prefer_item_pad, "prefer_sess_length": train_prefer_len, "short_sess_length":
train_short_len, 'short_genres': train_short_genres_pad, 'prefer_genres': train_prefer_genres_pad}

for key in ["gender", "age", "occupation", "zip"]:
train_model_input[key] = user_profile.loc[train_model_input['user_id']][key].values

return train_model_input, train_label
127 changes: 127 additions & 0 deletions src/sdm/sdm_recommend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import pandas as pd
from deepctr.feature_column import SparseFeat, VarLenSparseFeat
from sdm.movie_process import load_data, gen_data_set_sdm, gen_model_input_sdm
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model
from tensorflow.python.keras import optimizers

from deepmatch.models import SDM
from deepmatch.utils import sampledsoftmaxloss

if __name__ == "__main__":
data = load_data()

sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip", "genres"]
SEQ_LEN_short = 5
SEQ_LEN_prefer = 50

# 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input`

features = ['user_id', 'movie_id', 'gender', 'age', 'occupation', 'zip', 'genres']
feature_max_idx = {}
for feature in features:
lbe = LabelEncoder()
data[feature] = lbe.fit_transform(data[feature]) + 1
feature_max_idx[feature] = data[feature].max() + 1

user_profile = data[["user_id", "gender", "age", "occupation", "zip", "genres"]].drop_duplicates('user_id')

item_profile = data[["movie_id"]].drop_duplicates('movie_id')

user_profile.set_index("user_id", inplace=True)
#
# user_item_list = data.groupby("user_id")['movie_id'].apply(list)

train_set, test_set = gen_data_set_sdm(data, seq_short_len=SEQ_LEN_short, seq_prefer_len=SEQ_LEN_prefer)

train_model_input, train_label = gen_model_input_sdm(train_set, user_profile, SEQ_LEN_short, SEQ_LEN_prefer)
test_model_input, test_label = gen_model_input_sdm(test_set, user_profile, SEQ_LEN_short, SEQ_LEN_prefer)

# 2.count #unique features for each sparse field and generate feature config for sequence feature

embedding_dim = 32

user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], 16),
SparseFeat("gender", feature_max_idx['gender'], 16),
SparseFeat("age", feature_max_idx['age'], 16),
SparseFeat("occupation", feature_max_idx['occupation'], 16),
SparseFeat("zip", feature_max_idx['zip'], 16),
VarLenSparseFeat(SparseFeat('short_movie_id', feature_max_idx['movie_id'], embedding_dim,
embedding_name="movie_id"), SEQ_LEN_short, 'mean',
'short_sess_length'),
VarLenSparseFeat(SparseFeat('prefer_movie_id', feature_max_idx['movie_id'], embedding_dim,
embedding_name="movie_id"), SEQ_LEN_prefer, 'mean',
'prefer_sess_length'),
VarLenSparseFeat(SparseFeat('short_genres', feature_max_idx['genres'], embedding_dim,
embedding_name="genres"), SEQ_LEN_short, 'mean',
'short_sess_length'),
VarLenSparseFeat(SparseFeat('prefer_genres', feature_max_idx['genres'], embedding_dim,
embedding_name="genres"), SEQ_LEN_prefer, 'mean',
'prefer_sess_length'),
]

item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)]

K.set_learning_phase(True)

import tensorflow as tf

if tf.__version__ >= '2.0.0':
tf.compat.v1.disable_eager_execution()

# units must be equal to item embedding dim!
model = SDM(user_feature_columns, item_feature_columns, history_feature_list=['movie_id', 'genres'],
units=embedding_dim, num_sampled=100, )

# 梯度裁剪
optimizer = optimizers.Adam(lr=0.001, clipnorm=5.0)

model.compile(optimizer=optimizer, loss=sampledsoftmaxloss) # "binary_crossentropy")
model.summary()
history = model.fit(train_model_input, train_label, # train_label,
batch_size=512, epochs=20, verbose=1, validation_split=0.0, )
# model.save_weights('SDM_weights.h5')

K.set_learning_phase(False)
# 4. Generate user features for testing and full item features for retrieval
test_user_model_input = test_model_input
all_item_model_input = {"movie_id": item_profile['movie_id'].values, }

user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)
item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)

user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12)
# user_embs = user_embs[:, i, :] # i in [0,k_max) if MIND
item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)

print(user_embs.shape)
print(item_embs.shape)

test_true_label = {line[0]: [line[2]] for line in test_set}

import numpy as np
import faiss
from tqdm import tqdm
from deepmatch.utils import recall_N

index = faiss.IndexFlatIP(embedding_dim)
# faiss.normalize_L2(item_embs)
index.add(item_embs)
# faiss.normalize_L2(user_embs)
D, I = index.search(np.ascontiguousarray(user_embs), 50)
s = []
hit = 0
for i, uid in tqdm(enumerate(test_user_model_input['user_id'])):
try:
pred = [item_profile['movie_id'].values[x] for x in I[i]]
filter_item = None
recall_score = recall_N(test_true_label[uid], pred, N=50)
s.append(recall_score)
if test_true_label[uid] in pred:
hit += 1
except:
print(i)
print("")
print("recall", np.mean(s))
print("hit rate", hit / len(test_user_model_input['user_id']))

0 comments on commit ed4c69c

Please sign in to comment.