forked from nakaizura/Source-Code-Notebook
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_loader.py
143 lines (112 loc) · 5.59 KB
/
data_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
'''
Created on May 29, 2020
@author: nakaizura
'''
import collections
import os
import numpy as np
#按照preprocess的处理结果得到两个文件kg_final.txt和rating_final.txt
#KG的数据集格式是三元组:h,r,t
#推荐的评分数据集格式是:userid,itemid,rating(只有0和1,正例or负例)
def load_data(args):
train_data, eval_data, test_data, user_history_dict = load_rating(args)
n_entity, n_relation, kg = load_kg(args)
ripple_set = get_ripple_set(args, kg, user_history_dict)
return train_data, eval_data, test_data, n_entity, n_relation, ripple_set
def load_rating(args):
print('reading rating file ...')
#载入评分数据集
rating_file = '../data/' + args.dataset + '/ratings_final'
if os.path.exists(rating_file + '.npy'):
rating_np = np.load(rating_file + '.npy')
else:
rating_np = np.loadtxt(rating_file + '.txt', dtype=np.int32)
np.save(rating_file + '.npy', rating_np)
# n_user = len(set(rating_np[:, 0]))
# n_item = len(set(rating_np[:, 1]))
return dataset_split(rating_np)#分割数据集6:2:2
def dataset_split(rating_np):
print('splitting dataset ...')
# train:eval:test = 6:2:2
eval_ratio = 0.2
test_ratio = 0.2
n_ratings = rating_np.shape[0]
# 6:2:2是随机选择切分的
eval_indices = np.random.choice(n_ratings, size=int(n_ratings * eval_ratio), replace=False)
left = set(range(n_ratings)) - set(eval_indices)
test_indices = np.random.choice(list(left), size=int(n_ratings * test_ratio), replace=False)
train_indices = list(left - set(test_indices))
# print(len(train_indices), len(eval_indices), len(test_indices))
#只保留训练集中有历史评分的用户
user_history_dict = dict()
for i in train_indices:#遍历训练集
#数据集的格式是userid,itemid,rating
user = rating_np[i][0]
item = rating_np[i][1]
rating = rating_np[i][2]
if rating == 1:
if user not in user_history_dict:
user_history_dict[user] = []
user_history_dict[user].append(item)#把item放入该用户的历史记录中
#按照user_history_dict对其他两个数据也进行处理
train_indices = [i for i in train_indices if rating_np[i][0] in user_history_dict]
eval_indices = [i for i in eval_indices if rating_np[i][0] in user_history_dict]
test_indices = [i for i in test_indices if rating_np[i][0] in user_history_dict]
# print(len(train_indices), len(eval_indices), len(test_indices))
train_data = rating_np[train_indices]
eval_data = rating_np[eval_indices]
test_data = rating_np[test_indices]
return train_data, eval_data, test_data, user_history_dict
def load_kg(args):
print('reading KG file ...')
#载入KG数据集
kg_file = '../data/' + args.dataset + '/kg_final'
if os.path.exists(kg_file + '.npy'):
kg_np = np.load(kg_file + '.npy')
else:
kg_np = np.loadtxt(kg_file + '.txt', dtype=np.int32)
np.save(kg_file + '.npy', kg_np)
n_entity = len(set(kg_np[:, 0]) | set(kg_np[:, 2]))
n_relation = len(set(kg_np[:, 1]))
kg = construct_kg(kg_np)#构建KG
return n_entity, n_relation, kg
def construct_kg(kg_np):
print('constructing knowledge graph ...')
kg = collections.defaultdict(list)
for head, relation, tail in kg_np:
kg[head].append((tail, relation))#就是按照head建立字典,将尾节点和关系放入到字典中
return kg
#ripple多跳时,每跳的结果集
def get_ripple_set(args, kg, user_history_dict):
print('constructing ripple set ...')
# user -> [(hop_0_heads, hop_0_relations, hop_0_tails), (hop_1_heads, hop_1_relations, hop_1_tails), ...]
ripple_set = collections.defaultdict(list)
for user in user_history_dict:#对于每个用户
for h in range(args.n_hop):#该用户的兴趣在KG多跳hop中
memories_h = []
memories_r = []
memories_t = []
if h == 0:#如果不传播,上一跳的结果就直接是该用户的历史记录
tails_of_last_hop = user_history_dict[user]
else:#去除上一跳的记录
tails_of_last_hop = ripple_set[user][-1][2]
#去除上一跳的三元组特征
for entity in tails_of_last_hop:
for tail_and_relation in kg[entity]:
memories_h.append(entity)
memories_r.append(tail_and_relation[1])
memories_t.append(tail_and_relation[0])
# if the current ripple set of the given user is empty, we simply copy the ripple set of the last hop here
# this won't happen for h = 0, because only the items that appear in the KG have been selected
# this only happens on 154 users in Book-Crossing dataset (since both BX dataset and the KG are sparse)
if len(memories_h) == 0:
ripple_set[user].append(ripple_set[user][-1])
else:
#为每个用户采样固定大小的邻居
replace = len(memories_h) < args.n_memory
indices = np.random.choice(len(memories_h), size=args.n_memory, replace=replace)
memories_h = [memories_h[i] for i in indices]
memories_r = [memories_r[i] for i in indices]
memories_t = [memories_t[i] for i in indices]
ripple_set[user].append((memories_h, memories_r, memories_t))
return ripple_set