Skip to content

Commit

Permalink
'暂时保存,为了查看之前的数据集格式'
Browse files Browse the repository at this point in the history
  • Loading branch information
Estelle-gqy committed Feb 27, 2023
1 parent dc857cf commit f6e92d6
Show file tree
Hide file tree
Showing 18 changed files with 355 additions and 244 deletions.
480 changes: 290 additions & 190 deletions .idea/workspace.xml

Large diffs are not rendered by default.

Binary file modified base/__pycache__/recommender.cpython-37.pyc
Binary file not shown.
7 changes: 4 additions & 3 deletions base/recommender.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,10 @@ def execute(self):
print('Initializing and building model...')
self.build()
print('Training Model...')
self.train_by_item()
self.train()
print('Testing...')
# rec_list = self.test() # 预测每个用户可能购买的候选item
rec_list = self.test_by_item() # 预测每个item可能购买的候选用户
# rec_list = self.test_by_item() # 预测每个item可能购买的候选用户
print('Evaluating...')
self.evaluate_by_item(rec_list)
self.evaluate()
# self.evaluate_by_item(rec_list)
Binary file modified data/__pycache__/ui_graph.cpython-37.pyc
Binary file not shown.
7 changes: 4 additions & 3 deletions data/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,12 @@ def load_data_set(file, rec_type):
with open(file, encoding='utf-8', errors='ignore') as f:
for line in f:
items = split(' ', line.strip())
user_id = items[0]
examiner_id = items[0]
item_id = items[1]
title = items[2]
weight = items[3]
data.append([user_id, item_id, title, float(weight)])
user = items[3]
label = items[4]
data.append([examiner_id, item_id, title, user, label])

if rec_type == 'sequential':
data = {}
Expand Down
4 changes: 2 additions & 2 deletions data/retweet_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,9 @@ def flatten_record(self, records_save_path):
# 找出审批者曾经派发过文件的所有员工,不在worker list的标签为0,在的标签为1
for worker in self.examiner[self.user[examiner]].keys():
if self.id2user[worker] in next_step_worker: # 此时worker list已经是
fp.write(str(self.user[examiner]) + ' ' + str(self.title[one_title]) + ' ' + one_title + ' '+ str(worker) + ' 1\n') # (审批人id、公文id、公文标题、处理人id、rating)
fp.write(str(self.user[examiner]) + ' ' + str(self.title[one_title]) + ' ' + one_title + ' '+ str(worker) + ' ' + '1\n') # (审批人id、公文id、公文标题、处理人id、rating)
else:
fp.write(str(self.user[examiner]) + ' ' + str(self.title[one_title]) + ' ' + one_title + ' '+ str(worker) + ' 0\n') # (审批人id、公文id、公文标题、处理人id、rating)
fp.write(str(self.user[examiner]) + ' ' + str(self.title[one_title]) + ' ' + one_title + ' '+ str(worker) + ' ' + '0\n') # (审批人id、公文id、公文标题、处理人id、rating)


def get_interaction_dataset(self, interaction_save_path, records_save_path, train_frac = 0.8):
Expand Down
25 changes: 14 additions & 11 deletions data/ui_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,44 +41,47 @@ def __init__(self, conf, training, test, dev):

def __generate_set(self):
for entry in self.training_data:
user, item, title, rating = entry
examiner, item, title, user, label = entry
if user not in self.user:
self.user[user] = len(self.user)
self.id2user[self.user[user]] = user
if examiner not in self.user:
self.user[examiner] = len(self.user)
self.id2user[self.user[examiner]] = examiner
if item not in self.item:
if title == '':
print(entry)
self.train_titles.append(title)
self.item[item] = len(self.item)
self.id2item[self.item[item]] = item
# userList.append
self.training_set_u[user][item] = rating
self.training_set_i[item][user] = rating
self.training_set_u[user][item] = label
self.training_set_i[item][user] = label

for entry in self.dev_data:
user, item, title, rating = entry
examiner, item, title, user, label = entry
# by_item情况下就要跳过不在训练集中的item
if self.by_item and item not in self.item:
continue
# by_user情况下 用户不在训练集中,就跳过
if not self.by_item and user not in self.user:
continue
self.dev_set_u[user][item] = rating
self.dev_set_i[item][user] = rating
self.dev_set_u[user][item] = label
self.dev_set_i[item][user] = label
if title not in self.dev_titles:
self.dev_titles.append(title)

for entry in self.test_data:
user, item, title, rating = entry
examiner, item, title, user, label = entry
# by_item情况下就要跳过不在训练集中的item
if self.by_item and item not in self.item:
continue
# by_user情况下 用户不在训练集中,就跳过
if not self.by_item and user not in self.user:
continue
self.test_set[user][item] = rating
self.test_set[user][item] = label
self.test_set_item.add(item)
self.test_set_i[item][user] = rating
self.test_set_i[item][user] = label
# 把test_set_item对应的title保存起来
if title not in self.test_titles:
self.test_titles.append(title)
Expand Down Expand Up @@ -114,10 +117,10 @@ def __create_sparse_interaction_matrix(self):
"""
row, col, entries = [], [], []
for pair in self.training_data:
row += [self.user[pair[0]]]
row += [self.user[pair[3]]]
col += [self.item[pair[1]]]
entries += [1.0]
interaction_mat = sp.csr_matrix((entries, (row, col)), shape=(self.user_num,self.item_num),dtype=np.float32)
interaction_mat = sp.csr_matrix((entries, (row, col)), shape=(self.user_num, self.item_num), dtype=np.float32)
return interaction_mat

def get_user_id(self, u):
Expand Down
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
50 changes: 27 additions & 23 deletions model/graph/MHCN.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,8 +220,6 @@ def channel_attention(*channel_embeddings):
H_s = TFGraphInterface.convert_sparse_mat_to_tensor(H_s)
H_j = M_matrices[1]
H_j = TFGraphInterface.convert_sparse_mat_to_tensor(H_j)
# H_p = M_matrices[2]
# H_p = TFGraphInterface.convert_sparse_mat_to_tensor(H_p)
R = TFGraphInterface.convert_sparse_mat_to_tensor(self.data.normalize_graph_mat(self.data.interaction_mat))
# self-gating
user_embeddings_c1 = self_gating(self.user_embeddings, 1)
Expand All @@ -247,10 +245,7 @@ def channel_attention(*channel_embeddings):
user_embeddings_c2 = tf.sparse_tensor_dense_matmul(H_j, user_embeddings_c2)
norm_embeddings = tf.math.l2_normalize(user_embeddings_c2, axis=1)
all_embeddings_c2 += [norm_embeddings]
# Channel P
# user_embeddings_c3 = tf.sparse_tensor_dense_matmul(H_p, user_embeddings_c3)
# norm_embeddings = tf.math.l2_normalize(user_embeddings_c3, axis=1)
# all_embeddings_c3 += [norm_embeddings]

# item convolution
new_item_embeddings = tf.sparse_tensor_dense_matmul(tf.sparse.transpose(R), mixed_embedding)
norm_embeddings = tf.math.l2_normalize(new_item_embeddings, axis=1)
Expand All @@ -274,6 +269,8 @@ def channel_attention(*channel_embeddings):
self.ss_loss += self.hierarchical_self_supervision(self_supervised_gating(self.final_user_embeddings, 2), H_j)
self.ss_loss = self.ss_rate * self.ss_loss



# embedding look-up 从one_hot到矩阵编码的转换过程需要在embedding进行查找,就是把对应的id的用户表示提取出来
# self.final_item_embeddings是最终用户表示,self.neg_idx是负id
# 以下是by_user的embedding
Expand All @@ -282,22 +279,23 @@ def channel_attention(*channel_embeddings):
# self.batch_pos_item_emb = tf.nn.embedding_lookup(self.final_item_embeddings, self.v_idx)

# 以下是by_item的embedding
self.batch_neg_user_emb = tf.nn.embedding_lookup(self.final_user_embeddings, self.neg_idx)
# self.batch_neg_user_emb = tf.nn.embedding_lookup(self.final_user_embeddings, self.neg_idx)
self.batch_item_emb = tf.nn.embedding_lookup(self.final_item_embeddings, self.v_idx)
self.batch_pos_user_emb = tf.nn.embedding_lookup(self.final_user_embeddings, self.u_idx)
self.batch_user_emb = tf.nn.embedding_lookup(self.final_user_embeddings, self.u_idx)

# TODO: 将处理人和item embedding拼接起来,做二分类判断,得到logits
self.concat_embeddings = tf.concat(values=[self.user_embeddings, self.item_embeddings], axis=1)
self.fcn_w1 = tf.Variable(initializer([self.emb_size, 128]))
self.fcn_b1 = tf.Variable(initializer([1, 128]))
self.class_num = 2
self.concat_embeddings = tf.concat(values=[self.batch_user_emb, self.batch_item_emb], axis=1)
self.fcn_w1 = tf.Variable(initializer([self.emb_size * 2, 64]))
self.fcn_b1 = tf.Variable(initializer([len(self.v_idx), 64]))
self.xw1_plus_b1 = tf.matmul(self.concat_embeddings, self.fcn_w1) + self.fcn_b1
self.fcc_output1 = tf.sigmoid(self.xw1_plus_b1)

self.fcn_w2 = tf.Variable(initializer([128, 2]))
self.fcn_b2 = tf.Variable(initializer([1, 2]))
self.fcn_w2 = tf.Variable(initializer([64, self.class_num]))
self.fcn_b2 = tf.Variable(initializer([len(self.v_idx), self.class_num]))
self.xw2_plus_b2 = tf.matmul(self.fcc_output1, self.fcn_w2) + self.fcn_b2
self.fcc_output2 = tf.sigmoid(self.xw2_plus_b2)
self.logits = tf.argmax(input=self.fcc_output2, axis=1) # TODO:要不要加tf.argmax
self.logits = tf.argmax(input=self.fcc_output2, axis=1)

# 自监督损失
def hierarchical_self_supervision(self, em, adj):
Expand Down Expand Up @@ -327,29 +325,35 @@ def score(x1, x2):

def train(self):
# TODO:labels需要placeholder,
rec_loss = tf.nn.softmax_cross_entropy_with_logits(labels=self.batch_labels, logits=)
rec_loss = tf.nn.softmax_cross_entropy_with_logits(labels=self.batch_labels, logits=self.logits)
# rec_loss = bpr_loss(self.batch_user_emb, self.batch_pos_item_emb, self.batch_neg_item_emb)
reg_loss = 0
for key in self.weights:
reg_loss += self.reg * tf.nn.l2_loss(self.weights[key])
reg_loss += self.reg * (tf.nn.l2_loss(self.batch_user_emb) + tf.nn.l2_loss(self.batch_neg_item_emb) + tf.nn.l2_loss(self.batch_pos_item_emb))
total_loss = rec_loss + reg_loss + self.ss_loss
opt = tf.compat.v1.train.AdamOptimizer(self.lRate)
train_op = opt.minimize(total_loss)
init = tf.compat.v1.global_variables_initializer()
self.sess.run(init) # 启动默认会话,训练神经网络

# Suggested Maximum epoch Setting: LastFM 120 Douban 30 Yelp 30
# session.run()函数:Runs operations and evaluates tensors in `fetches`, fetches是从计算图中取出对应变量的参数,
# 可以是单个图元素、任意的列表、元组、字典等等形式的图元素。图元素包括操作、张量、稀疏张量、句柄、字符串等等。
# 正确率
self.correct_predictions = tf.equal(self.logits, self.batch_labels)
self.accuracy = tf.reduce_mean(tf.cast(self.correct_predictions, "float"), name="accuracy")

# TODO:修改
for epoch in range(self.maxEpoch):
for n, batch in enumerate(next_batch_pairwise(self.data, self.batch_size)):
user_idx, i_idx, j_idx = batch
_, loss1, loss2 = self.sess.run([train_op, rec_loss, self.ss_loss], feed_dict={self.u_idx: user_idx, self.batch_labels: j_idx, self.v_idx: i_idx})
print('training:', epoch + 1, 'batch', n, 'rec loss:', loss1, 'ssl loss',loss2)
user_idx, i_idx, labels = batch
_, loss1, loss2, acc = self.sess.run([train_op, rec_loss, self.ss_loss, self.accuracy], feed_dict={self.u_idx: user_idx, self.v_idx: i_idx, self.batch_labels: labels})
print('training:', epoch + 1, 'batch', n, 'rec loss:', loss1, 'ssl loss',loss2, ' acc:', acc)
self.U, self.V = self.sess.run([self.final_user_embeddings, self.final_item_embeddings])
self.fast_evaluation_by_item(epoch)
# validation
dev_acc = self.sess.run(self.accuracy, feed_dict={self.batch_labels: self.data.dev_data[4],
self.u_idx: self.data.dev_data[3], self.v_idx: self.data.dev_data[1]})

print('Validating the model...')
print('-' * 120)
print('Epoch:', str(epoch + 1) + ', dev acc:', dev_acc)
self.U, self.V = self.best_user_emb, self.best_item_emb

def train_by_item(self):
Expand Down
Binary file modified model/graph/__pycache__/MHCN.cpython-37.pyc
Binary file not shown.
Binary file modified util/__pycache__/sampler.cpython-37.pyc
Binary file not shown.
26 changes: 14 additions & 12 deletions util/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,20 +39,22 @@ def next_batch_pairwise(data, batch_size, n_negs=1):
batch_end = ptr + batch_size
else:
batch_end = data_size
users = [training_data[idx][0] for idx in range(ptr, batch_end)] # 获取当前batch的用户
users = [training_data[idx][3] for idx in range(ptr, batch_end)] # 获取当前batch的用户
items = [training_data[idx][1] for idx in range(ptr, batch_end)] # 获取当前batch的item
labels = [training_data[idx][4] for idx in range(ptr, batch_end)]
ptr = batch_end # 当前batch的末尾
u_idx, i_idx, j_idx = [], [], [] # 分别表示user、item、负例user样本集合
user_list = list(data.user.keys())
for i, item in enumerate(items):
i_idx.append(data.item[item]) # data.item[items[i]] 找到user对应的item的index
u_idx.append(data.user[users[i]]) # 找到user的index
for m in range(n_negs): # !从训练集所有的item中,随机抽取一个样本作为负例样本
neg_user = choice(user_list)
while neg_user in data.training_set_i[item]: # 如果neg_item在当前user处理过的item集合中,重新抽一次,为什么错的原因!
neg_user = choice(user_list)
j_idx.append(data.user[neg_user])
yield u_idx, i_idx, j_idx
# users, labels = [], []
# u_idx, i_idx, j_idx = [], [], [] # 分别表示user、item、负例user样本集合
# user_list = list(data.user.keys())
# for i, item in enumerate(items):
# i_idx.append(data.item[item]) # data.item[items[i]] 找到user对应的item的index
# u_idx.append(data.user[users[i]]) # 找到user的index
# for m in range(n_negs): # !从训练集所有的item中,随机抽取一个样本作为负例样本
# neg_user = choice(user_list)
# while neg_user in data.training_set_i[item]: # 如果neg_item在当前user处理过的item集合中,重新抽一次,为什么错的原因!
# neg_user = choice(user_list)
# j_idx.append(data.user[neg_user])
yield users, items, labels

def next_batch_pointwise(data,batch_size):
training_data = data.training_data
Expand Down

0 comments on commit f6e92d6

Please sign in to comment.