'暂时保存，为了查看之前的数据集格式'

Coder-Yu · Feb 27, 2023 · f6e92d6 · f6e92d6
1 parent dc857cf
commit f6e92d6
Show file tree

Hide file tree

Showing 18 changed files with 355 additions and 244 deletions.
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
diff --git a/base/__pycache__/recommender.cpython-37.pyc b/base/__pycache__/recommender.cpython-37.pyc
diff --git a/base/recommender.py b/base/recommender.py
@@ -82,9 +82,10 @@ def execute(self):
  print('Initializing and building model...')
  self.build()
  print('Training Model...')
- self.train_by_item()
+ self.train()
  print('Testing...')
  # rec_list = self.test() # 预测每个用户可能购买的候选item
- rec_list = self.test_by_item() # 预测每个item可能购买的候选用户
+ # rec_list = self.test_by_item() # 预测每个item可能购买的候选用户
  print('Evaluating...')
- self.evaluate_by_item(rec_list)
+ self.evaluate()
+ # self.evaluate_by_item(rec_list)
diff --git a/data/__pycache__/ui_graph.cpython-37.pyc b/data/__pycache__/ui_graph.cpython-37.pyc
diff --git a/data/loader.py b/data/loader.py
@@ -26,11 +26,12 @@ def load_data_set(file, rec_type):
  with open(file, encoding='utf-8', errors='ignore') as f:
  for line in f:
  items = split(' ', line.strip())
- user_id = items[0]
+ examiner_id = items[0]
  item_id = items[1]
  title = items[2]
- weight = items[3]
- data.append([user_id, item_id, title, float(weight)])
+ user = items[3]
+ label = items[4]
+ data.append([examiner_id, item_id, title, user, label])
 
  if rec_type == 'sequential':
  data = {}

diff --git a/data/retweet_dataset.py b/data/retweet_dataset.py
@@ -98,9 +98,9 @@ def flatten_record(self, records_save_path):
  # 找出审批者曾经派发过文件的所有员工，不在worker list的标签为0，在的标签为1
  for worker in self.examiner[self.user[examiner]].keys():
  if self.id2user[worker] in next_step_worker: # 此时worker list已经是
- fp.write(str(self.user[examiner]) + ' ' + str(self.title[one_title]) + ' ' + one_title + ' '+ str(worker) + ' 1\n') # （审批人id、公文id、公文标题、处理人id、rating）
+ fp.write(str(self.user[examiner]) + ' ' + str(self.title[one_title]) + ' ' + one_title + ' '+ str(worker) + ' ' + '1\n') # （审批人id、公文id、公文标题、处理人id、rating）
  else:
- fp.write(str(self.user[examiner]) + ' ' + str(self.title[one_title]) + ' ' + one_title + ' '+ str(worker) + ' 0\n') # （审批人id、公文id、公文标题、处理人id、rating）
+ fp.write(str(self.user[examiner]) + ' ' + str(self.title[one_title]) + ' ' + one_title + ' '+ str(worker) + ' ' + '0\n') # （审批人id、公文id、公文标题、处理人id、rating）
 
 
  def get_interaction_dataset(self, interaction_save_path, records_save_path, train_frac = 0.8):

diff --git a/data/ui_graph.py b/data/ui_graph.py
@@ -41,44 +41,47 @@ def __init__(self, conf, training, test, dev):
 
  def __generate_set(self):
  for entry in self.training_data:
- user, item, title, rating = entry
+ examiner, item, title, user, label = entry
  if user not in self.user:
  self.user[user] = len(self.user)
  self.id2user[self.user[user]] = user
+ if examiner not in self.user:
+ self.user[examiner] = len(self.user)
+ self.id2user[self.user[examiner]] = examiner
  if item not in self.item:
  if title == '':
  print(entry)
  self.train_titles.append(title)
  self.item[item] = len(self.item)
  self.id2item[self.item[item]] = item
  # userList.append
- self.training_set_u[user][item] = rating
- self.training_set_i[item][user] = rating
+ self.training_set_u[user][item] = label
+ self.training_set_i[item][user] = label
 
  for entry in self.dev_data:
- user, item, title, rating = entry
+ examiner, item, title, user, label = entry
  # by_item情况下就要跳过不在训练集中的item
  if self.by_item and item not in self.item:
  continue
  # by_user情况下 用户不在训练集中，就跳过
  if not self.by_item and user not in self.user:
  continue
- self.dev_set_u[user][item] = rating
- self.dev_set_i[item][user] = rating
+ self.dev_set_u[user][item] = label
+ self.dev_set_i[item][user] = label
  if title not in self.dev_titles:
  self.dev_titles.append(title)
 
  for entry in self.test_data:
- user, item, title, rating = entry
+ examiner, item, title, user, label = entry
  # by_item情况下就要跳过不在训练集中的item
  if self.by_item and item not in self.item:
  continue
  # by_user情况下 用户不在训练集中，就跳过
  if not self.by_item and user not in self.user:
  continue
- self.test_set[user][item] = rating
+ self.test_set[user][item] = label
  self.test_set_item.add(item)
- self.test_set_i[item][user] = rating
+ self.test_set_i[item][user] = label
  # 把test_set_item对应的title保存起来
  if title not in self.test_titles:
  self.test_titles.append(title)
@@ -114,10 +117,10 @@ def __create_sparse_interaction_matrix(self):
  """
  row, col, entries = [], [], []
  for pair in self.training_data:
- row += [self.user[pair[0]]]
+ row += [self.user[pair[3]]]
  col += [self.item[pair[1]]]
  entries += [1.0]
- interaction_mat = sp.csr_matrix((entries, (row, col)), shape=(self.user_num,self.item_num),dtype=np.float32)
+ interaction_mat = sp.csr_matrix((entries, (row, col)), shape=(self.user_num, self.item_num), dtype=np.float32)
  return interaction_mat
 
  def get_user_id(self, u):

diff --git a/log/MHCN 2023-02-27 11-23-32.log b/log/MHCN 2023-02-27 11-23-32.log
diff --git a/log/MHCN 2023-02-27 11-24-11.log b/log/MHCN 2023-02-27 11-24-11.log
diff --git a/log/MHCN 2023-02-27 11-30-52.log b/log/MHCN 2023-02-27 11-30-52.log
diff --git a/log/MHCN 2023-02-27 11-32-27.log b/log/MHCN 2023-02-27 11-32-27.log
diff --git a/log/MHCN 2023-02-27 11-34-29.log b/log/MHCN 2023-02-27 11-34-29.log
diff --git a/log/MHCN 2023-02-27 11-34-44.log b/log/MHCN 2023-02-27 11-34-44.log
diff --git a/log/MHCN 2023-02-27 11-36-47.log b/log/MHCN 2023-02-27 11-36-47.log
diff --git a/model/graph/MHCN.py b/model/graph/MHCN.py
@@ -220,8 +220,6 @@ def channel_attention(*channel_embeddings):
  H_s = TFGraphInterface.convert_sparse_mat_to_tensor(H_s)
  H_j = M_matrices[1]
  H_j = TFGraphInterface.convert_sparse_mat_to_tensor(H_j)
- # H_p = M_matrices[2]
- # H_p = TFGraphInterface.convert_sparse_mat_to_tensor(H_p)
  R = TFGraphInterface.convert_sparse_mat_to_tensor(self.data.normalize_graph_mat(self.data.interaction_mat))
  # self-gating
  user_embeddings_c1 = self_gating(self.user_embeddings, 1)
@@ -247,10 +245,7 @@ def channel_attention(*channel_embeddings):
  user_embeddings_c2 = tf.sparse_tensor_dense_matmul(H_j, user_embeddings_c2)
  norm_embeddings = tf.math.l2_normalize(user_embeddings_c2, axis=1)
  all_embeddings_c2 += [norm_embeddings]
- # Channel P
- # user_embeddings_c3 = tf.sparse_tensor_dense_matmul(H_p, user_embeddings_c3)
- # norm_embeddings = tf.math.l2_normalize(user_embeddings_c3, axis=1)
- # all_embeddings_c3 += [norm_embeddings]
+
  # item convolution
  new_item_embeddings = tf.sparse_tensor_dense_matmul(tf.sparse.transpose(R), mixed_embedding)
  norm_embeddings = tf.math.l2_normalize(new_item_embeddings, axis=1)
@@ -274,6 +269,8 @@ def channel_attention(*channel_embeddings):
  self.ss_loss += self.hierarchical_self_supervision(self_supervised_gating(self.final_user_embeddings, 2), H_j)
  self.ss_loss = self.ss_rate * self.ss_loss
 
+
+
  # embedding look-up 从one_hot到矩阵编码的转换过程需要在embedding进行查找，就是把对应的id的用户表示提取出来
  # self.final_item_embeddings是最终用户表示，self.neg_idx是负id
  # 以下是by_user的embedding
@@ -282,22 +279,23 @@ def channel_attention(*channel_embeddings):
  # self.batch_pos_item_emb = tf.nn.embedding_lookup(self.final_item_embeddings, self.v_idx)
 
  # 以下是by_item的embedding
- self.batch_neg_user_emb = tf.nn.embedding_lookup(self.final_user_embeddings, self.neg_idx)
+ # self.batch_neg_user_emb = tf.nn.embedding_lookup(self.final_user_embeddings, self.neg_idx)
  self.batch_item_emb = tf.nn.embedding_lookup(self.final_item_embeddings, self.v_idx)
- self.batch_pos_user_emb = tf.nn.embedding_lookup(self.final_user_embeddings, self.u_idx)
+ self.batch_user_emb = tf.nn.embedding_lookup(self.final_user_embeddings, self.u_idx)
 
  # TODO: 将处理人和item embedding拼接起来，做二分类判断，得到logits
- self.concat_embeddings = tf.concat(values=[self.user_embeddings, self.item_embeddings], axis=1)
- self.fcn_w1 = tf.Variable(initializer([self.emb_size, 128]))
- self.fcn_b1 = tf.Variable(initializer([1, 128]))
+ self.class_num = 2
+ self.concat_embeddings = tf.concat(values=[self.batch_user_emb, self.batch_item_emb], axis=1)
+ self.fcn_w1 = tf.Variable(initializer([self.emb_size * 2, 64]))
+ self.fcn_b1 = tf.Variable(initializer([len(self.v_idx), 64]))
  self.xw1_plus_b1 = tf.matmul(self.concat_embeddings, self.fcn_w1) + self.fcn_b1
  self.fcc_output1 = tf.sigmoid(self.xw1_plus_b1)
 
- self.fcn_w2 = tf.Variable(initializer([128, 2]))
- self.fcn_b2 = tf.Variable(initializer([1, 2]))
+ self.fcn_w2 = tf.Variable(initializer([64, self.class_num]))
+ self.fcn_b2 = tf.Variable(initializer([len(self.v_idx), self.class_num]))
  self.xw2_plus_b2 = tf.matmul(self.fcc_output1, self.fcn_w2) + self.fcn_b2
  self.fcc_output2 = tf.sigmoid(self.xw2_plus_b2)
- self.logits = tf.argmax(input=self.fcc_output2, axis=1) # TODO:要不要加tf.argmax
+ self.logits = tf.argmax(input=self.fcc_output2, axis=1)
 
  # 自监督损失
  def hierarchical_self_supervision(self, em, adj):
@@ -327,29 +325,35 @@ def score(x1, x2):
 
  def train(self):
  # TODO：labels需要placeholder，
- rec_loss = tf.nn.softmax_cross_entropy_with_logits(labels=self.batch_labels, logits=)
+ rec_loss = tf.nn.softmax_cross_entropy_with_logits(labels=self.batch_labels, logits=self.logits)
  # rec_loss = bpr_loss(self.batch_user_emb, self.batch_pos_item_emb, self.batch_neg_item_emb)
  reg_loss = 0
  for key in self.weights:
  reg_loss += self.reg * tf.nn.l2_loss(self.weights[key])
- reg_loss += self.reg * (tf.nn.l2_loss(self.batch_user_emb) + tf.nn.l2_loss(self.batch_neg_item_emb) + tf.nn.l2_loss(self.batch_pos_item_emb))
  total_loss = rec_loss + reg_loss + self.ss_loss
  opt = tf.compat.v1.train.AdamOptimizer(self.lRate)
  train_op = opt.minimize(total_loss)
  init = tf.compat.v1.global_variables_initializer()
  self.sess.run(init) # 启动默认会话，训练神经网络
 
- # Suggested Maximum epoch Setting: LastFM 120 Douban 30 Yelp 30
- # session.run()函数：Runs operations and evaluates tensors in `fetches`， fetches是从计算图中取出对应变量的参数，
- # 可以是单个图元素、任意的列表、元组、字典等等形式的图元素。图元素包括操作、张量、稀疏张量、句柄、字符串等等。
+ # 正确率
+ self.correct_predictions = tf.equal(self.logits, self.batch_labels)
+ self.accuracy = tf.reduce_mean(tf.cast(self.correct_predictions, "float"), name="accuracy")
+
  # TODO：修改
  for epoch in range(self.maxEpoch):
  for n, batch in enumerate(next_batch_pairwise(self.data, self.batch_size)):
- user_idx, i_idx, j_idx = batch
- _, loss1, loss2 = self.sess.run([train_op, rec_loss, self.ss_loss], feed_dict={self.u_idx: user_idx, self.batch_labels: j_idx, self.v_idx: i_idx})
- print('training:', epoch + 1, 'batch', n, 'rec loss:', loss1, 'ssl loss',loss2)
+ user_idx, i_idx, labels = batch
+ _, loss1, loss2, acc = self.sess.run([train_op, rec_loss, self.ss_loss, self.accuracy], feed_dict={self.u_idx: user_idx, self.v_idx: i_idx, self.batch_labels: labels})
+ print('training:', epoch + 1, 'batch', n, 'rec loss:', loss1, 'ssl loss',loss2, ' acc:', acc)
  self.U, self.V = self.sess.run([self.final_user_embeddings, self.final_item_embeddings])
- self.fast_evaluation_by_item(epoch)
+ # validation
+ dev_acc = self.sess.run(self.accuracy, feed_dict={self.batch_labels: self.data.dev_data[4],
+ self.u_idx: self.data.dev_data[3], self.v_idx: self.data.dev_data[1]})
+
+ print('Validating the model...')
+ print('-' * 120)
+ print('Epoch:', str(epoch + 1) + ', dev acc:', dev_acc)
  self.U, self.V = self.best_user_emb, self.best_item_emb
 
  def train_by_item(self):

diff --git a/model/graph/__pycache__/MHCN.cpython-37.pyc b/model/graph/__pycache__/MHCN.cpython-37.pyc
diff --git a/util/__pycache__/sampler.cpython-37.pyc b/util/__pycache__/sampler.cpython-37.pyc
diff --git a/util/sampler.py b/util/sampler.py
@@ -39,20 +39,22 @@ def next_batch_pairwise(data, batch_size, n_negs=1):
  batch_end = ptr + batch_size
  else:
  batch_end = data_size
- users = [training_data[idx][0] for idx in range(ptr, batch_end)] # 获取当前batch的用户
+ users = [training_data[idx][3] for idx in range(ptr, batch_end)] # 获取当前batch的用户
  items = [training_data[idx][1] for idx in range(ptr, batch_end)] # 获取当前batch的item
+ labels = [training_data[idx][4] for idx in range(ptr, batch_end)]
  ptr = batch_end # 当前batch的末尾
- u_idx, i_idx, j_idx = [], [], [] # 分别表示user、item、负例user样本集合
- user_list = list(data.user.keys())
- for i, item in enumerate(items):
- i_idx.append(data.item[item]) # data.item[items[i]] 找到user对应的item的index
- u_idx.append(data.user[users[i]]) # 找到user的index
- for m in range(n_negs): # ！从训练集所有的item中，随机抽取一个样本作为负例样本
- neg_user = choice(user_list)
- while neg_user in data.training_set_i[item]: # 如果neg_item在当前user处理过的item集合中，重新抽一次，为什么错的原因！
- neg_user = choice(user_list)
- j_idx.append(data.user[neg_user])
- yield u_idx, i_idx, j_idx
+ # users, labels = [], []
+ # u_idx, i_idx, j_idx = [], [], [] # 分别表示user、item、负例user样本集合
+ # user_list = list(data.user.keys())
+ # for i, item in enumerate(items):
+ # i_idx.append(data.item[item]) # data.item[items[i]] 找到user对应的item的index
+ # u_idx.append(data.user[users[i]]) # 找到user的index
+ # for m in range(n_negs): # ！从训练集所有的item中，随机抽取一个样本作为负例样本
+ # neg_user = choice(user_list)
+ # while neg_user in data.training_set_i[item]: # 如果neg_item在当前user处理过的item集合中，重新抽一次，为什么错的原因！
+ # neg_user = choice(user_list)
+ # j_idx.append(data.user[neg_user])
+ yield users, items, labels
 
 def next_batch_pointwise(data,batch_size):
  training_data = data.training_data