Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
luochen1992 committed Feb 7, 2018
0 parents commit 8bb7c12
Show file tree
Hide file tree
Showing 8 changed files with 37,753 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Auto detect text files and perform LF normalization
* text=auto
142 changes: 142 additions & 0 deletions LoadData.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jan 23 12:09:35 2018
@author: LuoJiacheng
"""
import numpy as np
import pandas as pd


class LoadData(object):
def __init__(self,ratio):
file = './datasets/ratings.txt'
f_file = './datasets/trust.txt'
data = np.loadtxt(file)
fdata = np.loadtxt(f_file).astype(int)

data[:,2] = data[:,2]/5
indices = np.arange(len(data[:,2]))
num_train_samples = int(len(indices)*ratio)

np.random.seed(615)
np.random.shuffle(indices)

user = data[:,0].copy().astype(int)
item = data[:,1].copy().astype(int)
rating = data[:,2].copy()
self.user = user[indices]
self.item = item[indices]
self.rating = rating[indices]

self.train_user = self.user[:num_train_samples]
self.train_item = self.item[:num_train_samples]
self.train_rating = self.rating[:num_train_samples]


self.test_user = self.user[num_train_samples:]
self.test_item = self.item[num_train_samples:]
self.test_rating = self.rating[num_train_samples:]
#find friends

friends_dic = {}
for elem in fdata[:,0]:
friends_dic.setdefault(elem,[])
for i,elem in enumerate(fdata[:,0]):
friends_dic[elem].append(fdata[i,1])
self.friends_dic = friends_dic

friends = []
for usr in self.user:
try:
friends.append(self.friends_dic[usr][0])
except:
friends.append(0)
friends = np.array(friends)
self.friends = friends

self.train_friends =self.friends[:num_train_samples]
self.test_friends = self.friends[num_train_samples:]

#sim matrix
matrix = np.zeros([max(fdata[:,1]),max(self.item)])
matrix[self.train_user,self.train_item]=self.train_rating
self.matrix = matrix

#sim
self.train_sim = self.comput_sim(self.train_friends,self.train_user)

self.test_sim = self.comput_sim(self.test_friends,self.test_user)

def comput_sim(self,user,friends):
sim = []
for i,j in zip(user,friends):
sim.append(self.sim_pearson(self.matrix[i,:],self.matrix[j,:]))
sim = ((np.array(sim)+1)/2)
return sim

def sim_pearson(self,user1,user2):
# a=np.isfinite(user1)
# b=np.isfinite(user2)
a_=np.nan_to_num(user1)
b_=np.nan_to_num(user2)
user1[user1 == 0] = np.nan
user2[user2 == 0] = np.nan
a=a_>0
b=b_>0
n=0
k = np.logical_and(a,b)
n = k[k==True].size
if n==0:
return 0
user1_k = user1[k]
user2_k = user2[k]
num = np.sum((user1_k-np.nanmean(user1))*(user2_k-np.nanmean(user2)))
den = np.sqrt(np.sum(pow((user1_k-np.nanmean(user1)),2))*np.sum(pow((user2_k-np.nanmean(user2)),2)))
if den ==0 : return 0
sim = num/den
sim_ac = sim*(2*user1_k.size)/(user1[pd.notnull(user1)].size+user2[pd.notnull(user2)].size)
return sim_ac

def get_batches(self,user,item, Y,batch_size):
'''
数据batch迭代器
user_batch, item_batch,y_batch
return 1 friends ,you can revise
'''

if batch_size is None:
batch_size = 128

n_batches = int(len(Y) / batch_size)
# 这里我们仅保留完整的batch,对于不能整出的部分进行舍弃

while (True):
# inputs

for count in range(n_batches):
friends =[]

user_batch = user[count*batch_size:(count+1)*batch_size]
item_batch = item[count*batch_size:(count+1)*batch_size]

# targets
y_batch = Y[count*batch_size:(count+1)*batch_size]

for usr in user_batch:
try:
friends.append(self.friends_dic[usr][0])
except:
friends.append(0)
friends = np.array(friends)

sim = self.comput_sim(user_batch,friends)
sim = np.reshape(sim,(batch_size,1))

user_batch = np.reshape(user_batch,(batch_size,1))
item_batch = np.reshape(item_batch,(batch_size,1))
friends = np.reshape(friends,(batch_size,1))
y_batch = np.reshape(y_batch,(batch_size,1))
yield user_batch, item_batch,friends,sim,y_batch
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# SRDF
244 changes: 244 additions & 0 deletions SRDF.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
# -*- coding: utf-8 -*-
"""
Created on Wed Jan 24 13:39:25 2018
@author: Luo Jiacheng, Hao Wuhan
Social-Regularized Deep Factors Model for Rating Prediction (Baseline)
"""

import tensorflow as tf
import numpy as np
from LoadData import LoadData
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_squared_error
from time import time
import argparse
import math
from tensorflow.contrib.layers.python.layers import batch_norm as batch_norm



#################### Arguments ####################
def parse_args():
parser = argparse.ArgumentParser(description="Run SRDF.")
parser.add_argument('--hidden_factor', type=int, default=64, help='Number of hidden factors.')
parser.add_argument('--layers', nargs='?', default='[64,64]', help="Size of each perception layer.")
parser.add_argument('--epoch', type=int, default=200, help='Number of epochs.')
parser.add_argument('--lr', type=float, default=0.05, help='Learning rate.')
parser.add_argument('--social_lambda', type=float, default=0, help='Regularizer for social relationship.')
parser.add_argument('--keep_prob', nargs='?', default='[0.8,0.5]', help='Keep probability (i.e., 1-dropout_ratio) for each deep layer and the Interaction layer. 1: no dropout. Note that the last index is for the Bi-Interaction layer.')
parser.add_argument('--optimizer', nargs='?', default='AdagradOptimizer', help='Specify an optimizer type (AdamOptimizer, AdagradOptimizer, GradientDescentOptimizer, MomentumOptimizer).')
parser.add_argument('--batch_norm', type=int, default=1, help='Whether to perform batch normaization (0 or 1)')
parser.add_argument('--activation', nargs='?', default='relu', help='Which activation function to use for deep layers: relu, sigmoid, tanh, identity')
parser.add_argument('--verbose', type=int, default=1, help='Show the results per X epochs (0, 1 ... any positive integer)')


# parser.add_argument('--early_stop', type=int, default=1, help='Whether to perform early stop (0 or 1)')
return parser.parse_args()

class SRDF(BaseEstimator, TransformerMixin):
def __init__(self, hidden_factor, layers, epoch, learning_rate, social_lambda,
keep_prob, optimizer_type, batch_norm, activation_function, verbose, random_seed=2018):
# bind params to class
self.hidden_factor = hidden_factor # embedding size
self.layers = layers # perception_layers
# self.features_U = 1 # Users/friends and items are represented by different embeddings so just only one feature
self.social_lambda = social_lambda
self.epoch = epoch
self.random_seed = random_seed
self.keep_prob = np.array(keep_prob)
# self.no_dropout = np.array([1 for i in range(len(keep_prob))])
self.optimizer_type = optimizer_type
self.learning_rate = learning_rate
self.batch_norm = batch_norm
self.verbose = verbose
self.activation_function = activation_function
self.train_phase =True
# self.early_stop = early_stop
# performance of each epoch
self.train_rmse, self.test_rmse = [], []

# init all variables in a tensorflow graph
self._init_graph()

def _init_graph(self):
'''
Init a tensorflow Graph containing: input data, variables, model, loss, optimizer
'''
self.graph = tf.Graph()
with self.graph.as_default(): # , tf.device('/cpu:0'):
# Set graph level random seed
tf.set_random_seed(self.random_seed)
# Input data.
self.train_phase = tf.placeholder(tf.bool,name = "train_phase")
self.lambda_ = tf.placeholder(tf.float32, name = "social_lambda")
self.dropout_keep = tf.placeholder(tf.float32, shape=[None], name="dropout")
self.user = tf.placeholder(tf.int32, shape=[None, 1], name="train_features") # None * features_M
self.item = tf.placeholder(tf.int32, shape=[None, 1], name="train_features") # None * features_M
self.friend = tf.placeholder(tf.int32, shape=[None, 1], name="train_features") # None * features_M
self.sim = tf.placeholder(tf.float32, shape=[None, 1], name="sim")
self.y_true = tf.placeholder(tf.float32, shape=[None, 1], name="train_ytrue") # None * 1
# Variables.
self.weights = self._initialize_weights()



# Model.
# _________ Embedding Layer _____________
self.u_emb = tf.nn.embedding_lookup(self.weights['user_embeddings'], self.user) # None * 1*embedding_size
self.i_emb = tf.nn.embedding_lookup(self.weights['item_embeddings'], self.item) # None * 1*embedding_size
self.f_emb = tf.nn.embedding_lookup(self.weights['user_embeddings'], self.friend) # None * 1*embedding_size

self.u_emb = tf.reshape(self.u_emb, shape=[-1, self.hidden_factor])
self.i_emb = tf.reshape(self.i_emb, shape=[-1, self.hidden_factor])
self.f_emb = tf.reshape(self.f_emb, shape=[-1, self.hidden_factor])

# _________ Interaction Layer _____________

self.DF = tf.multiply(self.u_emb, self.i_emb)
if self.batch_norm:
self.DF = self.batch_norm_layer(self.DF, train_phase=self.train_phase, scope_bn='bn_CSMF')


# ________ Perception Layers __________
for i in range(0, len(self.layers)):
self.DF = tf.add(tf.matmul(self.DF, self.weights['layer_%d' % i]), self.weights['bias_%d' % i]) # None * layer[i] * 1
if self.batch_norm:
self.DF = self.batch_norm_layer(self.DF, train_phase=self.train_phase, scope_bn='bn_%d' % i) # None * layer[i] * 1
self.DF = self.activation_function(self.DF)
self.DF = tf.nn.dropout(self.DF, self.dropout_keep[i]) # dropout at each Deep layer
self.out = tf.add(tf.matmul(self.DF, self.weights['prediction']),self.weights['prediction_b'])
#social regularization
dev = tf.subtract(self.u_emb, self.f_emb)
inner_product = tf.reduce_sum(tf.multiply(dev, dev), keep_dims=True)
# Compute the loss.
self.loss = tf.square(self.y_true - self.out) + self.lambda_ * inner_product * self.sim
# print (self.loss)
# Optimizer.
if self.optimizer_type == 'AdamOptimizer':
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8).minimize(self.loss)
elif self.optimizer_type == 'AdagradOptimizer':
self.optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate, initial_accumulator_value=1e-8).minimize(self.loss)
elif self.optimizer_type == 'GradientDescentOptimizer':
self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(self.loss)

# init
init = tf.global_variables_initializer()
self.sess = tf.Session()
self.sess.run(init)

def _initialize_weights(self):
all_weights = dict()

all_weights['user_embeddings'] = tf.Variable(
tf.random_normal([max(LD.friends) + 1, self.hidden_factor], 0.0, 0.01), name='left_embeddings') # features_U* K
all_weights['item_embeddings'] = tf.Variable(
tf.random_normal([max(LD.item) + 1, self.hidden_factor], 0.0, 0.01), name='right_embeddings') # features_I * K

# deep layers
num_layer = len(self.layers)
if num_layer > 0:
glorot = np.sqrt(2.0 / (self.hidden_factor + self.layers[0]))
all_weights['layer_0'] = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(self.hidden_factor, self.layers[0])), dtype=np.float32)
all_weights['bias_0'] = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(1, self.layers[0])), dtype=np.float32) # 1 * layers[0]
for i in range(1, num_layer):
glorot = np.sqrt(2.0 / (self.layers[i - 1] + self.layers[i]))
all_weights['layer_%d' % i] = tf.Variable(
np.random.normal(loc=0, scale=glorot, size=(self.layers[i - 1], self.layers[i])), dtype=np.float32) # layers[i-1]*layers[i]
all_weights['bias_%d' % i] = tf.Variable(
np.random.normal(loc=0, scale=glorot, size=(1, self.layers[i])), dtype=np.float32) # 1 * layer[i]
# prediction layer
glorot = np.sqrt(2.0 / (self.layers[-1] + 1))
all_weights['prediction'] = tf.Variable(np.random.normal(loc=0, scale=glorot, size=(self.layers[-1], 1)), dtype=np.float32) # layers[-1] * 1
all_weights['prediction_b'] = tf.Variable(
np.random.normal(loc=0, scale=glorot, size=(1, 1)), dtype=np.float32)
else:
all_weights['prediction'] = tf.Variable(np.ones((self.hidden_factor, 1), dtype=np.float32)) # hidden_factor * 1
all_weights['prediction_b'] = tf.Variable(np.ones((1, 1), dtype=np.float32))
return all_weights

def batch_norm_layer(self, x, train_phase, scope_bn):
bn_train = batch_norm(x, decay=0.9, center=True, scale=True, updates_collections=None,
is_training=True, reuse=None, trainable=True, scope=scope_bn)
bn_inference = batch_norm(x, decay=0.9, center=True, scale=True, updates_collections=None,
is_training=False, reuse=True, trainable=True, scope=scope_bn)
z = tf.cond(train_phase, lambda: bn_train, lambda: bn_inference)
return z

def partial_fit(self, data): # fit a batch
feed_dict = {self.user:data[0],self.item:data[1],self.friend:data[2],self.sim:data[3],self.y_true:data[4]}
feed_dict.update({ self.dropout_keep: self.keep_prob, self.train_phase: True,self.lambda_:self.social_lambda})

loss, opt = self.sess.run((self.loss, self.optimizer), feed_dict=feed_dict)
return loss


def train(self, train_batch, test_batch,Train_data,Test_data): # fit a dataset
# Check Init performance
if self.verbose > 0:
t2 = time()
init_train = self.evaluate(Train_data)
init_test = self.evaluate(Test_data)
print("Init: \t train=%.4f, test=%.4f [%.1f s]" % (init_train, init_test, time() - t2))


for epoch in range(self.epoch):
t1 = time()
for i in range(total_batch):
# Fit training
train_data = next(train_batch)
self.partial_fit(train_data)
t2 = time()

# output validation

train_result = self.evaluate(Train_data)
test_result = self.evaluate(Test_data)

self.train_rmse.append(train_result)
# self.valid_rmse.append(valid_result)
self.test_rmse.append(test_result)
if self.verbose > 0 and epoch % self.verbose == 0:
print("Epoch %d [%.1f s]\t train=%.4f, test=%.4f [%.1f s]"
% ( epoch + 1, t2 - t1, train_result, test_result, time() - t2))

def evaluate(self, data): # evaluate the results for an input set
num_example = len(data[-1])
feed_dict = {self.user:data[0],self.item:data[1],self.friend:data[2],self.sim:data[3],self.y_true:data[4]}
feed_dict.update({ self.dropout_keep: self.keep_prob, self.train_phase: True,self.lambda_:self.social_lambda})
predictions = self.sess.run((self.out), feed_dict=feed_dict)
y_pred = np.reshape(predictions, (num_example,))
y_true = np.reshape(data[-1], (num_example,))
predictions_bounded = np.maximum(y_pred, np.ones(num_example) * min(y_true)) # bound the lower values
predictions_bounded = np.minimum(predictions_bounded, np.ones(num_example) * max(y_true)) # bound the higher values
RMSE = math.sqrt(mean_squared_error(y_true, predictions_bounded))
return RMSE

if __name__ == '__main__':
# Data loading
LD = LoadData(0.8) #80% train
batch_size=32
train_batch = LD.get_batches(LD.train_user,LD.train_item,LD.train_rating,batch_size=batch_size)
test_batch = LD.get_batches(LD.test_user,LD.test_item,LD.test_rating,batch_size=batch_size)

Train_data = [LD.train_user,LD.train_item,LD.train_friends,LD.train_sim,LD.train_rating]
Train_data = [np.reshape(x,[-1,1]) for x in Train_data]

Test_data = [LD.test_user,LD.test_item,LD.test_friends,LD.test_sim,LD.test_rating]
Test_data = [np.reshape(x,[-1,1]) for x in Test_data]

total_batch = int(len(LD.train_user) / batch_size)
args = parse_args()

# if args.verbose > 0:
# print("SRDF: hidden_factor=%d, layers=%s, epoch=%d,lr=%.4f,lambda=%.4f,dropout_keep=%s, optimizer=%s, batch_norm=%d, activation=%s"
# % (args.hidden_factor,args.layers, args.epoch, args.lr, args.social_lambda,args.keep_prob, args.optimizer, args.batch_norm, args.activation))

# # Training
# t1 = time()
# model = SRDF(args.hidden_factor, eval(args.layers),args.epoch, args.lr, args.social_lambda, eval(args.keep_prob), args.optimizer, args.batch_norm, activation_function, args.verbose)
activation_function = tf.nn.relu
model = SRDF(hidden_factor = 10,layers= [128,64],epoch = 12,learning_rate = 0.0001,
social_lambda = 0.5, keep_prob = [1,1],
optimizer_type= 'GradientDescentOptimizer',batch_norm =True,
activation_function=activation_function, verbose =1)
model.train(train_batch, test_batch,Train_data,Test_data)
Binary file added __pycache__/LoadData.cpython-35.pyc
Binary file not shown.
Loading

0 comments on commit 8bb7c12

Please sign in to comment.