Skip to content

Commit

Permalink
bonus: on deep model
Browse files Browse the repository at this point in the history
  • Loading branch information
orbxball committed Jun 6, 2017
1 parent 3cb1869 commit 3e9deaa
Show file tree
Hide file tree
Showing 3 changed files with 212 additions and 6 deletions.
21 changes: 15 additions & 6 deletions hw6/Model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np
from keras.layers import Input, Embedding, Reshape, Dense, Dropout, Lambda
from keras.layers import Input, Embedding, Dense, Dropout
from keras.layers import Reshape, Flatten, Lambda
from keras.layers.merge import concatenate, dot, add
from keras.models import Model
from keras import backend as K
Expand Down Expand Up @@ -43,18 +44,26 @@ def build_cf_model(n_users, n_movies, dim, isBest=False):


def build_deep_model(n_users, n_movies, dim, dropout=0.1):
u_input = Input(shape=(1,))
u_input = Input(shape=(4,))
u = Embedding(n_users, dim)(u_input)
u = Reshape((dim,))(u)
# u = Reshape((dim,))(u)
u = Flatten()(u)

m_input = Input(shape=(1,))
m_input = Input(shape=(19,))
m = Embedding(n_movies, dim)(m_input)
m = Reshape((dim,))(m)
# m = Reshape((dim,))(m)
m = Flatten()(m)

out = concatenate([u, m])
out = Dropout(dropout)(out)
out = Dense(dim, activation='relu')(out)
out = Dense(256, activation='relu')(out)
out = Dropout(dropout)(out)
out = Dense(128, activation='relu')(out)
out = Dropout(dropout)(out)
out = Dense(64, activation='relu')(out)
out = Dropout(0.15)(out)
out = Dense(dim, activation='relu')(out)
out = Dropout(0.2)(out)
out = Dense(1, activation='relu')(out)

model = Model(inputs=[u_input, m_input], outputs=out)
Expand Down
95 changes: 95 additions & 0 deletions hw6/test_bonus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import os
import sys
import argparse
import numpy as np
import pandas as pd
from Model import build_cf_model, build_deep_model, rate

classes = ["Adventure", "Western", "Comedy", "Thriller", "Horror", "Mystery", "Crime", "Film-Noir", "Sci-Fi", "Fantasy", "Drama", "Musical", "War", "Documentary", "Children's", "Animation", "Action", "Romance"]


def parse_args():
parser = argparse.ArgumentParser(description='HW6: Matrix Factorization')
parser.add_argument('data_dir', type=str)
parser.add_argument('output', type=str)
return parser.parse_args()

def make_users(row, matrix):
matrix[row['UserID']] = [row['UserID'], row['Gender'], row['Age'], row['Occupation']]
return row

def categorize_movie(row, matrix, idx_map):
x = [0] * len(classes)
for g in row['Genres'].split('|'):
x[idx_map[g]] = 1
matrix[row['movieID']] = [row['movieID']] + x

def predict_rating(trained_model, userid, movieid):
return rate(trained_model, userid, movieid)

def ensure_dir(file_path):
directory = os.path.dirname(file_path)
if len(directory) == 0: return
if not os.path.exists(directory):
os.makedirs(directory)


def main(args):
users = pd.read_csv(USERS_CSV, sep='::', engine='python',
usecols=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'])
users['UserID'] -= 1
users['Gender'][users['Gender'] == 'F'] = 0
users['Gender'][users['Gender'] == 'M'] = 1
users_mx = {}
users.apply(lambda x: make_users(x, users_mx), axis=1)
print('{} description of {} users loaded'.format(len(users), max_userid))

movies = pd.read_csv(MOVIES_CSV, sep='::', engine='python',
usecols=['movieID', 'Title', 'Genres'])
movies['movieID'] -= 1
movies_mx = {}
classes_idx = {}
for i, c in enumerate(classes):
classes_idx[c] = i
movies.apply(lambda x: categorize_movie(x, movies_mx, classes_idx), axis=1)
print('{} descriptions of {} movies loaded'.format(len(movies), max_movieid))

test_data = pd.read_csv(TEST_CSV, usecols=['UserID', 'MovieID'])
print('{} testing data loaded.'.format(test_data.shape[0]))

trained_model = build_deep_model(max_userid, max_movieid, DIM)
print('Loading model weights...')
trained_model.load_weights(MODEL_WEIGHTS_FILE)
print('Loading model done!!!')

recommendations = pd.read_csv(TEST_CSV, usecols=['TestDataID'])
recommendations['Rating'] = test_data.apply(lambda x: predict_rating(trained_model, users_mx[x['UserID']-1], movies_mx[x['MovieID']-1]), axis=1)
# print(recommendations)

ensure_dir(args.output)
recommendations.to_csv(args.output, index=False, columns=['TestDataID', 'Rating'])


if __name__ == '__main__':
args = parse_args()

MODEL_DIR = './model'
MAX_CSV = 'max_bonus.csv'
TEST_CSV = 'test.csv'
USERS_CSV = 'users.csv'
MOVIES_CSV = 'movies.csv'
MODEL_WEIGHTS_FILE = 'weights_bonus.h5'

DATA_DIR = args.data_dir
TEST_CSV = os.path.join(DATA_DIR, TEST_CSV)
USERS_CSV = os.path.join(DATA_DIR, USERS_CSV)
MOVIES_CSV = os.path.join(DATA_DIR, MOVIES_CSV)

MODEL_WEIGHTS_FILE = os.path.join(MODEL_DIR, MODEL_WEIGHTS_FILE)
MAX_CSV = os.path.join(MODEL_DIR, MAX_CSV)
info = pd.read_csv(MAX_CSV)
DIM = list(info['dim'])[0]
max_userid = list(info['max_userid'])[0]
max_movieid = list(info['max_movieid'])[0]

main(args)
102 changes: 102 additions & 0 deletions hw6/train_bonus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import os
import sys
import argparse
import numpy as np
import pandas as pd
from keras import backend as K
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from Model import build_cf_model, build_deep_model, rate

classes = ["Adventure", "Western", "Comedy", "Thriller", "Horror", "Mystery", "Crime", "Film-Noir", "Sci-Fi", "Fantasy", "Drama", "Musical", "War", "Documentary", "Children's", "Animation", "Action", "Romance"]


def parse_args():
parser = argparse.ArgumentParser(description='HW6: Matrix Factorization')
parser.add_argument('train', type=str)
parser.add_argument('users', type=str)
parser.add_argument('movies', type=str)
parser.add_argument('test', type=str)
parser.add_argument('--dim', type=int, default=15)
return parser.parse_args()

def make_users(row, matrix):
matrix[row['UserID']] = [row['UserID'], row['Gender'], row['Age'], row['Occupation']]
return row

def categorize_movie(row, matrix, idx_map):
x = [0] * len(classes)
for g in row['Genres'].split('|'):
x[idx_map[g]] = 1
matrix[row['movieID']] = [row['movieID']] + x

def rmse(y_true, y_pred):
y_pred = K.clip(y_pred, 1., 5.)
return K.sqrt(K.mean(K.square((y_true - y_pred))))

def main(args):
ratings = pd.read_csv(args.train,
usecols=['UserID', 'MovieID', 'Rating'])
max_userid = ratings['UserID'].drop_duplicates().max()
max_movieid = ratings['MovieID'].drop_duplicates().max()
ratings['User_emb_id'] = ratings['UserID'] - 1
ratings['Movie_emb_id'] = ratings['MovieID'] - 1
print('{} ratings loaded.'.format(ratings.shape[0]))

users = pd.read_csv(args.users, sep='::', engine='python',
usecols=['UserID', 'Gender', 'Age', 'Occupation'])
users['UserID'] -= 1
users['Gender'][users['Gender'] == 'F'] = 0
users['Gender'][users['Gender'] == 'M'] = 1
users_mx = {}
users.apply(lambda x: make_users(x, users_mx), axis=1)
print('{} description of {} users loaded'.format(len(users), max_userid))

movies = pd.read_csv(args.movies, sep='::', engine='python',
usecols=['movieID', 'Genres'])
movies['movieID'] -= 1
movies_mx = {}
classes_idx = {}
for i, c in enumerate(classes):
classes_idx[c] = i
movies.apply(lambda x: categorize_movie(x, movies_mx, classes_idx), axis=1)
print('{} descriptions of {} movies loaded'.format(len(movies), max_movieid))

maximum = {}
maximum['max_userid'] = [max_userid]
maximum['max_movieid'] = [max_movieid]
maximum['dim'] = [DIM]
pd.DataFrame(data=maximum).to_csv(MAX_FILE, index=False)
print('max info save to {}'.format(MAX_FILE))

ratings = ratings.sample(frac=1)
Users = ratings['User_emb_id'].values
print('Users: {}, shape = {}'.format(Users, Users.shape))
Movies = ratings['Movie_emb_id'].values
print('Movies: {}, shape = {}'.format(Movies, Movies.shape))
Ratings = ratings['Rating'].values
print('Ratings: {}, shape = {}'.format(Ratings, Ratings.shape))

new_Users = np.array(list(map(users_mx.get, Users)))
new_Movies = np.array(list(map(movies_mx.get, Movies)))

model = build_deep_model(max_userid, max_movieid, DIM)
model.compile(loss='mse', optimizer='adamax', metrics=[rmse])

callbacks = [EarlyStopping('val_rmse', patience=2),
ModelCheckpoint(MODEL_WEIGHTS_FILE, save_best_only=True)]
history = model.fit([new_Users, new_Movies], Ratings, epochs=1000, batch_size=256, validation_split=.1, verbose=1, callbacks=callbacks)


if __name__ == '__main__':
args = parse_args()

MODEL_DIR = './model'
DIM = args.dim
MODEL_WEIGHTS_FILE = 'weights_bonus.h5'
MAX_FILE = 'max_bonus.csv'

if not os.path.exists(MODEL_DIR):
os.makedirs(MODEL_DIR)
MODEL_WEIGHTS_FILE = os.path.join(MODEL_DIR, MODEL_WEIGHTS_FILE)
MAX_FILE = os.path.join(MODEL_DIR, MAX_FILE)
main(args)

0 comments on commit 3e9deaa

Please sign in to comment.