Skip to content

Commit

Permalink
Merge pull request #14 from etlundquist/instacart
Browse files Browse the repository at this point in the history
Additions and Bug Fixes Working with Instacart Data
  • Loading branch information
Eric Lundquist committed May 29, 2020
2 parents cf0d93e + 1bd28a7 commit 6553cca
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 69 deletions.
14 changes: 7 additions & 7 deletions rankfm/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import numpy as np
import pandas as pd

from rankfm.utils import get_data

def hit_rate(model, test_interactions, k=10, filter_previous=False):
"""evaluate hit-rate (any match) wrt out-of-sample observed interactions
Expand All @@ -20,7 +20,7 @@ def hit_rate(model, test_interactions, k=10, filter_previous=False):
assert model.is_fit, "you must fit the model prior to evaluating hold-out metrics"

# transform interactions into a user -> items dictionary
test_user_items = pd.DataFrame(test_interactions, columns=['user_id', 'item_id'])
test_user_items = pd.DataFrame(get_data(test_interactions), columns=['user_id', 'item_id'])
test_user_items = test_user_items.groupby('user_id')['item_id'].apply(set).to_dict()
test_users = list(test_user_items.keys())

Expand All @@ -47,7 +47,7 @@ def reciprocal_rank(model, test_interactions, k=10, filter_previous=False):
assert model.is_fit, "you must fit the model prior to evaluating hold-out metrics"

# transform interactions into a user -> items dictionary
test_user_items = pd.DataFrame(test_interactions, columns=['user_id', 'item_id'])
test_user_items = pd.DataFrame(get_data(test_interactions), columns=['user_id', 'item_id'])
test_user_items = test_user_items.groupby('user_id')['item_id'].apply(set).to_dict()
test_users = list(test_user_items.keys())

Expand Down Expand Up @@ -75,7 +75,7 @@ def discounted_cumulative_gain(model, test_interactions, k=10, filter_previous=F
assert model.is_fit, "you must fit the model prior to evaluating hold-out metrics"

# transform interactions into a user -> items dictionary
test_user_items = pd.DataFrame(test_interactions, columns=['user_id', 'item_id'])
test_user_items = pd.DataFrame(get_data(test_interactions), columns=['user_id', 'item_id'])
test_user_items = test_user_items.groupby('user_id')['item_id'].apply(set).to_dict()
test_users = list(test_user_items.keys())

Expand Down Expand Up @@ -103,7 +103,7 @@ def precision(model, test_interactions, k=10, filter_previous=False):
assert model.is_fit, "you must fit the model prior to evaluating hold-out metrics"

# transform interactions into a user -> items dictionary
test_user_items = pd.DataFrame(test_interactions, columns=['user_id', 'item_id'])
test_user_items = pd.DataFrame(get_data(test_interactions), columns=['user_id', 'item_id'])
test_user_items = test_user_items.groupby('user_id')['item_id'].apply(set).to_dict()
test_users = list(test_user_items.keys())

Expand All @@ -130,7 +130,7 @@ def recall(model, test_interactions, k=10, filter_previous=False):
assert model.is_fit, "you must fit the model prior to evaluating hold-out metrics"

# transform interactions into a user -> items dictionary
test_user_items = pd.DataFrame(test_interactions, columns=['user_id', 'item_id'])
test_user_items = pd.DataFrame(get_data(test_interactions), columns=['user_id', 'item_id'])
test_user_items = test_user_items.groupby('user_id')['item_id'].apply(set).to_dict()
test_users = list(test_user_items.keys())

Expand All @@ -157,7 +157,7 @@ def diversity(model, test_interactions, k=10, filter_previous=False):
assert model.is_fit, "you must fit the model prior to evaluating hold-out metrics"

# get the unique set of test users
test_user_items = pd.DataFrame(test_interactions, columns=['user_id', 'item_id'])
test_user_items = pd.DataFrame(get_data(test_interactions), columns=['user_id', 'item_id'])
test_users = test_user_items['user_id'].unique()

# generate topK recommendations for all test users also present in the training data
Expand Down
115 changes: 69 additions & 46 deletions rankfm/numba_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,37 @@ def isin_2(item, items):


@nb.njit
def _fit(interactions, user_items, item_idx, regularization, learning_rate, learning_schedule, learning_exponent, epochs, verbose, x_uf, x_if, w_i, w_if, v_u, v_i, v_uf, v_if):
def assert_finite(w_i, w_if, v_u, v_i, v_uf, v_if):
"""assert all model weights are finite"""

assert np.isfinite(np.sum(w_i)), "item weights [w_i] are not finite - try decreasing feature/sample_weight magnitudes"
assert np.isfinite(np.sum(w_if)), "item feature weights [w_if] are not finite - try decreasing feature/sample_weight magnitudes"
assert np.isfinite(np.sum(v_u)), "user factors [v_u] are not finite - try decreasing feature/sample_weight magnitudes"
assert np.isfinite(np.sum(v_i)), "item factors [v_i] are not finite - try decreasing feature/sample_weight magnitudes"
assert np.isfinite(np.sum(v_uf)), "user-feature factors [v_uf] are not finite - try decreasing feature/sample_weight magnitudes"
assert np.isfinite(np.sum(v_if)), "item-feature factors [v_if] are not finite - try decreasing feature/sample_weight magnitudes"


@nb.njit
def reg_penalty(regularization, w_i, w_if, v_u, v_i, v_uf, v_if):
"""calculate the total regularization penalty for all model weights"""

penalty = 0.0
penalty += np.sum(regularization * np.square(w_i))
penalty += np.sum(regularization * np.square(w_if))
penalty += np.sum(regularization * np.square(v_u))
penalty += np.sum(regularization * np.square(v_i))
penalty += np.sum(regularization * np.square(v_uf))
penalty += np.sum(regularization * np.square(v_if))
return penalty


@nb.njit
def _fit(interactions, sample_weight, user_items, item_idx, regularization, learning_rate, learning_schedule, learning_exponent, epochs, verbose, x_uf, x_if, w_i, w_if, v_u, v_i, v_uf, v_if):
"""private JIT model-fitting function
:param interactions: np.array[int32] of observed [user_idx, item_idx] iteractions
:param sample_weight: vector of importance weights for each observed interaction
:param user_items: typed dict [int32 -> int32[:]] mapping user_idx to set of observed item_idx
:param item_idx: np.array[int32] of unique item_idx values found in interactions data
:param regularization: L2 regularization penalty
Expand All @@ -62,99 +89,95 @@ def _fit(interactions, user_items, item_idx, regularization, learning_rate, lear
"""

# define matrix dimension shapes
P = x_uf.shape[1]
Q = x_if.shape[1]
F = v_i.shape[1]
I = len(item_idx)

# create a shuffle index to diversify each training epoch
n_interaction = len(interactions)
shuffle_index = np.arange(n_interaction)

for epoch in range(epochs):

# set the new learning rate (eta) for this epoch
if learning_schedule == 'constant':
eta = learning_rate
elif learning_schedule == 'invscaling':
eta = learning_rate / (epoch + 1)**learning_exponent
else:
raise ValueError('unknown [learning_schedule]')

# randomly re-shuffle the observed interactions to diversify training epochs
shuffle_index = np.arange(len(interactions))
np.random.shuffle(shuffle_index)
interactions = interactions[shuffle_index]
sample_weight = sample_weight[shuffle_index]
log_likelihood = 0.0

for row in range(len(interactions)):
for row in range(n_interaction):

# locate the user (u) and observed item (i)
# locate the user (u), observed item (i), and sample weight (sw)
u = interactions[row, 0]
i = interactions[row, 1]
sw = sample_weight[row]

# randomly sample an unobserved item (j) for the user
while True:
j = int(I * random.random())
if not isin_1(j, user_items[u]):
break

# calculate the pairwise utility score for the (u, i, j) triplet

pu_i = w_i[i] - w_i[j]
pu_if = np.dot(x_if[i] - x_if[j], w_if)
pu_u_i = np.dot(v_i[i] - v_i[j], v_u[u])
pu_u_if = np.dot(x_if[i] - x_if[j], np.dot(v_if, v_u[u]))
pu_i_uf = np.dot(x_uf[u], np.dot(v_uf, v_i[i] - v_i[j]))
pu_uf_if = np.dot(np.dot(v_uf.T, x_uf[u]), np.dot(v_if.T, x_if[i] - x_if[j]))

# calculate the pairwise utility score for the (u, i, j) triplet and its associated log-likelihood
pairwise_utility = pu_i + pu_if + pu_u_i + pu_u_if + pu_i_uf + pu_uf_if
log_likelihood += np.log(1 / (1 + np.exp(-pairwise_utility)))

# calculate derivatives of the model penalized log-likelihood function
# calculate derivatives of the penalized log-likelihood function wrt to all model weights
# NOTE: sample weights are applied like frequency weights: gradient updates are scaled up/down as if there were W identical (u, i, j) pairs

# calculate the outer-derivative [d_LL/d_g(pu)] and regularization derivative [d_LL/d_norm(theta)]
d_con = 1.0 / (np.exp(pairwise_utility) + 1.0)
d_reg = 2.0 * regularization

# calculate the [item] and [user/item factor] derivatives
d_w_i = 1.0
d_w_j = -1.0
d_w_if = x_if[i] - x_if[j]

d_v_u = v_i[i] - v_i[j] + np.dot(v_if.T, x_if[i] - x_if[j])
d_v_i = v_u[u] + np.dot(v_uf.T, x_uf[u])
d_v_j = -v_u[u] - np.dot(v_uf.T, x_uf[u])

d_v_uf = np.empty((P, F), np.float32)
d_v_if = np.empty((Q, F), np.float32)

for f in range(F):
for p in range(P):
if (x_uf[u][p]) == 0.0:
d_v_uf[p, f] = 0.0
else:
d_v_uf[p, f] = (x_uf[u][p]) * (v_i[i][f] - v_i[j][f] + np.dot(v_if.T[f], x_if[i] - x_if[j]))
for q in range(Q):
if (x_if[i][q] - x_if[j][q]) == 0.0:
d_v_if[q, f] = 0.0
else:
d_v_if[q, f] = (x_if[i][q] - x_if[j][q]) * (v_u[u][f] + np.dot(v_uf.T[f], x_uf[u]))

# update model weights for this (u, i, j) triplet with a gradient step
w_i[i] += eta * ((d_con * d_w_i) - (d_reg * w_i[i]))
w_i[j] += eta * ((d_con * d_w_j) - (d_reg * w_i[j]))
w_if += eta * ((d_con * d_w_if) - (d_reg * w_if))
v_u[u] += eta * ((d_con * d_v_u) - (d_reg * v_u[u]))
v_i[i] += eta * ((d_con * d_v_i) - (d_reg * v_i[i]))
v_i[j] += eta * ((d_con * d_v_j) - (d_reg * v_i[j]))
v_uf += eta * ((d_con * d_v_uf) - (d_reg * v_uf))
v_if += eta * ((d_con * d_v_if) - (d_reg * v_if))

# calculate the cumulative penalized log-likelihood for this training epoch
penalty = 0.0
penalty += np.sum(regularization * np.square(w_i))
penalty += np.sum(regularization * np.square(w_if))
penalty += np.sum(regularization * np.square(v_u))
penalty += np.sum(regularization * np.square(v_i))
penalty += np.sum(regularization * np.square(v_uf))
penalty += np.sum(regularization * np.square(v_if))
# update the [item] and [user/item factor] weights with a gradient step
w_i[i] += eta * (sw * (d_con * d_w_i) - (d_reg * w_i[i]))
w_i[j] += eta * (sw * (d_con * d_w_j) - (d_reg * w_i[j]))
v_u[u] += eta * (sw * (d_con * d_v_u) - (d_reg * v_u[u]))
v_i[i] += eta * (sw * (d_con * d_v_i) - (d_reg * v_i[i]))
v_i[j] += eta * (sw * (d_con * d_v_j) - (d_reg * v_i[j]))

# get the non-zero indices of user/item features for this (u, i, j) triplet
x_uf_nz = np.nonzero(x_uf[u])[0]
x_if_nz = np.nonzero(x_if[i] - x_if[j])[0]

# update [user-feature-factor] weights for the non-zero user features
for p in x_uf_nz:
for f in range(F):
d_v_uf = (x_uf[u][p]) * (v_i[i][f] - v_i[j][f] + np.dot(v_if.T[f], x_if[i] - x_if[j]))
v_uf[p, f] += eta * (sw * (d_con * d_v_uf) - (d_reg * v_uf[p, f]))

# update [item-feature] and [item-feature-factor] weights for the non-zero item features
for q in x_if_nz:
d_w_if = x_if[i][q] - x_if[j][q]
w_if[q] += eta * (sw * (d_con * d_w_if) - (d_reg * w_if[q]))
for f in range(F):
d_v_if = (x_if[i][q] - x_if[j][q]) * (v_u[u][f] + np.dot(v_uf.T[f], x_uf[u]))
v_if[q, f] += eta * (sw * (d_con * d_v_if) - (d_reg * v_if[q, f]))

# assert all model weights are finite as of the end of this epoch
assert_finite(w_i, w_if, v_u, v_i, v_uf, v_if)

if verbose:
penalty = reg_penalty(regularization, w_i, w_if, v_u, v_i, v_uf, v_if)
log_likelihood = round(log_likelihood - penalty, 2)
print("\ntraining epoch:", epoch)
print("log likelihood:", log_likelihood)
Expand Down
Loading

0 comments on commit 6553cca

Please sign in to comment.