Merge pull request #14 from etlundquist/instacart

Additions and Bug Fixes Working with Instacart Data
etlundquist · May 29, 2020 · 6553cca · 6553cca
2 parents cf0d93e + 1bd28a7
commit 6553cca
Show file tree

Hide file tree

Showing 4 changed files with 135 additions and 69 deletions.
diff --git a/rankfm/evaluation.py b/rankfm/evaluation.py
@@ -4,7 +4,7 @@
 
 import numpy as np
 import pandas as pd
-
+from rankfm.utils import get_data
 
 def hit_rate(model, test_interactions, k=10, filter_previous=False):
  """evaluate hit-rate (any match) wrt out-of-sample observed interactions
@@ -20,7 +20,7 @@ def hit_rate(model, test_interactions, k=10, filter_previous=False):
  assert model.is_fit, "you must fit the model prior to evaluating hold-out metrics"
 
  # transform interactions into a user -> items dictionary
- test_user_items = pd.DataFrame(test_interactions, columns=['user_id', 'item_id'])
+ test_user_items = pd.DataFrame(get_data(test_interactions), columns=['user_id', 'item_id'])
  test_user_items = test_user_items.groupby('user_id')['item_id'].apply(set).to_dict()
  test_users = list(test_user_items.keys())
 
@@ -47,7 +47,7 @@ def reciprocal_rank(model, test_interactions, k=10, filter_previous=False):
  assert model.is_fit, "you must fit the model prior to evaluating hold-out metrics"
 
  # transform interactions into a user -> items dictionary
- test_user_items = pd.DataFrame(test_interactions, columns=['user_id', 'item_id'])
+ test_user_items = pd.DataFrame(get_data(test_interactions), columns=['user_id', 'item_id'])
  test_user_items = test_user_items.groupby('user_id')['item_id'].apply(set).to_dict()
  test_users = list(test_user_items.keys())
 
@@ -75,7 +75,7 @@ def discounted_cumulative_gain(model, test_interactions, k=10, filter_previous=F
  assert model.is_fit, "you must fit the model prior to evaluating hold-out metrics"
 
  # transform interactions into a user -> items dictionary
- test_user_items = pd.DataFrame(test_interactions, columns=['user_id', 'item_id'])
+ test_user_items = pd.DataFrame(get_data(test_interactions), columns=['user_id', 'item_id'])
  test_user_items = test_user_items.groupby('user_id')['item_id'].apply(set).to_dict()
  test_users = list(test_user_items.keys())
 
@@ -103,7 +103,7 @@ def precision(model, test_interactions, k=10, filter_previous=False):
  assert model.is_fit, "you must fit the model prior to evaluating hold-out metrics"
 
  # transform interactions into a user -> items dictionary
- test_user_items = pd.DataFrame(test_interactions, columns=['user_id', 'item_id'])
+ test_user_items = pd.DataFrame(get_data(test_interactions), columns=['user_id', 'item_id'])
  test_user_items = test_user_items.groupby('user_id')['item_id'].apply(set).to_dict()
  test_users = list(test_user_items.keys())
 
@@ -130,7 +130,7 @@ def recall(model, test_interactions, k=10, filter_previous=False):
  assert model.is_fit, "you must fit the model prior to evaluating hold-out metrics"
 
  # transform interactions into a user -> items dictionary
- test_user_items = pd.DataFrame(test_interactions, columns=['user_id', 'item_id'])
+ test_user_items = pd.DataFrame(get_data(test_interactions), columns=['user_id', 'item_id'])
  test_user_items = test_user_items.groupby('user_id')['item_id'].apply(set).to_dict()
  test_users = list(test_user_items.keys())
 
@@ -157,7 +157,7 @@ def diversity(model, test_interactions, k=10, filter_previous=False):
  assert model.is_fit, "you must fit the model prior to evaluating hold-out metrics"
 
  # get the unique set of test users
- test_user_items = pd.DataFrame(test_interactions, columns=['user_id', 'item_id'])
+ test_user_items = pd.DataFrame(get_data(test_interactions), columns=['user_id', 'item_id'])
  test_users = test_user_items['user_id'].unique()
 
  # generate topK recommendations for all test users also present in the training data

diff --git a/rankfm/numba_methods.py b/rankfm/numba_methods.py
@@ -46,10 +46,37 @@ def isin_2(item, items):
 
 
 @nb.njit
-def _fit(interactions, user_items, item_idx, regularization, learning_rate, learning_schedule, learning_exponent, epochs, verbose, x_uf, x_if, w_i, w_if, v_u, v_i, v_uf, v_if):
+def assert_finite(w_i, w_if, v_u, v_i, v_uf, v_if):
+ """assert all model weights are finite"""
+
+ assert np.isfinite(np.sum(w_i)), "item weights [w_i] are not finite - try decreasing feature/sample_weight magnitudes"
+ assert np.isfinite(np.sum(w_if)), "item feature weights [w_if] are not finite - try decreasing feature/sample_weight magnitudes"
+ assert np.isfinite(np.sum(v_u)), "user factors [v_u] are not finite - try decreasing feature/sample_weight magnitudes"
+ assert np.isfinite(np.sum(v_i)), "item factors [v_i] are not finite - try decreasing feature/sample_weight magnitudes"
+ assert np.isfinite(np.sum(v_uf)), "user-feature factors [v_uf] are not finite - try decreasing feature/sample_weight magnitudes"
+ assert np.isfinite(np.sum(v_if)), "item-feature factors [v_if] are not finite - try decreasing feature/sample_weight magnitudes"
+
+
+@nb.njit
+def reg_penalty(regularization, w_i, w_if, v_u, v_i, v_uf, v_if):
+ """calculate the total regularization penalty for all model weights"""
+
+ penalty = 0.0
+ penalty += np.sum(regularization * np.square(w_i))
+ penalty += np.sum(regularization * np.square(w_if))
+ penalty += np.sum(regularization * np.square(v_u))
+ penalty += np.sum(regularization * np.square(v_i))
+ penalty += np.sum(regularization * np.square(v_uf))
+ penalty += np.sum(regularization * np.square(v_if))
+ return penalty
+
+
+@nb.njit
+def _fit(interactions, sample_weight, user_items, item_idx, regularization, learning_rate, learning_schedule, learning_exponent, epochs, verbose, x_uf, x_if, w_i, w_if, v_u, v_i, v_uf, v_if):
  """private JIT model-fitting function
 
  :param interactions: np.array[int32] of observed [user_idx, item_idx] iteractions
+ :param sample_weight: vector of importance weights for each observed interaction
  :param user_items: typed dict [int32 -> int32[:]] mapping user_idx to set of observed item_idx
  :param item_idx: np.array[int32] of unique item_idx values found in interactions data
  :param regularization: L2 regularization penalty
@@ -62,99 +89,95 @@ def _fit(interactions, user_items, item_idx, regularization, learning_rate, lear
  """
 
  # define matrix dimension shapes
- P = x_uf.shape[1]
- Q = x_if.shape[1]
  F = v_i.shape[1]
  I = len(item_idx)
 
+ # create a shuffle index to diversify each training epoch
+ n_interaction = len(interactions)
+ shuffle_index = np.arange(n_interaction)
+
  for epoch in range(epochs):
 
- # set the new learning rate (eta) for this epoch
  if learning_schedule == 'constant':
  eta = learning_rate
  elif learning_schedule == 'invscaling':
  eta = learning_rate / (epoch + 1)**learning_exponent
  else:
  raise ValueError('unknown [learning_schedule]')
 
- # randomly re-shuffle the observed interactions to diversify training epochs
- shuffle_index = np.arange(len(interactions))
  np.random.shuffle(shuffle_index)
  interactions = interactions[shuffle_index]
+ sample_weight = sample_weight[shuffle_index]
  log_likelihood = 0.0
 
- for row in range(len(interactions)):
+ for row in range(n_interaction):
 
- # locate the user (u) and observed item (i)
+ # locate the user (u), observed item (i), and sample weight (sw)
  u = interactions[row, 0]
  i = interactions[row, 1]
+ sw = sample_weight[row]
 
  # randomly sample an unobserved item (j) for the user
  while True:
  j = int(I * random.random())
  if not isin_1(j, user_items[u]):
  break
 
- # calculate the pairwise utility score for the (u, i, j) triplet
-
  pu_i = w_i[i] - w_i[j]
  pu_if = np.dot(x_if[i] - x_if[j], w_if)
  pu_u_i = np.dot(v_i[i] - v_i[j], v_u[u])
  pu_u_if = np.dot(x_if[i] - x_if[j], np.dot(v_if, v_u[u]))
  pu_i_uf = np.dot(x_uf[u], np.dot(v_uf, v_i[i] - v_i[j]))
  pu_uf_if = np.dot(np.dot(v_uf.T, x_uf[u]), np.dot(v_if.T, x_if[i] - x_if[j]))
 
+ # calculate the pairwise utility score for the (u, i, j) triplet and its associated log-likelihood
  pairwise_utility = pu_i + pu_if + pu_u_i + pu_u_if + pu_i_uf + pu_uf_if
  log_likelihood += np.log(1 / (1 + np.exp(-pairwise_utility)))
 
- # calculate derivatives of the model penalized log-likelihood function
+ # calculate derivatives of the penalized log-likelihood function wrt to all model weights
+ # NOTE: sample weights are applied like frequency weights: gradient updates are scaled up/down as if there were W identical (u, i, j) pairs
 
+ # calculate the outer-derivative [d_LL/d_g(pu)] and regularization derivative [d_LL/d_norm(theta)]
  d_con = 1.0 / (np.exp(pairwise_utility) + 1.0)
  d_reg = 2.0 * regularization
 
+ # calculate the [item] and [user/item factor] derivatives
  d_w_i = 1.0
  d_w_j = -1.0
- d_w_if = x_if[i] - x_if[j]
-
  d_v_u = v_i[i] - v_i[j] + np.dot(v_if.T, x_if[i] - x_if[j])
  d_v_i = v_u[u] + np.dot(v_uf.T, x_uf[u])
  d_v_j = -v_u[u] - np.dot(v_uf.T, x_uf[u])
 
- d_v_uf = np.empty((P, F), np.float32)
- d_v_if = np.empty((Q, F), np.float32)
-
- for f in range(F):
- for p in range(P):
- if (x_uf[u][p]) == 0.0:
- d_v_uf[p, f] = 0.0
- else:
- d_v_uf[p, f] = (x_uf[u][p]) * (v_i[i][f] - v_i[j][f] + np.dot(v_if.T[f], x_if[i] - x_if[j]))
- for q in range(Q):
- if (x_if[i][q] - x_if[j][q]) == 0.0:
- d_v_if[q, f] = 0.0
- else:
- d_v_if[q, f] = (x_if[i][q] - x_if[j][q]) * (v_u[u][f] + np.dot(v_uf.T[f], x_uf[u]))
-
- # update model weights for this (u, i, j) triplet with a gradient step
- w_i[i] += eta * ((d_con * d_w_i) - (d_reg * w_i[i]))
- w_i[j] += eta * ((d_con * d_w_j) - (d_reg * w_i[j]))
- w_if += eta * ((d_con * d_w_if) - (d_reg * w_if))
- v_u[u] += eta * ((d_con * d_v_u) - (d_reg * v_u[u]))
- v_i[i] += eta * ((d_con * d_v_i) - (d_reg * v_i[i]))
- v_i[j] += eta * ((d_con * d_v_j) - (d_reg * v_i[j]))
- v_uf += eta * ((d_con * d_v_uf) - (d_reg * v_uf))
- v_if += eta * ((d_con * d_v_if) - (d_reg * v_if))
-
- # calculate the cumulative penalized log-likelihood for this training epoch
- penalty = 0.0
- penalty += np.sum(regularization * np.square(w_i))
- penalty += np.sum(regularization * np.square(w_if))
- penalty += np.sum(regularization * np.square(v_u))
- penalty += np.sum(regularization * np.square(v_i))
- penalty += np.sum(regularization * np.square(v_uf))
- penalty += np.sum(regularization * np.square(v_if))
+ # update the [item] and [user/item factor] weights with a gradient step
+ w_i[i] += eta * (sw * (d_con * d_w_i) - (d_reg * w_i[i]))
+ w_i[j] += eta * (sw * (d_con * d_w_j) - (d_reg * w_i[j]))
+ v_u[u] += eta * (sw * (d_con * d_v_u) - (d_reg * v_u[u]))
+ v_i[i] += eta * (sw * (d_con * d_v_i) - (d_reg * v_i[i]))
+ v_i[j] += eta * (sw * (d_con * d_v_j) - (d_reg * v_i[j]))
+
+ # get the non-zero indices of user/item features for this (u, i, j) triplet
+ x_uf_nz = np.nonzero(x_uf[u])[0]
+ x_if_nz = np.nonzero(x_if[i] - x_if[j])[0]
+
+ # update [user-feature-factor] weights for the non-zero user features
+ for p in x_uf_nz:
+ for f in range(F):
+ d_v_uf = (x_uf[u][p]) * (v_i[i][f] - v_i[j][f] + np.dot(v_if.T[f], x_if[i] - x_if[j]))
+ v_uf[p, f] += eta * (sw * (d_con * d_v_uf) - (d_reg * v_uf[p, f]))
+
+ # update [item-feature] and [item-feature-factor] weights for the non-zero item features
+ for q in x_if_nz:
+ d_w_if = x_if[i][q] - x_if[j][q]
+ w_if[q] += eta * (sw * (d_con * d_w_if) - (d_reg * w_if[q]))
+ for f in range(F):
+ d_v_if = (x_if[i][q] - x_if[j][q]) * (v_u[u][f] + np.dot(v_uf.T[f], x_uf[u]))
+ v_if[q, f] += eta * (sw * (d_con * d_v_if) - (d_reg * v_if[q, f]))
+
+ # assert all model weights are finite as of the end of this epoch
+ assert_finite(w_i, w_if, v_u, v_i, v_uf, v_if)
 
  if verbose:
+ penalty = reg_penalty(regularization, w_i, w_if, v_u, v_i, v_uf, v_if)
  log_likelihood = round(log_likelihood - penalty, 2)
  print("\ntraining epoch:", epoch)
  print("log likelihood:", log_likelihood)