Skip to content

Commit

Permalink
added support for sample weights
Browse files Browse the repository at this point in the history
  • Loading branch information
Eric Lundquist authored and Eric Lundquist committed May 29, 2020
1 parent 5144a73 commit 0648a5e
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 37 deletions.
67 changes: 45 additions & 22 deletions rankfm/numba_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,37 @@ def isin_2(item, items):


@nb.njit
def _fit(interactions, user_items, item_idx, regularization, learning_rate, learning_schedule, learning_exponent, epochs, verbose, x_uf, x_if, w_i, w_if, v_u, v_i, v_uf, v_if):
def assert_finite(w_i, w_if, v_u, v_i, v_uf, v_if):
"""assert all model weights are finite"""

assert np.isfinite(np.sum(w_i)), "item weights [w_i] are not finite - try decreasing feature/sample_weight magnitudes"
assert np.isfinite(np.sum(w_if)), "item feature weights [w_if] are not finite - try decreasing feature/sample_weight magnitudes"
assert np.isfinite(np.sum(v_u)), "user factors [v_u] are not finite - try decreasing feature/sample_weight magnitudes"
assert np.isfinite(np.sum(v_i)), "item factors [v_i] are not finite - try decreasing feature/sample_weight magnitudes"
assert np.isfinite(np.sum(v_uf)), "user-feature factors [v_uf] are not finite - try decreasing feature/sample_weight magnitudes"
assert np.isfinite(np.sum(v_if)), "item-feature factors [v_if] are not finite - try decreasing feature/sample_weight magnitudes"


@nb.njit
def reg_penalty(regularization, w_i, w_if, v_u, v_i, v_uf, v_if):
"""calculate the total regularization penalty for all model weights"""

penalty = 0.0
penalty += np.sum(regularization * np.square(w_i))
penalty += np.sum(regularization * np.square(w_if))
penalty += np.sum(regularization * np.square(v_u))
penalty += np.sum(regularization * np.square(v_i))
penalty += np.sum(regularization * np.square(v_uf))
penalty += np.sum(regularization * np.square(v_if))
return penalty


@nb.njit
def _fit(interactions, sample_weight, user_items, item_idx, regularization, learning_rate, learning_schedule, learning_exponent, epochs, verbose, x_uf, x_if, w_i, w_if, v_u, v_i, v_uf, v_if):
"""private JIT model-fitting function
:param interactions: np.array[int32] of observed [user_idx, item_idx] iteractions
:param sample_weight: vector of importance weights for each observed interaction
:param user_items: typed dict [int32 -> int32[:]] mapping user_idx to set of observed item_idx
:param item_idx: np.array[int32] of unique item_idx values found in interactions data
:param regularization: L2 regularization penalty
Expand All @@ -61,13 +88,11 @@ def _fit(interactions, user_items, item_idx, regularization, learning_rate, lear
:return: updated model weights (w_i, w_if, v_u, v_i, v_uf, v_if)
"""

# define matrix dimension shapes
# define matrix dimension shapes and shuffle index
P = x_uf.shape[1]
Q = x_if.shape[1]
F = v_i.shape[1]
I = len(item_idx)

# define shuffle index to randomly permute each epoch
shuffle_index = np.arange(len(interactions))

for epoch in range(epochs):
Expand All @@ -85,9 +110,10 @@ def _fit(interactions, user_items, item_idx, regularization, learning_rate, lear

for row in shuffle_index:

# locate the user (u) and observed item (i)
# locate the user (u), observed item (i), and sample weight (sw)
u = interactions[row, 0]
i = interactions[row, 1]
sw = sample_weight[row]

# randomly sample an unobserved item (j) for the user
while True:
Expand All @@ -108,6 +134,8 @@ def _fit(interactions, user_items, item_idx, regularization, learning_rate, lear
log_likelihood += np.log(1 / (1 + np.exp(-pairwise_utility)))

# calculate derivatives of the model penalized log-likelihood function
# NOTE: apply the sample weights to d_LL/d_g(pu) to scale the magnitude of the gradient step updates
# NOTE: sample weights are applied like frequency weights: gradient updates are scaled as if there were W (u, i, j) pairs

d_con = 1.0 / (np.exp(pairwise_utility) + 1.0)
d_reg = 2.0 * regularization
Expand Down Expand Up @@ -136,25 +164,20 @@ def _fit(interactions, user_items, item_idx, regularization, learning_rate, lear
d_v_if[q, f] = (x_if[i][q] - x_if[j][q]) * (v_u[u][f] + np.dot(v_uf.T[f], x_uf[u]))

# update model weights for this (u, i, j) triplet with a gradient step
w_i[i] += eta * ((d_con * d_w_i) - (d_reg * w_i[i]))
w_i[j] += eta * ((d_con * d_w_j) - (d_reg * w_i[j]))
w_if += eta * ((d_con * d_w_if) - (d_reg * w_if))
v_u[u] += eta * ((d_con * d_v_u) - (d_reg * v_u[u]))
v_i[i] += eta * ((d_con * d_v_i) - (d_reg * v_i[i]))
v_i[j] += eta * ((d_con * d_v_j) - (d_reg * v_i[j]))
v_uf += eta * ((d_con * d_v_uf) - (d_reg * v_uf))
v_if += eta * ((d_con * d_v_if) - (d_reg * v_if))

# calculate the cumulative penalized log-likelihood for this training epoch
penalty = 0.0
penalty += np.sum(regularization * np.square(w_i))
penalty += np.sum(regularization * np.square(w_if))
penalty += np.sum(regularization * np.square(v_u))
penalty += np.sum(regularization * np.square(v_i))
penalty += np.sum(regularization * np.square(v_uf))
penalty += np.sum(regularization * np.square(v_if))
w_i[i] += eta * (sw * (d_con * d_w_i) - (d_reg * w_i[i]))
w_i[j] += eta * (sw * (d_con * d_w_j) - (d_reg * w_i[j]))
w_if += eta * (sw * (d_con * d_w_if) - (d_reg * w_if))
v_u[u] += eta * (sw * (d_con * d_v_u) - (d_reg * v_u[u]))
v_i[i] += eta * (sw * (d_con * d_v_i) - (d_reg * v_i[i]))
v_i[j] += eta * (sw * (d_con * d_v_j) - (d_reg * v_i[j]))
v_uf += eta * (sw * (d_con * d_v_uf) - (d_reg * v_uf))
v_if += eta * (sw * (d_con * d_v_if) - (d_reg * v_if))

# assert all model weights are finite as of the end of this epoch
assert_finite(w_i, w_if, v_u, v_i, v_uf, v_if)

if verbose:
penalty = reg_penalty(regularization, w_i, w_if, v_u, v_i, v_uf, v_if)
log_likelihood = round(log_likelihood - penalty, 2)
print("\ntraining epoch:", epoch)
print("log likelihood:", log_likelihood)
Expand Down
43 changes: 30 additions & 13 deletions rankfm/rankfm.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,11 @@ def _reset_state(self):
self.user_to_index = None
self.item_to_index = None

# user/item interactions
# user/item interactions and sample importance weights
self.interactions = None
self.sample_weight = None

# dictionary user observed items lookups
self.user_items_py = None
self.user_items_nb = None

Expand All @@ -91,12 +94,13 @@ def _reset_state(self):
self.is_fit = False


def _init_all(self, interactions, user_features=None, item_features=None):
def _init_all(self, interactions, user_features=None, item_features=None, sample_weight=None):
"""index the raw interaction and user/item features data to numpy arrays
:param interactions: dataframe of observed user/item interactions: [user_id, item_id]
:param user_features: dataframe of user metadata features: [user_id, uf_1, ..., uf_n]
:param item_features: dataframe of item metadata features: [item_id, if_1, ..., if_n]
:param sample_weight: vector of importance weights for each observed interaction
:return: None
"""

Expand All @@ -122,7 +126,7 @@ def _init_all(self, interactions, user_features=None, item_features=None):
self.item_idx = np.arange(len(self.item_id), dtype=np.int32)

# map the interactions to internal index positions
self._init_interactions(interactions)
self._init_interactions(interactions, sample_weight)

# map the user/item features to internal index positions
self._init_features(user_features, item_features)
Expand All @@ -131,10 +135,11 @@ def _init_all(self, interactions, user_features=None, item_features=None):
self._init_weights(user_features, item_features)


def _init_interactions(self, interactions):
def _init_interactions(self, interactions, sample_weight):
"""map new interaction data to existing internal user/item indexes
:param interactions: dataframe of observed user/item interactions: [user_id, item_id]
:param sample_weight: vector of importance weights for each observed interaction
:return: None
"""

Expand All @@ -150,6 +155,15 @@ def _init_interactions(self, interactions):
self.interactions['item_id'] = self.interactions['item_id'].map(self.item_to_index).astype(np.int32)
self.interactions = self.interactions.rename({'user_id': 'user_idx', 'item_id': 'item_idx'}, axis=1).dropna().astype(np.int32)

# store the sample weights internally or create a vector of ones if not passed
if sample_weight is not None:
assert isinstance(sample_weight, (np.ndarray, pd.Series)), "[sample_weight] must be np.ndarray or pd.series"
assert sample_weight.ndim == 1, "[sample_weight] must a vector (ndim=1)"
assert len(sample_weight) == len(interactions), "[sample_weight] must have the same length as [interactions]"
self.sample_weight = get_data(sample_weight).astype(np.float32)
else:
self.sample_weight = np.ones(len(self.interactions), dtype=np.float32, order='C')

# create python/numba lookup dictionaries containing the set of observed items for each user
# NOTE: the typed numba dictionary will be used to sample unobserved items during training
# NOTE: the interactions data must be converted to np.ndarray prior to training to use @njit
Expand Down Expand Up @@ -180,7 +194,7 @@ def _init_features(self, user_features=None, item_features=None):
else:
raise KeyError('the users in [user_features] do not match the users in [interactions]')
else:
self.x_uf = np.zeros([len(self.user_idx), 1]).astype(np.float32)
self.x_uf = np.zeros([len(self.user_idx), 1], dtype=np.float32, order='C')

# store the item features as a ndarray [IxQ] row-ordered by item index position
if item_features is not None:
Expand All @@ -192,7 +206,7 @@ def _init_features(self, user_features=None, item_features=None):
else:
raise KeyError('the items in [item_features] do not match the items in [interactions]')
else:
self.x_if = np.zeros([len(self.item_idx), 1]).astype(np.float32)
self.x_if = np.zeros([len(self.item_idx), 1], dtype=np.float32, order='C')


def _init_weights(self, user_features, item_features):
Expand All @@ -211,14 +225,14 @@ def _init_weights(self, user_features, item_features):
if user_features is not None:
self.v_uf = np.random.normal(loc=0, scale=self.sigma, size=[self.x_uf.shape[1], self.factors]).astype(np.float32)
else:
self.v_uf = np.zeros([self.x_uf.shape[1], self.factors]).astype(np.float32)
self.v_uf = np.zeros([self.x_uf.shape[1], self.factors], dtype=np.float32, order='C')

# randomly initialize item feature factors if item features were supplied
# NOTE: set all item feature factor weights to zero to prevent random scoring influence otherwise
if item_features is not None:
self.v_if = np.random.normal(loc=0, scale=self.sigma, size=[self.x_if.shape[1], self.factors]).astype(np.float32)
else:
self.v_if = np.zeros([self.x_if.shape[1], self.factors]).astype(np.float32)
self.v_if = np.zeros([self.x_if.shape[1], self.factors], dtype=np.float32, order='C')



Expand All @@ -227,40 +241,43 @@ def _init_weights(self, user_features, item_features):
# -------------------------------


def fit(self, interactions, user_features=None, item_features=None, epochs=1, verbose=False):
def fit(self, interactions, user_features=None, item_features=None, sample_weight=None, epochs=1, verbose=False):
"""clear previous model state and learn new model weights using the input data
:param interactions: dataframe of observed user/item interactions: [user_id, item_id]
:param user_features: dataframe of user metadata features: [user_id, uf_1, ..., uf_n]
:param item_features: dataframe of item metadata features: [item_id, if_1, ..., if_n]
:param sample_weight: vector of importance weights for each observed interaction
:param epochs: number of training epochs (full passes through observed interactions)
:param verbose: whether to print epoch number and log-likelihood during training
:return: self
"""

self._reset_state()
self.fit_partial(interactions, user_features, item_features, epochs, verbose)
self.fit_partial(interactions, user_features, item_features, sample_weight, epochs, verbose)


def fit_partial(self, interactions, user_features=None, item_features=None, epochs=1, verbose=False):
def fit_partial(self, interactions, user_features=None, item_features=None, sample_weight=None, epochs=1, verbose=False):
"""learn or update model weights using the input data and resuming from the current model state
:param interactions: dataframe of observed user/item interactions: [user_id, item_id]
:param user_features: dataframe of user metadata features: [user_id, uf_1, ..., uf_n]
:param item_features: dataframe of item metadata features: [item_id, if_1, ..., if_n]
:param sample_weight: vector of importance weights for each observed interaction
:param epochs: number of training epochs (full passes through observed interactions)
:param verbose: whether to print epoch number and log-likelihood during training
:return: self
"""

if self.is_fit:
self._init_interactions(interactions)
self._init_interactions(interactions, sample_weight)
self._init_features(user_features, item_features)
else:
self._init_all(interactions, user_features, item_features)
self._init_all(interactions, user_features, item_features, sample_weight)

updated_weights = _fit(
self.interactions,
self.sample_weight,
self.user_items_nb,
self.item_idx,
self.regularization,
Expand Down
4 changes: 2 additions & 2 deletions rankfm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ def get_data(obj):
:return: the object's underlying np.ndarray data
"""

if obj.__class__.__name__ == 'DataFrame':
if obj.__class__.__name__ in ('DataFrame', 'Series'):
data = obj.values
elif obj.__class__.__name__ == 'ndarray':
data = obj
else:
raise TypeError("input data must be in either pd.dataframe or np.ndarray format")
raise TypeError("input data must be in either pd.dataframe/pd.series or np.ndarray format")
return data

0 comments on commit 0648a5e

Please sign in to comment.