Skip to content

Commit

Permalink
Merge pull request #26 from etlundquist/cleanup
Browse files Browse the repository at this point in the history
updating docs
  • Loading branch information
Eric Lundquist committed Jul 19, 2020
2 parents 16627bf + 7d61fdc commit c3e9395
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 49 deletions.
14 changes: 4 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@ The core (training, prediction, recommendation) methods are written in [Cython](

In addition to the familiar `fit()`, `predict()`, `recommend()` methods, RankFM includes additional utilities `similiar_users()` and `similar_items()` to find the most similar users/items to a given user/item based on latent factor space embeddings. A number of popular recommendation/ranking evaluation metric functions have been included in the separate `evaluation` module to streamline model tuning and validation.

See the **Quickstart** section below to get started and the `/examples` folder for more in-depth jupyter notebook walkthroughs using several popular open-source data sets. For more comprehensive documentation on the main model class and the included evaluation module see the [Online Documentation](https://rankfm.readthedocs.io/en/latest/).

This package is currently under active development and should not yet be considered fully stable. The core functionality is in place and working, but has not yet been rigorously tested against a wide variety of real-world data sets, modeling objectives, edge cases, user errors, etc. If you do find a problem or have suggestions for improvement please let me know!
* see the **Quickstart** section below to get started with the basic functionality
* see the `/examples` folder for more in-depth jupyter notebook walkthroughs with several popular open-source data sets
* see the [Online Documentation](https://rankfm.readthedocs.io/en/latest/) for more comprehensive documentation on the main model class and separate evaluation module
* see the [Medium Article](https://towardsdatascience.com/factorization-machines-for-item-recommendation-with-implicit-feedback-data-5655a7c749db) for contextual motivation and a detailed mathematical description of the algorithm

---
### Dependencies
Expand Down Expand Up @@ -150,10 +151,3 @@ model.similar_items(2997, n_items=10)
3481 High Fidelity (2000)
```
[Let's get weird...](https://www.youtube.com/watch?v=lIpev8JXJHQ&t=5s)

---
That's all for now. To see more in-depth worked examples in jupyter notebook format head to the `/examples` folder. Be sure to check back for added functionality and updated PyPI releases in the near future. There's more to come - stay tuned...

---
![under construction](./images/UnderConstruction.png)

2 changes: 0 additions & 2 deletions rankfm/_rankfm.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -217,15 +217,13 @@ def _fit(

for epoch in range(epochs):

# set the learning rate for this training epoch
if learning_schedule == 'constant':
eta = learning_rate
elif learning_schedule == 'invscaling':
eta = learning_rate / pow(epoch + 1, learning_exponent)
else:
raise ValueError('unknown [learning_schedule]')

# re-shuffle the interaction data for each epoch
np.random.shuffle(shuffle_index)
log_likelihood = 0.0

Expand Down
72 changes: 35 additions & 37 deletions rankfm/rankfm.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class RankFM():
"""Factorization Machines for Ranking Problems with Implicit Feedback Data"""

def __init__(self, factors=10, loss='bpr', max_samples=10, alpha=0.01, beta=0.1, sigma=0.1, learning_rate=0.1, learning_schedule='constant', learning_exponent=0.25):
"""store hyperparameters and initialize internal data
"""store hyperparameters and initialize internal model state
:param factors: latent factor rank
:param loss: optimization/loss function to use for training: ['bpr', 'warp']
Expand All @@ -26,7 +26,7 @@ def __init__(self, factors=10, loss='bpr', max_samples=10, alpha=0.01, beta=0.1,
:return: None
"""

# validate user inputs
# validate user input
assert isinstance(factors, int) and factors >= 1, "[factors] must be a positive integer"
assert isinstance(loss, str) and loss in ('bpr', 'warp'), "[loss] must be in ('bpr', 'warp')"
assert isinstance(max_samples, int) and max_samples > 0, "[max_samples] must be a positive integer"
Expand All @@ -37,7 +37,7 @@ def __init__(self, factors=10, loss='bpr', max_samples=10, alpha=0.01, beta=0.1,
assert isinstance(learning_schedule, str) and learning_schedule in ('constant', 'invscaling'), "[learning_schedule] must be in ('constant', 'invscaling')"
assert isinstance(learning_exponent, float) and learning_exponent > 0.0, "[learning_exponent] must be a positive float"

# store hyperparameters
# store model hyperparameters
self.factors = factors
self.loss = loss
self.max_samples = max_samples
Expand All @@ -58,47 +58,47 @@ def __init__(self, factors=10, loss='bpr', max_samples=10, alpha=0.01, beta=0.1,


def _reset_state(self):
"""initialize/reset all [user/item/feature] indexes and model weights"""
"""initialize or reset internal model state"""

# user/item ID/INDEX arrays
# [ID, IDX] arrays
self.user_id = None
self.item_id = None
self.user_idx = None
self.item_idx = None

# user/item ID <--> INDEX mappings
# [ID <-> IDX] mappings
self.index_to_user = None
self.index_to_item = None
self.user_to_index = None
self.item_to_index = None

# user/item interactions and sample importance weights
# user/item interactions and importance weights
self.interactions = None
self.sample_weight = None

# dictionary of observed items for each user
# set of observed items for each user
self.user_items = None

# user/item features
# [user, item] features
self.x_uf = None
self.x_if = None

# item and item feature weights
# [item, item-feature] scalar weights
self.w_i = None
self.w_if = None

# user/item/user-feature/item-feature latent factors
# [user, item, user-feature, item-feature] latent factors
self.v_u = None
self.v_i = None
self.v_uf = None
self.v_if = None

# internal model state
# internal model state indicator
self.is_fit = False


def _init_all(self, interactions, user_features=None, item_features=None, sample_weight=None):
"""index the raw interaction and user/item features data to numpy arrays
"""index the interaction data and user/item features and initialize model weights
:param interactions: dataframe of observed user/item interactions: [user_id, item_id]
:param user_features: dataframe of user metadata features: [user_id, uf_1, ..., uf_n]
Expand All @@ -107,16 +107,15 @@ def _init_all(self, interactions, user_features=None, item_features=None, sample
:return: None
"""

# check user data inputs
assert isinstance(interactions, (np.ndarray, pd.DataFrame)), "[interactions] must be np.ndarray or pd.dataframe"
assert interactions.shape[1] == 2, "[interactions] should be: [user_id, item_id]"

# save the unique lists of users/items in terms of original identifiers
# save unique arrays of users/items in terms of original identifiers
interactions_df = pd.DataFrame(get_data(interactions), columns=['user_id', 'item_id'])
self.user_id = pd.Series(np.sort(np.unique(interactions_df['user_id'])))
self.item_id = pd.Series(np.sort(np.unique(interactions_df['item_id'])))

# create zero-based index position to identifier mappings
# create zero-based index to identifier mappings
self.index_to_user = self.user_id
self.index_to_item = self.item_id

Expand Down Expand Up @@ -146,19 +145,17 @@ def _init_interactions(self, interactions, sample_weight):
:return: None
"""

# check user data inputs
assert isinstance(interactions, (np.ndarray, pd.DataFrame)), "[interactions] must be np.ndarray or pd.dataframe"
assert interactions.shape[1] == 2, "[interactions] should be: [user_id, item_id]"

# map the raw user/item identifiers to internal zero-based index positions
# NOTE: any user/item pairs not found in the existing indexes will be dropped

self.interactions = pd.DataFrame(get_data(interactions).copy(), columns=['user_id', 'item_id'])
self.interactions['user_id'] = self.interactions['user_id'].map(self.user_to_index).astype(np.int32)
self.interactions['item_id'] = self.interactions['item_id'].map(self.item_to_index).astype(np.int32)
self.interactions = self.interactions.rename({'user_id': 'user_idx', 'item_id': 'item_idx'}, axis=1).dropna()

# store the sample weights internally or create a vector of ones if not passed
# store the sample weights internally or generate a vector of ones if not given
if sample_weight is not None:
assert isinstance(sample_weight, (np.ndarray, pd.Series)), "[sample_weight] must be np.ndarray or pd.series"
assert sample_weight.ndim == 1, "[sample_weight] must a vector (ndim=1)"
Expand All @@ -167,9 +164,7 @@ def _init_interactions(self, interactions, sample_weight):
else:
self.sample_weight = np.ones(len(self.interactions), dtype=np.float32)

# create python/numba lookup dictionaries containing the set of observed items for each user
# NOTE: the typed numba dictionary will be used to sample unobserved items during training
# NOTE: the interactions data must be converted to np.ndarray prior to training to use @njit
# create a dictionary containing the set of observed items for each user
# NOTE: if the model has been previously fit extend rather than replace the itemset for each user

if self.is_fit:
Expand All @@ -178,16 +173,16 @@ def _init_interactions(self, interactions, sample_weight):
else:
self.user_items = self.interactions.sort_values(['user_idx', 'item_idx']).groupby('user_idx')['item_idx'].apply(np.array, dtype=np.int32).to_dict()

# format the interactions data as a c-contiguous integer array for cython
# format the interactions data as a c-contiguous integer array for cython use
self.interactions = np.ascontiguousarray(self.interactions, dtype=np.int32)



def _init_features(self, user_features=None, item_features=None):
"""initialize the user/item features given existing internal user/item indexes
:param user_features: dataframe of user metadata features: [user_id, uf_1, ..., uf_n]
:param item_features: dataframe of item metadata features: [item_id, if_1, ..., if_n]
:param user_features: dataframe of user metadata features: [user_id, uf_1, ... , uf_n]
:param item_features: dataframe of item metadata features: [item_id, if_1, ... , if_n]
:return: None
"""

Expand Down Expand Up @@ -216,8 +211,13 @@ def _init_features(self, user_features=None, item_features=None):
self.x_if = np.zeros([len(self.item_idx), 1], dtype=np.float32)


def _init_weights(self, user_features, item_features):
"""initialize model weights given user/item and user_feature/item_feature indexes/shapes"""
def _init_weights(self, user_features=None, item_features=None):
"""initialize model weights given user/item and user-feature/item-feature indexes/shapes
:param user_features: dataframe of user metadata features: [user_id, uf_1, ... , uf_n]
:param item_features: dataframe of item metadata features: [item_id, if_1, ... , if_n]
:return: None
"""

# initialize scalar weights as ndarrays of zeros
self.w_i = np.zeros(len(self.item_idx)).astype(np.float32)
Expand Down Expand Up @@ -253,8 +253,8 @@ def fit(self, interactions, user_features=None, item_features=None, sample_weigh
"""clear previous model state and learn new model weights using the input data
:param interactions: dataframe of observed user/item interactions: [user_id, item_id]
:param user_features: dataframe of user metadata features: [user_id, uf_1, ..., uf_n]
:param item_features: dataframe of item metadata features: [item_id, if_1, ..., if_n]
:param user_features: dataframe of user metadata features: [user_id, uf_1, ... , uf_n]
:param item_features: dataframe of item metadata features: [item_id, if_1, ... , if_n]
:param sample_weight: vector of importance weights for each observed interaction
:param epochs: number of training epochs (full passes through observed interactions)
:param verbose: whether to print epoch number and log-likelihood during training
Expand All @@ -263,25 +263,24 @@ def fit(self, interactions, user_features=None, item_features=None, sample_weigh

self._reset_state()
self.fit_partial(interactions, user_features, item_features, sample_weight, epochs, verbose)
return self


def fit_partial(self, interactions, user_features=None, item_features=None, sample_weight=None, epochs=1, verbose=False):
"""learn or update model weights using the input data and resuming from the current model state
:param interactions: dataframe of observed user/item interactions: [user_id, item_id]
:param user_features: dataframe of user metadata features: [user_id, uf_1, ..., uf_n]
:param item_features: dataframe of item metadata features: [item_id, if_1, ..., if_n]
:param user_features: dataframe of user metadata features: [user_id, uf_1, ... , uf_n]
:param item_features: dataframe of item metadata features: [item_id, if_1, ... , if_n]
:param sample_weight: vector of importance weights for each observed interaction
:param epochs: number of training epochs (full passes through observed interactions)
:param verbose: whether to print epoch number and log-likelihood during training
:return: self
"""

# validate user inputs
assert isinstance(epochs, int) and epochs >= 1, "[epochs] must be a positive integer"
assert isinstance(verbose, bool), "[verbose] must be a boolean value"

# initialize internal data representations
if self.is_fit:
self._init_interactions(interactions, sample_weight)
self._init_features(user_features, item_features)
Expand All @@ -299,7 +298,9 @@ def fit_partial(self, interactions, user_features=None, item_features=None, samp
else:
raise ValueError('[loss] function not recognized')

# NOTE: the cython internal fit method updates the model weights in place via memoryviews
# NOTE: the cython private _fit() method updates the model weights in-place via typed memoryviews
# NOTE: therefore there's nothing returned explicitly by either method

_fit(
self.interactions,
self.sample_weight,
Expand Down Expand Up @@ -334,7 +335,6 @@ def predict(self, pairs, cold_start='nan'):
:return: np.array of real-valued model scores
"""

# check user data inputs
assert isinstance(pairs, (np.ndarray, pd.DataFrame)), "[pairs] must be np.ndarray or pd.dataframe"
assert pairs.shape[1] == 2, "[pairs] should be: [user_id, item_id]"
assert self.is_fit, "you must fit the model prior to generating predictions"
Expand Down Expand Up @@ -410,7 +410,6 @@ def similar_items(self, item_id, n_items=10):
:return: np.array of topN most similar items wrt latent factor representations
"""

# ensure that the model has been fit before attempting to generate predictions
assert item_id in self.item_id.values, "you must select an [item_id] present in the training data"
assert self.is_fit, "you must fit the model prior to generating similarities"

Expand All @@ -437,7 +436,6 @@ def similar_users(self, user_id, n_users=10):
:return: np.array of topN most similar users wrt latent factor representations
"""

# ensure that the model has been fit before attempting to generate predictions
assert user_id in self.user_id.values, "you must select an [user_id] present in the training data"
assert self.is_fit, "you must fit the model prior to generating similarities"

Expand Down

0 comments on commit c3e9395

Please sign in to comment.