From ef5dfd08e9002584f4098532cd0282f7b37d9d0a Mon Sep 17 00:00:00 2001 From: Eric Lundquist Date: Tue, 26 May 2020 12:18:37 -0700 Subject: [PATCH] added basic tests --- .gitignore | 9 +- README.md | 21 +++-- rankfm/rankfm.py | 29 +++---- tests/test_rankfm.py | 203 ++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 227 insertions(+), 35 deletions(-) diff --git a/.gitignore b/.gitignore index 6c438f4..e5c968c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ -# excluded folders +# exclude data and private notebooks data/ -notebooks/old/ +examples/ignore/ # system files *.DS_Store @@ -17,8 +17,6 @@ lib/ lib64/ parts/ sdist/ -var/ -wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ @@ -26,9 +24,6 @@ share/python-wheels/ *.egg MANIFEST -# sphinx documentation -docs/_build/ - # spark stuff */derby.log */metastore_db/ diff --git a/README.md b/README.md index 6c3aafe..44780ac 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,12 @@ # RankFM -RankFM is a python implementation of the general Factorization Machines model class described in [Rendle 2010](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf) adapted for collaborative filtering recommendation/ranking problems with implicit feedback user-item interaction data. It uses the Bayesian Personalized Ranking (BPR-OPT) optimization criteria described in [Rendle 2009](https://arxiv.org/pdf/1205.2618.pdf) to learn model weights via Stochastic Gradient Descent (SGD). It can also incorporate user and/or item auxiliary features to augment the main interaction data which may increase model performance, especially in contexts where interaction data is highly sparse but rich user/item metadata features exist. +RankFM is a python implementation of the general Factorization Machines model class described in [Rendle 2010](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf) adapted for collaborative filtering recommendation/ranking problems with implicit feedback user-item interaction data. It uses the Bayesian Personalized Ranking (BPR-OPT) optimization criteria described in [Rendle 2009](https://arxiv.org/pdf/1205.2618.pdf) to learn model weights via Stochastic Gradient Descent (SGD). It can also incorporate user and/or item auxiliary features to augment the main interaction data, which may increase model performance, especially in contexts where the interaction data is highly sparse but rich user and/or item metadata features exist. -RankFM's core training/prediction/recommendation subroutines are converted to optimized machine code at runtime using the excellent [Numba](http://numba.pydata.org/) LLVM JIT compiler which can compile Python numerical algorithms to run at speeds approaching C/Fortran. This makes it possible to scale model training and recommendation to millions of user/item interactions. +The core training/prediction/recommendation subroutines are converted to optimized machine code at runtime using the [Numba](http://numba.pydata.org/) LLVM JIT compiler. This makes it possible to scale model training and recommendation to millions of user/item interactions. Designed for ease-of-use, RankFM accepts both `pd.DataFrame` and `np.ndarray` inputs. You do not have to convert your data to `scipy.sparse` matrices or re-map user/item identifiers to array indexes prior to use - RankFM internally maps all user/item identifiers to zero-based integer indexes, but always converts its outputs back to the original user/item identifiers from your data, which can be arbitrary (non-zero-based, non-consecutive) integers or even strings. -Designed for ease-of-use, RankFM accepts both `pd.DataFrame` and `np.ndarray` inputs. You do not have to convert your data to `scipy.sparse` matrices or re-map user/item identifiers to array indexes prior to use - internally RankFM maps all user/item identifiers to zero-based integer indexes, but always converts its output back to the original user/item identifiers from your data, which can be arbitrary (non-zero-based, non-consecutive) integers or even strings. +In addition to the familiar `fit()`, `predict()`, `recommend()` methods, RankFM includes additional utilities `similiar_users()` and `similar_items()` to find the most similar users/items to a given user/item based on latent factor space embeddings. A number of popular recommendation/ranking evaluation metric functions have been included in the separate `evaluation` module to streamline model tuning and validation. See the **Quickstart** section below to get started, and the `quickstart.ipynb` notebook in the `/examples` folder for a more in-depth walkthrough. -In addition to the familiar `fit()`, `predict()`, `recommend()` methods, RankFM includes additional utilities to find the most similar users/items to a given user/item based on user/item latent factor space embeddings. A number of popular recommendation/ranking evaluation metric functions are also included in the `evaluation` module to streamline model performance tuning and evaluation. - -See the **Quickstart** section below to get started, and the `quickstart.ipynb` notebook in the `/examples` folder for a more in-depth walkthrough. This package is currently under active development pre-release, and should not yet be considered stable. Release, build status, and PyPI information will be added once things get to a stable and satisfactory state for an initial release. The core functionality is mostly in place and working, but automated tests and CI workflows need to be added, and I need to teach myself how to do all that stuff first :) +This package is currently under active development pre-release, and should not yet be considered stable. Release, build status, and PyPI information will be added once things get to a stable and satisfactory state for an initial release. The core functionality is mostly in place and working, but automated tests and CI workflows need to be added, and I need to teach myself how to do all that stuff first :) --- ### Dependencies @@ -34,7 +32,7 @@ Let's first look at the required shape of the interaction data: | 5 | 377 | | 8 | 610 | -It has just two columns: a `user_id` and an `item_id` (although you can name these fields whatever you want or use a numpy array instead). Notice that there is no `rating` column - this library is for **implicit feedback** data (e.g. watches, page views, purchases, clicks) as opposed to **explicit feedback** data (e.g. 1-5 ratings, thumbs up/down). Implicit feedback is far more common in real-world recommendation contexts and doesn't suffer from the missing-not-at-random problem of pure explicit feedback approaches. Maciej Kula (legendary open-source recsys developer) provides an [excellent overview of the differences](https://resources.bibblio.org/hubfs/share/2018-01-24-RecSysLDN-Ravelin.pdf). +It has just two columns: a `user_id` and an `item_id` (you can name these fields whatever you want or use a numpy array instead). Notice that there is no `rating` column - this library is for **implicit feedback** data (e.g. watches, page views, purchases, clicks) as opposed to **explicit feedback** data (e.g. 1-5 ratings, thumbs up/down). Implicit feedback is far more common in real-world recommendation contexts and doesn't suffer from the missing-not-at-random problem of pure explicit feedback approaches. Maciej Kula (legendary open-source recsys developer) provides an [excellent overview of the differences](https://resources.bibblio.org/hubfs/share/2018-01-24-RecSysLDN-Ravelin.pdf). Now let's import the library, initialize our model, and fit on the training data: ```python @@ -44,13 +42,13 @@ model = RankFM(factors=10, regularization=0.01, learning_rate=0.1, learning_sche model.fit(interactions_train, epochs=20, verbose=True) # NOTE: this takes about 90 seconds for 750,000 interactions on my 2.3 GHz i5 8GB RAM MacBook ``` -If you set `verbose=True` the model will print the current epoch number as well as the epoch's log-likelihood during training. This can be useful to gauge both computational speed and training performance by epoch. If the log likelihood is not increasing then try upping the `learning_rate` or lowering the `regularization`. If the log likelihood is starting to sometimes decrease in later training epochs try lowering the `learning_rate` or using `learning_schedule='invscaling'` to gradually decrease the learning rate over time. +If you set `verbose=True` the model will print the current epoch number as well as the epoch's log-likelihood during training. This can be useful to gauge both computational speed and training performance by epoch. If the log likelihood is not increasing then try upping the `learning_rate` or lowering the `regularization`. If the log likelihood is starting to bounce up and down try lowering the `learning_rate` or using `learning_schedule='invscaling'` to decrease the learning rate over time. Now let's generate some user-item model scores from the validation data: ```python valid_scores = model.predict(interactions_valid, cold_start='nan') ``` -this will produce an array of real-valued model scores generated using the Factorization Machine model equation. You can interpret it as a measure of the predicted utility of a user (u) getting recommended an item (i). The `cold_start='nan'` option can be used to set scores to `np.nan` for user/item pairs not found in the training data, or `cold_start='drop'` can be specified to drop those pairs so the results contain no missing values. +this will produce an array of real-valued model scores generated using the Factorization Machines model equation. You can interpret it as a measure of the predicted utility of item (i) for user (u). The `cold_start='nan'` option can be used to set scores to `np.nan` for user/item pairs not found in the training data, or `cold_start='drop'` can be specified to drop those pairs so the results contain no missing values. Now let's generate our topN recommended movies for each user: ```python @@ -81,6 +79,7 @@ dcg: 0.704 precision: 0.152 recall: 0.068 ``` +[That's a Bingo!](https://www.youtube.com/watch?v=q5pESPQpXxE) Now let's find the most similar other movies for a few movies based on their embedding representations in latent factor space: ```python @@ -99,7 +98,7 @@ model.similar_items(589, n_items=10) 480 Jurassic Park (1993) 1200 Aliens (1986) ``` -I hope you like explosions... +[I hope you like explosions...](https://www.youtube.com/watch?v=uENYMZNzg9w) ```python # Being John Malkovich (1999) @@ -117,7 +116,7 @@ model.similar_items(2997, n_items=10) 2908 Boys Don't Cry (1999) 3481 High Fidelity (2000) ``` -Let's get weird... +[Let's get weird...](https://www.youtube.com/watch?v=lIpev8JXJHQ&t=5s) --- That's all for now. To see more in-depth worked examples in jupyter notebook format head to the `/examples` folder. Be sure to check back for added functionality and PyPI release status in the near future as soon as I teach myself how to use CI workflows and go where few data scientists have gone before: a comprehensive set of unit tests. Stay tuned... diff --git a/rankfm/rankfm.py b/rankfm/rankfm.py index 6f16025..9a02d67 100644 --- a/rankfm/rankfm.py +++ b/rankfm/rankfm.py @@ -247,14 +247,12 @@ def fit_partial(self, interactions, user_features=None, item_features=None, epoc :return: self """ - # initialize necessary internal data structures if self.is_fit: self._init_interactions(interactions) self._init_features(user_features, item_features) else: self._init_all(interactions, user_features, item_features) - # call numba internals updated_weights = _fit( self.interactions, self.user_items_nb, @@ -285,18 +283,16 @@ def predict(self, pairs, cold_start='nan'): :param pairs: dataframe of [user, item] pairs to score :param cold_start: whether to generate missing values ('nan') or drop ('drop') user/item pairs not found in training data - :return: vector of real-valued model scores + :return: np.array of real-valued model scores """ - # ensure that the model has been fit before attempting to generate predictions + assert pairs.shape[1] == 2, "[pairs] should be: [user_id, item_id]" assert self.is_fit, "you must fit the model prior to generating predictions" - # map raw user/item identifiers to internal index positions pred_pairs = pd.DataFrame(pairs.copy(), columns=['user_id', 'item_id']) pred_pairs['user_id'] = pred_pairs['user_id'].map(self.user_to_index) pred_pairs['item_id'] = pred_pairs['item_id'].map(self.item_to_index) - # call numba internals pred_pairs = pred_pairs.to_numpy().astype(np.float32) scores = _predict( pred_pairs, @@ -321,17 +317,16 @@ def predict(self, pairs, cold_start='nan'): def recommend(self, users, n_items=10, filter_previous=False, cold_start='nan'): """calculate the topN items for each user - :param users: list-like of user identifiers for which to generate recommendations + :param users: iterable of user identifiers for which to generate recommendations :param n_items: number of recommended items to generate for each user :param filter_previous: remove observed training items from generated recommendations :param cold_start: whether to generate missing values ('nan') or drop ('drop') users not found in training data :return: pandas dataframe where the index values are user identifiers and the columns are recommended items """ - # ensure that the model has been fit before attempting to generate predictions + assert getattr(users, '__iter__', False), "[users] must be an iterable (e.g. list, array, series)" assert self.is_fit, "you must fit the model prior to generating recommendations" - # call numba internals user_idx = pd.Series(users).map(self.user_to_index).to_numpy(dtype=np.float32) rec_items = _recommend( user_idx, @@ -362,10 +357,11 @@ def similar_items(self, item_id, n_items=10): :param item_id: item to search :param n_items: number of similar items to return - :return: topN most similar items wrt latent factor representations + :return: np.array of topN most similar items wrt latent factor representations """ # ensure that the model has been fit before attempting to generate predictions + assert item_id in self.item_id, "you must select an [item_id] present in the training data" assert self.is_fit, "you must fit the model prior to generating similarities" try: @@ -379,7 +375,7 @@ def similar_items(self, item_id, n_items=10): # calculate the most similar N items excluding the search item similarities = pd.Series(np.dot(lr_all_items, lr_item)).drop(item_idx).sort_values(ascending=False)[:n_items] - most_similar = pd.Series(similarities.index).map(self.index_to_item) + most_similar = pd.Series(similarities.index).map(self.index_to_item).values return most_similar @@ -388,10 +384,11 @@ def similar_users(self, user_id, n_users=10): :param user_id: user to search :param n_users: number of similar users to return - :return: topN most similar users wrt latent factor representations + :return: np.array of topN most similar users wrt latent factor representations """ # ensure that the model has been fit before attempting to generate predictions + assert user_id in self.user_id, "you must select an [user_id] present in the training data" assert self.is_fit, "you must fit the model prior to generating similarities" try: @@ -399,12 +396,12 @@ def similar_users(self, user_id, n_users=10): except (KeyError, TypeError): print("user_id={} not found in training data".format(user_id)) - # calculate item latent representations in F dimensional factor space - lr_user = self.v_i[user_idx] + np.dot(self.v_uf.T, self.x_uf[user_idx]) - lr_all_users = self.v_i + np.dot(self.x_uf, self.v_uf) + # calculate user latent representations in F dimensional factor space + lr_user = self.v_u[user_idx] + np.dot(self.v_uf.T, self.x_uf[user_idx]) + lr_all_users = self.v_u + np.dot(self.x_uf, self.v_uf) # calculate the most similar N users excluding the search user similarities = pd.Series(np.dot(lr_all_users, lr_user)).drop(user_idx).sort_values(ascending=False)[:n_users] - most_similar = pd.Series(similarities.index).map(self.index_to_user) + most_similar = pd.Series(similarities.index).map(self.index_to_user).values return most_similar diff --git a/tests/test_rankfm.py b/tests/test_rankfm.py index 90d253e..9bf6773 100644 --- a/tests/test_rankfm.py +++ b/tests/test_rankfm.py @@ -41,6 +41,13 @@ (3, 3, 3), (3, 6, 4), (3, 4, 5) ], columns=['user_id', 'item_id', 'rating'], dtype=np.int32) +# valid interactions with disjoint user/items +intx_valid_disjoint = pd.DataFrame([ + (1, 1), (1, 3), (1, 5), + (2, 1), (2, 2), (2, 7), + (4, 3), (4, 7), (4, 4) +], columns=['user_id', 'item_id'], dtype=np.int32) + # user features # ------------- @@ -115,6 +122,12 @@ (6, 0, 0, "G", 0.00) ], columns=['item_id', 'bin_1', 'bin_2', 'str', 'cnt']) +# user iterables +# -------------- + +train_users = np.array([1, 2, 3]) +valid_users = np.array([1, 2, 4, 5]) + # ------------------------------ # test basic model functionality # ------------------------------ @@ -178,10 +191,198 @@ def test__fit__bad__if_str_cols(): model = RankFM(factors=2) model.fit(intx_train_pd_int, item_features=if_str_cols) +# score prediction +# ---------------- + +def test__predict__good__train(): + """test the predict() method on the training inputs""" + + model = RankFM(factors=2) + model.fit(intx_train_pd_int) + scores = model.predict(intx_train_pd_int) + + shape = scores.shape == (9,) + dtype = scores.dtype == np.float32 + nmiss = np.sum(np.isnan(scores).astype(np.int32)) == 0 + assert shape and dtype and nmiss + +def test__predict__good__disjoint_nan(): + """test the predict() method on disjoint validation pairs with the cold_start='nan' option""" + + model = RankFM(factors=2) + model.fit(intx_train_pd_int) + scores = model.predict(intx_valid_disjoint, cold_start='nan') + + shape = scores.shape == (9,) + dtype = scores.dtype == np.float32 + nmiss = np.sum(np.isnan(scores).astype(np.int32)) == 4 + assert shape and dtype and nmiss + +def test__predict__good__disjoint_drop(): + """test the predict() method on disjoint validation pairs with the cold_start='drop' option""" + + model = RankFM(factors=2) + model.fit(intx_train_pd_int) + scores = model.predict(intx_valid_disjoint, cold_start='drop') + + shape = scores.shape == (5,) + dtype = scores.dtype == np.float32 + nmiss = np.sum(np.isnan(scores).astype(np.int32)) == 0 + assert shape and dtype and nmiss + +# user recommendation +# ------------------- + +def test__recommend__good__train(): + """test the recommend() method on the training users""" + + model = RankFM(factors=2) + model.fit(intx_train_pd_int) + recs = model.recommend(train_users, n_items=3) + + klass = isinstance(recs, pd.DataFrame) + shape = recs.shape == (3, 3) + index = np.array_equal(recs.index.values, train_users) + items = recs.isin(intx_train_pd_int['item_id'].values).all().all() + assert klass and shape and index and items + +def test__recommend__good__train__filter(): + """test the recommend() method on the training users but filter previous items""" + + model = RankFM(factors=2) + model.fit(intx_train_pd_int) + recs = model.recommend(train_users, n_items=3, filter_previous=True) + + klass = isinstance(recs, pd.DataFrame) + shape = recs.shape == (3, 3) + index = np.array_equal(recs.index.values, train_users) + items = recs.isin(intx_train_pd_int['item_id'].values).all().all() + + recs_long = recs.stack().reset_index().drop('level_1', axis=1) + recs_long.columns = ['user_id', 'item_id'] + intersect = pd.merge(intx_train_pd_int, recs_long, on=['user_id', 'item_id'], how='inner').empty + assert klass and shape and index and items and intersect + +def test__recommend__good__valid__nan(): + """test the recommend() method on a disjoint set of validation users""" + + model = RankFM(factors=2) + model.fit(intx_train_pd_int) + recs = model.recommend(valid_users, n_items=3, cold_start='nan') + + klass = isinstance(recs, pd.DataFrame) + shape = recs.shape == (4, 3) + index = np.array_equal(sorted(recs.index.values), sorted(valid_users)) + items = recs.dropna().isin(intx_train_pd_int['item_id'].values).all().all() + new_users = list(set(valid_users) - set(train_users)) + nmiss = recs.loc[new_users].isnull().all().all() + assert klass and shape and index and items and nmiss + +def test__recommend__good__valid__drop(): + """test the recommend() method on a disjoint set of validation users""" + + model = RankFM(factors=2) + model.fit(intx_train_pd_int) + recs = model.recommend(valid_users, n_items=3, cold_start='drop') + + klass = isinstance(recs, pd.DataFrame) + shape = recs.shape == (2, 3) + index = np.isin(recs.index.values, valid_users).all() + items = recs.dropna().isin(intx_train_pd_int['item_id'].values).all().all() + + same_users = list(set(valid_users) & set(train_users)) + match_users = np.array_equal(sorted(same_users), sorted(recs.index.values)) + assert klass and shape and index and items and match_users + +# similar items/users +# ------------------- + +def test__similar_items__good(): + """test the similar_items() method for a valid [item_id]""" + + model = RankFM(factors=2) + model.fit(intx_train_pd_int) + similar = model.similar_items(1, n_items=3) + + shape = similar.shape == (3,) + items = np.isin(similar, intx_train_pd_int['item_id'].unique()).all() + assert shape and items + +def test__similar_items__bad(): + """ensure the similar_items() method raises an exception for an item not in training data""" + + with pytest.raises(AssertionError): + model = RankFM(factors=2) + model.fit(intx_train_pd_int) + similar = model.similar_items(99, n_items=3) + + +def test__similar_users__good(): + """test the similar_users() method for a valid [user_id]""" + + model = RankFM(factors=2) + model.fit(intx_train_pd_int) + similar = model.similar_users(1, n_users=2) + + shape = similar.shape == (2,) + users = np.isin(similar, intx_train_pd_int['user_id'].unique()).all() + assert shape and users + +def test__similar_users__bad(): + """ensure the similar_users() method raises an exception for an user not in training data""" + with pytest.raises(AssertionError): + model = RankFM(factors=2) + model.fit(intx_train_pd_int) + similar = model.similar_users(9, n_users=1) +# model evaluation +# ---------------- # model = RankFM(factors=2) -# model.fit(interactions=intx_train_pd_int, user_features=uf_no_id) +# model.fit(intx_train_pd_int) +# recs = model.recommend(valid_users, n_items=3, cold_start='drop') + +# klass = isinstance(recs, pd.DataFrame) +# shape = recs.shape == (2, 3) +# index = np.array_equal(recs.index.values, valid_users) +# items = recs.dropna().isin(intx_train_pd_int['item_id'].values).all().all() + +# same_users = list(set(valid_users) & set(train_users)) +# match_users = np.array_equal(sorted(same_users), sorted(recs.index.values)) +# assert klass and shape and index and items and match_users + + + +# model = RankFM(factors=2) +# model.fit(intx_train_pd_int) +# recs = model.recommend(valid_users, n_items=3, cold_start='drop') + +# klass = isinstance(recs, pd.DataFrame) +# shape = recs.shape == (4, 3) +# index = np.array_equal(recs.index.values, valid_users) +# items = recs.dropna().isin(intx_train_pd_int['item_id'].values).all().all() + +# same_users = list(set(valid_users) & set(train_users)) +# match_users = np.array_equal(sorted(same_users), sorted(recs.index.values)) +# assert klass and shape and index and items and match_users + +# model = RankFM(factors=2) +# model.fit(interactions=intx_train_pd_int) +# scores = model.predict(intx_train_pd_int) +# recs = model.recommend(train_users, n_items=3, filter_previous=True) + +# model.similar_users(1, n_users=1) +# model.similar_users(9, n_users=1) + +# model.similar_items(1, n_items=3) +# model.similar_items(99, n_items=3) + +# v_u = model.v_u + +# x_uf = model.x_uf +# v_uf = model.v_uf + +# res_inner = np.dot(x_uf, v_uf) \ No newline at end of file