Merge pull request #26 from etlundquist/cleanup

updating docs
etlundquist · Jul 19, 2020 · c3e9395 · c3e9395
2 parents 16627bf + 7d61fdc
commit c3e9395
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 49 deletions.
diff --git a/README.md b/README.md
@@ -11,9 +11,10 @@ The core (training, prediction, recommendation) methods are written in [Cython](
 
 In addition to the familiar `fit()`, `predict()`, `recommend()` methods, RankFM includes additional utilities `similiar_users()` and `similar_items()` to find the most similar users/items to a given user/item based on latent factor space embeddings. A number of popular recommendation/ranking evaluation metric functions have been included in the separate `evaluation` module to streamline model tuning and validation.
 
-See the **Quickstart** section below to get started and the `/examples` folder for more in-depth jupyter notebook walkthroughs using several popular open-source data sets. For more comprehensive documentation on the main model class and the included evaluation module see the [Online Documentation](https://rankfm.readthedocs.io/en/latest/).
-
-This package is currently under active development and should not yet be considered fully stable. The core functionality is in place and working, but has not yet been rigorously tested against a wide variety of real-world data sets, modeling objectives, edge cases, user errors, etc. If you do find a problem or have suggestions for improvement please let me know!
+* see the **Quickstart** section below to get started with the basic functionality
+* see the `/examples` folder for more in-depth jupyter notebook walkthroughs with several popular open-source data sets
+* see the [Online Documentation](https://rankfm.readthedocs.io/en/latest/) for more comprehensive documentation on the main model class and separate evaluation module
+* see the [Medium Article](https://towardsdatascience.com/factorization-machines-for-item-recommendation-with-implicit-feedback-data-5655a7c749db) for contextual motivation and a detailed mathematical description of the algorithm
 
 ---
 ### Dependencies
@@ -150,10 +151,3 @@ model.similar_items(2997, n_items=10)
 3481 High Fidelity (2000)
 ```
 [Let's get weird...](https://www.youtube.com/watch?v=lIpev8JXJHQ&t=5s)
-
----
-That's all for now. To see more in-depth worked examples in jupyter notebook format head to the `/examples` folder. Be sure to check back for added functionality and updated PyPI releases in the near future. There's more to come - stay tuned...
-
----
-![under construction](./images/UnderConstruction.png)
-
diff --git a/rankfm/_rankfm.pyx b/rankfm/_rankfm.pyx
@@ -217,15 +217,13 @@ def _fit(
 
  for epoch in range(epochs):
 
- # set the learning rate for this training epoch
  if learning_schedule == 'constant':
  eta = learning_rate
  elif learning_schedule == 'invscaling':
  eta = learning_rate / pow(epoch + 1, learning_exponent)
  else:
  raise ValueError('unknown [learning_schedule]')
 
- # re-shuffle the interaction data for each epoch
  np.random.shuffle(shuffle_index)
  log_likelihood = 0.0
 

diff --git a/rankfm/rankfm.py b/rankfm/rankfm.py
@@ -12,7 +12,7 @@ class RankFM():
  """Factorization Machines for Ranking Problems with Implicit Feedback Data"""
 
  def __init__(self, factors=10, loss='bpr', max_samples=10, alpha=0.01, beta=0.1, sigma=0.1, learning_rate=0.1, learning_schedule='constant', learning_exponent=0.25):
- """store hyperparameters and initialize internal data
+ """store hyperparameters and initialize internal model state
 
  :param factors: latent factor rank
  :param loss: optimization/loss function to use for training: ['bpr', 'warp']
@@ -26,7 +26,7 @@ def __init__(self, factors=10, loss='bpr', max_samples=10, alpha=0.01, beta=0.1,
  :return: None
  """
 
- # validate user inputs
+ # validate user input
  assert isinstance(factors, int) and factors >= 1, "[factors] must be a positive integer"
  assert isinstance(loss, str) and loss in ('bpr', 'warp'), "[loss] must be in ('bpr', 'warp')"
  assert isinstance(max_samples, int) and max_samples > 0, "[max_samples] must be a positive integer"
@@ -37,7 +37,7 @@ def __init__(self, factors=10, loss='bpr', max_samples=10, alpha=0.01, beta=0.1,
  assert isinstance(learning_schedule, str) and learning_schedule in ('constant', 'invscaling'), "[learning_schedule] must be in ('constant', 'invscaling')"
  assert isinstance(learning_exponent, float) and learning_exponent > 0.0, "[learning_exponent] must be a positive float"
 
- # store hyperparameters
+ # store model hyperparameters
  self.factors = factors
  self.loss = loss
  self.max_samples = max_samples
@@ -58,47 +58,47 @@ def __init__(self, factors=10, loss='bpr', max_samples=10, alpha=0.01, beta=0.1,
 
 
  def _reset_state(self):
- """initialize/reset all [user/item/feature] indexes and model weights"""
+ """initialize or reset internal model state"""
 
- # user/item ID/INDEX arrays
+ # [ID, IDX] arrays
  self.user_id = None
  self.item_id = None
  self.user_idx = None
  self.item_idx = None
 
- # user/item ID <--> INDEX mappings
+ # [ID <-> IDX] mappings
  self.index_to_user = None
  self.index_to_item = None
  self.user_to_index = None
  self.item_to_index = None
 
- # user/item interactions and sample importance weights
+ # user/item interactions and importance weights
  self.interactions = None
  self.sample_weight = None
 
- # dictionary of observed items for each user
+ # set of observed items for each user
  self.user_items = None
 
- # user/item features
+ # [user, item] features
  self.x_uf = None
  self.x_if = None
 
- # item and item feature weights
+ # [item, item-feature] scalar weights
  self.w_i = None
  self.w_if = None
 
- # user/item/user-feature/item-feature latent factors
+ # [user, item, user-feature, item-feature] latent factors
  self.v_u = None
  self.v_i = None
  self.v_uf = None
  self.v_if = None
 
- # internal model state
+ # internal model state indicator
  self.is_fit = False
 
 
  def _init_all(self, interactions, user_features=None, item_features=None, sample_weight=None):
- """index the raw interaction and user/item features data to numpy arrays
+ """index the interaction data and user/item features and initialize model weights
 
  :param interactions: dataframe of observed user/item interactions: [user_id, item_id]
  :param user_features: dataframe of user metadata features: [user_id, uf_1, ..., uf_n]
@@ -107,16 +107,15 @@ def _init_all(self, interactions, user_features=None, item_features=None, sample
  :return: None
  """
 
- # check user data inputs
  assert isinstance(interactions, (np.ndarray, pd.DataFrame)), "[interactions] must be np.ndarray or pd.dataframe"
  assert interactions.shape[1] == 2, "[interactions] should be: [user_id, item_id]"
 
- # save the unique lists of users/items in terms of original identifiers
+ # save unique arrays of users/items in terms of original identifiers
  interactions_df = pd.DataFrame(get_data(interactions), columns=['user_id', 'item_id'])
  self.user_id = pd.Series(np.sort(np.unique(interactions_df['user_id'])))
  self.item_id = pd.Series(np.sort(np.unique(interactions_df['item_id'])))
 
- # create zero-based index position to identifier mappings
+ # create zero-based index to identifier mappings
  self.index_to_user = self.user_id
  self.index_to_item = self.item_id
 
@@ -146,19 +145,17 @@ def _init_interactions(self, interactions, sample_weight):
  :return: None
  """
 
- # check user data inputs
  assert isinstance(interactions, (np.ndarray, pd.DataFrame)), "[interactions] must be np.ndarray or pd.dataframe"
  assert interactions.shape[1] == 2, "[interactions] should be: [user_id, item_id]"
 
  # map the raw user/item identifiers to internal zero-based index positions
  # NOTE: any user/item pairs not found in the existing indexes will be dropped
-
  self.interactions = pd.DataFrame(get_data(interactions).copy(), columns=['user_id', 'item_id'])
  self.interactions['user_id'] = self.interactions['user_id'].map(self.user_to_index).astype(np.int32)
  self.interactions['item_id'] = self.interactions['item_id'].map(self.item_to_index).astype(np.int32)
  self.interactions = self.interactions.rename({'user_id': 'user_idx', 'item_id': 'item_idx'}, axis=1).dropna()
 
- # store the sample weights internally or create a vector of ones if not passed
+ # store the sample weights internally or generate a vector of ones if not given
  if sample_weight is not None:
  assert isinstance(sample_weight, (np.ndarray, pd.Series)), "[sample_weight] must be np.ndarray or pd.series"
  assert sample_weight.ndim == 1, "[sample_weight] must a vector (ndim=1)"
@@ -167,9 +164,7 @@ def _init_interactions(self, interactions, sample_weight):
  else:
  self.sample_weight = np.ones(len(self.interactions), dtype=np.float32)
 
- # create python/numba lookup dictionaries containing the set of observed items for each user
- # NOTE: the typed numba dictionary will be used to sample unobserved items during training
- # NOTE: the interactions data must be converted to np.ndarray prior to training to use @njit
+ # create a dictionary containing the set of observed items for each user
  # NOTE: if the model has been previously fit extend rather than replace the itemset for each user
 
  if self.is_fit:
@@ -178,16 +173,16 @@ def _init_interactions(self, interactions, sample_weight):
  else:
  self.user_items = self.interactions.sort_values(['user_idx', 'item_idx']).groupby('user_idx')['item_idx'].apply(np.array, dtype=np.int32).to_dict()
 
- # format the interactions data as a c-contiguous integer array for cython
+ # format the interactions data as a c-contiguous integer array for cython use
  self.interactions = np.ascontiguousarray(self.interactions, dtype=np.int32)
 
 
 
  def _init_features(self, user_features=None, item_features=None):
  """initialize the user/item features given existing internal user/item indexes
 
- :param user_features: dataframe of user metadata features: [user_id, uf_1, ..., uf_n]
- :param item_features: dataframe of item metadata features: [item_id, if_1, ..., if_n]
+ :param user_features: dataframe of user metadata features: [user_id, uf_1, ... , uf_n]
+ :param item_features: dataframe of item metadata features: [item_id, if_1, ... , if_n]
  :return: None
  """
 
@@ -216,8 +211,13 @@ def _init_features(self, user_features=None, item_features=None):
  self.x_if = np.zeros([len(self.item_idx), 1], dtype=np.float32)
 
 
- def _init_weights(self, user_features, item_features):
- """initialize model weights given user/item and user_feature/item_feature indexes/shapes"""
+ def _init_weights(self, user_features=None, item_features=None):
+ """initialize model weights given user/item and user-feature/item-feature indexes/shapes
+
+ :param user_features: dataframe of user metadata features: [user_id, uf_1, ... , uf_n]
+ :param item_features: dataframe of item metadata features: [item_id, if_1, ... , if_n]
+ :return: None
+ """
 
  # initialize scalar weights as ndarrays of zeros
  self.w_i = np.zeros(len(self.item_idx)).astype(np.float32)
@@ -253,8 +253,8 @@ def fit(self, interactions, user_features=None, item_features=None, sample_weigh
  """clear previous model state and learn new model weights using the input data
 
  :param interactions: dataframe of observed user/item interactions: [user_id, item_id]
- :param user_features: dataframe of user metadata features: [user_id, uf_1, ..., uf_n]
- :param item_features: dataframe of item metadata features: [item_id, if_1, ..., if_n]
+ :param user_features: dataframe of user metadata features: [user_id, uf_1, ... , uf_n]
+ :param item_features: dataframe of item metadata features: [item_id, if_1, ... , if_n]
  :param sample_weight: vector of importance weights for each observed interaction
  :param epochs: number of training epochs (full passes through observed interactions)
  :param verbose: whether to print epoch number and log-likelihood during training
@@ -263,25 +263,24 @@ def fit(self, interactions, user_features=None, item_features=None, sample_weigh
 
  self._reset_state()
  self.fit_partial(interactions, user_features, item_features, sample_weight, epochs, verbose)
+ return self
 
 
  def fit_partial(self, interactions, user_features=None, item_features=None, sample_weight=None, epochs=1, verbose=False):
  """learn or update model weights using the input data and resuming from the current model state
 
  :param interactions: dataframe of observed user/item interactions: [user_id, item_id]
- :param user_features: dataframe of user metadata features: [user_id, uf_1, ..., uf_n]
- :param item_features: dataframe of item metadata features: [item_id, if_1, ..., if_n]
+ :param user_features: dataframe of user metadata features: [user_id, uf_1, ... , uf_n]
+ :param item_features: dataframe of item metadata features: [item_id, if_1, ... , if_n]
  :param sample_weight: vector of importance weights for each observed interaction
  :param epochs: number of training epochs (full passes through observed interactions)
  :param verbose: whether to print epoch number and log-likelihood during training
  :return: self
  """
 
- # validate user inputs
  assert isinstance(epochs, int) and epochs >= 1, "[epochs] must be a positive integer"
  assert isinstance(verbose, bool), "[verbose] must be a boolean value"
 
- # initialize internal data representations
  if self.is_fit:
  self._init_interactions(interactions, sample_weight)
  self._init_features(user_features, item_features)
@@ -299,7 +298,9 @@ def fit_partial(self, interactions, user_features=None, item_features=None, samp
  else:
  raise ValueError('[loss] function not recognized')
 
- # NOTE: the cython internal fit method updates the model weights in place via memoryviews
+ # NOTE: the cython private _fit() method updates the model weights in-place via typed memoryviews
+ # NOTE: therefore there's nothing returned explicitly by either method
+
  _fit(
  self.interactions,
  self.sample_weight,
@@ -334,7 +335,6 @@ def predict(self, pairs, cold_start='nan'):
  :return: np.array of real-valued model scores
  """
 
- # check user data inputs
  assert isinstance(pairs, (np.ndarray, pd.DataFrame)), "[pairs] must be np.ndarray or pd.dataframe"
  assert pairs.shape[1] == 2, "[pairs] should be: [user_id, item_id]"
  assert self.is_fit, "you must fit the model prior to generating predictions"
@@ -410,7 +410,6 @@ def similar_items(self, item_id, n_items=10):
  :return: np.array of topN most similar items wrt latent factor representations
  """
 
- # ensure that the model has been fit before attempting to generate predictions
  assert item_id in self.item_id.values, "you must select an [item_id] present in the training data"
  assert self.is_fit, "you must fit the model prior to generating similarities"
 
@@ -437,7 +436,6 @@ def similar_users(self, user_id, n_users=10):
  :return: np.array of topN most similar users wrt latent factor representations
  """
 
- # ensure that the model has been fit before attempting to generate predictions
  assert user_id in self.user_id.values, "you must select an [user_id] present in the training data"
  assert self.is_fit, "you must fit the model prior to generating similarities"