adding sphinx docs

etlundquist · Jun 15, 2020 · 5e77bb3 · 5e77bb3
1 parent af3a621
commit 5e77bb3
Show file tree

Hide file tree

Showing 7 changed files with 398 additions and 0 deletions.
diff --git a/docs/Makefile b/docs/Makefile
@@ -0,0 +1,19 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS =
+SPHINXBUILD = sphinx-build
+SOURCEDIR = source
+BUILDDIR = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -0,0 +1,179 @@
+# -*- coding: utf-8 -*-
+#
+# Configuration file for the Sphinx documentation builder.
+#
+# This file does only contain a selection of the most common options. For a
+# full list see the documentation:
+# http:https://www.sphinx-doc.org/en/master/config
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'rankfm'
+copyright = '2020, Eric Lundquist'
+author = 'Eric Lundquist'
+
+# The short X.Y version
+version = ''
+# The full version, including alpha/beta/rc tags
+release = '0.2.5'
+
+
+# -- General configuration ---------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+ 'sphinx.ext.autodoc',
+ 'sphinx.ext.mathjax',
+ 'sphinx.ext.viewcode',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = None
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'alabaster'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further. For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# The default sidebars (for documents that don't match any pattern) are
+# defined by theme itself. Builtin themes are using these templates by
+# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
+# 'searchbox.html']``.
+#
+# html_sidebars = {}
+
+
+# -- Options for HTMLHelp output ---------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'rankfmdoc'
+
+
+# -- Options for LaTeX output ------------------------------------------------
+
+latex_elements = {
+ # The paper size ('letterpaper' or 'a4paper').
+ #
+ # 'papersize': 'letterpaper',
+
+ # The font size ('10pt', '11pt' or '12pt').
+ #
+ # 'pointsize': '10pt',
+
+ # Additional stuff for the LaTeX preamble.
+ #
+ # 'preamble': '',
+
+ # Latex figure (float) alignment
+ #
+ # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+# author, documentclass [howto, manual, or own class]).
+latex_documents = [
+ (master_doc, 'rankfm.tex', 'rankfm Documentation',
+ 'Eric Lundquist', 'manual'),
+]
+
+
+# -- Options for manual page output ------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+ (master_doc, 'rankfm', 'rankfm Documentation',
+ [author], 1)
+]
+
+
+# -- Options for Texinfo output ----------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+# dir menu entry, description, category)
+texinfo_documents = [
+ (master_doc, 'rankfm', 'rankfm Documentation',
+ author, 'rankfm', 'One line description of project.',
+ 'Miscellaneous'),
+]
+
+
+# -- Options for Epub output -------------------------------------------------
+
+# Bibliographic Dublin Core info.
+epub_title = project
+
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#
+# epub_identifier = ''
+
+# A unique identification for the text.
+#
+# epub_uid = ''
+
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ['search.html']
+
+
+# -- Extension configuration -------------------------------------------------
diff --git a/docs/source/evaluation.rst b/docs/source/evaluation.rst
@@ -0,0 +1,7 @@
+Model Evaluation
+================
+
+.. automodule:: rankfm.evaluation
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/docs/source/home.rst b/docs/source/home.rst
@@ -0,0 +1,57 @@
+Welcome to RankFM's Documentation!
+==================================
+
+RankFM is a python implementation of the general Factorization Machines model class described in `Rendle 2010 <https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf>`_ adapted for collaborative filtering recommendation/ranking problems with implicit feedback user-item interaction data. It uses `Bayesian Personalized Ranking (BPR) <https://arxiv.org/pdf/1205.2618.pdf>`_ and a variant of `Weighted Approximate-Rank Pairwise (WARP) <http:https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.587.3946&rep=rep1&type=pdf>`_ loss to learn model weights via Stochastic Gradient Descent (SGD). It can (optionally) incorporate individual training sample weights and/or user/item auxiliary features to augment the main interaction data for model training.
+
+The core training/prediction/recommendation methods are written in `Cython <https://cython.org/>`_. This makes it possible to scale to millions of users, items, and interactions. Designed for ease-of-use, RankFM accepts both `pd.DataFrame` and `np.ndarray` inputs. You do not have to convert your data to `scipy.sparse` matrices or re-map user/item identifiers to matrix indexes prior to use - RankFM internally maps all user/item identifiers to zero-based integer indexes, but always converts its outputs back to the original user/item identifiers from your data, which can be arbitrary (non-zero-based, non-consecutive) integers or even strings.
+
+In addition to the familiar `fit()`, `predict()`, `recommend()` methods, RankFM includes additional utilities `similiar_users()` and `similar_items()` to find the most similar users/items to a given user/item based on latent factor space embeddings. A number of popular recommendation/ranking evaluation metric functions have been included in the separate `evaluation` module to streamline model tuning and validation.
+
+Dependencies
+------------
+
+* Python 3.6+
+* numpy >= 1.15
+* pandas >= 0.24
+
+Installation
+------------
+
+Prerequisites
+^^^^^^^^^^^^^
+
+To install RankFM's C extensions you will need the `GNU Compiler Collection (GCC) <https://gcc.gnu.org/>`_. Check to see whether you already have it installed:
+
+.. code:: bash
+
+ gcc --version
+
+If you don't have it already you can easily install it using `Homebrew <https://brew.sh/>`_ on OSX or your default linux package manager:
+
+.. code:: bash
+
+ # OSX
+ brew install gcc
+
+ # linux
+ sudo yum install gcc
+
+ # ensure [gcc] has been installed correctly and is on the system PATH
+ gcc --version
+
+Package Installation
+^^^^^^^^^^^^^^^^^^^^
+
+You can install the latest published version from PyPI using `pip`:
+
+.. code:: bash
+
+ pip install rankfm
+
+Or alternatively install the current development build directly from GitHub:
+
+.. code:: bash
+ 
+ pip install git+https://github.com/etlundquist/rankfm.git#egg=rankfm
+
+
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -0,0 +1,14 @@
+.. include:: home.rst
+
+
+Contents 
+========
+
+.. toctree::
+ :maxdepth: 2
+
+ Home <home>
+ Quickstart <quickstart>
+ RankFM Model <rankfm>
+ Model Evaluation <evaluation>
+
diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
@@ -0,0 +1,115 @@
+Quickstart
+==========
+
+Let's work through a simple example of fitting a model, generating recommendations, evaluating performance, and assessing some item-item similarities. The data we'll be using here may already be somewhat familiar: you know it, you love it, it's the `MovieLens 1M <https://grouplens.org/datasets/movielens/1m/>`_!
+
+Let's first look at the required shape of the interaction data:
+
+======= =======
+user_id item_id 
+======= =======
+3 233
+5 377
+8 610
+======= =======
+
+It has just two columns: a `user_id` and an `item_id` (you can name these fields whatever you want or use a numpy array instead). Notice that there is no `rating` column - this library is for **implicit feedback** data (e.g. watches, page views, purchases, clicks) as opposed to **explicit feedback** data (e.g. 1-5 ratings, thumbs up/down). Implicit feedback is far more common in real-world recommendation contexts and doesn't suffer from the `missing-not-at-random problem <https://resources.bibblio.org/hubfs/share/2018-01-24-RecSysLDN-Ravelin.pdf>`_ of pure explicit feedback approaches.
+
+Now let's import the library, initialize our model, and fit on the training data:
+
+.. code:: python
+
+ from rankfm.rankfm import RankFM
+ model = RankFM(factors=20, loss='warp', max_samples=20, learning_rate=0.1, learning_schedule='invscaling')
+ model.fit(interactions_train, epochs=20, verbose=True)
+
+If you set `verbose=True` the model will print the current epoch number as well as the epoch's log-likelihood during training. This can be useful to gauge both computational speed and training gains by epoch. If the log likelihood is not increasing then try upping the `learning_rate` or lowering the (`alpha`, `beta`) regularization strength terms. If the log likelihood is starting to bounce up and down try lowering the `learning_rate` or using `learning_schedule='invscaling'` to decrease the learning rate over time. If you run into overflow errors then decrease the feature and/or sample-weight magnitudes and try upping `beta`, especially if you have a small number of dense user-features and/or item-features. Selecting `BPR` loss will lead to faster training times, but `WARP` loss typically yields superior model performance.
+
+Now let's generate some user-item model scores from the validation data:
+
+.. code:: python
+
+ valid_scores = model.predict(interactions_valid, cold_start='nan')
+
+this will produce an array of real-valued model scores generated using the Factorization Machines model equation. You can interpret it as a measure of the predicted utility of item (i) for user (u). The `cold_start='nan'` option can be used to set scores to `np.nan` for user/item pairs not found in the training data, or `cold_start='drop'` can be specified to drop those pairs so the results contain no missing values.
+
+Now let's generate our topN recommended movies for each user:
+
+.. code:: python
+
+ valid_recs = model.recommend(valid_users, n_items=10, filter_previous=True, cold_start='drop')
+
+The input should be a `pd.Series`, `np.ndarray` or `list` of `user_id` values. You can use `filter_previous=True` to prevent generating recommendations that include any items observed by the user in the training data, which could be useful depending on your application context. The result will be a `pd.DataFrame` where `user_id` values will be the index and the rows will be each user's top recommended items in descending order (best item is in column 0):
+
+======= ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
+user_id 0 1 2 3 4 5 6 7 8 9
+======= ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
+3 2396 1265 357 34 2858 3175 1 2028 17 356
+5 608 1617 1610 3418 590 474 858 377 924 1036
+8 589 1036 2571 2028 2000 1220 1197 110 780 1954
+======= ==== ==== ==== ==== ==== ==== ==== ==== ==== ====
+
+Now let's see how the model is performing wrt the included validation metrics evaluated on the hold-out data:
+
+.. code:: python
+
+ from rankfm.evaluation import hit_rate, reciprocal_rank, discounted_cumulative_gain, precision, recall
+
+ valid_hit_rate = hit_rate(model, interactions_valid, k=10)
+ valid_reciprocal_rank = reciprocal_rank(model, interactions_valid, k=10)
+ valid_dcg = discounted_cumulative_gain(model, interactions_valid, k=10)
+ valid_precision = precision(model, interactions_valid, k=10)
+ valid_recall = recall(model, interactions_valid, k=10)
+
+.. parsed-literal::
+
+ hit_rate: 0.796
+ reciprocal_rank: 0.339
+ dcg: 0.734
+ precision: 0.159
+ recall: 0.077
+
+`That's a Bingo! <https://www.youtube.com/watch?v=q5pESPQpXxE>`_
+
+Now let's find the most similar other movies for a few movies based on their embedding representations in latent factor space:
+
+.. code:: python
+
+ # Terminator 2: Judgment Day (1991)
+ model.similar_items(589, n_items=10)
+
+.. parsed-literal::
+
+ 2571 Matrix, The (1999)
+ 1527 Fifth Element, The (1997)
+ 2916 Total Recall (1990)
+ 3527 Predator (1987)
+ 780 Independence Day (ID4) (1996)
+ 1909 X-Files: Fight the Future, The (1998)
+ 733 Rock, The (1996)
+ 1376 Star Trek IV: The Voyage Home (1986)
+ 480 Jurassic Park (1993)
+ 1200 Aliens (1986)
+
+`I hope you like explosions... <https://www.youtube.com/watch?v=uENYMZNzg9w>`_
+
+.. code:: python
+
+ # Being John Malkovich (1999)
+ model.similar_items(2997, n_items=10)
+
+.. parsed-literal::
+
+ 2599 Election (1999)
+ 3174 Man on the Moon (1999)
+ 2858 American Beauty (1999)
+ 3317 Wonder Boys (2000)
+ 223 Clerks (1994)
+ 3897 Almost Famous (2000)
+ 2395 Rushmore (1998)
+ 2502 Office Space (1999)
+ 2908 Boys Don't Cry (1999)
+ 3481 High Fidelity (2000)
+
+`Let's get weird... <https://www.youtube.com/watch?v=lIpev8JXJHQ&t=5s>`_
+