add kmerf docstrings

neurodata · sampan501 · Jan 12, 2021 · Jan 11, 2021 · Jan 11, 2021 · Jan 11, 2021
commit 3831b4b1b21c7a37f47846f7e75a1bed26d74062
diff --git a/docs/reference/independence.rst b/docs/reference/independence.rst
@@ -3,6 +3,10 @@ Independence
 
 .. currentmodule:: hyppo.independence
 
+Kernel Mean Embedding Random Forest (KMERF)
+-------------------------------------------
+.. autoclass:: KMERF
+
 Multiscale Graph Correlation (MGC)
 ----------------------------------
 .. autoclass:: MGC

diff --git a/hyppo/independence/kmerf.py b/hyppo/independence/kmerf.py
@@ -16,7 +16,77 @@
 
 class KMERF(IndependenceTest):
  r"""
- Class for calculating the kernel mean embedding (KMERF) test statistic and p-value.
+ Class for calculating the KMERF test statistic and p-value.
+
+ The KMERF test statistic is a kernel method for calculating independence by using
+ a random forest induced similarity matrix as an input, and has been shown to have
+ especially high gains in finite sample testing power in high dimensional settings
+ [#1KMERF]_.
+
+ Parameters
+ ----------
+ forest : {"classifier", "regressor"}
+ Type of forest used when running the independence test. If the `y` input in
+ ``test`` is categorial, use the "classifier" keyword.
+ ntrees : int, optional (default: 500)
+ The number of trees used in the random forest.
+ **kwargs : optional
+ Additional arguments used for the forest (see
+ ``sklearn.ensemble.RandomForestClassifier`` or
+ ``sklearn.ensemble.RandomForestRegressor``)
+
+ Notes
+ -----
+ A description of KMERF in greater detail can be found in [#1KMERF]_. It is computed
+ using the following steps:
+
+ Let :math:`x` and :math:`y` be :math:`(n, p)` samples of random variables
+ :math:`X` and :math:`Y`.
+
+ + Run random forest with :math:`m` trees. Independent bootstrap samples of size
+ :math:`n_{b} \leq n` are drawn to build a tree each time; each tree structure
+ within the forest is denoted as :math:`\phi_w \in \mathbf{P}`,
+ :math:`w \in \{ 1, \ldots, m \}`; :math:`\phi_w(x_i)` denotes the partition
+ assigned to :math:`x_i`.
+
+ + Calculate the proximity kernel:
+
+ .. math::
+
+ \mathbf{K}^{\mathbf{x}}_{ij} = \frac{1}{m} \sum_{w = 1}^{m} I(\phi_w(x_i)
+ = \phi_w(x_j))
+
+ where :math:`I(\cdot)`$` is the indicator function for how often two observations
+ lie in the same partition.
+
+ + Compute the induced kernel correlation: Let
+
+ .. math::
+
+ \mathbf{L}^{\mathbf{x}}_{ij}=
+ \begin{cases}
+ \mathbf{K}^{\mathbf{x}}_{ij}
+ - \frac{1}{n-2} \sum_{t=1}^{n} \mathbf{K}^{\mathbf{x}}_{it}
+ - \frac{1}{n-2} \sum_{s=1}^{n} \mathbf{K}^{\mathbf{x}}_{sj}
+ + \frac{1}{(n-1)(n-2)} \sum_{s,t=1}^{n} \mathbf{K}^{\mathbf{x}}_{st}
+ & \mbox{when} \ i \neq j \\
+ 0 & \mbox{ otherwise}
+ \end{cases}
+
+ + Then let :math:`\mathbf{K}^{\mathbf{y}}` be the Euclidean distance induced kernel,
+ and similarly compute :math:`\mathbf{L}^{\mathbf{y}}` from
+ :math:`\mathbf{K}^{\mathbf{y}}`. The unbiased kernel correlation equals
+
+ .. math::
+
+ KMERF_n(\mathbf{x}, \mathbf{y}) = \frac{1}{n(n-3)}
+ \mathrm{tr} \left( \mathbf{L}^{\mathbf{x}} \mathbf{L}^{\mathbf{y}} \right)
+
+ References
+ ----------
+ .. [#1KMERF] Shen, C., Panda, S., & Vogelstein, J. T. (2018). Learning
+ Interpretable Characteristic Kernels via Decision Forests.
+ arXiv preprint arXiv:1812.00029.
  """
 
  def __init__(self, forest="regressor", ntrees=500, **kwargs):
@@ -30,6 +100,20 @@ def _statistic(self, x, y):
  r"""
  Helper function that calculates the KMERF test statistic.
 
+ Parameters
+ ----------
+ x, y : ndarray
+ Input data matrices. `x` and `y` must have the same number of
+ samples. That is, the shapes must be `(n, p)` and `(n, q)` where
+ `n` is the number of samples and `p` and `q` are the number of
+ dimensions. Alternatively, `x` and `y` can be distance matrices,
+ where the shapes must both be `(n, n)`.
+
+ Returns
+ -------
+ stat : float
+ The computed KMERF statistic.
+
  y must be categorical
  """
  y = y.reshape(-1)
@@ -45,15 +129,39 @@ def _statistic(self, x, y):
  def test(self, x, y, reps=1000, workers=1):
  r"""
  Calculates the KMERF test statistic and p-value.
+
+ Parameters
+ ----------
+ x, y : ndarray
+ Input data matrices. `x` and `y` must have the same number of
+ samples. That is, the shapes must be `(n, p)` and `(n, q)` where
+ `n` is the number of samples and `p` and `q` are the number of
+ dimensions. Alternatively, `x` and `y` can be distance matrices,
+ where the shapes must both be `(n, n)`.
+ reps : int, optional (default: 1000)
+ The number of replications used to estimate the null distribution
+ when using the permutation test used to calculate the p-value.
+ workers : int, optional (default: 1)
+ The number of cores to parallelize the p-value computation over.
+ Supply -1 to use all cores available to the Process.
+
+ Returns
+ -------
+ stat : float
+ The computed KMERF statistic.
+ pvalue : float
+ The computed KMERF p-value.
+
+ Examples
+ --------
+ >>> import numpy as np
+ >>> from hyppo.independence import KMERF
+ >>> x = np.arange(100)
+ >>> y = x
+ >>> '%.1f, %.2f' % KMERF().test(x, y) # doctest: +SKIP
+ '1.0, 0.001'
  """
  check_input = _CheckInputs(x, y, reps=reps)
  x, y = check_input()
 
- stat, pvalue, null_dist = perm_test(
- self._statistic, x, y, reps=reps, workers=workers, is_distsim=False
- )
- self.stat = stat
- self.pvalue = pvalue
- self.null_dist = null_dist
-
- return stat, pvalue
+ return super(KMERF, self).test(x, y, reps, workers, is_distsim=False)
diff --git a/hyppo/independence/mgc.py b/hyppo/independence/mgc.py
@@ -55,7 +55,7 @@ class MGC(IndependenceTest):
  :math:`n \times n` distance matrices :math:`A` and :math:`B` (the
  centering and unbiased modification) [3]_.
 
- #. For all values :math:`k` and :math:`l` from :math:`1, ..., n`,
+ + For all values :math:`k` and :math:`l` from :math:`1, ..., n`,
 
  * The :math:`k`-nearest neighbor and :math:`l`-nearest neighbor graphs
  are calculated for each property. Here, :math:`G_k (i, j)` indicates
@@ -66,19 +66,19 @@ class MGC(IndependenceTest):
  * Let :math:`\circ` denotes the entry-wise matrix product, then local
  correlations are summed and normalized using the following statistic:
 
- .. math::
+  .. math::
 
- c^{kl} = \frac{\sum_{ij} A G_k B H_l}
- {\sqrt{\sum_{ij} A^2 G_k \times \sum_{ij} B^2 H_l}}
+  c^{kl} = \frac{\sum_{ij} A G_k B H_l}
+  {\sqrt{\sum_{ij} A^2 G_k \times \sum_{ij} B^2 H_l}}
 
- #. The MGC test statistic is the smoothed optimal local correlation of
-  :math:`\{ c^{kl} \}`. Denote the smoothing operation as :math:`R(\cdot)`
-  (which essentially set all isolated large correlations) as 0 and
-  connected large correlations the same as before, see [#3MGC]_.) MGC is,
+ + The MGC test statistic is the smoothed optimal local correlation of
+ :math:`\{ c^{kl} \}`. Denote the smoothing operation as :math:`R(\cdot)`
+ (which essentially set all isolated large correlations) as 0 and
+ connected large correlations the same as before, see [#3MGC]_.) MGC is,
 
- .. math::
+  .. math::
 
- MGC_n (x, y) = \max_{(k, l)} R \left(c^{kl} \left( x_n, y_n \right)
+  MGC_n (x, y) = \max_{(k, l)} R \left(c^{kl} \left( x_n, y_n \right)
  \right)
 
  The test statistic returns a value between :math:`(-1, 1)` since it is
@@ -192,17 +192,6 @@ def test(self, x, y, reps=1000, workers=1):
  >>> '%.1f, %.3f' % (stat, pvalue)
  '1.0, 0.001'
 
- The number of replications can give p-values with higher confidence
- (greater alpha levels).
-
- >>> import numpy as np
- >>> from hyppo.independence import MGC
- >>> x = np.arange(100)
- >>> y = x
- >>> stat, pvalue, _ = MGC().test(x, y, reps=10000)
- >>> '%.1f, %.3f' % (stat, pvalue)
- '1.0, 0.000'
-
  In addition, the inputs can be distance matrices. Using this is the,
  same as before, except the ``compute_distance`` parameter must be set
  to ``None``.

diff --git a/hyppo/ksample/_utils.py b/hyppo/ksample/_utils.py
@@ -59,33 +59,38 @@ def _convert_inputs_float64(self):
  def _check_indep_test(self):
  tests = [CCA, Dcorr, HHG, RV, Hsic, MGC, KMERF]
  if self.indep_test.__class__ not in tests and self.indep_test:
- raise ValueError(
- "indep_test must be in {}".format(tests)
- )
+ raise ValueError("indep_test must be in {}".format(tests))
 
  def _check_min_samples(self):
  for i in self.inputs:
  if i.shape[0] <= 3:
  raise ValueError("Number of samples is too low")
 
 
-def k_sample_transform(inputs):
+def k_sample_transform(inputs, test_type="normal"):
  n_inputs = len(inputs)
  u = np.vstack(inputs)
  if np.var(u) == 0:
  raise ValueError("Test cannot be run, the inputs have 0 variance")
 
- if n_inputs == 2:
- n1 = inputs[0].shape[0]
- n2 = inputs[1].shape[0]
- v = np.vstack([np.zeros((n1, 1)), np.ones((n2, 1))])
+ if test_type == "rf":
+ v = np.concatenate(
+ [np.repeat(i, inputs[i].shape[0]) for i in range(n_inputs)], axis=0
+ )
+ elif test_type == "normal":
+ if n_inputs == 2:
+ n1 = inputs[0].shape[0]
+ n2 = inputs[1].shape[0]
+ v = np.vstack([np.zeros((n1, 1)), np.ones((n2, 1))])
+ else:
+ vs = []
+ for i in range(n_inputs):
+ n = inputs[i].shape[0]
+ encode = np.zeros(shape=(n, n_inputs))
+ encode[:, i] = np.ones(shape=n)
+ vs.append(encode)
+ v = np.concatenate(vs)
  else:
- vs = []
- for i in range(n_inputs):
- n = inputs[i].shape[0]
- encode = np.zeros(shape=(n, n_inputs))
- encode[:, i] = np.ones(shape=n)
- vs.append(encode)
- v = np.concatenate(vs)
+ raise ValueError("test_type must be normal or rf")
 
  return u, v
diff --git a/hyppo/ksample/ksamp.py b/hyppo/ksample/ksamp.py
@@ -93,7 +93,7 @@ def __init__(self, indep_test, compute_distance="euclidean", bias=False, **kwarg
  compute_distance=compute_distance, **kwargs
  )
  elif self.indep_test_name == "kmerf":
- self.indep_test = indep_test(**kwargs)
+ self.indep_test = indep_test(forest_type="classifier", **kwargs)
  else:
  self.indep_test = indep_test()
 
@@ -115,7 +115,10 @@ def _statistic(self, *args):
  matrices, where the shapes must all be `(n, n)`.
  """
  inputs = list(args)
- u, v = k_sample_transform(inputs)
+ if self.indep_test_name == "kmerf":
+ u, v = k_sample_transform(inputs, test_type="rf")
+ else:
+ u, v = k_sample_transform(inputs)
 
  return self.indep_test._statistic(u, v)
 
@@ -167,7 +170,10 @@ def test(self, *args, reps=1000, workers=1, auto=True):
  indep_test=self.indep_test,
  )
  inputs = check_input()
- u, v = k_sample_transform(inputs)
+ if self.indep_test_name == "kmerf":
+ u, v = k_sample_transform(inputs, test_type="rf")
+ else:
+ u, v = k_sample_transform(inputs)
 
  kwargs = {}
  if self.indep_test_name in ["dcorr", "hsic"]:

diff --git a/pytest.ini b/pytest.ini
@@ -1,8 +1,9 @@
 [pytest]
 addopts = --cov=hyppo --doctest-modules
+deselect = "hyppo/independence/kmerf.py"
 
 
-filterwarnings = 
+filterwarnings =
 # Matrix PendingDeprecationWarning.
  ignore:Using or importing the ABCs from 'collections'
  ignore:the matrix subclass is not