Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor package and add utilities documentation #156

Merged
merged 21 commits into from
Jan 12, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
add kmerf docstrings
  • Loading branch information
sampan501 committed Jan 11, 2021
commit 3831b4b1b21c7a37f47846f7e75a1bed26d74062
4 changes: 4 additions & 0 deletions docs/reference/independence.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@ Independence

.. currentmodule:: hyppo.independence

Kernel Mean Embedding Random Forest (KMERF)
-------------------------------------------
.. autoclass:: KMERF

Multiscale Graph Correlation (MGC)
----------------------------------
.. autoclass:: MGC
Expand Down
126 changes: 117 additions & 9 deletions hyppo/independence/kmerf.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,77 @@

class KMERF(IndependenceTest):
r"""
Class for calculating the kernel mean embedding (KMERF) test statistic and p-value.
Class for calculating the KMERF test statistic and p-value.

The KMERF test statistic is a kernel method for calculating independence by using
a random forest induced similarity matrix as an input, and has been shown to have
especially high gains in finite sample testing power in high dimensional settings
[#1KMERF]_.

Parameters
----------
forest : {"classifier", "regressor"}
Type of forest used when running the independence test. If the `y` input in
``test`` is categorial, use the "classifier" keyword.
ntrees : int, optional (default: 500)
The number of trees used in the random forest.
**kwargs : optional
Additional arguments used for the forest (see
``sklearn.ensemble.RandomForestClassifier`` or
``sklearn.ensemble.RandomForestRegressor``)

Notes
-----
A description of KMERF in greater detail can be found in [#1KMERF]_. It is computed
using the following steps:

Let :math:`x` and :math:`y` be :math:`(n, p)` samples of random variables
:math:`X` and :math:`Y`.

+ Run random forest with :math:`m` trees. Independent bootstrap samples of size
:math:`n_{b} \leq n` are drawn to build a tree each time; each tree structure
within the forest is denoted as :math:`\phi_w \in \mathbf{P}`,
:math:`w \in \{ 1, \ldots, m \}`; :math:`\phi_w(x_i)` denotes the partition
assigned to :math:`x_i`.

+ Calculate the proximity kernel:

.. math::

\mathbf{K}^{\mathbf{x}}_{ij} = \frac{1}{m} \sum_{w = 1}^{m} I(\phi_w(x_i)
= \phi_w(x_j))

where :math:`I(\cdot)`$` is the indicator function for how often two observations
lie in the same partition.

+ Compute the induced kernel correlation: Let

.. math::

\mathbf{L}^{\mathbf{x}}_{ij}=
\begin{cases}
\mathbf{K}^{\mathbf{x}}_{ij}
- \frac{1}{n-2} \sum_{t=1}^{n} \mathbf{K}^{\mathbf{x}}_{it}
- \frac{1}{n-2} \sum_{s=1}^{n} \mathbf{K}^{\mathbf{x}}_{sj}
+ \frac{1}{(n-1)(n-2)} \sum_{s,t=1}^{n} \mathbf{K}^{\mathbf{x}}_{st}
& \mbox{when} \ i \neq j \\
0 & \mbox{ otherwise}
\end{cases}

+ Then let :math:`\mathbf{K}^{\mathbf{y}}` be the Euclidean distance induced kernel,
and similarly compute :math:`\mathbf{L}^{\mathbf{y}}` from
:math:`\mathbf{K}^{\mathbf{y}}`. The unbiased kernel correlation equals

.. math::

KMERF_n(\mathbf{x}, \mathbf{y}) = \frac{1}{n(n-3)}
\mathrm{tr} \left( \mathbf{L}^{\mathbf{x}} \mathbf{L}^{\mathbf{y}} \right)

References
----------
.. [#1KMERF] Shen, C., Panda, S., & Vogelstein, J. T. (2018). Learning
Interpretable Characteristic Kernels via Decision Forests.
arXiv preprint arXiv:1812.00029.
"""

def __init__(self, forest="regressor", ntrees=500, **kwargs):
Expand All @@ -30,6 +100,20 @@ def _statistic(self, x, y):
r"""
Helper function that calculates the KMERF test statistic.

Parameters
----------
x, y : ndarray
Input data matrices. `x` and `y` must have the same number of
samples. That is, the shapes must be `(n, p)` and `(n, q)` where
`n` is the number of samples and `p` and `q` are the number of
dimensions. Alternatively, `x` and `y` can be distance matrices,
where the shapes must both be `(n, n)`.

Returns
-------
stat : float
The computed KMERF statistic.

y must be categorical
"""
y = y.reshape(-1)
Expand All @@ -45,15 +129,39 @@ def _statistic(self, x, y):
def test(self, x, y, reps=1000, workers=1):
r"""
Calculates the KMERF test statistic and p-value.

Parameters
----------
x, y : ndarray
Input data matrices. `x` and `y` must have the same number of
samples. That is, the shapes must be `(n, p)` and `(n, q)` where
`n` is the number of samples and `p` and `q` are the number of
dimensions. Alternatively, `x` and `y` can be distance matrices,
where the shapes must both be `(n, n)`.
reps : int, optional (default: 1000)
The number of replications used to estimate the null distribution
when using the permutation test used to calculate the p-value.
workers : int, optional (default: 1)
The number of cores to parallelize the p-value computation over.
Supply -1 to use all cores available to the Process.

Returns
-------
stat : float
The computed KMERF statistic.
pvalue : float
The computed KMERF p-value.

Examples
--------
>>> import numpy as np
>>> from hyppo.independence import KMERF
>>> x = np.arange(100)
>>> y = x
>>> '%.1f, %.2f' % KMERF().test(x, y) # doctest: +SKIP
'1.0, 0.001'
"""
check_input = _CheckInputs(x, y, reps=reps)
x, y = check_input()

stat, pvalue, null_dist = perm_test(
self._statistic, x, y, reps=reps, workers=workers, is_distsim=False
)
self.stat = stat
self.pvalue = pvalue
self.null_dist = null_dist

return stat, pvalue
return super(KMERF, self).test(x, y, reps, workers, is_distsim=False)
31 changes: 10 additions & 21 deletions hyppo/independence/mgc.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class MGC(IndependenceTest):
:math:`n \times n` distance matrices :math:`A` and :math:`B` (the
centering and unbiased modification) [3]_.

#. For all values :math:`k` and :math:`l` from :math:`1, ..., n`,
+ For all values :math:`k` and :math:`l` from :math:`1, ..., n`,

* The :math:`k`-nearest neighbor and :math:`l`-nearest neighbor graphs
are calculated for each property. Here, :math:`G_k (i, j)` indicates
Expand All @@ -66,19 +66,19 @@ class MGC(IndependenceTest):
* Let :math:`\circ` denotes the entry-wise matrix product, then local
correlations are summed and normalized using the following statistic:

.. math::
.. math::

c^{kl} = \frac{\sum_{ij} A G_k B H_l}
{\sqrt{\sum_{ij} A^2 G_k \times \sum_{ij} B^2 H_l}}
c^{kl} = \frac{\sum_{ij} A G_k B H_l}
{\sqrt{\sum_{ij} A^2 G_k \times \sum_{ij} B^2 H_l}}

#. The MGC test statistic is the smoothed optimal local correlation of
:math:`\{ c^{kl} \}`. Denote the smoothing operation as :math:`R(\cdot)`
(which essentially set all isolated large correlations) as 0 and
connected large correlations the same as before, see [#3MGC]_.) MGC is,
+ The MGC test statistic is the smoothed optimal local correlation of
:math:`\{ c^{kl} \}`. Denote the smoothing operation as :math:`R(\cdot)`
(which essentially set all isolated large correlations) as 0 and
connected large correlations the same as before, see [#3MGC]_.) MGC is,

.. math::
.. math::

MGC_n (x, y) = \max_{(k, l)} R \left(c^{kl} \left( x_n, y_n \right)
MGC_n (x, y) = \max_{(k, l)} R \left(c^{kl} \left( x_n, y_n \right)
\right)

The test statistic returns a value between :math:`(-1, 1)` since it is
Expand Down Expand Up @@ -192,17 +192,6 @@ def test(self, x, y, reps=1000, workers=1):
>>> '%.1f, %.3f' % (stat, pvalue)
'1.0, 0.001'

The number of replications can give p-values with higher confidence
(greater alpha levels).

>>> import numpy as np
>>> from hyppo.independence import MGC
>>> x = np.arange(100)
>>> y = x
>>> stat, pvalue, _ = MGC().test(x, y, reps=10000)
>>> '%.1f, %.3f' % (stat, pvalue)
'1.0, 0.000'

In addition, the inputs can be distance matrices. Using this is the,
same as before, except the ``compute_distance`` parameter must be set
to ``None``.
Expand Down
35 changes: 20 additions & 15 deletions hyppo/ksample/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,33 +59,38 @@ def _convert_inputs_float64(self):
def _check_indep_test(self):
tests = [CCA, Dcorr, HHG, RV, Hsic, MGC, KMERF]
if self.indep_test.__class__ not in tests and self.indep_test:
raise ValueError(
"indep_test must be in {}".format(tests)
)
raise ValueError("indep_test must be in {}".format(tests))

def _check_min_samples(self):
for i in self.inputs:
if i.shape[0] <= 3:
raise ValueError("Number of samples is too low")


def k_sample_transform(inputs):
def k_sample_transform(inputs, test_type="normal"):
n_inputs = len(inputs)
u = np.vstack(inputs)
if np.var(u) == 0:
raise ValueError("Test cannot be run, the inputs have 0 variance")

if n_inputs == 2:
n1 = inputs[0].shape[0]
n2 = inputs[1].shape[0]
v = np.vstack([np.zeros((n1, 1)), np.ones((n2, 1))])
if test_type == "rf":
v = np.concatenate(
[np.repeat(i, inputs[i].shape[0]) for i in range(n_inputs)], axis=0
)
elif test_type == "normal":
if n_inputs == 2:
n1 = inputs[0].shape[0]
n2 = inputs[1].shape[0]
v = np.vstack([np.zeros((n1, 1)), np.ones((n2, 1))])
else:
vs = []
for i in range(n_inputs):
n = inputs[i].shape[0]
encode = np.zeros(shape=(n, n_inputs))
encode[:, i] = np.ones(shape=n)
vs.append(encode)
v = np.concatenate(vs)
else:
vs = []
for i in range(n_inputs):
n = inputs[i].shape[0]
encode = np.zeros(shape=(n, n_inputs))
encode[:, i] = np.ones(shape=n)
vs.append(encode)
v = np.concatenate(vs)
raise ValueError("test_type must be normal or rf")

return u, v
12 changes: 9 additions & 3 deletions hyppo/ksample/ksamp.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def __init__(self, indep_test, compute_distance="euclidean", bias=False, **kwarg
compute_distance=compute_distance, **kwargs
)
elif self.indep_test_name == "kmerf":
self.indep_test = indep_test(**kwargs)
self.indep_test = indep_test(forest_type="classifier", **kwargs)
else:
self.indep_test = indep_test()

Expand All @@ -115,7 +115,10 @@ def _statistic(self, *args):
matrices, where the shapes must all be `(n, n)`.
"""
inputs = list(args)
u, v = k_sample_transform(inputs)
if self.indep_test_name == "kmerf":
u, v = k_sample_transform(inputs, test_type="rf")
else:
u, v = k_sample_transform(inputs)

return self.indep_test._statistic(u, v)

Expand Down Expand Up @@ -167,7 +170,10 @@ def test(self, *args, reps=1000, workers=1, auto=True):
indep_test=self.indep_test,
)
inputs = check_input()
u, v = k_sample_transform(inputs)
if self.indep_test_name == "kmerf":
u, v = k_sample_transform(inputs, test_type="rf")
else:
u, v = k_sample_transform(inputs)

kwargs = {}
if self.indep_test_name in ["dcorr", "hsic"]:
Expand Down
3 changes: 2 additions & 1 deletion pytest.ini
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
[pytest]
addopts = --cov=hyppo --doctest-modules
deselect = "hyppo/independence/kmerf.py"


filterwarnings =
filterwarnings =
# Matrix PendingDeprecationWarning.
ignore:Using or importing the ABCs from 'collections'
ignore:the matrix subclass is not
Expand Down