neurodata · sampan501 · Apr 28, 2020 · Apr 27, 2020 · Apr 27, 2020 · Apr 27, 2020
diff --git a/.travis.yml b/.travis.yml
@@ -1,9 +1,9 @@
 language: python
 
 python:
-- '3.5'
 - '3.6'
 - '3.7'
+- '3.8'
 
 jobs:
  include:
@@ -24,11 +24,11 @@ cache: pip
 install:
 - pip install -r requirements.txt
 - pip install -U pytest pytest-cov codecov
-- if [[ $TRAVIS_PYTHON_VERSION != '3.5' ]]; then travis_retry pip install black; fi
+- pip install black
 
 script:
 - pytest hyppo/
-- if [[ $TRAVIS_PYTHON_VERSION != '3.5' ]]; then black --check --diff ./hyppo; fi
+- black --check --diff ./hyppo
 
 after_success:
 - codecov

diff --git a/README.md b/README.md
@@ -15,4 +15,9 @@
 - **Issues:** [https://github.com/neurodata/hyppo/issues](https://github.com/neurodata/hyppo/issues)
 - **Contribution Guide:** [https://hyppo.neurodata.io/contributing.html](https://hyppo.neurodata.io/contributing.html)
 
+Some system/package requirements:
+- **Python**: 3.6+
+- **OS**: All major platforms (Linux, macOS, Windows)
+- **Dependencies**: numpy, scipy, numba, scikit-learn
+
 `hyppo` intends to be a comprehensive multivariate hypothesis testing package and runs on all major versions of operating systems. It also includes novel tests not found in other packages. It is quick to install and free of charge. If you need to use multivariate hypothesis testing, be sure to give `hyppo` a try!
diff --git a/benchmarks/power.py b/benchmarks/power.py
@@ -35,10 +35,6 @@ def __call__(self, index):
  # calculate permuted stats, store in null distribution
  perm_stat = self.test._statistic(x, permy)
 
- if self.test_name == "Pearson":
- obs_stat = np.abs(obs_stat)
- perm_stat = np.abs(perm_stat)
-
  return obs_stat, perm_stat
 
 

diff --git a/docs/reference/independence.rst b/docs/reference/independence.rst
@@ -26,15 +26,3 @@ Cannonical Correlation Analysis (CCA)
 RV
 ---
 .. autoclass:: RV
-
-Pearson
--------
-.. autoclass:: Pearson
-
-Kendall's tau
--------------
-.. autoclass:: Kendall
-
-Spearman's rho
---------------
-.. autoclass:: Spearman
diff --git a/docs/tutorials/independence/indep_power.ipynb b/docs/tutorials/independence/indep_power.ipynb
diff --git a/docs/tutorials/independence/power.py b/docs/tutorials/independence/power.py
@@ -35,10 +35,6 @@ def __call__(self, index):
  # calculate permuted stats, store in null distribution
  perm_stat = self.test._statistic(x, permy)
 
- if self.test_name == "Pearson":
- obs_stat = np.abs(obs_stat)
- perm_stat = np.abs(perm_stat)
-
  return obs_stat, perm_stat
 
 

diff --git a/hyppo/_utils.py b/hyppo/_utils.py
@@ -2,9 +2,9 @@
 from joblib import Parallel, delayed
 
 import numpy as np
-from scipy._lib._util import check_random_state
 from scipy.stats.distributions import chi2
-from scipy.spatial.distance import cdist
+from sklearn.metrics import pairwise_distances
+from sklearn.metrics.pairwise import rbf_kernel
 
 
 # from scipy
@@ -95,29 +95,37 @@ def check_inputs_distmat(inputs):
  )
 
 
-def euclidean(x):
+def euclidean(x, workers=None):
  """Default euclidean distance function calculation"""
- return cdist(x, x, metric="euclidean")
+ return pairwise_distances(X=x, metric="euclidean", n_jobs=workers)
 
 
-def gaussian(x):
+def gaussian(x, workers=None):
  """Default medial gaussian kernel similarity calculation"""
- l1 = cdist(x, x, "cityblock")
- mask = np.ones(l1.shape, dtype=bool)
- np.fill_diagonal(mask, 0)
- gamma = 1.0 / (2 * (np.median(l1[mask]) ** 2))
- return np.exp(-gamma * cdist(x, x, "sqeuclidean"))
+ l1 = pairwise_distances(X=x, metric="l1", n_jobs=workers)
+ n = l1.shape[0]
+ med = np.median(
+ np.lib.stride_tricks.as_strided(
+ l1, (n - 1, n + 1), (l1.itemsize * (n + 1), l1.itemsize)
+ )[:, 1:]
+ )
+ gamma = 1.0 / (2 * (med ** 2))
+ return rbf_kernel(x, gamma=gamma)
 
 
 # p-value computation
-def _perm_stat(calc_stat, x, y):
- permy = np.random.permutation(y)
+def _perm_stat(calc_stat, x, y, is_distsim=True):
+ if is_distsim:
+ order = np.random.permutation(y.shape[0])
+ permy = y[order][:, order]
+ else:
+ permy = np.random.permutation(y)
  perm_stat = calc_stat(x, permy)
 
  return perm_stat
 
 
-def perm_test(calc_stat, x, y, reps=1000, workers=1):
+def perm_test(calc_stat, x, y, reps=1000, workers=1, is_distsim=True):
  """
  Calculate the p-value via permutation
  """
@@ -127,12 +135,16 @@ def perm_test(calc_stat, x, y, reps=1000, workers=1):
  # calculate null distribution
  null_dist = np.array(
  Parallel(n_jobs=workers)(
- [delayed(_perm_stat)(calc_stat, x, y) for rep in range(reps)]
+ [delayed(_perm_stat)(calc_stat, x, y, is_distsim) for rep in range(reps)]
  )
  )
  pvalue = (null_dist >= stat).sum() / reps
+
+ # correct for a p-value of 0. This is because, with bootstrapping
+ # permutations, a p-value of 0 is incorrect
  if pvalue == 0:
  pvalue = 1 / reps
+
  return stat, pvalue, null_dist
 
 

diff --git a/hyppo/discrim/_utils.py b/hyppo/discrim/_utils.py
@@ -1,4 +1,3 @@
-import warnings
 import numpy as np
 from .._utils import (
  contains_nan,
@@ -42,7 +41,7 @@ def __call__(self):
  return self.x, self.y
 
  def _condition_input(self, x1):
- """Checks whether there is only one subject and removes 
+ """Checks whether there is only one subject and removes
  isolates and calculate distance."""
  uniques, counts = np.unique(self.y, return_counts=True)
 

diff --git a/hyppo/discrim/base.py b/hyppo/discrim/base.py
@@ -1,6 +1,5 @@
 from abc import ABC, abstractmethod
 import numpy as np
-from .._utils import euclidean
 
 
 class DiscriminabilityTest(ABC):

diff --git a/hyppo/discrim/discrim_one_samp.py b/hyppo/discrim/discrim_one_samp.py
@@ -1,6 +1,5 @@
 from ._utils import _CheckInputs
 import numpy as np
-import random
 from .base import DiscriminabilityTest
 from scipy._lib._util import MapWrapper
 

diff --git a/hyppo/discrim/discrim_two_samp.py b/hyppo/discrim/discrim_two_samp.py
@@ -1,6 +1,5 @@
 from ._utils import _CheckInputs
 import numpy as np
-import random
 from numba import njit
 from .base import DiscriminabilityTest
 from scipy._lib._util import MapWrapper
@@ -27,7 +26,8 @@ class DiscrimTwoSample(DiscriminabilityTest):
  Notes
  -----
  Let :math:`\hat D_{x_1}` denote the sample discriminability of one approach,
- and :math:`\hat D_{x_2}` denote the sample discriminability of another approach. Then,
+ and :math:`\hat D_{x_2}` denote the sample discriminability of another approach.
+ Then,
 
  .. math::
 
@@ -81,15 +81,15 @@ def test(self, x1, x2, y, reps=1000, alt="neq", workers=-1):
  to ``True`` in this case.
  y : ndarray
  A vector containing the sample ids for our `n` samples. Should be matched
- to the inputs such that ``y[i]`` is the corresponding label for ``x_1[i, :]``
- and ``x_2[i, :]``.
+ to the inputs such that ``y[i]`` is the corresponding label for
+ ``x_1[i, :]`` and ``x_2[i, :]``.
  reps : int, optional (default: 1000)
  The number of replications used to estimate the null distribution
  when using the permutation test used to calculate the p-value.
  alt : {"greater", "less", "neq"} (default: "neq")
- The alternative hypothesis for the test. Can test that first dataset is more
- discriminable (alt = "greater"), less discriminable (alt = "less") or unequal
- discriminability (alt = "neq").
+ The alternative hypothesis for the test. Can test that first dataset is
+ more discriminable (alt = "greater"), less discriminable (alt = "less")
+ or unequal discriminability (alt = "neq").
  workers : int, optional (default: -1)
  The number of cores to parallelize the p-value computation over.
  Supply -1 to use all cores available to the Process.

diff --git a/hyppo/discrim/tests/test_discrim_two_samp.py b/hyppo/discrim/tests/test_discrim_two_samp.py
@@ -38,7 +38,8 @@ def test_less(self):
  assert_almost_equal(p, obs_p, decimal=4)
 
  def test_neq(self):
- # test whether discriminability for x1 is not equal compared to discriminability for x2
+ # test whether discriminability for x1 is not equal compared to
+ # discriminability for x2
  x1 = np.ones((100, 2), dtype=float)
  x2 = np.concatenate((np.zeros((50, 2)), np.ones((50, 2))), axis=0)
  y = np.concatenate((np.zeros(50), np.ones(50)), axis=0)
@@ -55,7 +56,7 @@ def test_neq(self):
 
 
 class TestTwoSampleWarn:
- """ 
+ """
  Tests errors and warnings derived from one sample test.
  """
 

diff --git a/hyppo/independence/__init__.py b/hyppo/independence/__init__.py
@@ -1,8 +1,5 @@
-from .pearson import Pearson
 from .rv import RV
 from .cca import CCA
-from .kendall import Kendall
-from .spearman import Spearman
 from .hhg import HHG
 from .dcorr import Dcorr
 from .hsic import Hsic

diff --git a/hyppo/independence/_utils.py b/hyppo/independence/_utils.py
@@ -1,7 +1,4 @@
-import warnings
-
 import numpy as np
-from scipy.stats import chi2
 
 from .._utils import (
  contains_nan,
@@ -15,10 +12,9 @@
 class _CheckInputs:
  """Checks inputs for all independence tests"""
 
- def __init__(self, x, y, dim, reps=None, compute_distance=None):
+ def __init__(self, x, y, reps=None, compute_distance=None):
  self.x = x
  self.y = y
- self.dim = dim
  self.compute_distance = compute_distance
  self.reps = reps
 
@@ -29,6 +25,7 @@ def __call__(self):
  self.x, self.y = self.check_dim_xy()
  self.x, self.y = convert_xy_float64(self.x, self.y)
  self._check_min_samples()
+ self._check_variance()
  check_compute_distance(self.compute_distance)
 
  if self.reps:
@@ -38,30 +35,20 @@ def __call__(self):
 
  def check_dim_xy(self):
  """Convert x and y to proper dimensions"""
- # for kendall, pearson, and spearman
- if self.dim == 1:
- # check if x or y is shape (n,)
- if self.x.ndim > 1 or self.y.ndim > 1:
- self.x.shape = (-1,)
- self.y.shape = (-1,)
-
- # for other tests
- elif self.dim > 1:
- # convert arrays of type (n,) to (n, 1)
- if self.x.ndim == 1:
- self.x = self.x[:, np.newaxis]
- elif self.x.ndim != 2:
- raise ValueError(
- "Expected a 2-D array `x`, found shape " "{}".format(self.x.shape)
- )
- if self.y.ndim == 1:
- self.y = self.y[:, np.newaxis]
- elif self.y.ndim != 2:
- raise ValueError(
- "Expected a 2-D array `y`, found shape " "{}".format(self.y.shape)
- )
+ if self.x.ndim == 1:
+ self.x = self.x[:, np.newaxis]
+ elif self.x.ndim != 2:
+ raise ValueError(
+ "Expected a 2-D array `x`, found shape " "{}".format(self.x.shape)
+ )
+ if self.y.ndim == 1:
+ self.y = self.y[:, np.newaxis]
+ elif self.y.ndim != 2:
+ raise ValueError(
+ "Expected a 2-D array `y`, found shape " "{}".format(self.y.shape)
+ )
 
-  self._check_nd_indeptest()
+ self._check_nd_indeptest()
 
  return self.x, self.y
 
@@ -82,16 +69,6 @@ def _check_min_samples(self):
  if nx <= 3 or ny <= 3:
  raise ValueError("Number of samples is too low")
 
-
-def _chi2_approx(stat, null_dist, samps):
- mu = np.mean(null_dist)
- sigma = np.std(null_dist)
-
- if sigma < 10e-4 and mu < 10e-4:
- x = 0.0
- else:
- x = samps * (stat - mu) / sigma + 1
-
- pvalue = 1 - chi2.cdf(x, 1)
-
- return pvalue
+ def _check_variance(self):
+ if np.var(self.x) == 0 or np.var(self.y) == 0:
+ raise ValueError("Test cannot be run, one of the inputs has 0 variance")
diff --git a/hyppo/independence/base.py b/hyppo/independence/base.py
@@ -1,7 +1,5 @@
 from abc import ABC, abstractmethod
 
-import numpy as np
-
 from .._utils import euclidean, perm_test
 
 
@@ -40,7 +38,7 @@ def _statistic(self, x, y):
  """
 
  @abstractmethod
- def test(self, x, y, reps=1000, workers=1):
+ def test(self, x, y, reps=1000, workers=1, is_distsim=True):
  r"""
  Calulates the independence test p-value.
 
@@ -66,7 +64,7 @@ def test(self, x, y, reps=1000, workers=1):
 
  # calculate p-value
  stat, pvalue, null_dist = perm_test(
- self._statistic, x, y, reps=reps, workers=workers
+ self._statistic, x, y, reps=reps, workers=workers, is_distsim=is_distsim
  )
  self.stat = stat
  self.pvalue = pvalue

diff --git a/hyppo/independence/cca.py b/hyppo/independence/cca.py
@@ -15,7 +15,6 @@ class CCA(IndependenceTest):
 
  See Also
  --------
- Pearson : Pearson product-moment correlation test statistic and p-value.
  RV : RV test statistic and p-value.
 
  Notes
@@ -140,8 +139,8 @@ def test(self, x, y, reps=1000, workers=1):
  >>> '%.1f, %.2f' % (stat, pvalue)
  '1.0, 0.00'
  """
- check_input = _CheckInputs(x, y, dim=2, reps=reps)
+ check_input = _CheckInputs(x, y, reps=reps)
  x, y = check_input()
 
  # use default permutation test
- return super(CCA, self).test(x, y, reps, workers)
+ return super(CCA, self).test(x, y, reps, workers, is_distsim=False)