neurodata · sampan501 · Dec 16, 2019 · Nov 19, 2019 · Nov 20, 2019 · Nov 21, 2019
diff --git a/docs/reference/discrim.rst b/docs/reference/discrim.rst
@@ -0,0 +1,12 @@
+Discriminability
+****************
+
+.. currentmodule:: mgc.discrim
+
+Discriminability one sample test
+--------------------------------
+.. autoclass:: oneSample
+
+Discriminability two sample test
+--------------------------------
+.. autoclass:: twoSample
diff --git a/docs/reference/index.rst b/docs/reference/index.rst
@@ -4,7 +4,8 @@ Reference
 *********
 
 .. toctree::
- :maxdepth: 2
+ :maxdepth: 3
 
+ discrim
  independence
  ksample
diff --git a/mgc/__init__.py b/mgc/__init__.py
@@ -1,5 +1,6 @@
 import mgc.independence
 import mgc.ksample
 import mgc.time_series
+import mgc.discrim
 
-__version__ = "0.0.1"
+__version__ = "0.0.1"
diff --git a/mgc/discrim/__init__.py b/mgc/discrim/__init__.py
@@ -0,0 +1,4 @@
+from .oneSample import oneSample
+from .twoSample import twoSample
+
+__all__ = ["oneSample", "twoSample"]
diff --git a/mgc/discrim/_utils.py b/mgc/discrim/_utils.py
@@ -0,0 +1,31 @@
+import warnings
+import numpy as np 
+from sklearn.utils import check_X_y
+
+
+class _CheckInputs:
+ """Checks inputs for discriminability tests"""
+ def __init__(self, X, Y, reps = None):
+ self.X = X
+ self.Y = Y
+ self.reps = reps
+
+ def __call__(self):
+ check_X_y(self.X, self.Y, accept_sparse=True)
+ self.check_reps()
+
+ return self.X, self.Y
+
+ def check_reps(self):
+ """Check if reps is valid"""
+
+ # check if reps is an integer > than 0
+ if not isinstance(self.reps, int) or self.reps < 0:
+ raise ValueError("Number of reps must be an integer greater than 0.")
+
+ # check if reps is under 1000 (recommended)
+ elif self.reps < 1000:
+ msg = ("The number of replications is low (under 1000), and p-value "
+ "calculations may be unreliable. Use the p-value result, with "
+ "caution!")
+ warnings.warn(msg, RuntimeWarning)
diff --git a/mgc/discrim/base.py b/mgc/discrim/base.py
@@ -0,0 +1,123 @@
+from abc import ABC, abstractmethod
+from sklearn.metrics import euclidean_distances
+from sklearn.utils import check_X_y
+import numpy as np
+
+
+class discriminabilityTest(ABC):
+ r"""
+ A base class for a Discriminability test.
+
+ """
+
+
+ def __init__(self):
+ super().__init__()
+
+
+ def _statistic(self, X, Y, is_dist = False, remove_isolates=True, return_rdfs=False):
+ r"""
+ Calulates the independence test statistic.
+
+ Parameters
+ ----------
+ x, y : ndarray
+ Input data matrices.
+ """
+
+ uniques, counts = np.unique(Y, return_counts=True)
+
+ if remove_isolates:
+ idx = np.isin(Y, uniques[counts != 1])
+ labels = Y[idx]
+
+ if ~is_dist:
+ X = X[idx]
+ else:
+ X = X[np.ix_(idx, idx)]
+ else:
+ labels = Y
+
+ if ~is_dist:
+ dissimilarities = euclidean_distances(X)
+ else:
+ dissimilarities = X
+
+ rdfs = self._discr_rdf(dissimilarities, labels)
+ stat = np.nanmean(rdfs)
+
+ if return_rdfs:
+ return stat, rdfs
+ else:
+ return stat
+
+
+
+
+ def _discr_rdf(self, dissimilarities, labels):
+
+ check_X_y(dissimilarities, labels, accept_sparse=True)
+
+ rdfs = []
+ for i, label in enumerate(labels):
+ di = dissimilarities[i]
+
+ # All other samples except its own label
+ idx = labels == label
+ Dij = di[~idx]
+
+ # All samples except itself
+ idx[i] = False
+ Dii = di[idx]
+
+ rdf = [1 - ((Dij < d).sum() + 0.5 * (Dij == d).sum()) / Dij.size for d in Dii]
+ rdfs.append(rdf)
+
+ out = np.full((len(rdfs), max(map(len, rdfs))), np.nan)
+ for i, rdf in enumerate(rdfs):
+ out[i, : len(rdf)] = rdf
+
+ return out
+
+
+
+ @abstractmethod
+ def _perm_stat(self, index):
+ r"""
+ Helper function that is used to calculate parallel permuted test
+ statistics.
+
+ Parameters
+ ----------
+ index : int
+ Iterator used for parallel statistic calculation
+
+ Returns
+ -------
+ perm_stat : float
+ Test statistic for each value in the null distribution.
+ """
+
+
+ @abstractmethod
+ def test(self, x, y, reps=1000, workers=-1):
+ r"""
+ Calulates the independence test p-value.
+
+ Parameters
+ ----------
+ x, y : ndarray
+ Input data matrices.
+ reps : int, optional
+ The number of replications used in permutation, by default 1000.
+ workers : int, optional
+ Evaluates method using `multiprocessing.Pool <multiprocessing>`).
+ Supply `-1` to use all cores available to the Process.
+
+ Returns
+ -------
+ stat : float
+ The computed independence test statistic.
+ pvalue : float
+ The pvalue obtained via permutation.
+ """
diff --git a/mgc/discrim/oneSample.py b/mgc/discrim/oneSample.py
@@ -0,0 +1,116 @@
+from sklearn.metrics import euclidean_distances
+from ._utils import _CheckInputs
+import numpy as np
+import random
+from .base import discriminabilityTest
+from scipy._lib._util import MapWrapper
+
+
+
+class oneSample(discriminabilityTest):
+ r"""
+ A class that performs a one-sample test for whether the discriminability 
+ differs from random chance, as described in [1]. Discriminability index 
+ is a masure of whether a data acquisition and preprocessing pipeline is 
+ more discriminable among different subjects. The key insight is that each
+ measurement of the same item should be more similar to other measurements 
+ of that item, as compared to measurements of any other item. 
+ With :math:`D_X` as the sample discriminability of :math:`X`, it tests whether:
+
+ .. math::
+ H_0: D_X = D_0
+
+ and
+
+ .. math::
+ H_A: D_X > D_0 
+
+ where :math:`D_0` is the discriminability that would be observed by random chance.
+
+ References
+ ----------
+ .. [#1Dscr] Eric W. Bridgeford, et al. "Optimal Decisions for Reference 
+ Pipelines and Datasets: Applications in Connectomics." Bioarxiv (2019).
+ """
+
+ def __init__(self):
+ discriminabilityTest.__init__(self)
+
+
+
+ def test(self, X, Y, is_dist = False, remove_isolates=True, reps=1000, workers=-1):
+ r"""
+ Calculates the test statistic and p-value for Discriminability one sample test.
+
+ Parameters
+ ----------
+ X : ndarray
+
+ * An :math:`n \times d` data matrix with :math:`n` samples in :math:`d` dimensions, if flag :math:`(is\_dist = Flase)` 
+
+ * An :math:`n \times n` distance matrix, if flag :math:`(is\_dist = True)`
+
+ Y : ndarray
+ a vector containing the sample ids for our :math:`n` samples.
+
+ is_dist : Boolean, optional (default: False)
+ Whether `X` is a distance matrix or not.
+ remove_isolates : Boolean, optional (default: True)
+ whether remove the samples with single instance or not.
+ reps : int, optional (default: 1000)
+ The number of replications used to estimate the null distribution
+ when using the permutation test used to calculate the p-value.
+ workers : int, optional (default: -1)
+ The number of cores to parallelize the p-value computation over.
+ Supply -1 to use all cores available to the Process.
+
+ Returns
+ -------
+ stat : float
+ The computed Discriminability statistic.
+ pvalue : float
+ The computed one sample test p-value.
+ """
+
+ check_input = _CheckInputs(X, Y, reps = reps)
+ X, Y = check_input()
+
+ _, counts = np.unique(Y, return_counts=True)
+
+ if (counts != 1).sum() <= 1:
+ msg = "You have passed a vector containing only a single unique sample id."
+ raise ValueError(msg)
+
+
+ self.X = X
+ self.Y = Y
+
+ stat = super(oneSample,self)._statistic(self.X, self.Y,is_dist, remove_isolates, return_rdfs=False)
+ self.stat_ = stat
+
+ # use all cores to create function that parallelizes over number of reps
+ mapwrapper = MapWrapper(workers)
+ null_dist = np.array(list(mapwrapper(self._perm_stat, range(reps))))
+ self.null_dist = null_dist
+
+ # calculate p-value and significant permutation map through list
+ pvalue = ((null_dist >= stat).sum() + 1) / (reps + 1)
+
+ # correct for a p-value of 0. This is because, with bootstrapping
+ # permutations, a p-value of 0 is incorrect
+ if pvalue == 0:
+ pvalue = 1 / reps
+ self.pvalue_ = pvalue
+
+ return stat, pvalue
+
+
+
+
+ def _perm_stat(self, index):
+ permx = self.X
+ permy = np.random.permutation(self.Y)
+
+ perm_stat = super(oneSample,self)._statistic(permx, permy)
+
+ return perm_stat
diff --git a/mgc/discrim/tests/__init__.py b/mgc/discrim/tests/__init__.py
diff --git a/mgc/discrim/tests/test_oneSample.py b/mgc/discrim/tests/test_oneSample.py
@@ -0,0 +1,63 @@
+import pytest
+import numpy as np
+from numpy.testing import assert_almost_equal, assert_warns, assert_raises
+from .. import oneSample
+
+class TestOneSample:
+ def test_same_one(self):
+ x = np.ones((100,2),dtype=float)
+ y = np.concatenate((np.zeros(50),np.ones(50)), axis= 0)
+
+ np.random.seed(123456789)
+ obs_stat = 0.5
+ obs_p = 1
+ stat, p = oneSample().test(x,y) 
+
+ assert_almost_equal(stat, obs_stat, decimal=2)
+ assert_almost_equal(p, obs_p, decimal=2)
+
+ def test_diff_one(self):
+ x = np.concatenate((np.zeros((50,2)) ,np.ones((50,2))), axis=0)
+ y = np.concatenate((np.zeros(50),np.ones(50)), axis= 0)
+
+ np.random.seed(123456789)
+ obs_stat = 1.0
+ obs_p = 0.001
+ oneSamp = oneSample()
+ stat, p = oneSamp.test(x,y) 
+
+ assert_almost_equal(stat, obs_stat, decimal=3)
+ assert_almost_equal(p, obs_p, decimal=3)
+
+
+class TestOneSampleWarn:
+ """ Tests errors and warnings derived from one sample test.
+ """
+
+ def test_error_nans(self):
+ # raises error if inputs contain NaNs
+ x = np.arange(20, dtype=float)
+ x[0] = np.nan
+ assert_raises(ValueError, oneSample().test, x, x)
+
+ y = np.arange(20)
+ assert_raises(ValueError, oneSample().test, x, y)
+
+ @pytest.mark.parametrize("reps", [
+ -1, # reps is negative
+ '1', # reps is not integer
+ ])
+ def test_error_reps(self, reps):
+ # raises error if reps is negative
+ x = np.ones((100,2),dtype=float)
+ y = np.concatenate((np.zeros(50),np.ones(50)), axis= 0)
+
+ assert_raises(ValueError, oneSample().test, x, y, reps=reps)
+
+ def test_warns_reps(self):
+ # raises warning when reps is less than 1000
+ x = np.ones((100,2),dtype=float)
+ y = np.concatenate((np.zeros(50),np.ones(50)), axis= 0)
+
+ reps = 100
+ assert_warns(RuntimeWarning, oneSample().test, x, y, reps=reps)