Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add discriminability (one sample and two sample) #27

Merged
merged 30 commits into from
Dec 16, 2019
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
2f7a50b
Adding one sample test for discriminability
Nov 19, 2019
f32855b
Adding discriminability doc
Nov 20, 2019
588b213
Adding discriminability
Nov 21, 2019
31d37e3
adding unit test
Nov 23, 2019
82aa258
Bug fixing unit test
Nov 25, 2019
a304bac
Bug fixing unit test
Nov 25, 2019
e7bb5e5
Bug fixing unit test
Nov 26, 2019
d98a0b1
adding correction for the one sample test
Dec 2, 2019
633ec91
Fixed issues with docs, unit tests and addressing PR review comments
Dec 3, 2019
3f0480f
Merge branch 'master' into JD
sampan501 Dec 3, 2019
b18efb5
Update discrimTwoSample.py
sampan501 Dec 3, 2019
e946d77
Merge branch 'master' into JD
sampan501 Dec 9, 2019
6e37aec
Merge branch 'master' into JD
jdey4 Dec 10, 2019
3988a5e
merging master
Dec 10, 2019
00fe235
Merge branch 'master' into JD
jdey4 Dec 13, 2019
e548b03
Merge branch 'master' into JD
sampan501 Dec 15, 2019
be643b0
adding correction for coding formats
Dec 15, 2019
27d4d31
adding correction for final pr
Dec 15, 2019
fe7b3cf
adding correction for final pr
Dec 15, 2019
1615834
adding correction for final pr
Dec 15, 2019
aade79a
adding correction for final pr
Dec 15, 2019
0335642
Adding correction for code format using black
Dec 15, 2019
feda8d5
Adding correction for coding format
Dec 15, 2019
ddcc89a
Adding correction for coding format
Dec 15, 2019
33bf538
Adding correction for coding format
Dec 15, 2019
44ddaa1
added numba for two sample test
Dec 16, 2019
8950dbc
correcting doc formats
Dec 16, 2019
dce1584
correcting doc formats
Dec 16, 2019
087eb4b
correcting doc formats
Dec 16, 2019
2fd2f67
correcting doc formats
Dec 16, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions docs/reference/discrim.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
Discriminability
****************

.. currentmodule:: mgc.discrim

Discriminability one sample test
--------------------------------
.. autoclass:: oneSample

Discriminability two sample test
--------------------------------
.. autoclass:: twoSample
3 changes: 2 additions & 1 deletion docs/reference/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ Reference
*********

.. toctree::
:maxdepth: 2
:maxdepth: 3

discrim
independence
ksample
3 changes: 2 additions & 1 deletion mgc/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import mgc.independence
import mgc.ksample
import mgc.time_series
import mgc.discrim

__version__ = "0.0.1"
__version__ = "0.0.1"
4 changes: 4 additions & 0 deletions mgc/discrim/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .oneSample import oneSample
from .twoSample import twoSample

__all__ = ["oneSample", "twoSample"]
31 changes: 31 additions & 0 deletions mgc/discrim/_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import warnings
import numpy as np
from sklearn.utils import check_X_y


class _CheckInputs:
"""Checks inputs for discriminability tests"""
def __init__(self, X, Y, reps = None):
self.X = X
self.Y = Y
self.reps = reps
sampan501 marked this conversation as resolved.
Show resolved Hide resolved

def __call__(self):
check_X_y(self.X, self.Y, accept_sparse=True)
self.check_reps()

return self.X, self.Y

def check_reps(self):
"""Check if reps is valid"""

# check if reps is an integer > than 0
if not isinstance(self.reps, int) or self.reps < 0:
raise ValueError("Number of reps must be an integer greater than 0.")

# check if reps is under 1000 (recommended)
elif self.reps < 1000:
msg = ("The number of replications is low (under 1000), and p-value "
"calculations may be unreliable. Use the p-value result, with "
"caution!")
warnings.warn(msg, RuntimeWarning)
123 changes: 123 additions & 0 deletions mgc/discrim/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
from abc import ABC, abstractmethod
from sklearn.metrics import euclidean_distances
from sklearn.utils import check_X_y
sampan501 marked this conversation as resolved.
Show resolved Hide resolved
import numpy as np


class discriminabilityTest(ABC):
sampan501 marked this conversation as resolved.
Show resolved Hide resolved
r"""
A base class for a Discriminability test.

"""

sampan501 marked this conversation as resolved.
Show resolved Hide resolved

def __init__(self):
super().__init__()


def _statistic(self, X, Y, is_dist = False, remove_isolates=True, return_rdfs=False):
sampan501 marked this conversation as resolved.
Show resolved Hide resolved
r"""
Calulates the independence test statistic.

Parameters
----------
x, y : ndarray
Input data matrices.
"""

sampan501 marked this conversation as resolved.
Show resolved Hide resolved
uniques, counts = np.unique(Y, return_counts=True)

if remove_isolates:
idx = np.isin(Y, uniques[counts != 1])
labels = Y[idx]

if ~is_dist:
X = X[idx]
else:
X = X[np.ix_(idx, idx)]
else:
labels = Y

if ~is_dist:
dissimilarities = euclidean_distances(X)
else:
dissimilarities = X

rdfs = self._discr_rdf(dissimilarities, labels)
stat = np.nanmean(rdfs)

if return_rdfs:
return stat, rdfs
else:
return stat




def _discr_rdf(self, dissimilarities, labels):

sampan501 marked this conversation as resolved.
Show resolved Hide resolved
check_X_y(dissimilarities, labels, accept_sparse=True)

rdfs = []
for i, label in enumerate(labels):
di = dissimilarities[i]

# All other samples except its own label
idx = labels == label
Dij = di[~idx]

# All samples except itself
idx[i] = False
Dii = di[idx]

rdf = [1 - ((Dij < d).sum() + 0.5 * (Dij == d).sum()) / Dij.size for d in Dii]
rdfs.append(rdf)

out = np.full((len(rdfs), max(map(len, rdfs))), np.nan)
for i, rdf in enumerate(rdfs):
out[i, : len(rdf)] = rdf

return out



@abstractmethod
def _perm_stat(self, index):
sampan501 marked this conversation as resolved.
Show resolved Hide resolved
r"""
Helper function that is used to calculate parallel permuted test
statistics.

Parameters
----------
index : int
Iterator used for parallel statistic calculation

Returns
-------
perm_stat : float
Test statistic for each value in the null distribution.
"""


@abstractmethod
def test(self, x, y, reps=1000, workers=-1):
r"""
Calulates the independence test p-value.

Parameters
----------
x, y : ndarray
Input data matrices.
reps : int, optional
The number of replications used in permutation, by default 1000.
workers : int, optional
Evaluates method using `multiprocessing.Pool <multiprocessing>`).
Supply `-1` to use all cores available to the Process.

Returns
-------
stat : float
The computed independence test statistic.
pvalue : float
The pvalue obtained via permutation.
"""
116 changes: 116 additions & 0 deletions mgc/discrim/oneSample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
from sklearn.metrics import euclidean_distances
from ._utils import _CheckInputs
import numpy as np
import random
from .base import discriminabilityTest
from scipy._lib._util import MapWrapper



class oneSample(discriminabilityTest):
r"""
A class that performs a one-sample test for whether the discriminability
differs from random chance, as described in [1]. Discriminability index
is a masure of whether a data acquisition and preprocessing pipeline is
more discriminable among different subjects. The key insight is that each
measurement of the same item should be more similar to other measurements
of that item, as compared to measurements of any other item.
With :math:`D_X` as the sample discriminability of :math:`X`, it tests whether:

.. math::
H_0: D_X = D_0

and

.. math::
H_A: D_X > D_0

where :math:`D_0` is the discriminability that would be observed by random chance.

References
----------
.. [#1Dscr] Eric W. Bridgeford, et al. "Optimal Decisions for Reference
Pipelines and Datasets: Applications in Connectomics." Bioarxiv (2019).
"""

def __init__(self):
discriminabilityTest.__init__(self)



def test(self, X, Y, is_dist = False, remove_isolates=True, reps=1000, workers=-1):
r"""
Calculates the test statistic and p-value for Discriminability one sample test.

Parameters
----------
X : ndarray

* An :math:`n \times d` data matrix with :math:`n` samples in :math:`d` dimensions, if flag :math:`(is\_dist = Flase)`

* An :math:`n \times n` distance matrix, if flag :math:`(is\_dist = True)`

Y : ndarray
a vector containing the sample ids for our :math:`n` samples.

is_dist : Boolean, optional (default: False)
Whether `X` is a distance matrix or not.
remove_isolates : Boolean, optional (default: True)
whether remove the samples with single instance or not.
reps : int, optional (default: 1000)
The number of replications used to estimate the null distribution
when using the permutation test used to calculate the p-value.
workers : int, optional (default: -1)
The number of cores to parallelize the p-value computation over.
Supply -1 to use all cores available to the Process.

Returns
-------
stat : float
The computed Discriminability statistic.
pvalue : float
The computed one sample test p-value.
"""

check_input = _CheckInputs(X, Y, reps = reps)
X, Y = check_input()

_, counts = np.unique(Y, return_counts=True)

if (counts != 1).sum() <= 1:
msg = "You have passed a vector containing only a single unique sample id."
raise ValueError(msg)


self.X = X
self.Y = Y

stat = super(oneSample,self)._statistic(self.X, self.Y,is_dist, remove_isolates, return_rdfs=False)
self.stat_ = stat

# use all cores to create function that parallelizes over number of reps
mapwrapper = MapWrapper(workers)
null_dist = np.array(list(mapwrapper(self._perm_stat, range(reps))))
self.null_dist = null_dist

# calculate p-value and significant permutation map through list
pvalue = ((null_dist >= stat).sum() + 1) / (reps + 1)

# correct for a p-value of 0. This is because, with bootstrapping
# permutations, a p-value of 0 is incorrect
if pvalue == 0:
pvalue = 1 / reps
self.pvalue_ = pvalue

return stat, pvalue




def _perm_stat(self, index):
permx = self.X
permy = np.random.permutation(self.Y)

perm_stat = super(oneSample,self)._statistic(permx, permy)

return perm_stat
Empty file added mgc/discrim/tests/__init__.py
Empty file.
63 changes: 63 additions & 0 deletions mgc/discrim/tests/test_oneSample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import pytest
import numpy as np
from numpy.testing import assert_almost_equal, assert_warns, assert_raises
from .. import oneSample

class TestOneSample:
def test_same_one(self):
x = np.ones((100,2),dtype=float)
y = np.concatenate((np.zeros(50),np.ones(50)), axis= 0)

np.random.seed(123456789)
obs_stat = 0.5
obs_p = 1
stat, p = oneSample().test(x,y)

assert_almost_equal(stat, obs_stat, decimal=2)
assert_almost_equal(p, obs_p, decimal=2)

def test_diff_one(self):
x = np.concatenate((np.zeros((50,2)) ,np.ones((50,2))), axis=0)
y = np.concatenate((np.zeros(50),np.ones(50)), axis= 0)

np.random.seed(123456789)
obs_stat = 1.0
obs_p = 0.001
oneSamp = oneSample()
stat, p = oneSamp.test(x,y)

assert_almost_equal(stat, obs_stat, decimal=3)
assert_almost_equal(p, obs_p, decimal=3)


class TestOneSampleWarn:
""" Tests errors and warnings derived from one sample test.
"""

def test_error_nans(self):
# raises error if inputs contain NaNs
x = np.arange(20, dtype=float)
x[0] = np.nan
assert_raises(ValueError, oneSample().test, x, x)

y = np.arange(20)
assert_raises(ValueError, oneSample().test, x, y)

@pytest.mark.parametrize("reps", [
-1, # reps is negative
'1', # reps is not integer
])
def test_error_reps(self, reps):
# raises error if reps is negative
x = np.ones((100,2),dtype=float)
y = np.concatenate((np.zeros(50),np.ones(50)), axis= 0)

assert_raises(ValueError, oneSample().test, x, y, reps=reps)

def test_warns_reps(self):
# raises warning when reps is less than 1000
x = np.ones((100,2),dtype=float)
y = np.concatenate((np.zeros(50),np.ones(50)), axis= 0)

reps = 100
assert_warns(RuntimeWarning, oneSample().test, x, y, reps=reps)
Loading