Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add discriminability (one sample and two sample) #27

Merged
merged 30 commits into from
Dec 16, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
2f7a50b
Adding one sample test for discriminability
Nov 19, 2019
f32855b
Adding discriminability doc
Nov 20, 2019
588b213
Adding discriminability
Nov 21, 2019
31d37e3
adding unit test
Nov 23, 2019
82aa258
Bug fixing unit test
Nov 25, 2019
a304bac
Bug fixing unit test
Nov 25, 2019
e7bb5e5
Bug fixing unit test
Nov 26, 2019
d98a0b1
adding correction for the one sample test
Dec 2, 2019
633ec91
Fixed issues with docs, unit tests and addressing PR review comments
Dec 3, 2019
3f0480f
Merge branch 'master' into JD
sampan501 Dec 3, 2019
b18efb5
Update discrimTwoSample.py
sampan501 Dec 3, 2019
e946d77
Merge branch 'master' into JD
sampan501 Dec 9, 2019
6e37aec
Merge branch 'master' into JD
jdey4 Dec 10, 2019
3988a5e
merging master
Dec 10, 2019
00fe235
Merge branch 'master' into JD
jdey4 Dec 13, 2019
e548b03
Merge branch 'master' into JD
sampan501 Dec 15, 2019
be643b0
adding correction for coding formats
Dec 15, 2019
27d4d31
adding correction for final pr
Dec 15, 2019
fe7b3cf
adding correction for final pr
Dec 15, 2019
1615834
adding correction for final pr
Dec 15, 2019
aade79a
adding correction for final pr
Dec 15, 2019
0335642
Adding correction for code format using black
Dec 15, 2019
feda8d5
Adding correction for coding format
Dec 15, 2019
ddcc89a
Adding correction for coding format
Dec 15, 2019
33bf538
Adding correction for coding format
Dec 15, 2019
44ddaa1
added numba for two sample test
Dec 16, 2019
8950dbc
correcting doc formats
Dec 16, 2019
dce1584
correcting doc formats
Dec 16, 2019
087eb4b
correcting doc formats
Dec 16, 2019
2fd2f67
correcting doc formats
Dec 16, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions docs/reference/discrim.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
Discriminability
****************

.. currentmodule:: mgc.discrim

Discriminability one sample test
--------------------------------
.. autoclass:: DiscrimOneSample

Discriminability two sample test
--------------------------------
.. autoclass:: DiscrimTwoSample
3 changes: 2 additions & 1 deletion docs/reference/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@ Reference
*********

.. toctree::
:maxdepth: 2
:maxdepth: 3

discrim
independence
ksample
sims
309 changes: 309 additions & 0 deletions docs/tutorials/benchmarks/3samp_power_epsilon.ipynb

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions docs/tutorials/benchmarks/indep_power_dimension.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -27,7 +27,7 @@
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from power import power_dim\n",
"from benchmarks import power_dim\n",
"from mgc.independence import *\n",
"from mgc.sims import *\n",
"\n",
Expand Down
4 changes: 2 additions & 2 deletions docs/tutorials/benchmarks/indep_power_sampsize.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -27,7 +27,7 @@
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from power import power_sample\n",
"from benchmarks import power_sample\n",
"from mgc.independence import *\n",
"from mgc.sims import *\n",
"\n",
Expand Down
1 change: 1 addition & 0 deletions mgc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@
import mgc.ksample
import mgc.time_series
import mgc.sims
import mgc.discrim

__version__ = "0.0.1"
4 changes: 4 additions & 0 deletions mgc/discrim/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .discrim_one_samp import DiscrimOneSample
from .discrim_two_samp import DiscrimTwoSample

__all__ = ["DiscrimOneSample", "DiscrimTwoSample"]
74 changes: 74 additions & 0 deletions mgc/discrim/_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import warnings
import numpy as np
from .._utils import (
contains_nan,
check_ndarray_xy,
convert_xy_float64,
check_reps,
euclidean,
)


class _CheckInputs:
"""Checks inputs for discriminability tests"""

def __init__(self, x, y, reps=None, is_dist=False, remove_isolates=True):
self.x = x
self.y = y
self.reps = reps
sampan501 marked this conversation as resolved.
Show resolved Hide resolved
self.is_distance = is_dist
self.remove_isolates = remove_isolates

def __call__(self):
if len(self.x) > 1:
if self.x[0].shape[0] != self.x[1].shape[0]:
msg = "The input matrices do not have the same number of rows."
raise ValueError(msg)

tmp_ = []
for x1 in self.x:
check_ndarray_xy(x1, self.y)
contains_nan(x1)
contains_nan(self.y)
check_min_samples(x1)
x1, self.y = convert_xy_float64(x1, self.y)
tmp_.append(self._condition_input(x1))

self.x = tmp_

if self.reps:
check_reps(self.reps)

return self.x, self.y

def _condition_input(self, x1):
"""Checks whether there is only one subject and removes
isolates and calculate distance."""
uniques, counts = np.unique(self.y, return_counts=True)

if (counts != 1).sum() <= 1:
msg = "You have passed a vector containing only a single unique sample id."
raise ValueError(msg)

if self.remove_isolates:
idx = np.isin(self.y, uniques[counts != 1])
self.y = self.y[idx]

x1 = np.asarray(x1)
if not self.is_distance:
x1 = x1[idx]
else:
x1 = x1[np.ix_(idx, idx)]

if not self.is_distance:
x1 = euclidean(x1)

return x1


def check_min_samples(x1):
"""Check if the number of samples is at least 3"""
nx = x1.shape[0]

if nx <= 10:
raise ValueError("Number of samples is too low")
79 changes: 79 additions & 0 deletions mgc/discrim/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from abc import ABC, abstractmethod
import numpy as np
from .._utils import euclidean


class DiscriminabilityTest(ABC):
r"""
A base class for a Discriminability test.
"""

def __init__(self):
self.pvalue_ = None
sampan501 marked this conversation as resolved.
Show resolved Hide resolved
super().__init__()

# @abstractmethod
def _statistic(self, x, y):
r"""
Calulates the independence test statistic.

Parameters
----------
x, y : ndarray
Input data matrices.
"""

sampan501 marked this conversation as resolved.
Show resolved Hide resolved
rdfs = self._discr_rdf(x, y)
stat = np.nanmean(rdfs)

return stat

def _discr_rdf(self, dissimilarities, labels):
# calculates test statistics distribution
rdfs = []

for i, label in enumerate(labels):
di = dissimilarities[i]

# All other samples except its own label
idx = labels == label
Dij = di[~idx]

# All samples except itself
idx[i] = False
Dii = di[idx]

rdf = [
1 - ((Dij < d).sum() + 0.5 * (Dij == d).sum()) / Dij.size for d in Dii
]
rdfs.append(rdf)

out = np.full((len(rdfs), max(map(len, rdfs))), np.nan)
for i, rdf in enumerate(rdfs):
out[i, : len(rdf)] = rdf

return out

@abstractmethod
def _perm_stat(self, index):
sampan501 marked this conversation as resolved.
Show resolved Hide resolved
r"""
Helper function that is used to calculate parallel permuted test
statistics.

Parameters
----------
index : int
Iterator used for parallel statistic calculation

Returns
-------
perm_stat : float
Test statistic for each value in the null distribution.
"""

@abstractmethod
def test(self):
r"""
Calculates the test statistic and p-value for Discriminability one sample
and two sample test.
"""
143 changes: 143 additions & 0 deletions mgc/discrim/discrim_one_samp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
from ._utils import _CheckInputs
import numpy as np
import random
from .base import DiscriminabilityTest
from scipy._lib._util import MapWrapper


class DiscrimOneSample(DiscriminabilityTest):
r"""
A class that performs a one-sample test for discriminability.

Discriminability index is a measure of whether a data acquisition and
preprocessing pipeline is more discriminable among different subjects.
The key insight is that each measurement of the same item should be more
similar to other measurements of that item, as compared to measurements
of any other item. One sample test measures whether the discriminability
for a dataset differs from random chance. More details can be described
in [#1Dscr]_.

Parameters
----------
is_dist : bool, optional (default: False)
whether `x` is a distance matrix or not.
remove_isolates : bool, optional (default: True)
whether to remove the measurements with single instance or not.

See Also
--------
DiscrimTwoSample : Two sample test for comparing the discriminability of two data

Notes
-----
With :math:`D_X` as the sample discriminability of :math:`X`,
one sample test verifies whether

.. math::

H_0: D_X = D_0

and

.. math::

H_A: D_X > D_0

where :math:`D_0` is the discriminability that would be observed by random chance.

References
----------
.. [#1Dscr] Eric W. Bridgeford, et al. "Optimal Decisions for Reference
Pipelines and Datasets: Applications in Connectomics." Bioarxiv (2019).
"""

def __init__(self, is_dist=False, remove_isolates=True):
# set is_distance to true if compute_distance is None
self.is_distance = is_dist
self.remove_isolates = remove_isolates
DiscriminabilityTest.__init__(self)

def _statistic(self, x, y):
"""
Helper function that calculates the discriminability test statistics.
"""
stat = super(DiscrimOneSample, self)._statistic(x, y)

return stat

def test(self, x, y, reps=1000, workers=-1):
r"""
Calculates the test statistic and p-value for Discriminability one sample test.

Parameters
----------
x: ndarray
An `(n, d)` data matrix with `n` samples in `d` dimensions,
if flag is_dist = Flase and an `(n, n)` distance matrix,
if flag is_dist = True

y : ndarray
a vector containing the sample ids for our :math:`n` samples.
reps : int, optional (default: 1000)
The number of replications used to estimate the null distribution
when using the permutation test used to calculate the p-value.
workers : int, optional (default: -1)
The number of cores to parallelize the p-value computation over.
Supply -1 to use all cores available to the Process.

Returns
-------
stat : float
The computed discriminability statistic.
pvalue : float
The computed one sample test p-value.

Examples
--------
>>> import numpy as np
>>> from mgc.discrim import DiscrimOneSample
>>> x = np.concatenate((np.zeros((50,2)) ,np.ones((50,2))), axis=0)
>>> y = np.concatenate((np.zeros(50),np.ones(50)), axis= 0)
>>> stat, p = DiscrimOneSample().test(x,y)
>>> '%.1f, %.2f' % (stat, p)
'1.0, 0.00'
"""

check_input = _CheckInputs(
[x],
y,
reps=reps,
is_dist=self.is_distance,
remove_isolates=self.remove_isolates,
)
x, y = check_input()

self.x = np.asarray(x[0])
self.y = y

stat = self._statistic(self.x, self.y)
self.stat = stat

# use all cores to create function that parallelizes over number of reps
mapwrapper = MapWrapper(workers)
null_dist = np.array(list(mapwrapper(self._perm_stat, range(reps))))
self.null_dist = null_dist

# calculate p-value and significant permutation map through list
pvalue = ((null_dist >= stat).sum()) / reps

# correct for a p-value of 0. This is because, with bootstrapping
# permutations, a p-value of 0 is incorrect
if pvalue == 0:
pvalue = 1 / reps

self.pvalue_ = pvalue

return stat, pvalue

def _perm_stat(self, index):
permy = np.random.permutation(self.y)

perm_stat = self._statistic(self.x, permy)

return perm_stat
Loading