Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Doc resample bench #24

Merged
merged 23 commits into from
Dec 8, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fixing input to knn
  • Loading branch information
plutasnyy committed Dec 4, 2019
commit cc338889fb5adbe89d08557ee4ece9a4db6a173f
52 changes: 27 additions & 25 deletions multi_imbalance/resampling/SOUP.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from collections import Counter, defaultdict
from copy import deepcopy
from operator import itemgetter

import numpy as np
Expand All @@ -16,10 +17,9 @@ class SOUP:

def __init__(self, k: int = 9) -> None:
self.k = k
self.neigh_clf = NearestNeighbors(n_neighbors=self.k + 1)
self.quantities, self.goal_quantity = [None] * 2

def fit_transform(self, X, y, shuffle: bool = True):
def fit_transform(self, _X, _y, shuffle: bool = True):
"""

Parameters
Expand All @@ -31,34 +31,35 @@ def fit_transform(self, X, y, shuffle: bool = True):
-------
Resampled X (mean class quantity * number of unique classes), y (number of rows in X) as numpy array
"""

X = deepcopy(_X)
y = deepcopy(_y)

assert len(X.shape) == 2, 'X should have 2 dimension'
assert X.shape[0] == y.shape[0], 'Number of labels must be equal to number of samples'

self.quantities = Counter(y)
self.goal_quantity = self._calculate_goal_quantity()

dsc_maj_cls = sorted(((v, i) for v, i in self.quantities.items() if i >= self.goal_quantity), key=itemgetter(1),
reverse=True)
asc_min_cls = sorted(((v, i) for v, i in self.quantities.items() if i < self.goal_quantity), key=itemgetter(1),
reverse=False)
result_X, result_y = list(), list()
for class_name, class_quantity in dsc_maj_cls:
self.neigh_clf.fit(X)
self._undersample(X, y, class_name, result_X, result_y)
self._undersample(X, y, class_name)

for class_name, class_quantity in asc_min_cls:
self.neigh_clf.fit(X)
self._oversample(X, y, class_name, result_X, result_y)
self._oversample(X, y, class_name)

if shuffle:
result_X, result_y = sklearn.utils.shuffle(result_X, result_y)
result_X, result_y = sklearn.utils.shuffle(X, y)

return np.array(result_X), np.array(result_y)

def _construct_class_safe_levels(self, X, y, class_name) -> defaultdict:
indices_in_class = [i for i, value in enumerate(y) if value == class_name]

neighbour_indices = self.neigh_clf.kneighbors(X[indices_in_class], return_distance=False)
neigh_clf = NearestNeighbors(n_neighbors=self.k + 1).fit(X)
neighbour_indices = neigh_clf.kneighbors(X[indices_in_class], return_distance=False)[:, 1:]
neighbour_classes = y[neighbour_indices]

class_safe_levels = defaultdict(float)
Expand All @@ -74,37 +75,38 @@ def _calculate_sample_safe_level(self, class_name, neighbours_quantities: Counte

for neigh_label, neigh_q in neighbours_quantities.items():
similarity_between_classes = min(q[class_name], q[neigh_label]) / max(q[class_name], q[neigh_label])
safe_level += neigh_q * similarity_between_classes
return safe_level / self.k
safe_level += neigh_q * similarity_between_classes / self.k

def _undersample(self, X, y, class_name, result_X, result_y):
if safe_level > 1:
raise ValueError(f'Safe level is bigger than 1: {safe_level}')

return safe_level

def _undersample(self, X, y, class_name):
safe_levels_of_samples_in_class = self._construct_class_safe_levels(X, y, class_name)

class_quantity = self.quantities[class_name]
safe_levels_list = sorted(safe_levels_of_samples_in_class.items(), key=itemgetter(1))
samples_to_remove_quantity = max(0, int(class_quantity - self.goal_quantity))
safe_levels_list = safe_levels_list[samples_to_remove_quantity:]

undersampled_X = [X[idx] for idx, _ in safe_levels_list]
undersampled_y = [y[idx] for idx, _ in safe_levels_list]
if samples_to_remove_quantity > 0:
remove_indices = list(map(itemgetter(0), safe_levels_list[-samples_to_remove_quantity:]))
X = np.delete(X, remove_indices, axis=0)
y = np.delete(y, remove_indices, axis=0)

result_X.extend(undersampled_X)
result_y.extend(undersampled_y)
return X, y

def _oversample(self, X, y, class_name, result_X, result_y):
def _oversample(self, X, y, class_name):
safe_levels_of_samples_in_class = self._construct_class_safe_levels(X, y, class_name)
class_quantity = self.quantities[class_name]
safe_levels_list = sorted(safe_levels_of_samples_in_class.items(), key=itemgetter(1), reverse=True)

oversampled_X, oversampled_y = list(), list()
for i in range(self.goal_quantity):
sample_level_ranking_to_duplicate: int = i % class_quantity
sample_id, sample_safe_level = safe_levels_list[sample_level_ranking_to_duplicate]
oversampled_X.append(X[sample_id])
oversampled_y.append(y[sample_id])
X.append(X[sample_id])
y.append(y[sample_id])

result_X.extend(oversampled_X)
result_y.extend(oversampled_y)
return X, y

def _calculate_goal_quantity(self):
max_q = max(list(self.quantities.values()))
Expand Down
31 changes: 14 additions & 17 deletions multi_imbalance/resampling/tests/test_soup.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
])

y_balanced = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
y_balanced_first_sample_safe_level = 1
y_balanced_first_sample_safe_level = 0.8
y_balanced_0_class_safe_levels = defaultdict(float,
{0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0, 8: 1.0,
9: 1.0})
Expand All @@ -39,7 +39,7 @@
18: 1.0, 19: 1.0})

y_imb_easy = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1])
y_imb_easy_first_sample_safe_level = 0.7714285714285714
y_imb_easy_first_sample_safe_level = 0.685714
y_imb_easy_0_class_safe_levels = defaultdict(float, {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0,
8: 0.8857142857142858, 9: 1.0, 10: 0.7714285714285714,
11: 0.7714285714285714, 12: 0.6571428571428571,
Expand All @@ -49,7 +49,7 @@
16: 0.7714285714285714, 18: 0.8857142857142858, 19: 0.8857142857142858})

y_imb_hard = np.array([0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0])
y_imb_hard_first_sample_safe_level = 0.7714285714285714
y_imb_hard_first_sample_safe_level = 0.685714
y_imb_hard_quantities_0_class_safe_levels = defaultdict(float, {0: 0.8857142857142858, 1: 0.7714285714285714,
2: 0.8857142857142858, 3: 0.8857142857142858,
4: 0.8857142857142858, 5: 0.7714285714285714,
Expand All @@ -70,22 +70,21 @@

safe_levels_test_data = [
# x,y,class_name,expected undersampling,oversampling quantity
(X, y_balanced, 0, 10, 10),
(X, y_balanced, 1, 10, 10),
(X, y_imb_easy, 0, 10, 10),
(X, y_imb_easy, 1, 6, 14),
(X, y_imb_hard, 0, 10, 10),
(X, y_imb_hard, 1, 6, 14,),
(X, y_balanced, 0, 20, 20),
(X, y_balanced, 1, 20, 20),
(X, y_imb_easy, 0, 16, 20),
(X, y_imb_easy, 1, 20, 24),
(X, y_imb_hard, 0, 16, 20),
(X, y_imb_hard, 1, 20, 24),
]


@pytest.fixture()
def soup_mock():
def _get_parametrized_soup(X, y):
clf = SOUP(k=5)
clf.neigh_clf.fit(X)
clf.quantities = Counter(y)
clf.goal_quantity = 10
clf.goal_quantity = clf._calculate_goal_quantity()
return clf

return _get_parametrized_soup
Expand All @@ -94,7 +93,7 @@ def _get_parametrized_soup(X, y):
@pytest.mark.parametrize("X, y, zero_safe_levels, one_safe_levels, first_sample_safe", complete_test_data)
def test_calculating_safe_levels_for_sample(X, y, zero_safe_levels, one_safe_levels, first_sample_safe, soup_mock):
clf = soup_mock(X, y)
neighbour_quantities = Counter({0: 3, 1: 2})
neighbour_quantities = Counter({0: 3, 1: 1})

safe_level = clf._calculate_sample_safe_level(0, neighbour_quantities)
assert_array_almost_equal(safe_level, first_sample_safe)
Expand All @@ -113,19 +112,17 @@ def test_calculating_safe_levels_for_class(X, y, zero_safe_levels, one_safe_leve


@pytest.mark.parametrize("X, y, class_name, expected_undersampling, expected_oversampling", safe_levels_test_data)
def test_undersample(X, y, class_name, expected_undersampling, expected_oversampling, soup_mock):
def test_oversample(X, y, class_name, expected_undersampling, expected_oversampling, soup_mock):
clf = soup_mock(X, y)
oversampled_X, oversampled_y = list(), list()
clf._oversample(X, y, class_name, oversampled_X, oversampled_y)
oversampled_X, oversampled_y = clf._oversample(X, y, class_name)
assert len(oversampled_X) == expected_oversampling
assert len(oversampled_y) == expected_oversampling


@pytest.mark.parametrize("X, y, class_name, expected_undersampling, expected_oversampling", safe_levels_test_data)
def test_undersample(X, y, class_name, expected_undersampling, expected_oversampling, soup_mock):
clf = soup_mock(X, y)
undersampled_X, undersampled_y = list(), list()
clf._undersample(X, y, class_name, undersampled_X, undersampled_y)
undersampled_X, undersampled_y = clf._undersample(X, y, class_name)
assert len(undersampled_X) == expected_undersampling
assert len(undersampled_y) == expected_undersampling

Expand Down