Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Gmm sampler #98

Merged
merged 9 commits into from
May 20, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
add more tests
  • Loading branch information
mateusz-wozny committed Apr 23, 2023
commit 263a7225ef8a34218b3cf7a8ccba8723f3142a86
8 changes: 4 additions & 4 deletions multi_imbalance/resampling/gmm_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ def _construct_neighborhood(self, X: np.ndarray, y: np.ndarray) -> None:
def _check_sample_neighborhood(self, sample_class: int, neigh_counts: Counter[int]) -> float:
neighborhood = 0.0
for neigh_class, count in neigh_counts.items():
class_sizes: List = [
class_sizes = [
self.class_sizes[sample_class],
self.class_sizes[neigh_class],
]
Expand Down Expand Up @@ -388,12 +388,12 @@ def _get_coefficients(self, gaussian_mixture: GaussianMixture) -> Tuple[np.ndarr
if self.covariance_type == "tied":
covariances = np.array([covariances] * gaussian_mixture.n_components)
elif self.covariance_type == "diag":
cov_list: List = []
cov_list = []
for component in range(gaussian_mixture.n_components):
cov_list.append(np.diagflat(covariances[component, :]))
covariances = np.array(cov_list)
elif self.covariance_type == "spherical":
cov_list: List = []
cov_list = []
for component in range(gaussian_mixture.n_components):
var = np.array([covariances[component]] * self.__x_subset.shape[1])
cov_list.append(np.diagflat(var))
Expand Down Expand Up @@ -431,7 +431,7 @@ def _compute_mdist(self, in_data: np.ndarray, mean: np.ndarray, covariance: np.n
mdist = cdist(data, [mean], metric="mahalanobis", VI=np.linalg.inv(covariance))[:, 0]
except Exception as e:
self.__logger.error("Can't compute 'cdist' function. Distance threshold is set to 2.0")
self.__logger.info(f"For more information, examine exception: {e}")
self.__logger.error(f"For more information, examine exception: {e}")
mdist = np.full_like(in_data, fill_value=2.0)[:, 0]
return mdist

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ dependencies = [
"matplotlib~=3.6.2",
"click~=8.1.3",
"scikit-posthocs~=0.7.0",
"pydantic~=1.10.5",
]

[project.optional-dependencies]
Expand Down
99 changes: 90 additions & 9 deletions tests/resampling/test_gmm_sampler.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import numpy as np
import pytest
from collections import Counter
from collections import Counter, OrderedDict

from multi_imbalance.resampling.gmm_sampler import GMMSampler

Expand Down Expand Up @@ -45,8 +46,8 @@

@pytest.fixture()
def gmm_sampler_mock():
def _get_parametrized_gmm_sampler(X, y, undersample):
gmm_sampler = GMMSampler(undersample=undersample)
def _get_parametrized_gmm_sampler(*args, **kwargs) -> GMMSampler:
gmm_sampler = GMMSampler(*args, **kwargs)
return gmm_sampler

return _get_parametrized_gmm_sampler
Expand All @@ -57,13 +58,14 @@ def get_goal_quantity(y):
return np.mean((quantities[minority_class], quantities[majority_class]), dtype=int)


@pytest.mark.parametrize("strategy, filter_new", [("median", 1.0), ("average", 0.0)])
@pytest.mark.parametrize("X, y", complete_test_data)
def test_output_length_with_undersample(X, y, gmm_sampler_mock):
gmm_sampler = gmm_sampler_mock(X, y, True)
def test_output_length_with_undersample(X, y, strategy, filter_new, gmm_sampler_mock):
gmm_sampler = gmm_sampler_mock(undersample=True, strategy=strategy, filter_new=filter_new)
resampled_X, resampled_y = gmm_sampler.fit_resample(X, y)

y_resampled_count = Counter(resampled_y)
for _, quantity in y_resampled_count.items():
for quantity in y_resampled_count.values():
assert quantity == get_goal_quantity(y)

assert len(resampled_X) == get_goal_quantity(y) * num_classes
Expand All @@ -72,8 +74,8 @@ def test_output_length_with_undersample(X, y, gmm_sampler_mock):

@pytest.mark.parametrize("X, y", complete_test_data)
def test_output_length_without_undersample(X, y, gmm_sampler_mock):
gmm_sampler = gmm_sampler_mock(X, y, False)
resampled_X, resampled_y = gmm_sampler.fit_resample(X, y)
gmm_sampler = gmm_sampler_mock(undersample=False)
_, resampled_y = gmm_sampler.fit_resample(X, y)

y_count = Counter(y)
y_resampled_count = Counter(resampled_y)
Expand All @@ -83,9 +85,88 @@ def test_output_length_without_undersample(X, y, gmm_sampler_mock):


def test_perform_step_condition(gmm_sampler_mock):
gmm_sampler = GMMSampler()
gmm_sampler = gmm_sampler_mock()
assert gmm_sampler._perform_step(n_components=2, likelihood=1.0, num_samples=3)
assert not gmm_sampler._perform_step(n_components=2, likelihood=-1.0, num_samples=3)
assert not gmm_sampler._perform_step(n_components=2, likelihood=1.0, num_samples=1)
gmm_sampler.max_components = 1
assert not gmm_sampler._perform_step(n_components=4, likelihood=1.0, num_samples=3)


def test_minority_classes(gmm_sampler_mock):
minority_classes = [0, 1]
gmm_sampler = gmm_sampler_mock(minority_classes=minority_classes)

assert gmm_sampler.minority_classes == minority_classes


@pytest.mark.parametrize(
"maj_int_min, expected_size",
[
({"maj": [], "int": [], "min": [1]}, 6),
({"maj": [1], "int": [], "min": []}, 6),
({"maj": [], "int": [1], "min": []}, 6),
],
)
def test_set_size_to_align(gmm_sampler_mock, expected_size, maj_int_min):
gmm_sampler = gmm_sampler_mock()
gmm_sampler.class_sizes = Counter(y_imb_hard)
gmm_sampler.maj_int_min = OrderedDict(maj_int_min)

gmm_sampler._set_size_to_align()
assert gmm_sampler.size_to_align == expected_size


def test_compute_mdist(gmm_sampler_mock, caplog):
caplog.set_level(logging.ERROR)
gmm_sampler = gmm_sampler_mock()
mean = [0, 0]
covariance = np.eye(2)
x = np.random.multivariate_normal(mean, covariance, size=2)

gmm_sampler._compute_mdist(x, mean, np.ones((2, 2)))

no_exception_check = 0
for record in caplog.records:
if record.levelname == "ERROR":
msg = record.message
no_exception_check += (
msg == "Can't compute 'cdist' function. Distance threshold is set to 2.0"
or msg == "For more information, examine exception: Singular matrix"
)

assert no_exception_check == 2


@pytest.mark.parametrize(
"strategy, class_count, expected_middle_size",
[("median", [10, 6, 4], 6), ("median", [4, 12, 4], 4), ("average", [4, 4, 12], 6), ("average", [8, 4, 8], 6)],
)
def test_get_middle_size_based_on_strategy(strategy, class_count, expected_middle_size, gmm_sampler_mock):
gmm_sampler = gmm_sampler_mock(strategy=strategy)

gmm_sampler._fit(X, np.array([*[0] * class_count[0], *[1] * class_count[1], *[2] * class_count[2]]))
middle_size = gmm_sampler._get_middle_size_based_on_strategy()
assert middle_size == expected_middle_size


def test_get_middle_size_based_on_strategy_exception(gmm_sampler_mock):
gmm_sampler = gmm_sampler_mock()
gmm_sampler.strategy = "min"

with pytest.raises(ValueError) as ex:
gmm_sampler._get_middle_size_based_on_strategy()

assert str(ex.value) == 'Unrecognized min. Only "median" and "average" are allowed.'


def test_set_size_to_align_exception(gmm_sampler_mock):
maj_int_min = {"maj": [], "int": [], "min": []}
gmm_sampler = gmm_sampler_mock()
gmm_sampler.class_sizes = Counter(y_imb_hard)
gmm_sampler.maj_int_min = OrderedDict(maj_int_min)

with pytest.raises(ValueError) as ex:
gmm_sampler._set_size_to_align()

assert str(ex.value) == "Bad input - can not obtain desire size."