add MC-CCR implementation

damianhorna · dddddddddtd · Jun 21, 2023 · Jan 6, 2023 · Jan 6, 2023 · Jan 6, 2023
commit 434809d8d090531c9a38ddb7eb1d74871a070997
diff --git a/examples/resampling/CCR.ipynb b/examples/resampling/CCR.ipynb
diff --git a/multi_imbalance/resampling/ccr.py b/multi_imbalance/resampling/ccr.py
@@ -1,3 +1,4 @@
+from collections import Counter
 from typing import Tuple
 
 import numpy as np
@@ -42,20 +43,22 @@ def _fit_resample(self, X: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, np.nd
  minority_examples = X[y == minority_class]
  majority_examples = X[y != minority_class]
 
- clean_majority, synthetic_minority = self.clean_and_generate(minority_examples, majority_examples)
+ clean_majority, synthetic_minority = self._clean_and_generate(minority_examples, majority_examples)
 
  return np.vstack([minority_examples, clean_majority, synthetic_minority]), np.hstack([
  np.full((minority_examples.shape[0],), minority_class),
  y[y != minority_class],
  np.full((synthetic_minority.shape[0],), minority_class)
  ])
 
- def clean_and_generate(self, minority_examples: np.ndarray, majority_examples: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+ def _clean_and_generate(self, minority_examples: np.ndarray, majority_examples: np.ndarray, synthetic_examples_total: int = None) -> Tuple[np.ndarray, np.ndarray]:
  """
  :param minority_examples:
  two-dimensional numpy array (number of samples x number of features) with float numbers of minority class
  :param majority_examples:
  two-dimensional numpy array (number of samples x number of features) with float numbers of majority class
+ :param synthetic_examples_total:
+ number of synthetic examples to be generated, if left as None it is calculated as difference of class counts
  :return:
  clean majority X, synthetic minority X
  """
@@ -102,19 +105,116 @@ def clean_and_generate(self, minority_examples: np.ndarray, majority_examples: n
 
  clean_majority_examples += t
 
- number_of_synthetic_examples = majority_examples.shape[0] - minority_examples.shape[0]
- inverse_radius_sum = (r ** -1).sum()
+ generation_order = r.argsort()
+ if synthetic_examples_total is None:
+ synthetic_examples_total = majority_examples.shape[0] - minority_examples.shape[0]
+ synthetic_examples_counts = (r ** -1 / (r ** -1).sum()) * synthetic_examples_total
+ synthetic_leftovers = int((synthetic_examples_counts - synthetic_examples_counts.astype(int)).sum())
+ synthetic_examples_counts = np.floor(synthetic_examples_counts).astype(int)
+
+ for i in range(synthetic_leftovers):
+ synthetic_examples_counts[generation_order[i % len(generation_order)]] += 1
 
  generated = []
- for i, x in enumerate(minority_examples):
- synthetic_examples = int(np.round(r[i] ** -1 / inverse_radius_sum * number_of_synthetic_examples))
- for j in range(synthetic_examples):
+ for i in generation_order:
+ x = minority_examples[i]
+ for j in range(synthetic_examples_counts[i]):
  random_translation = np.random.rand(majority_examples.shape[1]) * 2 - 1
  multiplier = random_translation / abs(random_translation).sum()
  new_point = x + multiplier * r[i] * np.random.rand(1)
  generated.append(new_point)
- generated = np.vstack(generated)
+
+ if len(generated) == synthetic_examples_total:
+ break
+ if len(generated) == synthetic_examples_total:
+ break
+
+ if len(generated) > 0:
+ generated = np.array(generated)
+ else:
+ generated = np.empty((0, minority_examples.shape[1]))
+
  return clean_majority_examples, generated
 
  def distances(self, minority_example, majority_examples):
  return (abs(minority_example - majority_examples)).sum(1)
+
+
+class MultiClassCCR(BaseSampler):
+ """
+ CCR for multi-class problems.
+
+ The approach consists of the following steps:
+ 1. The classes are sorted in the descending order by the number of associated observations.
+ 2. For each of the minority classes, a collection of combined majority observations is constructed, consisting of
+ a randomly sampled fraction of observations from each of the already considered class.
+ 3. Preprocessing with the CCR algorithm is performed, using the observations from the currently considered class
+ as a minority, and the combined majority observations as the majority class. Both the generated synthetic
+ minority observations and the applied translations are incorporated into the original data, and the synthetic
+ observations can be used to construct the collection of combined majority observations for later classes.
+
+ Koziarski, M., Wozniak, M., Krawczyk, B.: Combined Cleaning and Resampling Algorithm for Multi-Class Imbalanced
+ Data with Label Noise. (2020)
+ """
+
+ def __init__(self, energy: float):
+ """
+ :param energy:
+ initial energy budget for each minority example to use for sphere expansion
+ """
+ super().__init__()
+ self._sampling_type = "over-sampling"
+ self.CCR = CCR(energy=energy)
+
+ def _fit_resample(self, X: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+ """
+ :param X:
+ two-dimensional numpy array (number of samples x number of features) with float numbers
+ :param y:
+ one-dimensional numpy array with labels for rows in X, assumes minority class = 1 and majority class = 0
+ :return:
+ resampled X, resampled y
+ """
+ sorted_class_counts = sorted(list(Counter(y).items()), key=lambda x: x[1], reverse=True)
+ n_max = sorted_class_counts[0][1]
+ class_X = {clazz: X[y == clazz] for clazz, _ in sorted_class_counts}
+
+ for i in range(1, len(sorted_class_counts)):
+ current_class, current_class_count = sorted_class_counts[i]
+ number_of_classes_with_higher_count = self._number_of_classes_with_higher_count(sorted_class_counts, i)
+ if number_of_classes_with_higher_count > 0:
+ X_minority = class_X[current_class]
+ X_majority = []
+ class_samples = []
+ for clazz, _ in sorted_class_counts[:i]:
+ if clazz != current_class:
+ sampled_X = class_X[clazz]
+ sampled_size = sampled_X.shape[0]
+ sample_size = int(n_max / number_of_classes_with_higher_count)
+ sample_size = min(sample_size, sampled_size)
+ sample = np.random.choice(sampled_size, sample_size, replace=False)
+ class_samples.append((clazz, sample))
+ X_majority.append(sampled_X[sample])
+ X_majority = np.concatenate(X_majority)
+ clean_X_majority, synthetic_minority = self.CCR._clean_and_generate(X_minority, X_majority,
+ n_max - current_class_count)
+ class_X[current_class] = np.vstack([class_X[current_class], synthetic_minority])
+ clean_X_splits = [sample.shape[0] for _, sample in class_samples[:-1]]
+ for j in range(1, len(clean_X_splits)):
+ clean_X_splits[j] += clean_X_splits[j - 1]
+ split_clean_X = np.split(clean_X_majority, clean_X_splits)
+
+ for j, (clazz, sample) in enumerate(class_samples):
+ class_X[clazz][sample] = split_clean_X[j]
+
+ final_X = np.vstack([class_X[clazz] for clazz, _ in sorted_class_counts])
+ final_y = np.hstack([np.full((class_X[clazz].shape[0],), clazz) for clazz, _ in sorted_class_counts])
+ return final_X, final_y
+
+ def _number_of_classes_with_higher_count(self, sorted_class_counts, i):
+ number_of_classes_with_higher_count = 0
+ _, current_class_count = sorted_class_counts[i]
+ for _, class_count in sorted_class_counts[:i]:
+ if class_count > current_class_count:
+ number_of_classes_with_higher_count += 1
+ return number_of_classes_with_higher_count
diff --git a/tests/resampling/test_ccr.py b/tests/resampling/test_ccr.py
@@ -44,7 +44,7 @@ def _get_parametrized_ccr():
 
 @pytest.mark.parametrize("X, y", nop_test_data)
 def test_compare_cleaning_results_to_original_article_implementation(X, y, ccr_mock):
- clf = ccr_mock(X, y)
- oversampled_X, oversampled_y = clf.fit_resample(X, y)
- assert np.array_equiv(np.sort(oversampled_X[:X.shape[0]], axis=0),
+ clf = ccr_mock()
+ resampled_X, resampled_y = clf.fit_resample(X, y)
+ assert np.array_equiv(np.sort(resampled_X[:X.shape[0]], axis=0),
  np.sort(original_cleaning_results, axis=0)) == True