From 5d060b0d90ef279cdd75da0f5b24352e8f8a8723 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Thu, 3 Nov 2022 13:04:48 +0100 Subject: [PATCH 01/48] black --- docs/conf.py | 19 +- examples/ensemble/SOUPBagging.ipynb | 15 +- examples/ensemble/mrbbagging.ipynb | 4 +- examples/ensemble/mrbbagging_pipeline.ipynb | 2 +- examples/example_ecml_presentation.ipynb | 90 +++++--- examples/resampling/GlobalCS.ipynb | 34 ++- examples/resampling/MDO.ipynb | 42 ++-- examples/resampling/SOUP.ipynb | 35 ++- examples/resampling/spider3.ipynb | 55 +++-- examples/use_case.ipynb | 48 ++-- multi_imbalance/__init__.py | 2 +- multi_imbalance/datasets/__init__.py | 2 +- multi_imbalance/datasets/_data_loader.py | 31 ++- .../datasets/tests/test_data_loader.py | 34 +-- multi_imbalance/ensemble/ecoc.py | 197 ++++++++++++----- multi_imbalance/ensemble/mrbbagging.py | 63 ++++-- multi_imbalance/ensemble/ovo.py | 93 +++++--- multi_imbalance/ensemble/soup_bagging.py | 56 +++-- multi_imbalance/ensemble/tests/test_ecoc.py | 155 ++++++++----- .../ensemble/tests/test_mrbbagging.py | 92 +++++--- multi_imbalance/ensemble/tests/test_ovo.py | 137 +++++++----- .../ensemble/tests/test_soupbagging.py | 81 ++++--- multi_imbalance/resampling/global_cs.py | 16 +- multi_imbalance/resampling/mdo.py | 90 +++++--- multi_imbalance/resampling/soup.py | 70 ++++-- multi_imbalance/resampling/spider.py | 49 +++-- multi_imbalance/resampling/static_smote.py | 9 +- .../resampling/tests/test_globalcs.py | 46 ++-- multi_imbalance/resampling/tests/test_mdo.py | 85 ++++--- multi_imbalance/resampling/tests/test_soup.py | 208 +++++++++++++----- .../resampling/tests/test_spider.py | 36 ++- .../resampling/tests/test_static_smote.py | 10 +- multi_imbalance/utils/data.py | 46 ++-- multi_imbalance/utils/min_int_maj.py | 38 ++-- multi_imbalance/utils/plot.py | 44 +++- multi_imbalance/utils/tests/test_data.py | 85 ++++--- setup.py | 16 +- 37 files changed, 1398 insertions(+), 737 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 29a80d6..8079333 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -13,17 +13,18 @@ import os import sys import sphinx_rtd_theme -sys.path.insert(0, os.path.abspath('../')) + +sys.path.insert(0, os.path.abspath("../")) # -- Project information ----------------------------------------------------- -master_doc = 'index' -project = 'multi-imbalance' -copyright = '2020, Damian Horna, Jacek Grycza, Hanna Klimczak, Kamil Pluciński' -author = 'Damian Horna, Jacek Grycza, Hanna Klimczak, Kamil Pluciński' +master_doc = "index" +project = "multi-imbalance" +copyright = "2020, Damian Horna, Jacek Grycza, Hanna Klimczak, Kamil Pluciński" +author = "Damian Horna, Jacek Grycza, Hanna Klimczak, Kamil Pluciński" # The full version, including alpha/beta/rc tags -release = '0.0.4' +release = "0.0.4" # -- General configuration --------------------------------------------------- @@ -40,12 +41,12 @@ html_theme = "sphinx_rtd_theme" # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # -- Options for HTML output ------------------------------------------------- @@ -65,4 +66,4 @@ # for mod_name in MOCK_MODULES: # sys.modules[mod_name] = mock.Mock() -autoclass_content = 'both' # for including __init__ in docs +autoclass_content = "both" # for including __init__ in docs diff --git a/examples/ensemble/SOUPBagging.ipynb b/examples/ensemble/SOUPBagging.ipynb index 5daa2e5..9a31b1d 100644 --- a/examples/ensemble/SOUPBagging.ipynb +++ b/examples/ensemble/SOUPBagging.ipynb @@ -30,7 +30,7 @@ "from multi_imbalance.utils.min_int_maj import maj_int_min\n", "\n", "%matplotlib inline\n", - "sns.set_style('darkgrid')" + "sns.set_style(\"darkgrid\")" ], "metadata": { "collapsed": false, @@ -57,7 +57,7 @@ } ], "source": [ - "dataset = load_datasets()['new_ecoli']\n", + "dataset = load_datasets()[\"new_ecoli\"]\n", "\n", "X, y = dataset.data, dataset.target\n", "print(X[:5])\n", @@ -75,7 +75,7 @@ "execution_count": 6, "outputs": [], "source": [ - "X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25)" + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)" ], "metadata": { "collapsed": false, @@ -99,7 +99,9 @@ ], "source": [ "clf = KNeighborsClassifier()\n", - "vote_classifier = SOUPBagging(clf, n_classifiers=50, maj_int_min=maj_int_min['new_ecoli'])\n", + "vote_classifier = SOUPBagging(\n", + " clf, n_classifiers=50, maj_int_min=maj_int_min[\"new_ecoli\"]\n", + ")\n", "vote_classifier.fit(X_train, y_train)\n", "y_pred = vote_classifier.predict(X_test)\n", "geometric_mean_score(y_test, y_pred, correction=0.001)" @@ -140,11 +142,10 @@ } ], "source": [ - "X, y = load_arff_dataset(f'{os.getcwd()}/../../data/arff/new_ecoli.arff')\n", + "X, y = load_arff_dataset(f\"{os.getcwd()}/../../data/arff/new_ecoli.arff\")\n", "clf = make_pipeline(StandardScaler(), SOUPBagging())\n", "cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)\n", - "print(cross_val_score(clf, X, y, cv=cv))\n", - "\n" + "print(cross_val_score(clf, X, y, cv=cv))" ], "metadata": { "collapsed": false, diff --git a/examples/ensemble/mrbbagging.ipynb b/examples/ensemble/mrbbagging.ipynb index 8a33abd..df9d53f 100644 --- a/examples/ensemble/mrbbagging.ipynb +++ b/examples/ensemble/mrbbagging.ipynb @@ -29,7 +29,9 @@ "# initialization of MEBBagging object\n", "mrbbagging = MRBBagging(30, tree.DecisionTreeClassifier())\n", "\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.33, random_state=42\n", + ")\n", "\n", "# train the MRBBagging classifier\n", "mrbbagging.fit(X_train, y_train)\n", diff --git a/examples/ensemble/mrbbagging_pipeline.ipynb b/examples/ensemble/mrbbagging_pipeline.ipynb index dfc6463..70cf341 100644 --- a/examples/ensemble/mrbbagging_pipeline.ipynb +++ b/examples/ensemble/mrbbagging_pipeline.ipynb @@ -43,7 +43,7 @@ "from multi_imbalance.utils.data import load_arff_dataset\n", "\n", "# an example of how mrbbagging can be used in sklearn pipeline\n", - "X, y = load_arff_dataset(f'{os.getcwd()}/../../data/arff/new_ecoli.arff')\n", + "X, y = load_arff_dataset(f\"{os.getcwd()}/../../data/arff/new_ecoli.arff\")\n", "clf = make_pipeline(StandardScaler(), MRBBagging(30, tree.DecisionTreeClassifier()))\n", "cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)\n", "print(cross_val_score(clf, X, y, cv=cv))" diff --git a/examples/example_ecml_presentation.ipynb b/examples/example_ecml_presentation.ipynb index e916873..e6d69eb 100644 --- a/examples/example_ecml_presentation.ipynb +++ b/examples/example_ecml_presentation.ipynb @@ -31,11 +31,11 @@ "import pandas as pd\n", "import seaborn as sns\n", "\n", - "seed=0\n", + "seed = 0\n", "np.random.seed(seed)\n", "\n", - "sns.set_style('white')\n", - "sns.set(rc={'figure.figsize':(12,7)})\n", + "sns.set_style(\"white\")\n", + "sns.set(rc={\"figure.figsize\": (12, 7)})\n", "sns.set_context(\"notebook\", font_scale=1.5)" ] }, @@ -72,12 +72,19 @@ "datasets = load_datasets_arff(return_non_cat_length=True)\n", "\n", "scores = defaultdict(lambda: defaultdict(dict))\n", - "datasets_to_compare = ['glass', 'flare', 'new_ecoli', 'new_winequality-red', 'new_yeast', 'balance-scale']\n", + "datasets_to_compare = [\n", + " \"glass\",\n", + " \"flare\",\n", + " \"new_ecoli\",\n", + " \"new_winequality-red\",\n", + " \"new_yeast\",\n", + " \"balance-scale\",\n", + "]\n", "for dataset_name in tqdm(datasets_to_compare):\n", " dataset_values = datasets[dataset_name]\n", " X, y = dataset_values.data, dataset_values.target\n", " non_cat_length = dataset_values.non_cat_length\n", - " for method in ['Tree', 'SOUP', 'MDO', 'OVO', 'MRBB']:\n", + " for method in [\"Tree\", \"SOUP\", \"MDO\", \"OVO\", \"MRBB\"]:\n", " k_fold_score = list()\n", " for i in range(10):\n", " skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)\n", @@ -86,23 +93,40 @@ " y_train, y_test = y[train_index], y[test_index]\n", " if non_cat_length > 0:\n", " normalizer = StandardScaler().fit(X_train[:, :non_cat_length])\n", - " X_train[:, :non_cat_length] = normalizer.transform(X_train[:, :non_cat_length])\n", - " X_test[:, :non_cat_length] = normalizer.transform(X_test[:, :non_cat_length])\n", + " X_train[:, :non_cat_length] = normalizer.transform(\n", + " X_train[:, :non_cat_length]\n", + " )\n", + " X_test[:, :non_cat_length] = normalizer.transform(\n", + " X_test[:, :non_cat_length]\n", + " )\n", "\n", - " if method == 'SOUP':\n", + " if method == \"SOUP\":\n", " soup = SOUP(k=7, maj_int_min=maj_int_min[dataset_name])\n", - " X_train_resampled, y_train_resampled = soup.fit_resample(np.copy(X_train), np.copy(y_train))\n", - " elif method == 'MDO':\n", - " mdo = MDO(k=5, k1_frac=0.4, maj_int_min=maj_int_min[dataset_name], seed=seed)\n", - " X_train_resampled, y_train_resampled = mdo.fit_resample(np.copy(X_train), np.copy(y_train))\n", + " X_train_resampled, y_train_resampled = soup.fit_resample(\n", + " np.copy(X_train), np.copy(y_train)\n", + " )\n", + " elif method == \"MDO\":\n", + " mdo = MDO(\n", + " k=5,\n", + " k1_frac=0.4,\n", + " maj_int_min=maj_int_min[dataset_name],\n", + " seed=seed,\n", + " )\n", + " X_train_resampled, y_train_resampled = mdo.fit_resample(\n", + " np.copy(X_train), np.copy(y_train)\n", + " )\n", " else:\n", " X_train_resampled, y_train_resampled = X_train, y_train\n", "\n", - " if method == 'OVO':\n", - " clf = OVO(binary_classifier=DecisionTreeClassifier(random_state=seed),\n", - " preprocessing=SOUP(maj_int_min=maj_int_min[dataset_name]))\n", - " elif method == 'MRBB':\n", - " clf = MRBBagging(k=100, learning_algorithm=DecisionTreeClassifier())\n", + " if method == \"OVO\":\n", + " clf = OVO(\n", + " binary_classifier=DecisionTreeClassifier(random_state=seed),\n", + " preprocessing=SOUP(maj_int_min=maj_int_min[dataset_name]),\n", + " )\n", + " elif method == \"MRBB\":\n", + " clf = MRBBagging(\n", + " k=100, learning_algorithm=DecisionTreeClassifier()\n", + " )\n", " else:\n", " clf = DecisionTreeClassifier(random_state=i)\n", "\n", @@ -137,7 +161,7 @@ "outputs": [], "source": [ "df = pd.DataFrame(scores).T\n", - "df.to_csv(f'results.csv')" + "df.to_csv(f\"results.csv\")" ] }, { @@ -160,11 +184,10 @@ } ], "source": [ - "\n", - "df = read_csv('results.csv')\n", - "df.columns = ['Dataset', 'Baseline', 'SOUP', 'MDO','OVO + SOUP', 'MRBBagging']\n", - "df = df.set_index('Dataset').T\n", - "df.columns = ['glass', 'flare', 'ecoli', 'winequality-red', 'yeast', 'balance-scale']\n", + "df = read_csv(\"results.csv\")\n", + "df.columns = [\"Dataset\", \"Baseline\", \"SOUP\", \"MDO\", \"OVO + SOUP\", \"MRBBagging\"]\n", + "df = df.set_index(\"Dataset\").T\n", + "df.columns = [\"glass\", \"flare\", \"ecoli\", \"winequality-red\", \"yeast\", \"balance-scale\"]\n", "df = df.T\n", "df" ] @@ -184,9 +207,12 @@ } ], "source": [ - "df_rank = df.rank(axis=1,ascending=True).astype(int)\n", - "df_meanrank = pd.DataFrame(df_rank.mean().sort_values(ascending=False),columns=['Mean ranking (the higher the better)']).round(2)\n", - "df_meanrank.index.name = 'Method'\n", + "df_rank = df.rank(axis=1, ascending=True).astype(int)\n", + "df_meanrank = pd.DataFrame(\n", + " df_rank.mean().sort_values(ascending=False),\n", + " columns=[\"Mean ranking (the higher the better)\"],\n", + ").round(2)\n", + "df_meanrank.index.name = \"Method\"\n", "df_meanrank" ], "metadata": { @@ -210,7 +236,13 @@ } ], "source": [ - "g = sns.barplot(x='Mean ranking (the higher the better)',y=df_meanrank.index, data=df_meanrank, palette=['grey'], dodge=True)\n", + "g = sns.barplot(\n", + " x=\"Mean ranking (the higher the better)\",\n", + " y=df_meanrank.index,\n", + " data=df_meanrank,\n", + " palette=[\"grey\"],\n", + " dodge=True,\n", + ")\n", "g.set(ylabel=None)\n", "g.set(xlabel=None)\n", "plt.show()" @@ -226,9 +258,7 @@ "cell_type": "code", "execution_count": 6, "outputs": [], - "source": [ - "\n" - ], + "source": [], "metadata": { "collapsed": false, "pycharm": { diff --git a/examples/resampling/GlobalCS.ipynb b/examples/resampling/GlobalCS.ipynb index cb468e0..db44f63 100644 --- a/examples/resampling/GlobalCS.ipynb +++ b/examples/resampling/GlobalCS.ipynb @@ -49,10 +49,10 @@ "from multi_imbalance.utils.data import construct_flat_2pc_df\n", "\n", "%matplotlib inline\n", - "sns.set_style('darkgrid')\n", + "sns.set_style(\"darkgrid\")\n", "\n", - "df = load_datasets()['new_ecoli']\n", - "X, y = df['data'], df['target']\n", + "df = load_datasets()[\"new_ecoli\"]\n", + "X, y = df[\"data\"], df[\"target\"]\n", "print(X[:5])\n", "print(y[:5])" ], @@ -131,21 +131,41 @@ "pca.fit(X)\n", "\n", "fig, axs = plt.subplots(ncols=2, nrows=2)\n", - "fig.set_size_inches( 16, 10)\n", + "fig.set_size_inches(16, 10)\n", "axs = axs.flatten()\n", "\n", "axs[1].set_title(\"Base\")\n", "sns.countplot(y, ax=axs[0], palette=p)\n", "X = pca.transform(X)\n", "df = construct_flat_2pc_df(X, y)\n", - "sns.scatterplot(x='x1', y='x2', hue='y', style='y', data=df, alpha=0.7, ax=axs[1], legend='full', palette=p)\n", + "sns.scatterplot(\n", + " x=\"x1\",\n", + " y=\"x2\",\n", + " hue=\"y\",\n", + " style=\"y\",\n", + " data=df,\n", + " alpha=0.7,\n", + " ax=axs[1],\n", + " legend=\"full\",\n", + " palette=p,\n", + ")\n", "\n", "\n", "axs[3].set_title(\"MDO\")\n", - "sns.countplot(resampled_y, ax=axs[2],palette=p)\n", + "sns.countplot(resampled_y, ax=axs[2], palette=p)\n", "resampled_X = pca.transform(resampled_X)\n", "df = construct_flat_2pc_df(resampled_X, resampled_y)\n", - "sns.scatterplot(x='x1', y='x2', hue='y', style='y', data=df, alpha=0.7, ax=axs[3], legend='full', palette=p)" + "sns.scatterplot(\n", + " x=\"x1\",\n", + " y=\"x2\",\n", + " hue=\"y\",\n", + " style=\"y\",\n", + " data=df,\n", + " alpha=0.7,\n", + " ax=axs[3],\n", + " legend=\"full\",\n", + " palette=p,\n", + ")" ], "metadata": { "collapsed": false, diff --git a/examples/resampling/MDO.ipynb b/examples/resampling/MDO.ipynb index 364c3fd..450a37a 100644 --- a/examples/resampling/MDO.ipynb +++ b/examples/resampling/MDO.ipynb @@ -49,12 +49,12 @@ "from multi_imbalance.utils.min_int_maj import maj_int_min\n", "\n", "%matplotlib inline\n", - "sns.set_style('darkgrid')\n", + "sns.set_style(\"darkgrid\")\n", "\n", "\n", "dataset = load_datasets()\n", "print(dataset.keys())\n", - "dataset = dataset['new_ecoli']\n", + "dataset = dataset[\"new_ecoli\"]\n", "X, y = dataset.data, dataset.target\n", "print(X[:5])\n", "print(y[:5])" @@ -77,7 +77,7 @@ }, "outputs": [], "source": [ - "clf = MDO(k=5, k1_frac=0.3, maj_int_min=maj_int_min['new_ecoli'])\n", + "clf = MDO(k=5, k1_frac=0.3, maj_int_min=maj_int_min[\"new_ecoli\"])\n", "resampled_X, resampled_y = clf.fit_resample(X, y)" ] }, @@ -122,21 +122,41 @@ "pca.fit(X)\n", "\n", "fig, axs = plt.subplots(ncols=2, nrows=2)\n", - "fig.set_size_inches( 16, 10)\n", + "fig.set_size_inches(16, 10)\n", "axs = axs.flatten()\n", "\n", "axs[1].set_title(\"Base\")\n", "sns.countplot(y, ax=axs[0], palette=p)\n", "X = pca.transform(X)\n", "df = construct_flat_2pc_df(X, y)\n", - "sns.scatterplot(x='x1', y='x2', hue='y', style='y', data=df, alpha=0.7, ax=axs[1], legend='full', palette=p)\n", + "sns.scatterplot(\n", + " x=\"x1\",\n", + " y=\"x2\",\n", + " hue=\"y\",\n", + " style=\"y\",\n", + " data=df,\n", + " alpha=0.7,\n", + " ax=axs[1],\n", + " legend=\"full\",\n", + " palette=p,\n", + ")\n", "\n", "\n", "axs[3].set_title(\"MDO\")\n", - "sns.countplot(resampled_y, ax=axs[2],palette=p)\n", + "sns.countplot(resampled_y, ax=axs[2], palette=p)\n", "resampled_X = pca.transform(resampled_X)\n", "df = construct_flat_2pc_df(resampled_X, resampled_y)\n", - "sns.scatterplot(x='x1', y='x2', hue='y', style='y', data=df, alpha=0.7, ax=axs[3], legend='full', palette=p)" + "sns.scatterplot(\n", + " x=\"x1\",\n", + " y=\"x2\",\n", + " hue=\"y\",\n", + " style=\"y\",\n", + " data=df,\n", + " alpha=0.7,\n", + " ax=axs[3],\n", + " legend=\"full\",\n", + " palette=p,\n", + ")" ] }, { @@ -163,14 +183,10 @@ } ], "source": [ - "X, y = load_arff_dataset(f'{os.getcwd()}/../../data/arff/new_ecoli.arff')\n", + "X, y = load_arff_dataset(f\"{os.getcwd()}/../../data/arff/new_ecoli.arff\")\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)\n", "\n", - "pipeline = Pipeline([\n", - " ('scaler', StandardScaler()),\n", - " ('mdo', MDO()),\n", - " ('knn', KNN())\n", - "])\n", + "pipeline = Pipeline([(\"scaler\", StandardScaler()), (\"mdo\", MDO()), (\"knn\", KNN())])\n", "\n", "pipeline.fit(X_train, y_train)\n", "y_hat = pipeline.predict(X_test)\n", diff --git a/examples/resampling/SOUP.ipynb b/examples/resampling/SOUP.ipynb index 9eadd0f..52d99b4 100644 --- a/examples/resampling/SOUP.ipynb +++ b/examples/resampling/SOUP.ipynb @@ -50,8 +50,8 @@ "from multi_imbalance.utils.min_int_maj import maj_int_min\n", "\n", "%matplotlib inline\n", - "sns.set_style('darkgrid')\n", - "dataset_name = 'new_ecoli'\n", + "sns.set_style(\"darkgrid\")\n", + "dataset_name = \"new_ecoli\"\n", "dataset = load_datasets()[dataset_name]\n", "\n", "X, y = dataset.data, dataset.target\n", @@ -91,7 +91,7 @@ ], "source": [ "clf = SOUP(maj_int_min=maj_int_min[dataset_name], shuffle=False)\n", - "print(maj_int_min['new_ecoli'])\n", + "print(maj_int_min[\"new_ecoli\"])\n", "resampled_X, resampled_y = clf.fit_resample(X, y)" ], "metadata": { @@ -135,7 +135,6 @@ } ], "source": [ - "\n", "n = len(Counter(y).keys())\n", "p = sns.color_palette(\"husl\", n)\n", "\n", @@ -143,21 +142,41 @@ "pca.fit(X)\n", "\n", "fig, axs = plt.subplots(ncols=2, nrows=2)\n", - "fig.set_size_inches( 16, 10)\n", + "fig.set_size_inches(16, 10)\n", "axs = axs.flatten()\n", "\n", "axs[1].set_title(\"Base\")\n", "sns.countplot(y, ax=axs[0], palette=p)\n", "X = pca.transform(X)\n", "df = construct_flat_2pc_df(X, y)\n", - "sns.scatterplot(x='x1', y='x2', hue='y', style='y', data=df, alpha=0.7, ax=axs[1], legend='full', palette=p)\n", + "sns.scatterplot(\n", + " x=\"x1\",\n", + " y=\"x2\",\n", + " hue=\"y\",\n", + " style=\"y\",\n", + " data=df,\n", + " alpha=0.7,\n", + " ax=axs[1],\n", + " legend=\"full\",\n", + " palette=p,\n", + ")\n", "\n", "\n", "axs[3].set_title(\"SOUP\")\n", - "sns.countplot(resampled_y, ax=axs[2],palette=p)\n", + "sns.countplot(resampled_y, ax=axs[2], palette=p)\n", "resampled_X = pca.transform(resampled_X)\n", "df = construct_flat_2pc_df(resampled_X, resampled_y)\n", - "sns.scatterplot(x='x1', y='x2', hue='y', style='y', data=df, alpha=0.7, ax=axs[3], legend='full', palette=p)" + "sns.scatterplot(\n", + " x=\"x1\",\n", + " y=\"x2\",\n", + " hue=\"y\",\n", + " style=\"y\",\n", + " data=df,\n", + " alpha=0.7,\n", + " ax=axs[3],\n", + " legend=\"full\",\n", + " palette=p,\n", + ")" ], "metadata": { "collapsed": false, diff --git a/examples/resampling/spider3.ipynb b/examples/resampling/spider3.ipynb index 3a5b407..b6c4400 100644 --- a/examples/resampling/spider3.ipynb +++ b/examples/resampling/spider3.ipynb @@ -55,12 +55,12 @@ "from multi_imbalance.utils.data import construct_flat_2pc_df\n", "\n", "%matplotlib inline\n", - "sns.set_style('darkgrid')\n", + "sns.set_style(\"darkgrid\")\n", "\n", "%matplotlib inline\n", - "sns.set_style('darkgrid')\n", + "sns.set_style(\"darkgrid\")\n", "\n", - "dataset = load_datasets()['new_ecoli']\n", + "dataset = load_datasets()[\"new_ecoli\"]\n", "\n", "X, y = dataset.data, dataset.target\n", "print(X[:5])\n", @@ -105,14 +105,10 @@ "cost = np.random.rand(64).reshape((8, 8)) # np.ones((8, 8))\n", "for i in range(8):\n", " cost[i][i] = 0\n", - " \n", - "maj_int_min = {\n", - " 'maj':[0,1],\n", - " 'int':[4],\n", - " 'min':[2,3]\n", - "}\n", "\n", - "clf = SPIDER3(k=1, maj_int_min = maj_int_min, cost=cost)\n", + "maj_int_min = {\"maj\": [0, 1], \"int\": [4], \"min\": [2, 3]}\n", + "\n", + "clf = SPIDER3(k=1, maj_int_min=maj_int_min, cost=cost)\n", "resampled_X, resampled_y = clf.fit_resample(X, y)" ] }, @@ -150,7 +146,6 @@ } ], "source": [ - "\n", "n = len(Counter(y).keys())\n", "p = sns.color_palette(\"husl\", n)\n", "\n", @@ -158,21 +153,41 @@ "pca.fit(X)\n", "\n", "fig, axs = plt.subplots(ncols=2, nrows=2)\n", - "fig.set_size_inches( 16, 10)\n", + "fig.set_size_inches(16, 10)\n", "axs = axs.flatten()\n", "\n", "axs[1].set_title(\"Base\")\n", "sns.countplot(y, ax=axs[0], palette=p)\n", "X = pca.transform(X)\n", "df = construct_flat_2pc_df(X, y)\n", - "sns.scatterplot(x='x1', y='x2', hue='y', style='y', data=df, alpha=0.7, ax=axs[1], legend='full', palette=p)\n", + "sns.scatterplot(\n", + " x=\"x1\",\n", + " y=\"x2\",\n", + " hue=\"y\",\n", + " style=\"y\",\n", + " data=df,\n", + " alpha=0.7,\n", + " ax=axs[1],\n", + " legend=\"full\",\n", + " palette=p,\n", + ")\n", "\n", "\n", "axs[3].set_title(\"Spider\")\n", - "sns.countplot(resampled_y, ax=axs[2],palette=p)\n", + "sns.countplot(resampled_y, ax=axs[2], palette=p)\n", "resampled_X = pca.transform(resampled_X)\n", "df = construct_flat_2pc_df(resampled_X, resampled_y)\n", - "sns.scatterplot(x='x1', y='x2', hue='y', style='y', data=df, alpha=0.7, ax=axs[3], legend='full', palette=p)" + "sns.scatterplot(\n", + " x=\"x1\",\n", + " y=\"x2\",\n", + " hue=\"y\",\n", + " style=\"y\",\n", + " data=df,\n", + " alpha=0.7,\n", + " ax=axs[3],\n", + " legend=\"full\",\n", + " palette=p,\n", + ")" ] }, { @@ -213,15 +228,13 @@ "from sklearn.neighbors import KNeighborsClassifier as KNN\n", "from sklearn.metrics import classification_report\n", "\n", - "dataset = load_datasets()['new_ecoli']\n", + "dataset = load_datasets()[\"new_ecoli\"]\n", "\n", "X, y = dataset.data, dataset.target\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)\n", - "pipeline = Pipeline([\n", - " ('scaler', StandardScaler()),\n", - " ('spider', SPIDER3(k=1)),\n", - " ('knn', KNN())\n", - "])\n", + "pipeline = Pipeline(\n", + " [(\"scaler\", StandardScaler()), (\"spider\", SPIDER3(k=1)), (\"knn\", KNN())]\n", + ")\n", "pipeline.fit(X_train, y_train)\n", "y_hat = pipeline.predict(X_test)\n", "print(classification_report(y_test, y_hat))" diff --git a/examples/use_case.ipynb b/examples/use_case.ipynb index 1d81824..5d95474 100644 --- a/examples/use_case.ipynb +++ b/examples/use_case.ipynb @@ -38,7 +38,7 @@ "\n", "%matplotlib inline\n", "\n", - "dataset_name = 'glass'\n", + "dataset_name = \"glass\"\n", "dir = os.getcwd()" ] }, @@ -86,7 +86,7 @@ "source": [ "from multi_imbalance.utils.data import load_arff_dataset\n", "\n", - "X, y = load_arff_dataset(f'{dir}/../data/arff/{dataset_name}.arff')\n", + "X, y = load_arff_dataset(f\"{dir}/../data/arff/{dataset_name}.arff\")\n", "\n", "print(X[:5])\n", "print(y[:5])" @@ -122,7 +122,8 @@ ], "source": [ "from multi_imbalance.utils.plot import plot_cardinality_and_2d_data\n", - "plot_cardinality_and_2d_data(X, y, 'Glass')" + "\n", + "plot_cardinality_and_2d_data(X, y, \"Glass\")" ] }, { @@ -151,7 +152,9 @@ "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import StandardScaler\n", "\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, stratify=y, random_state=seed)\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, y, test_size=0.35, stratify=y, random_state=seed\n", + ")\n", "\n", "scaler = StandardScaler()\n", "scaler.fit(X_train)\n", @@ -218,7 +221,7 @@ "from multi_imbalance.utils.metrics import gmean_score\n", "\n", "gmean = gmean_score(y_test, y_pred, correction=0.001)\n", - "f'Decision Tree: {round(gmean,3)}'" + "f\"Decision Tree: {round(gmean,3)}\"" ] }, { @@ -246,11 +249,8 @@ "source": [ "from multi_imbalance.resampling.mdo import MDO\n", "\n", - "mdo = MDO(k1_frac=.3, maj_int_min={\n", - " 'maj': [0, 1],\n", - " 'min': [2, 3, 4, 5]\n", - " })\n", - "X_train_res, y_train_res = mdo.fit_resample(X_train, y_train)\n" + "mdo = MDO(k1_frac=0.3, maj_int_min={\"maj\": [0, 1], \"min\": [2, 3, 4, 5]})\n", + "X_train_res, y_train_res = mdo.fit_resample(X_train, y_train)" ] }, { @@ -284,7 +284,10 @@ ], "source": [ "from multi_imbalance.utils.plot import plot_visual_comparision_datasets\n", - "plot_visual_comparision_datasets(X_train, y_train, X_train_res, y_train_res, 'Glass', 'Resampled Glass')" + "\n", + "plot_visual_comparision_datasets(\n", + " X_train, y_train, X_train_res, y_train_res, \"Glass\", \"Resampled Glass\"\n", + ")" ] }, { @@ -322,7 +325,7 @@ "y_pred = clf.predict(X_test)\n", "\n", "gmean = gmean_score(y_test, y_pred, correction=0.001)\n", - "f'Decision Tree + MDO: {round(gmean,3)}'\n" + "f\"Decision Tree + MDO: {round(gmean,3)}\"" ] }, { @@ -359,12 +362,11 @@ "source": [ "from multi_imbalance.resampling.soup import SOUP\n", "\n", - "soup = SOUP(maj_int_min={\n", - " 'maj': [0, 1],\n", - " 'min': [2, 3, 4, 5]\n", - " })\n", + "soup = SOUP(maj_int_min={\"maj\": [0, 1], \"min\": [2, 3, 4, 5]})\n", "X_train_res, y_train_res = soup.fit_resample(X_train, y_train)\n", - "plot_visual_comparision_datasets(X_train, y_train, X_train_res, y_train_res, 'Glass', 'Resampled Glass')" + "plot_visual_comparision_datasets(\n", + " X_train, y_train, X_train_res, y_train_res, \"Glass\", \"Resampled Glass\"\n", + ")" ] }, { @@ -398,7 +400,7 @@ "y_pred = clf.predict(X_test)\n", "\n", "gmean = gmean_score(y_test, y_pred, correction=0.001)\n", - "f'Decision Tree + SOUP: {round(gmean,3)}'" + "f\"Decision Tree + SOUP: {round(gmean,3)}\"" ] }, { @@ -439,21 +441,17 @@ "source": [ "from multi_imbalance.ensemble.ovo import OVO\n", "\n", - "soup = SOUP(maj_int_min={\n", - " 'maj': [0, 1],\n", - " 'min': [2, 3, 4, 5]\n", - " })\n", + "soup = SOUP(maj_int_min={\"maj\": [0, 1], \"min\": [2, 3, 4, 5]})\n", "\n", "clf = OVO(\n", - " binary_classifier=DecisionTreeClassifier(random_state=seed),\n", - " preprocessing=soup\n", + " binary_classifier=DecisionTreeClassifier(random_state=seed), preprocessing=soup\n", ")\n", "\n", "clf.fit(X_train, y_train)\n", "y_pred = clf.predict(X_test)\n", "\n", "gmean = gmean_score(y_test, y_pred, correction=0.001)\n", - "f'OVO with DecisionTree and SOUP: {round(gmean,3)}'" + "f\"OVO with DecisionTree and SOUP: {round(gmean,3)}\"" ] }, { diff --git a/multi_imbalance/__init__.py b/multi_imbalance/__init__.py index 853c793..a942444 100644 --- a/multi_imbalance/__init__.py +++ b/multi_imbalance/__init__.py @@ -3,4 +3,4 @@ from . import utils name = "multi_imbalance" -__all__ = ['ensemble', 'resampling', 'utils'] +__all__ = ["ensemble", "resampling", "utils"] diff --git a/multi_imbalance/datasets/__init__.py b/multi_imbalance/datasets/__init__.py index 0fcecfd..8aeb5ac 100644 --- a/multi_imbalance/datasets/__init__.py +++ b/multi_imbalance/datasets/__init__.py @@ -1,3 +1,3 @@ from ._data_loader import load_datasets -__all__ = ['load_datasets'] +__all__ = ["load_datasets"] diff --git a/multi_imbalance/datasets/_data_loader.py b/multi_imbalance/datasets/_data_loader.py index 9a63c7d..c4abe2e 100644 --- a/multi_imbalance/datasets/_data_loader.py +++ b/multi_imbalance/datasets/_data_loader.py @@ -26,14 +26,29 @@ from sklearn.datasets._base import Bunch -PRE_FILENAME = 'x' -POST_FILENAME = 'data.npz' +PRE_FILENAME = "x" +POST_FILENAME = "data.npz" DATA_HOME_BASIC = "./../../data/" -MAP_NAME_ID_KEYS = ['1czysty-cut', '2delikatne-cut', '3mocniej-cut', '4delikatne-bezover-cut', - 'balance-scale', 'cleveland', 'cleveland_v2', 'cmc', 'dermatology', - 'glass', 'hayes-roth', 'new_ecoli', 'new_led7digit', 'new_vehicle', - 'new_winequality-red', 'new_yeast', 'thyroid-newthyroid'] +MAP_NAME_ID_KEYS = [ + "1czysty-cut", + "2delikatne-cut", + "3mocniej-cut", + "4delikatne-bezover-cut", + "balance-scale", + "cleveland", + "cleveland_v2", + "cmc", + "dermatology", + "glass", + "hayes-roth", + "new_ecoli", + "new_led7digit", + "new_vehicle", + "new_winequality-red", + "new_yeast", + "thyroid-newthyroid", +] MAP_NAME_ID = OrderedDict() MAP_ID_NAME = OrderedDict() @@ -69,13 +84,13 @@ def load_datasets(data_home=DATA_HOME_BASIC): if not available: makedirs(extracted_dir, exist_ok=True) - with open(f'{data_home}data.tar.gz', 'rb') as fin: + with open(f"{data_home}data.tar.gz", "rb") as fin: f = BytesIO(fin.read()) tar = tarfile.open(fileobj=f) tar.extractall(path=extracted_dir) data = np.load(filename) - X, y = data['data'], data['label'] + X, y = data["data"], data["label"] datasets[it] = Bunch(data=X, target=y, DESCR=it) diff --git a/multi_imbalance/datasets/tests/test_data_loader.py b/multi_imbalance/datasets/tests/test_data_loader.py index 5f56a15..6c734ad 100644 --- a/multi_imbalance/datasets/tests/test_data_loader.py +++ b/multi_imbalance/datasets/tests/test_data_loader.py @@ -4,23 +4,23 @@ from multi_imbalance.datasets import load_datasets DATASET_SHAPE = { - '1czysty-cut': (1200, 2), - '2delikatne-cut': (1200, 2), - '3mocniej-cut': (1200, 2), - '4delikatne-bezover-cut': (1200, 2), - 'balance-scale': (625, 4), - 'cleveland': (303, 13), - 'cleveland_v2': (303, 13), - 'cmc': (1473, 9), - 'dermatology': (366, 34), - 'glass': (214, 9), - 'hayes-roth': (160, 4), - 'new_ecoli': (336, 7), - 'new_led7digit': (500, 7), - 'new_vehicle': (846, 18), - 'new_winequality-red': (1599, 11), - 'new_yeast': (1484, 8), - 'thyroid-newthyroid': (215, 5) + "1czysty-cut": (1200, 2), + "2delikatne-cut": (1200, 2), + "3mocniej-cut": (1200, 2), + "4delikatne-bezover-cut": (1200, 2), + "balance-scale": (625, 4), + "cleveland": (303, 13), + "cleveland_v2": (303, 13), + "cmc": (1473, 9), + "dermatology": (366, 34), + "glass": (214, 9), + "hayes-roth": (160, 4), + "new_ecoli": (336, 7), + "new_led7digit": (500, 7), + "new_vehicle": (846, 18), + "new_winequality-red": (1599, 11), + "new_yeast": (1484, 8), + "thyroid-newthyroid": (215, 5), } diff --git a/multi_imbalance/ensemble/ecoc.py b/multi_imbalance/ensemble/ecoc.py index 4b1161a..6b111cd 100644 --- a/multi_imbalance/ensemble/ecoc.py +++ b/multi_imbalance/ensemble/ecoc.py @@ -24,13 +24,19 @@ class ECOC(BaggingClassifier): which is closest to test instance in the sense of Hamming distance is chosen. """ - _allowed_encodings = ['dense', 'sparse', 'complete', 'OVA', 'OVO'] - _allowed_oversampling = [None, 'globalCS', 'SMOTE', 'SOUP'] - _allowed_classifiers = ['tree', 'NB', 'KNN'] - _allowed_weights = [None, 'acc', 'avg_tpr_min'] - - def __init__(self, binary_classifier='KNN', preprocessing='SOUP', encoding='OVO', n_neighbors=3, - weights=None): + _allowed_encodings = ["dense", "sparse", "complete", "OVA", "OVO"] + _allowed_oversampling = [None, "globalCS", "SMOTE", "SOUP"] + _allowed_classifiers = ["tree", "NB", "KNN"] + _allowed_weights = [None, "acc", "avg_tpr_min"] + + def __init__( + self, + binary_classifier="KNN", + preprocessing="SOUP", + encoding="OVO", + n_neighbors=3, + weights=None, + ): """ :param binary_classifier: binary classifier used by the algorithm. Possible classifiers: @@ -114,14 +120,17 @@ def fit(self, X, y, minority_classes=None): self.minority_classes = minority_classes if self.weights is not None: - X_train, X_for_weights, y_train, y_for_weights = train_test_split(X, y, test_size=0.2, stratify=y, - random_state=0) + X_train, X_for_weights, y_train, y_for_weights = train_test_split( + X, y, test_size=0.2, stratify=y, random_state=0 + ) else: X_train, y_train = X, y self._labels = np.unique(y) self._gen_code_matrix() - self._binary_classifiers = [self._get_classifier() for _ in range(self._code_matrix.shape[1])] + self._binary_classifiers = [ + self._get_classifier() for _ in range(self._code_matrix.shape[1]) + ] self._learn_binary_classifiers(X_train, y_train) if self.weights is not None: self._calc_weights(X_for_weights, y_for_weights) @@ -146,37 +155,57 @@ def predict(self, X): def _learn_binary_classifiers(self, X, y): for classifier_idx, classifier in enumerate(self._binary_classifiers): - excluded_classes_indices = [idx for idx in range(len(y)) if - self._code_matrix[self._labels.tolist().index(y[idx])][classifier_idx] == 0] + excluded_classes_indices = [ + idx + for idx in range(len(y)) + if self._code_matrix[self._labels.tolist().index(y[idx])][ + classifier_idx + ] + == 0 + ] X_filtered = np.delete(X, excluded_classes_indices, 0) y_filtered = np.delete(y, excluded_classes_indices) - binary_labels = np.array([self._code_matrix[self._labels.tolist().index(clazz)][classifier_idx] for clazz in - y_filtered]) + binary_labels = np.array( + [ + self._code_matrix[self._labels.tolist().index(clazz)][ + classifier_idx + ] + for clazz in y_filtered + ] + ) X_filtered, binary_labels = self._oversample(X_filtered, binary_labels) classifier.fit(X_filtered, binary_labels) def _gen_code_matrix(self): - if self.encoding == 'dense': + if self.encoding == "dense": self._code_matrix = self._encode_dense(self._labels.shape[0]) - elif self.encoding == 'sparse': + elif self.encoding == "sparse": self._code_matrix = self._encode_sparse(self._labels.shape[0]) - elif self.encoding == 'complete': + elif self.encoding == "complete": self._code_matrix = self._encode_complete(self._labels.shape[0]) - elif self.encoding == 'OVO': + elif self.encoding == "OVO": self._code_matrix = self._encode_ovo(self._labels.shape[0]) - elif self.encoding == 'OVA': + elif self.encoding == "OVA": self._code_matrix = self._encode_ova(self._labels.shape[0]) else: - raise ValueError("Unknown matrix generation encoding: %s, expected to be one of %s." - % (self.encoding, ECOC._allowed_encodings)) + raise ValueError( + "Unknown matrix generation encoding: %s, expected to be one of %s." + % (self.encoding, ECOC._allowed_encodings) + ) - def _encode_dense(self, number_of_classes, random_state=0, number_of_code_generations=10000): + def _encode_dense( + self, number_of_classes, random_state=0, number_of_code_generations=10000 + ): try: dirname = os.path.dirname(__file__) - matrix = np.load(dirname + f'/cached_matrices/dense_{number_of_classes}.npy') + matrix = np.load( + dirname + f"/cached_matrices/dense_{number_of_classes}.npy" + ) return matrix except IOError: - print(f'Could not find cached matrix for dense code for {number_of_classes} classes, generating matrix...') + print( + f"Could not find cached matrix for dense code for {number_of_classes} classes, generating matrix..." + ) number_of_columns = int(np.ceil(10 * np.log2(number_of_classes))) code_matrix = np.ones((number_of_classes, number_of_columns)) @@ -185,7 +214,7 @@ def _encode_dense(self, number_of_classes, random_state=0, number_of_code_genera max_min_dist = 0 for i in range(number_of_code_generations): tmp_code_matrix = np.ones((number_of_classes, number_of_columns)) - min_dist = float('inf') + min_dist = float("inf") for row in range(0, number_of_classes): for col in range(0, number_of_columns): @@ -193,7 +222,9 @@ def _encode_dense(self, number_of_classes, random_state=0, number_of_code_genera tmp_code_matrix[row, col] = -1 for compared_row in range(0, row): - dist = self._hamming_distance(tmp_code_matrix[compared_row], tmp_code_matrix[row]) + dist = self._hamming_distance( + tmp_code_matrix[compared_row], tmp_code_matrix[row] + ) if dist < min_dist: min_dist = dist @@ -202,13 +233,19 @@ def _encode_dense(self, number_of_classes, random_state=0, number_of_code_genera code_matrix = tmp_code_matrix return code_matrix - def _encode_sparse(self, number_of_classes, random_state=0, number_of_code_generations=10000): + def _encode_sparse( + self, number_of_classes, random_state=0, number_of_code_generations=10000 + ): try: dirname = os.path.dirname(__file__) - matrix = np.load(dirname + f'/cached_matrices/sparse_{number_of_classes}.npy') + matrix = np.load( + dirname + f"/cached_matrices/sparse_{number_of_classes}.npy" + ) return matrix except IOError: - print(f'Could not find cached matrix for sparse code for {number_of_classes} classes, generating matrix...') + print( + f"Could not find cached matrix for sparse code for {number_of_classes} classes, generating matrix..." + ) number_of_columns = int(np.ceil(15 * np.log2(number_of_classes))) code_matrix = np.ones((number_of_classes, number_of_columns)) @@ -217,7 +254,7 @@ def _encode_sparse(self, number_of_classes, random_state=0, number_of_code_gener max_min_dist = 0 for i in range(number_of_code_generations): tmp_code_matrix = np.ones((number_of_classes, number_of_columns)) - min_dist = float('inf') + min_dist = float("inf") for row in range(0, number_of_classes): for col in range(0, number_of_columns): @@ -231,7 +268,9 @@ def _encode_sparse(self, number_of_classes, random_state=0, number_of_code_gener break for compared_row in range(0, row): - dist = self._hamming_distance(tmp_code_matrix[compared_row], tmp_code_matrix[row]) + dist = self._hamming_distance( + tmp_code_matrix[compared_row], tmp_code_matrix[row] + ) if dist < min_dist: min_dist = dist @@ -277,7 +316,7 @@ def _encode_complete(self, number_of_classes): digit = -1 partial_code_len = 2 ** (number_of_classes - row_idx - 1) for idx in range(0, code_length, partial_code_len): - matrix[row_idx][idx:idx + partial_code_len] = digit + matrix[row_idx][idx : idx + partial_code_len] = digit digit *= -1 return matrix @@ -291,11 +330,21 @@ def _get_closest_class(self, row): if self.weights is not None: return self._labels[ np.argmin( - [sum(np.multiply(self.dich_weights, (encoded_class - row) ** 2)) for encoded_class in - self._code_matrix])] + [ + sum(np.multiply(self.dich_weights, (encoded_class - row) ** 2)) + for encoded_class in self._code_matrix + ] + ) + ] else: return self._labels[ - np.argmin([self._hamming_distance(row, encoded_class) for encoded_class in self._code_matrix])] + np.argmin( + [ + self._hamming_distance(row, encoded_class) + for encoded_class in self._code_matrix + ] + ) + ] def _oversample(self, X, y): if self.preprocessing is None: @@ -303,20 +352,22 @@ def _oversample(self, X, y): if isinstance(self.preprocessing, str): if self.preprocessing not in ECOC._allowed_oversampling: - raise ValueError("Unknown preprocessing method: %s, expected to be one of %s." - % (self.preprocessing, ECOC._allowed_oversampling)) + raise ValueError( + "Unknown preprocessing method: %s, expected to be one of %s." + % (self.preprocessing, ECOC._allowed_oversampling) + ) elif np.unique(y).size == 1: return X, y - elif self.preprocessing == 'globalCS': + elif self.preprocessing == "globalCS": gcs = GlobalCS() return gcs.fit_resample(X, y) - elif self.preprocessing == 'SMOTE': + elif self.preprocessing == "SMOTE": return self._smote_oversample(X, y) - elif self.preprocessing == 'SOUP': + elif self.preprocessing == "SOUP": soup = SOUP() return soup.fit_resample(X, y) else: - if not hasattr(self.preprocessing, 'fit_transform'): + if not hasattr(self.preprocessing, "fit_transform"): raise ValueError("Your resampler must implement fit_transform method") return self.preprocessing.fit_transform(X, y) @@ -325,58 +376,86 @@ def _get_classifier(self): if self.binary_classifier not in ECOC._allowed_classifiers: raise ValueError( "Unknown binary classifier: %s, expected to be one of %s." - % (self.binary_classifier, ECOC._allowed_classifiers)) - elif self.binary_classifier == 'tree': + % (self.binary_classifier, ECOC._allowed_classifiers) + ) + elif self.binary_classifier == "tree": decision_tree_classifier = DecisionTreeClassifier(random_state=42) return decision_tree_classifier - elif self.binary_classifier == 'NB': + elif self.binary_classifier == "NB": gnb = GaussianNB() return gnb - elif self.binary_classifier == 'KNN': + elif self.binary_classifier == "KNN": knn = KNeighborsClassifier(n_neighbors=self.n_neighbors) return knn else: - if not hasattr(self.binary_classifier, 'fit') or not hasattr(self.binary_classifier, 'predict'): - raise ValueError("Your classifier must implement fit and predict methods") + if not hasattr(self.binary_classifier, "fit") or not hasattr( + self.binary_classifier, "predict" + ): + raise ValueError( + "Your classifier must implement fit and predict methods" + ) return deepcopy(self.binary_classifier) def _smote_oversample(self, X, y): n_neighbors = min(3, min(np.unique(y, return_counts=True)[1]) - 1) if n_neighbors == 0: raise ValueError( - 'In order to use SMOTE preprocessing, the training set should contain at least 2 examples from each class') + "In order to use SMOTE preprocessing, the training set should contain at least 2 examples from each class" + ) smote = SMOTE(k_neighbors=n_neighbors, random_state=42) return smote.fit_resample(X, y) def _calc_weights(self, X_for_weights, y_for_weights): if self.weights not in ECOC._allowed_weights: - raise ValueError("Unknown weighting strategy: %s, expected to be one of %s." - % (self.weights, ECOC._allowed_weights)) + raise ValueError( + "Unknown weighting strategy: %s, expected to be one of %s." + % (self.weights, ECOC._allowed_weights) + ) dich_weights = np.ones(self._code_matrix.shape[1]) - if self.weights == 'acc': + if self.weights == "acc": for clf_idx, clf in enumerate(self._binary_classifiers): samples_no = 0 correct_no = 0 for sample, sample_label in zip(X_for_weights, y_for_weights): - if self._code_matrix[np.where(self._labels == sample_label)[0][0]][clf_idx] != 0: + if ( + self._code_matrix[np.where(self._labels == sample_label)[0][0]][ + clf_idx + ] + != 0 + ): samples_no += 1 - if clf.predict([sample])[0] == \ - self._code_matrix[np.where(self._labels == sample_label)[0][0]][clf_idx]: + if ( + clf.predict([sample])[0] + == self._code_matrix[ + np.where(self._labels == sample_label)[0][0] + ][clf_idx] + ): correct_no += 1 if samples_no != 0: acc = correct_no / samples_no dich_weights[clf_idx] = -1 + 2 * acc - elif self.weights == 'avg_tpr_min': - min_counter = Counter([y for y in y_for_weights if y in self.minority_classes]) + elif self.weights == "avg_tpr_min": + min_counter = Counter( + [y for y in y_for_weights if y in self.minority_classes] + ) for clf_idx, clf in enumerate(self._binary_classifiers): min_correct_pred = defaultdict(lambda: 0) for sample, sample_label in zip(X_for_weights, y_for_weights): - if clf.predict([sample])[0] == \ - self._code_matrix[np.where(self._labels == sample_label)[0][0]][clf_idx]: + if ( + clf.predict([sample])[0] + == self._code_matrix[ + np.where(self._labels == sample_label)[0][0] + ][clf_idx] + ): min_correct_pred[sample_label] += 1 - avg_tpr_min = np.mean([min_correct_pred[clazz] / min_counter[clazz] for clazz in min_counter.keys()]) + avg_tpr_min = np.mean( + [ + min_correct_pred[clazz] / min_counter[clazz] + for clazz in min_counter.keys() + ] + ) dich_weights[clf_idx] = avg_tpr_min self.dich_weights = dich_weights diff --git a/multi_imbalance/ensemble/mrbbagging.py b/multi_imbalance/ensemble/mrbbagging.py index 6197e20..6ac4bbc 100644 --- a/multi_imbalance/ensemble/mrbbagging.py +++ b/multi_imbalance/ensemble/mrbbagging.py @@ -21,8 +21,16 @@ class MRBBagging(BaggingClassifier): J. Intell Inf Syst (2018) 50: 97 """ - def __init__(self, k, learning_algorithm, undersampling=True, feature_selection=False, random_fs=False, - half_features=True, random_state=None): + def __init__( + self, + k, + learning_algorithm, + undersampling=True, + feature_selection=False, + random_fs=False, + half_features=True, + random_state=None, + ): """ :param k: number of classifiers (multiplied by 3 when choosing feature selection) @@ -114,7 +122,12 @@ def _resample(self, n, prob, classes, grouped_data): subset_x, subset_y = [], [] for no, j in enumerate(classes): data = grouped_data[j] - resample_class = resample(data, replace=True, n_samples=samples_no[no], random_state=self.random_state) + resample_class = resample( + data, + replace=True, + n_samples=samples_no[no], + random_state=self.random_state, + ) for sample in resample_class: subset_x.append(sample[0]) subset_y.append(sample[1]) @@ -137,7 +150,9 @@ def _find_random_features(self, labels_no, features_no, subset_x): def _get_features_array(self, subset_x, random_features_idx): random_features = np.array(subset_x[:, random_features_idx[0]]) for f in range(1, len(random_features_idx)): - random_features = np.vstack((random_features, subset_x[:, random_features_idx[f]])) + random_features = np.vstack( + (random_features, subset_x[:, random_features_idx[f]]) + ) if random_features.ndim == 1: return random_features[:, np.newaxis] return random_features.T @@ -160,14 +175,26 @@ def _train_with_feature_selection(self, la_list, n, prob, classes, grouped_data) subset_y = np.array(subset_y).astype(np.float) if self.all_random: - subset1, subset1_idx = self._find_random_features(labels_no, features_no, subset_x) - subset2, subset2_idx = self._find_random_features(labels_no, features_no, subset_x) - subset3, subset3_idx = self._find_random_features(labels_no, features_no, subset_x) + subset1, subset1_idx = self._find_random_features( + labels_no, features_no, subset_x + ) + subset2, subset2_idx = self._find_random_features( + labels_no, features_no, subset_x + ) + subset3, subset3_idx = self._find_random_features( + labels_no, features_no, subset_x + ) else: - subset1, subset1_idx = self._get_kbest_classifier(chi2, features_no, subset_x, subset_y) - subset2, subset2_idx = self._get_kbest_classifier(f_classif, features_no, subset_x, subset_y) - subset3, subset3_idx = self._find_random_features(labels_no, features_no, subset_x) + subset1, subset1_idx = self._get_kbest_classifier( + chi2, features_no, subset_x, subset_y + ) + subset2, subset2_idx = self._get_kbest_classifier( + f_classif, features_no, subset_x, subset_y + ) + subset3, subset3_idx = self._find_random_features( + labels_no, features_no, subset_x + ) self.feature_selection_methods[i] = subset1_idx self.feature_selection_methods[i + 1] = subset2_idx @@ -183,12 +210,18 @@ def _set_classes_dict(self, classes): def _select_data(self, classifier_id, data): if self.feature_selection: if self.all_random: - new_data = self._get_features_array(data, self.feature_selection_methods[classifier_id]) + new_data = self._get_features_array( + data, self.feature_selection_methods[classifier_id] + ) else: if (classifier_id % 3) - 2 == 0: - new_data = self._get_features_array(data, self.feature_selection_methods[classifier_id]) + new_data = self._get_features_array( + data, self.feature_selection_methods[classifier_id] + ) else: - new_data = self.feature_selection_methods[classifier_id].transform(data) + new_data = self.feature_selection_methods[classifier_id].transform( + data + ) return new_data return data @@ -199,7 +232,9 @@ def _count_votes(self, data): classes = self.classifiers[classifier_id].predict(new_data) probabilities = self.classifiers[classifier_id].predict_proba(new_data) for i, cl in enumerate(classes): - idx = list(self.classifier_classes.keys())[list(self.classifier_classes.values()).index(int(cl))] + idx = list(self.classifier_classes.keys())[ + list(self.classifier_classes.values()).index(int(cl)) + ] voting_matrix[i][idx] += max(probabilities[i]) return voting_matrix diff --git a/multi_imbalance/ensemble/ovo.py b/multi_imbalance/ensemble/ovo.py index 375ff26..c4ae5af 100644 --- a/multi_imbalance/ensemble/ovo.py +++ b/multi_imbalance/ensemble/ovo.py @@ -22,11 +22,17 @@ class OVO(BaggingClassifier): """ - _allowed_classifiers = ['tree', 'NB', 'KNN'] - _allowed_preprocessing = [None, 'globalCS', 'SMOTE', 'SOUP'] - _allowed_preprocessing_between = ['all', 'maj-min'] - - def __init__(self, binary_classifier='tree', n_neighbors=3, preprocessing='SOUP', preprocessing_between='all'): + _allowed_classifiers = ["tree", "NB", "KNN"] + _allowed_preprocessing = [None, "globalCS", "SMOTE", "SOUP"] + _allowed_preprocessing_between = ["all", "maj-min"] + + def __init__( + self, + binary_classifier="tree", + n_neighbors=3, + preprocessing="SOUP", + preprocessing_between="all", + ): """ :param binary_classifier: binary classifier. Possible classifiers: @@ -88,8 +94,9 @@ def fit(self, X, y, minority_classes=None): self._labels = np.unique(y) self._minority_classes = minority_classes num_of_classes = len(self._labels) - self._binary_classifiers = [[self._get_classifier() for _ in range(n)] for n in - range(0, num_of_classes)] + self._binary_classifiers = [ + [self._get_classifier() for _ in range(n)] for n in range(0, num_of_classes) + ] self._learn_binary_classifiers(X, y) return self @@ -103,7 +110,9 @@ def predict(self, X): num_of_classes = len(self._labels) predicted = list() for instance in X: - binary_outputs_matrix = self._construct_binary_outputs_matrix(instance, num_of_classes) + binary_outputs_matrix = self._construct_binary_outputs_matrix( + instance, num_of_classes + ) predicted.append(self._perform_max_voting(binary_outputs_matrix)) return np.array(predicted) @@ -112,15 +121,20 @@ def _construct_binary_outputs_matrix(self, instance, num_of_classes): binary_outputs_matrix = np.zeros((num_of_classes, num_of_classes)) for class_idx1 in range(len(self._labels)): for class_idx2 in range(class_idx1): - binary_outputs_matrix[class_idx1][class_idx2] = self._binary_classifiers[class_idx1][class_idx2] \ - .predict([instance]) + binary_outputs_matrix[class_idx1][ + class_idx2 + ] = self._binary_classifiers[class_idx1][class_idx2].predict([instance]) return binary_outputs_matrix def _learn_binary_classifiers(self, X, y): for row in range(len(self._labels)): for col in range(row): first_class, second_class = self._labels[row], self._labels[col] - filtered_indices = [idx for idx in range(len(y)) if y[idx] in (first_class, second_class)] + filtered_indices = [ + idx + for idx in range(len(y)) + if y[idx] in (first_class, second_class) + ] X_filtered, y_filtered = X[filtered_indices], y[filtered_indices] if self.should_perform_oversampling(first_class, second_class): X_filtered, y_filtered = self._oversample(X_filtered, y_filtered) @@ -131,26 +145,33 @@ def _get_classifier(self): if self.binary_classifier not in OVO._allowed_classifiers: raise ValueError( "Unknown binary classifier: %s, expected to be one of %s." - % (self.binary_classifier, OVO._allowed_classifiers)) - elif self.binary_classifier == 'tree': + % (self.binary_classifier, OVO._allowed_classifiers) + ) + elif self.binary_classifier == "tree": decision_tree_classifier = DecisionTreeClassifier(random_state=42) return decision_tree_classifier - elif self.binary_classifier == 'NB': + elif self.binary_classifier == "NB": gnb = GaussianNB() return gnb - elif self.binary_classifier == 'KNN': + elif self.binary_classifier == "KNN": knn = KNeighborsClassifier(n_neighbors=self.n_neighbors) return knn else: - if not hasattr(self.binary_classifier, 'fit') or not hasattr(self.binary_classifier, 'predict'): - raise ValueError("Your classifier must implement fit and predict methods") + if not hasattr(self.binary_classifier, "fit") or not hasattr( + self.binary_classifier, "predict" + ): + raise ValueError( + "Your classifier must implement fit and predict methods" + ) return deepcopy(self.binary_classifier) def _perform_max_voting(self, binary_outputs_matrix): scores = np.zeros(len(self._labels)) for clf_1 in range(len(binary_outputs_matrix)): for clf_2 in range(clf_1): - scores[self._labels.tolist().index(binary_outputs_matrix[clf_1][clf_2])] += 1 + scores[ + self._labels.tolist().index(binary_outputs_matrix[clf_1][clf_2]) + ] += 1 return self._labels[np.argmax(scores)] def _oversample(self, X, y): @@ -159,20 +180,22 @@ def _oversample(self, X, y): if isinstance(self.preprocessing, str): if self.preprocessing not in OVO._allowed_preprocessing: - raise ValueError("Unknown preprocessing: %s, expected to be one of %s." - % (self.preprocessing, OVO._allowed_preprocessing)) + raise ValueError( + "Unknown preprocessing: %s, expected to be one of %s." + % (self.preprocessing, OVO._allowed_preprocessing) + ) elif np.unique(y).size == 1: return X, y - elif self.preprocessing == 'globalCS': + elif self.preprocessing == "globalCS": gcs = GlobalCS() return gcs.fit_resample(X, y) - elif self.preprocessing == 'SMOTE': + elif self.preprocessing == "SMOTE": return self._smote_oversample(X, y) - elif self.preprocessing == 'SOUP': + elif self.preprocessing == "SOUP": soup = SOUP() return soup.fit_resample(X, y) else: - if not hasattr(self.preprocessing, 'fit_resample'): + if not hasattr(self.preprocessing, "fit_resample"): raise ValueError("Your resampler must implement fit_resample method") return self.preprocessing.fit_resample(X, y) @@ -180,16 +203,24 @@ def _smote_oversample(self, X, y): n_neighbors = min(3, min(np.unique(y, return_counts=True)[1]) - 1) if n_neighbors == 0: raise ValueError( - 'In order to use SMOTE preprocessing, the training set should contain at least 2 examples from each class') + "In order to use SMOTE preprocessing, the training set should contain at least 2 examples from each class" + ) smote = SMOTE(k_neighbors=n_neighbors, random_state=42) return smote.fit_resample(X, y) def should_perform_oversampling(self, first_class, second_class): if self.oversample_between not in OVO._allowed_preprocessing_between: - raise ValueError("Unknown strategy for oversampling: %s, expected to be one of %s." - % (self.oversample_between, OVO._allowed_preprocessing_between)) - elif self.oversample_between == 'all': + raise ValueError( + "Unknown strategy for oversampling: %s, expected to be one of %s." + % (self.oversample_between, OVO._allowed_preprocessing_between) + ) + elif self.oversample_between == "all": return True - elif self.oversample_between == 'maj-min': - return (first_class in self._minority_classes and second_class not in self._minority_classes) or \ - (second_class in self._minority_classes and first_class not in self._minority_classes) + elif self.oversample_between == "maj-min": + return ( + first_class in self._minority_classes + and second_class not in self._minority_classes + ) or ( + second_class in self._minority_classes + and first_class not in self._minority_classes + ) diff --git a/multi_imbalance/ensemble/soup_bagging.py b/multi_imbalance/ensemble/soup_bagging.py index 8073422..f2e8ed5 100644 --- a/multi_imbalance/ensemble/soup_bagging.py +++ b/multi_imbalance/ensemble/soup_bagging.py @@ -52,21 +52,30 @@ def fit_classifier(args): clf, X, y, resampled, maj_int_min = args x_sampled, y_sampled = resampled - out_of_bag = setdiff(np.hstack((X, y[:, np.newaxis])), np.hstack((x_sampled, y_sampled[:, np.newaxis]))) + out_of_bag = setdiff( + np.hstack((X, y[:, np.newaxis])), + np.hstack((x_sampled, y_sampled[:, np.newaxis])), + ) x_out, y_out = out_of_bag[:, :-1], out_of_bag[:, -1].astype(int) - x_resampled, y_resampled = SOUP(maj_int_min=maj_int_min).fit_resample(x_sampled, y_sampled) + x_resampled, y_resampled = SOUP(maj_int_min=maj_int_min).fit_resample( + x_sampled, y_sampled + ) clf.fit(x_resampled, y_resampled) result = clf.predict_proba(x_out) class_sum_prob = np.sum(result, axis=0) + 0.001 class_quantities = Counter(y_out) - expected_sum_prob = np.array([class_quantities[i] for i in range(len(Counter(y)))]) + expected_sum_prob = np.array( + [class_quantities[i] for i in range(len(Counter(y)))] + ) try: global_weights = expected_sum_prob / class_sum_prob except Exception: global_weights = np.ones(shape=len(Counter(y))) - print(f'Exc {Counter(y)} {Counter(y_out)} {result.shape} {expected_sum_prob.shape} {class_sum_prob.shape}') + print( + f"Exc {Counter(y)} {Counter(y_out)} {result.shape} {expected_sum_prob.shape} {class_sum_prob.shape}" + ) return clf, global_weights def fit(self, X, y, **kwargs): @@ -83,8 +92,19 @@ def fit(self, X, y, **kwargs): self.classes = np.unique(y) pool = multiprocessing.Pool(self.num_core) - results = pool.map(fit_clf, [(clf, X, y, resample(X, y, stratify=y, random_state=i), self.maj_int_min) - for i, clf in enumerate(self.classifiers)]) + results = pool.map( + fit_clf, + [ + ( + clf, + X, + y, + resample(X, y, stratify=y, random_state=i), + self.maj_int_min, + ) + for i, clf in enumerate(self.classifiers) + ], + ) pool.close() pool.join() for i, (clf, weights) in enumerate(results): @@ -93,7 +113,7 @@ def fit(self, X, y, **kwargs): self.clf_weights = np.array(self.clf_weights) - def predict(self, X, strategy: str = 'average'): + def predict(self, X, strategy: str = "average"): """ Predict class for X. The predicted class of an input sample is computed as the class with the highest sum of predicted probability. @@ -113,31 +133,35 @@ def predict(self, X, strategy: str = 'average'): array of shape = [n_samples]. The predicted classes. """ weights_sum = self.predict_proba(X) - if strategy == 'average': + if strategy == "average": p = np.sum(weights_sum, axis=0) - elif strategy == 'optimistic': + elif strategy == "optimistic": p = np.max(weights_sum, axis=0) - elif strategy == 'pessimistic': + elif strategy == "pessimistic": p = np.min(weights_sum, axis=0) - elif strategy == 'mixed': + elif strategy == "mixed": n_samples = X.shape[0] n_classes = self.classes.shape[0] p = np.zeros(shape=(n_samples, n_classes)) - 1 for i in range(n_classes): - two_dim_class_vector = weights_sum[:, :, i] # [:,:,1] -> [classifiers x samples] - if i in self.maj_int_min['min']: + two_dim_class_vector = weights_sum[ + :, :, i + ] # [:,:,1] -> [classifiers x samples] + if i in self.maj_int_min["min"]: squeeze_with_strategy = np.max(two_dim_class_vector, axis=0) else: - squeeze_with_strategy = np.min(two_dim_class_vector, axis=0) # [1, n_samples, 1] -> [n_samples] + squeeze_with_strategy = np.min( + two_dim_class_vector, axis=0 + ) # [1, n_samples, 1] -> [n_samples] p[:, i] = squeeze_with_strategy assert -1 not in p - elif strategy == 'global': + elif strategy == "global": for i, weight in enumerate(self.clf_weights): weights_sum[i] *= weight p = np.sum(weights_sum, axis=0) else: - raise KeyError(f'Incorrect strategy param: ${strategy}') + raise KeyError(f"Incorrect strategy param: ${strategy}") y_result = np.argmax(p, axis=1) return y_result diff --git a/multi_imbalance/ensemble/tests/test_ecoc.py b/multi_imbalance/ensemble/tests/test_ecoc.py index 7ffe21b..6acdadb 100644 --- a/multi_imbalance/ensemble/tests/test_ecoc.py +++ b/multi_imbalance/ensemble/tests/test_ecoc.py @@ -5,44 +5,79 @@ import multi_imbalance.ensemble.ecoc as ecoc -X = np.array([ - [1.8938566839198983, 0.7347724642028586, 1.5817290619305417], - [1.6893330472771877, 1.3729481360429043, 0.1779576959347715], - [1.1103882804642866, 0.2684931500114267, 0.24565717871603532], - [0.9635120154904986, 0.44338438370111577, 1.6559238383999697], - [0.6525827502237067, 0.8978087724631425, 1.5056794207545134], - [0.8232009732859464, 0.5270243940630088, 1.434695372722657], - [0.519304726338536, 0.4635228434262648, 0.014170648565480004], - [1.3938520002157688, 1.524670776643407, 0.9011189423913637], - [0.09993454781831534, 0.5991188594563008, 0.6462181194010983], - [1.5300019511124079, 0.08177359763506553, 1.7642527715349894], - [1.1770688242955876, 0.9604049547799067, 0.6989025594835503], - [1.5143651712498534, 1.4914673103908214, 1.3377704178955587], - [1.1299009013495136, 0.700540900007983, 1.071829181951729], - [1.530652133805449, 0.2992536048983532, 1.957731948975865], - [1.6236761570974148, 0.5919033806975751, 1.6334065904199757], - [0.9365056250644108, 1.526475631725099, 1.420298571686271], - [0.9063995770780813, 1.0248369545634513, 1.36911505163145], - [0.3861789635773656, 0.5758917834278445, 0.910187724154228], - [0.7165380621896438, 1.494299618627891, 0.521854931610239], - [1.3621939213912113, 0.387219837127391, 1.321376123618781], - [1.6764775993219296, 0.15364096535456317, 1.3219739817389], - [3.6764775993219296, 0.15364096535456317, 1.3789731273891], - [4.6764775993219296, 2.15364096535456317, 1.9830281123211], - [5.6764775993219296, 3.15364096535456317, 1.3213121322321], - [6.321312, 11.15364096535456317, 1.0998908320132], - [8.414132131, 2.15364096535456317, 5.0998908320132], - [10.6764775993219296, -2.15364096535456317, 3.0998908320132], - [4.6764775993219296, 0.15364096535456317, 2.0998908320132], - [-4.6764775993219296, -1.15364096535456317, 9.0998908320132], - [-6.6764775993219296, 11.15364096535456317, 5.0998908320132], -]) - -y = np.array([2, 0, 2, 3, 0, 3, 1, 0, 2, 0, 2, 3, 1, 2, 1, 3, 0, 3, 2, 0, 0, 1, 2, 3, 0, 1, 2, 3, 1, 2]) +X = np.array( + [ + [1.8938566839198983, 0.7347724642028586, 1.5817290619305417], + [1.6893330472771877, 1.3729481360429043, 0.1779576959347715], + [1.1103882804642866, 0.2684931500114267, 0.24565717871603532], + [0.9635120154904986, 0.44338438370111577, 1.6559238383999697], + [0.6525827502237067, 0.8978087724631425, 1.5056794207545134], + [0.8232009732859464, 0.5270243940630088, 1.434695372722657], + [0.519304726338536, 0.4635228434262648, 0.014170648565480004], + [1.3938520002157688, 1.524670776643407, 0.9011189423913637], + [0.09993454781831534, 0.5991188594563008, 0.6462181194010983], + [1.5300019511124079, 0.08177359763506553, 1.7642527715349894], + [1.1770688242955876, 0.9604049547799067, 0.6989025594835503], + [1.5143651712498534, 1.4914673103908214, 1.3377704178955587], + [1.1299009013495136, 0.700540900007983, 1.071829181951729], + [1.530652133805449, 0.2992536048983532, 1.957731948975865], + [1.6236761570974148, 0.5919033806975751, 1.6334065904199757], + [0.9365056250644108, 1.526475631725099, 1.420298571686271], + [0.9063995770780813, 1.0248369545634513, 1.36911505163145], + [0.3861789635773656, 0.5758917834278445, 0.910187724154228], + [0.7165380621896438, 1.494299618627891, 0.521854931610239], + [1.3621939213912113, 0.387219837127391, 1.321376123618781], + [1.6764775993219296, 0.15364096535456317, 1.3219739817389], + [3.6764775993219296, 0.15364096535456317, 1.3789731273891], + [4.6764775993219296, 2.15364096535456317, 1.9830281123211], + [5.6764775993219296, 3.15364096535456317, 1.3213121322321], + [6.321312, 11.15364096535456317, 1.0998908320132], + [8.414132131, 2.15364096535456317, 5.0998908320132], + [10.6764775993219296, -2.15364096535456317, 3.0998908320132], + [4.6764775993219296, 0.15364096535456317, 2.0998908320132], + [-4.6764775993219296, -1.15364096535456317, 9.0998908320132], + [-6.6764775993219296, 11.15364096535456317, 5.0998908320132], + ] +) + +y = np.array( + [ + 2, + 0, + 2, + 3, + 0, + 3, + 1, + 0, + 2, + 0, + 2, + 3, + 1, + 2, + 1, + 3, + 0, + 3, + 2, + 0, + 0, + 1, + 2, + 3, + 0, + 1, + 2, + 3, + 1, + 2, + ] +) def test_random_oversampling(): - ecoc_clf = ecoc.ECOC(preprocessing='globalCS') + ecoc_clf = ecoc.ECOC(preprocessing="globalCS") X_oversampled, y_oversampled = ecoc_clf._oversample(X, y) assert len(X_oversampled) == len(y_oversampled) @@ -58,8 +93,10 @@ def test_no_oversampling(): assert y.shape == y_oversampled.shape -@pytest.mark.parametrize("encoding_strategy", ['dense', 'sparse', 'OVO', 'OVA', 'complete']) -@pytest.mark.parametrize("oversampling", [None, 'globalCS', 'SMOTE', 'SOUP']) +@pytest.mark.parametrize( + "encoding_strategy", ["dense", "sparse", "OVO", "OVA", "complete"] +) +@pytest.mark.parametrize("oversampling", [None, "globalCS", "SMOTE", "SOUP"]) def test_encoding(encoding_strategy, oversampling): ecoc_clf = ecoc.ECOC(encoding=encoding_strategy, preprocessing=oversampling) ecoc_clf.fit(X, y) @@ -72,7 +109,7 @@ def test_encoding(encoding_strategy, oversampling): assert bool((~matrix.any(axis=0)).any()) is False -@pytest.mark.parametrize("encoding_strategy", ['dense', 'sparse']) +@pytest.mark.parametrize("encoding_strategy", ["dense", "sparse"]) def test_dense_and_sparse_with_not_cached_matrices(encoding_strategy): X1 = np.concatenate((X, 2 * X, 3 * X, 4 * X, 5 * X), axis=0) y1 = np.concatenate((y + 4, y + 8, y + 12, y + 16, y + 20)) @@ -124,10 +161,10 @@ def fit_transform(self, X, y): def test_unknown_classifier(): - ecoc_clf = ecoc.ECOC(binary_classifier='DUMMY_CLASSIFIER', preprocessing=None) + ecoc_clf = ecoc.ECOC(binary_classifier="DUMMY_CLASSIFIER", preprocessing=None) with pytest.raises(ValueError) as e: ecoc_clf.fit(X, y) - assert 'DUMMY_CLASSIFIER' in str(e.value) + assert "DUMMY_CLASSIFIER" in str(e.value) def test_own_classifier_without_predict_and_fit(): @@ -142,24 +179,26 @@ def bar(self, X): ecoc_clf = ecoc.ECOC(binary_classifier=dummy_clf, preprocessing=None) with pytest.raises(ValueError) as e: ecoc_clf.fit(X, y) - assert 'predict' in str(e.value) - assert 'fit' in str(e.value) + assert "predict" in str(e.value) + assert "fit" in str(e.value) -@pytest.mark.parametrize("classifier", ['tree', 'NB', 'KNN']) -@pytest.mark.parametrize("weights", [None, 'acc', 'avg_tpr_min']) +@pytest.mark.parametrize("classifier", ["tree", "NB", "KNN"]) +@pytest.mark.parametrize("weights", [None, "acc", "avg_tpr_min"]) def test_predefined_classifiers_and_weighting_without_exceptions(classifier, weights): ecoc_clf = ecoc.ECOC(binary_classifier=classifier, weights=weights) ecoc_clf.fit(X, y) - predicted = ecoc_clf.predict(np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6], [7.7, 8.8, 9.9]])) + predicted = ecoc_clf.predict( + np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6], [7.7, 8.8, 9.9]]) + ) assert len(predicted) == 3 def test_unknown_preprocessing(): - ecoc_clf = ecoc.ECOC(preprocessing='DUMMY_OVERSAMPLING') + ecoc_clf = ecoc.ECOC(preprocessing="DUMMY_OVERSAMPLING") with pytest.raises(ValueError) as e: ecoc_clf.fit(X, y) - assert 'DUMMY_OVERSAMPLING' in str(e.value) + assert "DUMMY_OVERSAMPLING" in str(e.value) def test_own_preprocessing_without_fit_transform(): @@ -174,16 +213,22 @@ def bar(self, X): ecoc_clf = ecoc.ECOC(preprocessing=dummy_oversampler) with pytest.raises(ValueError) as e: ecoc_clf.fit(X, y) - assert 'fit_transform' in str(e.value) + assert "fit_transform" in str(e.value) -@pytest.mark.parametrize("encoding_strategy", ['dense', 'sparse', 'OVO', 'OVA', 'complete']) -@pytest.mark.parametrize("oversampling", [None, 'globalCS', 'SMOTE', 'SOUP']) +@pytest.mark.parametrize( + "encoding_strategy", ["dense", "sparse", "OVO", "OVA", "complete"] +) +@pytest.mark.parametrize("oversampling", [None, "globalCS", "SMOTE", "SOUP"]) def test_ecoc_with_sklearn_pipeline(encoding_strategy, oversampling): - pipeline = Pipeline([ - ('scaler', StandardScaler()), - ('ecoc', ecoc.ECOC(encoding=encoding_strategy, preprocessing=oversampling)) - ]) + pipeline = Pipeline( + [ + ("scaler", StandardScaler()), + ("ecoc", ecoc.ECOC(encoding=encoding_strategy, preprocessing=oversampling)), + ] + ) pipeline.fit(X, y) - y_hat = pipeline.predict(np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6], [7.7, 8.8, 9.9]])) + y_hat = pipeline.predict( + np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6], [7.7, 8.8, 9.9]]) + ) assert len(y_hat) == 3 diff --git a/multi_imbalance/ensemble/tests/test_mrbbagging.py b/multi_imbalance/ensemble/tests/test_mrbbagging.py index af15adf..c63c0dc 100644 --- a/multi_imbalance/ensemble/tests/test_mrbbagging.py +++ b/multi_imbalance/ensemble/tests/test_mrbbagging.py @@ -6,63 +6,88 @@ from multi_imbalance.ensemble.mrbbagging import MRBBagging import numpy as np -X_train = np.array([ - [0.05837771, 0.57543339], - [0.06153624, 0.99871925], - [0.14308529, 0.00681144], - [0.23401697, 0.21188708], - [0.2418553, 0.02137086], - [0.32480534, 0.81547632], - [0.42478482, 0.31995162], - [0.50726834, 0.72621157], - [0.54580968, 0.58025914], - [0.55748531, 0.71866238], - [0.69208769, 0.63759459], - [0.70797377, 0.16348051], - [0.76410615, 0.70451542], - [0.81680686, 0.50793884], - [0.8490789, 0.53826627], - [0.8847505, 0.96856011], -]) +X_train = np.array( + [ + [0.05837771, 0.57543339], + [0.06153624, 0.99871925], + [0.14308529, 0.00681144], + [0.23401697, 0.21188708], + [0.2418553, 0.02137086], + [0.32480534, 0.81547632], + [0.42478482, 0.31995162], + [0.50726834, 0.72621157], + [0.54580968, 0.58025914], + [0.55748531, 0.71866238], + [0.69208769, 0.63759459], + [0.70797377, 0.16348051], + [0.76410615, 0.70451542], + [0.81680686, 0.50793884], + [0.8490789, 0.53826627], + [0.8847505, 0.96856011], + ] +) y_train = np.array([0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0]) -X_test = np.array([[0.9287003, 0.97580299], - [0.9584236, 0.10536541], - [0.01, 0.87666093], - [0.97352367, 0.78807909], ]) +X_test = np.array( + [ + [0.9287003, 0.97580299], + [0.9584236, 0.10536541], + [0.01, 0.87666093], + [0.97352367, 0.78807909], + ] +) y_test = np.array([0, 0, 0, 0]) class TestMRBBagging(unittest.TestCase): def test_api(self): - mrbbagging = MRBBagging(1, DecisionTreeClassifier(random_state=0), random_state=0) + mrbbagging = MRBBagging( + 1, DecisionTreeClassifier(random_state=0), random_state=0 + ) mrbbagging.fit(X_train, y_train) y_pred = mrbbagging.predict(X_test) assert all(y_pred == y_test) def test_api_multiple_trees(self): - mrbbagging = MRBBagging(5, DecisionTreeClassifier(random_state=0), random_state=0) + mrbbagging = MRBBagging( + 5, DecisionTreeClassifier(random_state=0), random_state=0 + ) mrbbagging.fit(X_train, y_train) y_pred = mrbbagging.predict(X_test) assert all(y_pred == y_test) def test_api_with_feature_selection(self): - mrbbagging = MRBBagging(1, DecisionTreeClassifier(random_state=0), feature_selection=True, random_state=0) + mrbbagging = MRBBagging( + 1, + DecisionTreeClassifier(random_state=0), + feature_selection=True, + random_state=0, + ) mrbbagging.fit(X_train, y_train) y_pred = mrbbagging.predict(X_test) assert all(y_pred == y_test) def test_api_with_random_feature_selection(self): - mrbbagging = MRBBagging(1, DecisionTreeClassifier(random_state=0), feature_selection=True, random_fs=True, - random_state=0) + mrbbagging = MRBBagging( + 1, + DecisionTreeClassifier(random_state=0), + feature_selection=True, + random_fs=True, + random_state=0, + ) mrbbagging.fit(X_train, y_train) y_pred = mrbbagging.predict(X_test) assert all(y_pred == y_test) def test_api_with_feature_selection_sqrt_features(self): - mrbbagging = MRBBagging(1, DecisionTreeClassifier(random_state=0), feature_selection=True, - half_features=False, random_state=0) + mrbbagging = MRBBagging( + 1, + DecisionTreeClassifier(random_state=0), + feature_selection=True, + half_features=False, + random_state=0, + ) mrbbagging.fit(X_train, y_train) y_pred = mrbbagging.predict(X_test) assert all(y_pred == y_test) @@ -72,8 +97,11 @@ def test__group_data(self): x = [[1, 1, 1], [2, 2, 2], [3, 3, 3]] y = ["A", "B", "C"] classes, grouped_data = mrbbagging._group_data(x, y) - self.assertEqual(classes, {'A', 'B', 'C'}) - self.assertEqual(grouped_data, {'C': [[[3, 3, 3], 'C']], 'A': [[[1, 1, 1], 'A']], 'B': [[[2, 2, 2], 'B']]}) + self.assertEqual(classes, {"A", "B", "C"}) + self.assertEqual( + grouped_data, + {"C": [[[3, 3, 3], "C"]], "A": [[[1, 1, 1], "A"]], "B": [[[2, 2, 2], "B"]]}, + ) def test__group_data_with_none(self): mrbbagging = MRBBagging(1, DecisionTreeClassifier()) @@ -98,5 +126,5 @@ def test_with_invalid_k(self): MRBBagging(0, DecisionTreeClassifier()) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/multi_imbalance/ensemble/tests/test_ovo.py b/multi_imbalance/ensemble/tests/test_ovo.py index e7b09b4..9d337ce 100644 --- a/multi_imbalance/ensemble/tests/test_ovo.py +++ b/multi_imbalance/ensemble/tests/test_ovo.py @@ -5,35 +5,39 @@ import multi_imbalance.ensemble.ovo as ovo import numpy as np -X = np.array([ - [-0.5813674466943386, -0.37091887120486655, -0.4465813355321204], - [-0.630844420005455, 0.2871060228285258, 0.25613448374582437], - [-0.6714353752038125, -0.3537255703996809, -0.9687281557330454], - [0.48996953214789785, -0.09697447345439447, 0.5495667841083927], - [-0.8821146485100975, -0.7739441502933209, 0.34906417794620515], - [-0.6652132165510964, -0.8488383805882527, -0.030511639438375093], - [0.7846621478367604, -0.9231479370667406, 0.7262231362586529], - [0.7860907554630845, -0.33615224298146584, 0.6928271619140047], - [0.7630774674537872, -0.7753044382704197, -0.7570971821030896], - [-0.5843764899573332, -0.524996569658353, 0.9675951634125524], - [-33.5843764899573332, -0.303030303030303, 5.9292929292929290], - [22.5843764899573332, -2.020202022020202, 6.3213211113213211], - [11.5843764899573332, -22.110101010011010, 7.3213123131232111], - [2.5843764899573332, -1.010123211231232, 1.9675951634125524], - [-0.5843764899573332, 1.321321312112321, 2.3213123123123222], - [-0.2321312313211321, -412.321321312112321, 6.1010101010100101], - [-0.3921809321038213, -2.321321312112321, 3.1010101010100101], - [1.5843764899573332, -0.4324234243242342, 32.1010101010100101], - [2.5843764899573332, -0.321321312112321, 53.1010101010100101], - [42.5843764899573332, 66.321321312112321, 3242.1010101010100101], - [1.5843764899573332, 44.321321312112321, 2.999909909090909090], - [-2.5843764899573332, 12.321321312112321, 2112.342423], - [-5.5843764899573332, 3.321321312112321, 2212.1010101010100101], - [-8.5843764899573332, 1.321321312112321, 1222.1010101010100101], - [-2.3213123213211232, -2.321321312112321, 992.1010101010100101], -]) - -y = np.array([1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 1, 1, 2, 3, 2, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1]) +X = np.array( + [ + [-0.5813674466943386, -0.37091887120486655, -0.4465813355321204], + [-0.630844420005455, 0.2871060228285258, 0.25613448374582437], + [-0.6714353752038125, -0.3537255703996809, -0.9687281557330454], + [0.48996953214789785, -0.09697447345439447, 0.5495667841083927], + [-0.8821146485100975, -0.7739441502933209, 0.34906417794620515], + [-0.6652132165510964, -0.8488383805882527, -0.030511639438375093], + [0.7846621478367604, -0.9231479370667406, 0.7262231362586529], + [0.7860907554630845, -0.33615224298146584, 0.6928271619140047], + [0.7630774674537872, -0.7753044382704197, -0.7570971821030896], + [-0.5843764899573332, -0.524996569658353, 0.9675951634125524], + [-33.5843764899573332, -0.303030303030303, 5.9292929292929290], + [22.5843764899573332, -2.020202022020202, 6.3213211113213211], + [11.5843764899573332, -22.110101010011010, 7.3213123131232111], + [2.5843764899573332, -1.010123211231232, 1.9675951634125524], + [-0.5843764899573332, 1.321321312112321, 2.3213123123123222], + [-0.2321312313211321, -412.321321312112321, 6.1010101010100101], + [-0.3921809321038213, -2.321321312112321, 3.1010101010100101], + [1.5843764899573332, -0.4324234243242342, 32.1010101010100101], + [2.5843764899573332, -0.321321312112321, 53.1010101010100101], + [42.5843764899573332, 66.321321312112321, 3242.1010101010100101], + [1.5843764899573332, 44.321321312112321, 2.999909909090909090], + [-2.5843764899573332, 12.321321312112321, 2112.342423], + [-5.5843764899573332, 3.321321312112321, 2212.1010101010100101], + [-8.5843764899573332, 1.321321312112321, 1222.1010101010100101], + [-2.3213123213211232, -2.321321312112321, 992.1010101010100101], + ] +) + +y = np.array( + [1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 1, 1, 2, 3, 2, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1] +) def test_fit_predict(): @@ -43,7 +47,7 @@ def test_fit_predict(): assert predicted in (1, 2, 3) -@pytest.mark.parametrize("classifier", ['tree', 'NB', 'KNN']) +@pytest.mark.parametrize("classifier", ["tree", "NB", "KNN"]) def test_binary_classifiers(classifier): clf = ovo.OVO(binary_classifier=classifier, preprocessing=None) clf.fit(X[:-1], y[:-1]) @@ -57,11 +61,15 @@ def test_binary_classifiers(classifier): def test_max_voting(): labels = np.array([4, 3, 6, 5, 7]) - binary_outputs = np.array([[0, 0, 0, 0, 0], - [3, 0, 0, 0, 0], - [4, 3, 0, 0, 0], - [4, 5, 6, 0, 0], - [7, 7, 7, 5, 0]]) + binary_outputs = np.array( + [ + [0, 0, 0, 0, 0], + [3, 0, 0, 0, 0], + [4, 3, 0, 0, 0], + [4, 5, 6, 0, 0], + [7, 7, 7, 5, 0], + ] + ) clf = ovo.OVO() clf._labels = labels @@ -96,22 +104,29 @@ def fit_resample(self, X, y): assert len(y_oversampled) == 2 * len(y) -@pytest.mark.parametrize("preprocessing_btwn", ['all', 'maj-min']) -@pytest.mark.parametrize("classifier", ['tree', 'NB', 'KNN']) -@pytest.mark.parametrize("preprocessing", [None, 'globalCS', 'SMOTE', 'SOUP']) -def test_predefined_classifiers_and_preprocessings_without_errors(classifier, preprocessing, preprocessing_btwn): - ovo_clf = ovo.OVO(binary_classifier=classifier, preprocessing=preprocessing, - preprocessing_between=preprocessing_btwn) +@pytest.mark.parametrize("preprocessing_btwn", ["all", "maj-min"]) +@pytest.mark.parametrize("classifier", ["tree", "NB", "KNN"]) +@pytest.mark.parametrize("preprocessing", [None, "globalCS", "SMOTE", "SOUP"]) +def test_predefined_classifiers_and_preprocessings_without_errors( + classifier, preprocessing, preprocessing_btwn +): + ovo_clf = ovo.OVO( + binary_classifier=classifier, + preprocessing=preprocessing, + preprocessing_between=preprocessing_btwn, + ) ovo_clf.fit(X, y) - predicted = ovo_clf.predict(np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6], [7.7, 8.8, 9.9]])) + predicted = ovo_clf.predict( + np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6], [7.7, 8.8, 9.9]]) + ) assert len(predicted) == 3 def test_unknown_preprocessing(): - ovo_clf = ovo.OVO(preprocessing='DUMMY_OVERSAMPLING') + ovo_clf = ovo.OVO(preprocessing="DUMMY_OVERSAMPLING") with pytest.raises(ValueError) as e: ovo_clf.fit(X, y) - assert 'DUMMY_OVERSAMPLING' in str(e.value) + assert "DUMMY_OVERSAMPLING" in str(e.value) def test_own_preprocessing_without_fit_resample(): @@ -126,25 +141,35 @@ def bar(self, X): ovo_clf = ovo.OVO(preprocessing=dummy_oversampler) with pytest.raises(ValueError) as e: ovo_clf.fit(X, y) - assert 'fit_resample' in str(e.value) + assert "fit_resample" in str(e.value) def test_unknown_preprocessing_between_strategy_raises_exception(): - ovo_clf = ovo.OVO(preprocessing_between='min-intermediate') + ovo_clf = ovo.OVO(preprocessing_between="min-intermediate") with pytest.raises(ValueError) as e: ovo_clf.fit(X, y) - assert 'min-intermediate' in str(e.value) + assert "min-intermediate" in str(e.value) -@pytest.mark.parametrize("preprocessing_btwn", ['all', 'maj-min']) -@pytest.mark.parametrize("classifier", ['tree', 'NB', 'KNN']) -@pytest.mark.parametrize("preprocessing", [None, 'globalCS', 'SMOTE', 'SOUP']) +@pytest.mark.parametrize("preprocessing_btwn", ["all", "maj-min"]) +@pytest.mark.parametrize("classifier", ["tree", "NB", "KNN"]) +@pytest.mark.parametrize("preprocessing", [None, "globalCS", "SMOTE", "SOUP"]) def test_ecoc_with_sklearn_pipeline(preprocessing_btwn, classifier, preprocessing): - pipeline = Pipeline([ - ('scaler', StandardScaler()), - ('ecoc', ovo.OVO(binary_classifier=classifier, preprocessing=preprocessing, - preprocessing_between=preprocessing_btwn)) - ]) + pipeline = Pipeline( + [ + ("scaler", StandardScaler()), + ( + "ecoc", + ovo.OVO( + binary_classifier=classifier, + preprocessing=preprocessing, + preprocessing_between=preprocessing_btwn, + ), + ), + ] + ) pipeline.fit(X, y) - y_hat = pipeline.predict(np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6], [7.7, 8.8, 9.99]])) + y_hat = pipeline.predict( + np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6], [7.7, 8.8, 9.99]]) + ) assert len(y_hat) == 3 diff --git a/multi_imbalance/ensemble/tests/test_soupbagging.py b/multi_imbalance/ensemble/tests/test_soupbagging.py index d4ca00e..8360efc 100644 --- a/multi_imbalance/ensemble/tests/test_soupbagging.py +++ b/multi_imbalance/ensemble/tests/test_soupbagging.py @@ -7,62 +7,68 @@ from multi_imbalance.ensemble.soup_bagging import SOUPBagging -X_train = np.array([ - [0.05837771, 0.57543339], - [0.06153624, 0.99871925], - [0.14308529, 0.00681144], - [0.23401697, 0.21188708], - [0.2418553, 0.02137086], - [0.32480534, 0.81547632], - [0.42478482, 0.31995162], - [0.50726834, 0.72621157], - [0.54580968, 0.58025914], - [0.55748531, 0.71866238], - [0.69208769, 0.63759459], - [0.70797377, 0.16348051], - [0.76410615, 0.70451542], - [0.81680686, 0.50793884], - [0.8490789, 0.53826627], - [0.8847505, 0.96856011], -]) +X_train = np.array( + [ + [0.05837771, 0.57543339], + [0.06153624, 0.99871925], + [0.14308529, 0.00681144], + [0.23401697, 0.21188708], + [0.2418553, 0.02137086], + [0.32480534, 0.81547632], + [0.42478482, 0.31995162], + [0.50726834, 0.72621157], + [0.54580968, 0.58025914], + [0.55748531, 0.71866238], + [0.69208769, 0.63759459], + [0.70797377, 0.16348051], + [0.76410615, 0.70451542], + [0.81680686, 0.50793884], + [0.8490789, 0.53826627], + [0.8847505, 0.96856011], + ] +) y_train = np.array([0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0]) -X_test = np.array([[0.9287003, 0.97580299], - [0.9584236, 0.10536541], - [0.01, 0.87666093], - [0.97352367, 0.78807909], ]) +X_test = np.array( + [ + [0.9287003, 0.97580299], + [0.9584236, 0.10536541], + [0.01, 0.87666093], + [0.97352367, 0.78807909], + ] +) y_test = np.array([0, 1, 0, 0]) def test_soubagging(): clf = KNeighborsClassifier() - maj_int_min = {'maj': [0], 'int': [], 'min': [1]} + maj_int_min = {"maj": [0], "int": [], "min": [1]} clf = SOUPBagging(clf, n_classifiers=2, maj_int_min=maj_int_min) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) assert all(y_pred == y_test) - y_pred = clf.predict(X_test, strategy='mixed') + y_pred = clf.predict(X_test, strategy="mixed") assert all(y_pred == y_test) - y_pred = clf.predict(X_test, strategy='optimistic') + y_pred = clf.predict(X_test, strategy="optimistic") assert all(y_pred == y_test) - y_pred = clf.predict(X_test, strategy='pessimistic') + y_pred = clf.predict(X_test, strategy="pessimistic") assert all(y_pred == y_test) - y_pred = clf.predict(X_test, strategy='global') + y_pred = clf.predict(X_test, strategy="global") assert all(y_pred == y_test) def test_exception(): clf = KNeighborsClassifier() - maj_int_min = {'maj': [0], 'int': [], 'min': [1]} + maj_int_min = {"maj": [0], "int": [], "min": [1]} clf = SOUPBagging(clf, n_classifiers=2, maj_int_min=maj_int_min) clf.fit(X_train, y_train) with pytest.raises(KeyError): - clf.predict(X_test, 'incorrect') + clf.predict(X_test, "incorrect") def test_default_classifier(): - maj_int_min = {'maj': [0], 'int': [], 'min': [1]} + maj_int_min = {"maj": [0], "int": [], "min": [1]} clf = SOUPBagging(n_classifiers=2, maj_int_min=maj_int_min) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) @@ -70,14 +76,19 @@ def test_default_classifier(): def test_fit_classifier_classifier(): - maj_int_min = {'maj': [0], 'int': [], 'min': [1]} + maj_int_min = {"maj": [0], "int": [], "min": [1]} clf = KNeighborsClassifier() - clf, weights = SOUPBagging.fit_classifier([clf, X_train, y_train, - resample(X_train, y_train, stratify=y_train, random_state=0), - maj_int_min]) + clf, weights = SOUPBagging.fit_classifier( + [ + clf, + X_train, + y_train, + resample(X_train, y_train, stratify=y_train, random_state=0), + maj_int_min, + ] + ) y_pred = clf.predict(X_test) check_is_fitted(clf) assert all(y_pred == y_test) assert_array_almost_equal(weights, np.array([1.33288904, 0.66644452])) - diff --git a/multi_imbalance/resampling/global_cs.py b/multi_imbalance/resampling/global_cs.py index 8991e0b..b9b9b8f 100644 --- a/multi_imbalance/resampling/global_cs.py +++ b/multi_imbalance/resampling/global_cs.py @@ -13,7 +13,7 @@ class GlobalCS(BaseSampler): def __init__(self, shuffle: bool = True): super().__init__() - self._sampling_type = 'over-sampling' + self._sampling_type = "over-sampling" self.shuffle = shuffle self.quantities, self.max_quantity, self.X, self.y = [None] * 4 @@ -26,8 +26,10 @@ def _fit_resample(self, X, y): :return: Resampled X (max class quantity * number of unique classes), y (number of rows in X) as numpy array """ - assert len(X.shape) == 2, 'X should have 2 dimension' - assert X.shape[0] == y.shape[0], 'Number of labels must be equal to number of samples' + assert len(X.shape) == 2, "X should have 2 dimension" + assert ( + X.shape[0] == y.shape[0] + ), "Number of labels must be equal to number of samples" self.quantities = Counter(y) self.max_quantity = int(np.max(list(self.quantities.values()))) @@ -48,10 +50,14 @@ def _fit_resample(self, X, y): return np.array(result_X), np.array(result_y) def _equal_oversample(self, X, y, class_name): - indices_in_class = [i for i, class_label in enumerate(y) if class_label == class_name] + indices_in_class = [ + i for i, class_label in enumerate(y) if class_label == class_name + ] desired_quantity = self.max_quantity - len(indices_in_class) - oversampled_X, oversampled_y = list(X[indices_in_class]), list(y[indices_in_class]) + oversampled_X, oversampled_y = list(X[indices_in_class]), list( + y[indices_in_class] + ) for i in range(desired_quantity): sample_index_to_duplicate: int = i % self.quantities[class_name] diff --git a/multi_imbalance/resampling/mdo.py b/multi_imbalance/resampling/mdo.py index 941fa07..30288bf 100644 --- a/multi_imbalance/resampling/mdo.py +++ b/multi_imbalance/resampling/mdo.py @@ -17,7 +17,7 @@ class MDO(BaseSampler): """ - def __init__(self, k=5, k1_frac=.4, seed=0, prop=1, maj_int_min=None): + def __init__(self, k=5, k1_frac=0.4, seed=0, prop=1, maj_int_min=None): """ :param k: Number of neighbours considered during the neighbourhood analysis @@ -32,7 +32,7 @@ def __init__(self, k=5, k1_frac=.4, seed=0, prop=1, maj_int_min=None): dict {'maj': majority class labels, 'min': minority class labels} """ super().__init__() - self._sampling_type = 'over-sampling' + self._sampling_type = "over-sampling" self.knn = NearestNeighbors(n_neighbors=k) self.k2 = k self.k1 = int(k * k1_frac) @@ -60,48 +60,80 @@ def _fit_resample(self, X, y): quantities = Counter(self.y) goal_quantity = int(max(list(quantities.values()))) labels = list(set(self.y)) - minority_classes = self.class_balances['min'] + minority_classes = self.class_balances["min"] for class_label in labels: if minority_classes is not None and class_label not in minority_classes: continue - chosen_minor_class_samples_to_oversample, weights = self._choose_samples(class_label) + chosen_minor_class_samples_to_oversample, weights = self._choose_samples( + class_label + ) if len(chosen_minor_class_samples_to_oversample) == 0: continue - oversampling_rate = int((goal_quantity - quantities[class_label]) * self.prop) + oversampling_rate = int( + (goal_quantity - quantities[class_label]) * self.prop + ) if oversampling_rate > 0: if len(chosen_minor_class_samples_to_oversample) == 1: - oversampled_set = np.repeat(chosen_minor_class_samples_to_oversample, oversampling_rate, axis=0) + oversampled_set = np.repeat( + chosen_minor_class_samples_to_oversample, + oversampling_rate, + axis=0, + ) else: - chosen_samples_features_mean = np.mean(chosen_minor_class_samples_to_oversample, axis=0) - zero_mean_samples = chosen_minor_class_samples_to_oversample - chosen_samples_features_mean + chosen_samples_features_mean = np.mean( + chosen_minor_class_samples_to_oversample, axis=0 + ) + zero_mean_samples = ( + chosen_minor_class_samples_to_oversample + - chosen_samples_features_mean + ) n_components = min(zero_mean_samples.shape) pca = PCA(n_components=n_components).fit(zero_mean_samples) uncorrelated_samples = pca.transform(zero_mean_samples) - variables_variance = np.diag(np.cov(uncorrelated_samples, rowvar=False)) - - oversampled_set = self._MDO_oversampling(uncorrelated_samples, variables_variance, - oversampling_rate, - weights) - oversampled_set = pca.inverse_transform(oversampled_set) + chosen_samples_features_mean + variables_variance = np.diag( + np.cov(uncorrelated_samples, rowvar=False) + ) + + oversampled_set = self._MDO_oversampling( + uncorrelated_samples, + variables_variance, + oversampling_rate, + weights, + ) + oversampled_set = ( + pca.inverse_transform(oversampled_set) + + chosen_samples_features_mean + ) oversampled_X = np.vstack((oversampled_X, oversampled_set)) - oversampled_y = np.hstack((oversampled_y, np.array([class_label] * oversampling_rate))) + oversampled_y = np.hstack( + (oversampled_y, np.array([class_label] * oversampling_rate)) + ) return oversampled_X, oversampled_y def _choose_samples(self, class_label): - minor_class_indices = [i for i, value in enumerate(self.y) if value == class_label] + minor_class_indices = [ + i for i, value in enumerate(self.y) if value == class_label + ] minor_set = self.X[minor_class_indices] - quantity_same_class_neighbours = self.calculate_same_class_neighbour_quantities(minor_set, class_label) - chosen_minor_class_samples_to_oversample = minor_set[quantity_same_class_neighbours >= self.k1] - - weights = quantity_same_class_neighbours[quantity_same_class_neighbours >= self.k1] / self.k2 + quantity_same_class_neighbours = self.calculate_same_class_neighbour_quantities( + minor_set, class_label + ) + chosen_minor_class_samples_to_oversample = minor_set[ + quantity_same_class_neighbours >= self.k1 + ] + + weights = ( + quantity_same_class_neighbours[quantity_same_class_neighbours >= self.k1] + / self.k2 + ) weights_sum = np.sum(weights) if weights_sum != 0: @@ -127,12 +159,14 @@ def _MDO_oversampling(self, T, v, oversampling_rate, weights): for alpha_V_j in alpha_V[:-1]: sqrt_avj = np.sqrt(alpha_V_j) r = self.random_state.uniform(low=-sqrt_avj, high=sqrt_avj) - s += r ** 2 / alpha_V_j + s += r**2 / alpha_V_j features_vector.append(r) last = (1 - s) * alpha_V[-1] last_feature = np.sqrt(last) if last > 0 else 0 - random_last_feature = self.random_state.choice([-last_feature, last_feature], 1)[0] + random_last_feature = self.random_state.choice( + [-last_feature, last_feature], 1 + )[0] features_vector.append(random_last_feature) oversampled_set.append(features_vector) @@ -140,10 +174,16 @@ def _MDO_oversampling(self, T, v, oversampling_rate, weights): return np.array(oversampled_set) def calculate_same_class_neighbour_quantities(self, S_minor, S_minor_label): - minority_class_neighbours_indices = self.knn.kneighbors(S_minor, return_distance=False) + minority_class_neighbours_indices = self.knn.kneighbors( + S_minor, return_distance=False + ) quantity_with_same_label_in_neighbourhood = list() for i in range(len(S_minor)): sample_neighbours_indices = minority_class_neighbours_indices[i][1:] - quantity_sample_neighbours_indices_with_same_label = sum(self.y[sample_neighbours_indices] == S_minor_label) - quantity_with_same_label_in_neighbourhood.append(quantity_sample_neighbours_indices_with_same_label) + quantity_sample_neighbours_indices_with_same_label = sum( + self.y[sample_neighbours_indices] == S_minor_label + ) + quantity_with_same_label_in_neighbourhood.append( + quantity_sample_neighbours_indices_with_same_label + ) return np.array(quantity_with_same_label_in_neighbourhood) diff --git a/multi_imbalance/resampling/soup.py b/multi_imbalance/resampling/soup.py index a69e7aa..4141744 100644 --- a/multi_imbalance/resampling/soup.py +++ b/multi_imbalance/resampling/soup.py @@ -28,7 +28,7 @@ def __init__(self, k: int = 7, shuffle=False, maj_int_min=None) -> None: dict {'maj': majority class labels, 'min': minority class labels} """ super().__init__() - self._sampling_type = 'clean-sampling' + self._sampling_type = "clean-sampling" self.k = k self.shuffle = shuffle self.maj_int_min = maj_int_min @@ -54,15 +54,23 @@ def _fit_resample(self, X, y): self._X = deepcopy(X) self._y = deepcopy(y) - assert len(self._X.shape) == 2, 'X should have 2 dimension' - assert self._X.shape[0] == self._y.shape[0], 'Number of labels must be equal to number of samples' + assert len(self._X.shape) == 2, "X should have 2 dimension" + assert ( + self._X.shape[0] == self._y.shape[0] + ), "Number of labels must be equal to number of samples" self.quantities = Counter(self._y) self.goal_quantity = self._calculate_goal_quantity(self.maj_int_min) - self.dsc_maj_cls = sorted(((v, i) for v, i in self.quantities.items() if i >= self.goal_quantity), - key=itemgetter(1), reverse=True) - self.asc_min_cls = sorted(((v, i) for v, i in self.quantities.items() if i < self.goal_quantity), - key=itemgetter(1), reverse=False) + self.dsc_maj_cls = sorted( + ((v, i) for v, i in self.quantities.items() if i >= self.goal_quantity), + key=itemgetter(1), + reverse=True, + ) + self.asc_min_cls = sorted( + ((v, i) for v, i in self.quantities.items() if i < self.goal_quantity), + key=itemgetter(1), + reverse=False, + ) for class_name, class_quantity in self.dsc_maj_cls: self._X, self._y = self._undersample(self._X, self._y, class_name) @@ -80,13 +88,17 @@ def _construct_class_safe_levels(self, X, y, class_name) -> defaultdict: indices_in_class = [i for i, value in enumerate(y) if value == class_name] neigh_clf = NearestNeighbors(n_neighbors=self.k + 1).fit(X) - neighbour_indices = neigh_clf.kneighbors(X[indices_in_class], return_distance=False)[:, 1:] + neighbour_indices = neigh_clf.kneighbors( + X[indices_in_class], return_distance=False + )[:, 1:] neighbour_classes = y[neighbour_indices] class_safe_levels = defaultdict(float) for i, sample_id in enumerate(indices_in_class): neighbours_quantities = Counter(neighbour_classes[i]) - class_safe_levels[sample_id] = self._calculate_sample_safe_level(class_name, neighbours_quantities) + class_safe_levels[sample_id] = self._calculate_sample_safe_level( + class_name, neighbours_quantities + ) return class_safe_levels @@ -95,38 +107,54 @@ def _calculate_sample_safe_level(self, class_name, neighbours_quantities: Counte q: Counter = self.quantities for neigh_label, neigh_q in neighbours_quantities.items(): - similarity_between_classes = min(q[class_name], q[neigh_label]) / max(q[class_name], q[neigh_label]) + similarity_between_classes = min(q[class_name], q[neigh_label]) / max( + q[class_name], q[neigh_label] + ) safe_level += neigh_q * similarity_between_classes safe_level /= self.k if safe_level > 1: - raise ValueError(f'Safe level is bigger than 1: {safe_level}') + raise ValueError(f"Safe level is bigger than 1: {safe_level}") return safe_level def _undersample(self, X, y, class_name): - safe_levels_of_samples_in_class = self._construct_class_safe_levels(X, y, class_name) + safe_levels_of_samples_in_class = self._construct_class_safe_levels( + X, y, class_name + ) class_quantity = self.quantities[class_name] - safe_levels_list = sorted(safe_levels_of_samples_in_class.items(), key=itemgetter(1)) + safe_levels_list = sorted( + safe_levels_of_samples_in_class.items(), key=itemgetter(1) + ) samples_to_remove_quantity = max(0, int(class_quantity - self.goal_quantity)) if samples_to_remove_quantity > 0: - remove_indices = list(map(itemgetter(0), safe_levels_list[:samples_to_remove_quantity])) + remove_indices = list( + map(itemgetter(0), safe_levels_list[:samples_to_remove_quantity]) + ) X = np.delete(X, remove_indices, axis=0) y = np.delete(y, remove_indices, axis=0) return X, y def _oversample(self, X, y, class_name): - safe_levels_of_samples_in_class = self._construct_class_safe_levels(X, y, class_name) + safe_levels_of_samples_in_class = self._construct_class_safe_levels( + X, y, class_name + ) class_quantity = self.quantities[class_name] - safe_levels_list = list(sorted(safe_levels_of_samples_in_class.items(), key=itemgetter(1), reverse=True)) + safe_levels_list = list( + sorted( + safe_levels_of_samples_in_class.items(), key=itemgetter(1), reverse=True + ) + ) difference = self.goal_quantity - class_quantity while difference > 0: quantity_items_to_copy = min(difference, class_quantity) - indices_to_copy = list(map(itemgetter(0), safe_levels_list[:quantity_items_to_copy])) + indices_to_copy = list( + map(itemgetter(0), safe_levels_list[:quantity_items_to_copy]) + ) X = np.vstack((X, X[indices_to_copy])) y = np.hstack((y, y[indices_to_copy])) difference -= quantity_items_to_copy @@ -139,9 +167,13 @@ def _calculate_goal_quantity(self, maj_int_min=None): min_q = min(list(self.quantities.values())) return np.mean((min_q, maj_q), dtype=int) else: - maj_classes = {k: v for k, v in self.quantities.items() if k in maj_int_min['maj']} + maj_classes = { + k: v for k, v in self.quantities.items() if k in maj_int_min["maj"] + } maj_q = list(maj_classes.values()) - min_classes = {k: v for k, v in self.quantities.items() if k in maj_int_min['min']} + min_classes = { + k: v for k, v in self.quantities.items() if k in maj_int_min["min"] + } min_q = list(min_classes.values()) if len(maj_q) == 0: diff --git a/multi_imbalance/resampling/spider.py b/multi_imbalance/resampling/spider.py index df403d4..91f2075 100644 --- a/multi_imbalance/resampling/spider.py +++ b/multi_imbalance/resampling/spider.py @@ -4,7 +4,7 @@ from imblearn.base import BaseSampler from sklearn.neighbors import NearestNeighbors -from multi_imbalance.utils.array_util import (union, setdiff, contains) +from multi_imbalance.utils.array_util import union, setdiff, contains from multi_imbalance.utils.data import construct_maj_int_min @@ -30,7 +30,7 @@ def __init__(self, k, maj_int_min=None, cost=None): """ super().__init__() - self._sampling_type = 'clean-sampling' + self._sampling_type = "clean-sampling" self.k = k self.neigh_clf = NearestNeighbors(n_neighbors=self.k) self.maj_int_min = maj_int_min @@ -69,9 +69,9 @@ def _fit_resample(self, X, y): def _initialize_algorithm(self, X, y): if self.maj_int_min is None: self.maj_int_min = construct_maj_int_min(y) - self.majority_classes = self.maj_int_min['maj'] - self.intermediate_classes = self.maj_int_min['int'] - self.minority_classes = self.maj_int_min['min'] + self.majority_classes = self.maj_int_min["maj"] + self.intermediate_classes = self.maj_int_min["int"] + self.minority_classes = self.maj_int_min["min"] self.stds, self.means = [1] * X.shape[1], [0] * X.shape[1] if self.cost is None: @@ -105,8 +105,12 @@ def _estimate_cost_matrix(y): def _sort_by_cardinality(self, y): class_cardinality = Counter(y) # to ensure looping over classes with decreasing cardinality. - int_classes = sorted(self.intermediate_classes, key=lambda clazz: -class_cardinality[clazz]) - min_classes = sorted(self.minority_classes, key=lambda clazz: -class_cardinality[clazz]) + int_classes = sorted( + self.intermediate_classes, key=lambda clazz: -class_cardinality[clazz] + ) + min_classes = sorted( + self.minority_classes, key=lambda clazz: -class_cardinality[clazz] + ) return int_classes, min_classes def amplify(self, int_min_class): @@ -209,7 +213,9 @@ def _min_cost_classes(self, x, DS): for cj in C: s = 0 for ci in C: - s += ((kneighbors[:, -1] == ci).astype(int).sum() / self.k) * self.cost[C.index(ci), C.index(cj)] + s += ((kneighbors[:, -1] == ci).astype(int).sum() / self.k) * self.cost[ + C.index(ci), C.index(cj) + ] vals.append(s) C = np.array(C) vals = np.array(vals) @@ -225,8 +231,12 @@ def _relabel_nn(self, x): """ nearest_neighbors = self._knn(x, self._ds_as_rs_union()) for neighbor in nearest_neighbors: - if contains(self.RS, neighbor) and self._class_of(neighbor) in self.majority_classes and self._class_of( - neighbor) in self._min_cost_classes(x, self._ds_as_rs_union()): + if ( + contains(self.RS, neighbor) + and self._class_of(neighbor) in self.majority_classes + and self._class_of(neighbor) + in self._min_cost_classes(x, self._ds_as_rs_union()) + ): self.RS = setdiff(self.RS, np.array([neighbor])) neighbor[-1] = x[-1] self.AS = union(self.AS, np.array([neighbor])) @@ -240,8 +250,9 @@ def _clean_nn(self, x): """ nearest_neighbors = self._knn(x, self._ds_as_rs_union()) for neighbor in nearest_neighbors: - if self._class_of(neighbor) in self.majority_classes and \ - self._class_of(neighbor) in self._min_cost_classes(x, self._ds_as_rs_union()): + if self._class_of(neighbor) in self.majority_classes and self._class_of( + neighbor + ) in self._min_cost_classes(x, self._ds_as_rs_union()): self.DS = setdiff(self.DS, np.array([neighbor])) self.RS = setdiff(self.RS, np.array([neighbor])) @@ -267,9 +278,13 @@ def _knn(self, x, DS): self.neigh_clf.fit(DS[:, :-1]) - within_radius = self.neigh_clf.radius_neighbors([x[:-1]], radius= - self.neigh_clf.kneighbors([x[:-1]], return_distance=True)[0][0][-1] + 0.0001 * - self.neigh_clf.kneighbors([x[:-1]], return_distance=True)[0][0][-1], return_distance=True) + within_radius = self.neigh_clf.radius_neighbors( + [x[:-1]], + radius=self.neigh_clf.kneighbors([x[:-1]], return_distance=True)[0][0][-1] + + 0.0001 + * self.neigh_clf.kneighbors([x[:-1]], return_distance=True)[0][0][-1], + return_distance=True, + ) unique_distances = np.unique(sorted(within_radius[0][0])) all_distances = within_radius[0][0] @@ -289,7 +304,9 @@ def _amplify_nn(self, x): Single observation. """ - while self._class_of(x) not in self._min_cost_classes(x, self._ds_as_rs_union()): + while self._class_of(x) not in self._min_cost_classes( + x, self._ds_as_rs_union() + ): y = x.copy() self.AS = union(self.AS, np.asarray([y])) diff --git a/multi_imbalance/resampling/static_smote.py b/multi_imbalance/resampling/static_smote.py index a4b209f..6f05f78 100644 --- a/multi_imbalance/resampling/static_smote.py +++ b/multi_imbalance/resampling/static_smote.py @@ -14,9 +14,10 @@ class StaticSMOTE(BaseSampler): procedure based on sensitivity for multi-class problems. Pattern Recognit. 44, 1821–1833 (2011) """ + def __init__(self): super().__init__() - self._sampling_type = 'over-sampling' + self._sampling_type = "over-sampling" def _fit_resample(self, X, y): """ @@ -38,9 +39,11 @@ def _fit_resample(self, X, y): for _ in range(M): sm = SMOTE(sampling_strategy={min_class: cnt[min_class] * 2}) X_smote, y_smote = sm.fit_resample(X_original, y_original) - X_added_examples = X_smote[y_smote == min_class][cnt[min_class]:, :] + X_added_examples = X_smote[y_smote == min_class][cnt[min_class] :, :] X_resampled = np.vstack([X_resampled, X_added_examples]) - y_resampled = np.hstack([y_resampled, y_smote[y_smote == min_class][cnt[min_class]:]]) + y_resampled = np.hstack( + [y_resampled, y_smote[y_smote == min_class][cnt[min_class] :]] + ) cnt = Counter(y_resampled) min_class = min(cnt, key=cnt.get) diff --git a/multi_imbalance/resampling/tests/test_globalcs.py b/multi_imbalance/resampling/tests/test_globalcs.py index 6115f85..3502eef 100644 --- a/multi_imbalance/resampling/tests/test_globalcs.py +++ b/multi_imbalance/resampling/tests/test_globalcs.py @@ -5,28 +5,30 @@ from multi_imbalance.resampling.global_cs import GlobalCS -X = np.array([ - [0.05837771, 0.57543339], - [0.06153624, 0.99871925], - [0.14308529, 0.00681144], - [0.23401697, 0.21188708], - [0.2418553, 0.02137086], - [0.32480534, 0.81547632], - [0.42478482, 0.31995162], - [0.50726834, 0.72621157], - [0.54580968, 0.58025914], - [0.55748531, 0.71866238], - [0.69208769, 0.63759459], - [0.70797377, 0.16348051], - [0.76410615, 0.70451542], - [0.81680686, 0.50793884], - [0.8490789, 0.53826627], - [0.8847505, 0.96856011], - [0.9287003, 0.97580299], - [0.9584236, 0.10536541], - [0.96983103, 0.87666093], - [0.97352367, 0.78807909], -]) +X = np.array( + [ + [0.05837771, 0.57543339], + [0.06153624, 0.99871925], + [0.14308529, 0.00681144], + [0.23401697, 0.21188708], + [0.2418553, 0.02137086], + [0.32480534, 0.81547632], + [0.42478482, 0.31995162], + [0.50726834, 0.72621157], + [0.54580968, 0.58025914], + [0.55748531, 0.71866238], + [0.69208769, 0.63759459], + [0.70797377, 0.16348051], + [0.76410615, 0.70451542], + [0.81680686, 0.50793884], + [0.8490789, 0.53826627], + [0.8847505, 0.96856011], + [0.9287003, 0.97580299], + [0.9584236, 0.10536541], + [0.96983103, 0.87666093], + [0.97352367, 0.78807909], + ] +) y_balanced = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) y_imb_easy = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1]) diff --git a/multi_imbalance/resampling/tests/test_mdo.py b/multi_imbalance/resampling/tests/test_mdo.py index 349ad86..0e88012 100644 --- a/multi_imbalance/resampling/tests/test_mdo.py +++ b/multi_imbalance/resampling/tests/test_mdo.py @@ -6,17 +6,33 @@ from multi_imbalance.resampling.mdo import MDO -X = np.array([ - [0.05837771, 0.57543339], - [0.06153624, 0.99871925], - [0.14308529, 0.00681144], - [0.23401697, 0.21188708], - [0.2418553, 0.02137086], - [0.32480534, 0.81547632], - [0.42478482, 0.31995162], - [0.50726834, 0.72621157], - [0.54580968, 0.58025914], - [0.55748531, 0.71866238], +X = np.array( + [ + [0.05837771, 0.57543339], + [0.06153624, 0.99871925], + [0.14308529, 0.00681144], + [0.23401697, 0.21188708], + [0.2418553, 0.02137086], + [0.32480534, 0.81547632], + [0.42478482, 0.31995162], + [0.50726834, 0.72621157], + [0.54580968, 0.58025914], + [0.55748531, 0.71866238], + [0.69208769, 0.63759459], + [0.70797377, 0.16348051], + [0.76410615, 0.70451542], + [0.81680686, 0.50793884], + [0.8490789, 0.53826627], + [0.8847505, 0.96856011], + [0.9287003, 0.97580299], + [0.9584236, 0.10536541], + [0.96983103, 0.87666093], + [0.97352367, 0.78807909], + ] +) + +y_balanced = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) +y_balanced_SC_minor = [ [0.69208769, 0.63759459], [0.70797377, 0.16348051], [0.76410615, 0.70451542], @@ -27,26 +43,29 @@ [0.9584236, 0.10536541], [0.96983103, 0.87666093], [0.97352367, 0.78807909], -]) +] -y_balanced = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) -y_balanced_SC_minor = [[0.69208769, 0.63759459], - [0.70797377, 0.16348051], - [0.76410615, 0.70451542], - [0.81680686, 0.50793884], - [0.8490789, 0.53826627], - [0.8847505, 0.96856011], - [0.9287003, 0.97580299], - [0.9584236, 0.10536541], - [0.96983103, 0.87666093], - [0.97352367, 0.78807909]] - -y_balanced_weights = [0.058824, 0.088235, 0.088235, 0.088235, 0.117647, 0.117647, 0.117647, 0.088235, 0.117647, - 0.117647] +y_balanced_weights = [ + 0.058824, + 0.088235, + 0.088235, + 0.088235, + 0.117647, + 0.117647, + 0.117647, + 0.088235, + 0.117647, + 0.117647, +] y_imb_easy = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1]) -y_imb_easy_SC_minor = [[0.8490789, 0.53826627], [0.8847505, 0.96856011], [0.9287003, 0.97580299], - [0.96983103, 0.87666093], [0.97352367, 0.78807909]] +y_imb_easy_SC_minor = [ + [0.8490789, 0.53826627], + [0.8847505, 0.96856011], + [0.9287003, 0.97580299], + [0.96983103, 0.87666093], + [0.97352367, 0.78807909], +] y_imb_easy_weights = [0.142857, 0.214286, 0.214286, 0.214286, 0.214286] y_imb_hard = np.array([0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0]) @@ -63,7 +82,7 @@ @pytest.fixture() def mdo_mock(): def _get_parametrized_mdo(X, y): - clf = MDO(k1_frac=.5) + clf = MDO(k1_frac=0.5) clf.knn.fit(X) clf.X, clf.y = X, y return clf @@ -71,7 +90,9 @@ def _get_parametrized_mdo(X, y): return _get_parametrized_mdo -@pytest.mark.parametrize("X, y, sc_minor_expected, weights_expected", complete_test_data) +@pytest.mark.parametrize( + "X, y, sc_minor_expected, weights_expected", complete_test_data +) def test_choose_samples(X, y, sc_minor_expected, weights_expected, mdo_mock): clf = mdo_mock(X, y) SC_minor, weights = clf._choose_samples(1) @@ -116,8 +137,8 @@ def test_zero_variance(mdo_mock): def test_mdo_api(mdo_mock): clf = mdo_mock(X, y_imb_hard) - maj_int_min = {'maj': [0], 'int': [], 'min': [1]} + maj_int_min = {"maj": [0], "int": [], "min": [1]} clf.k1 = 0 clf.class_balances = maj_int_min X_r, y_r = clf.fit_resample(X, y_imb_hard) - assert X_r.shape == (28,2) \ No newline at end of file + assert X_r.shape == (28, 2) diff --git a/multi_imbalance/resampling/tests/test_soup.py b/multi_imbalance/resampling/tests/test_soup.py index 587ad62..16a3585 100644 --- a/multi_imbalance/resampling/tests/test_soup.py +++ b/multi_imbalance/resampling/tests/test_soup.py @@ -6,66 +6,141 @@ from multi_imbalance.resampling.soup import SOUP -X = np.array([ - [0.05837771, 0.57543339], - [0.06153624, 0.99871925], - [0.14308529, 0.00681144], - [0.23401697, 0.21188708], - [0.2418553, 0.02137086], - [0.32480534, 0.81547632], - [0.42478482, 0.31995162], - [0.50726834, 0.72621157], - [0.54580968, 0.58025914], - [0.55748531, 0.71866238], - [0.69208769, 0.63759459], - [0.70797377, 0.16348051], - [0.76410615, 0.70451542], - [0.81680686, 0.50793884], - [0.8490789, 0.53826627], - [0.8847505, 0.96856011], - [0.9287003, 0.97580299], - [0.9584236, 0.10536541], - [0.96983103, 0.87666093], - [0.97352367, 0.78807909], -]) +X = np.array( + [ + [0.05837771, 0.57543339], + [0.06153624, 0.99871925], + [0.14308529, 0.00681144], + [0.23401697, 0.21188708], + [0.2418553, 0.02137086], + [0.32480534, 0.81547632], + [0.42478482, 0.31995162], + [0.50726834, 0.72621157], + [0.54580968, 0.58025914], + [0.55748531, 0.71866238], + [0.69208769, 0.63759459], + [0.70797377, 0.16348051], + [0.76410615, 0.70451542], + [0.81680686, 0.50793884], + [0.8490789, 0.53826627], + [0.8847505, 0.96856011], + [0.9287003, 0.97580299], + [0.9584236, 0.10536541], + [0.96983103, 0.87666093], + [0.97352367, 0.78807909], + ] +) y_balanced = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) y_balanced_first_sample_safe_level = 0.8 -y_balanced_0_class_safe_levels = defaultdict(float, - {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0, 8: 1.0, - 9: 1.0}) -y_balanced_1_class_safe_levels = defaultdict(float, - {10: 1.0, 11: 1.0, 12: 1.0, 13: 1.0, 14: 1.0, 15: 1.0, 16: 1.0, 17: 1.0, - 18: 1.0, 19: 1.0}) +y_balanced_0_class_safe_levels = defaultdict( + float, + {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0, 8: 1.0, 9: 1.0}, +) +y_balanced_1_class_safe_levels = defaultdict( + float, + { + 10: 1.0, + 11: 1.0, + 12: 1.0, + 13: 1.0, + 14: 1.0, + 15: 1.0, + 16: 1.0, + 17: 1.0, + 18: 1.0, + 19: 1.0, + }, +) y_imb_easy = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1]) y_imb_easy_first_sample_safe_level = 0.685714 -y_imb_easy_0_class_safe_levels = defaultdict(float, {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0, - 8: 0.8857142857142858, 9: 1.0, 10: 0.7714285714285714, - 11: 0.7714285714285714, 12: 0.6571428571428571, - 17: 0.7714285714285714}) -y_imb_easy_1_class_safe_levels = defaultdict(float, - {13: 0.6571428571428571, 14: 0.6571428571428571, 15: 0.7714285714285714, - 16: 0.7714285714285714, 18: 0.8857142857142858, 19: 0.8857142857142858}) +y_imb_easy_0_class_safe_levels = defaultdict( + float, + { + 0: 1.0, + 1: 1.0, + 2: 1.0, + 3: 1.0, + 4: 1.0, + 5: 1.0, + 6: 1.0, + 7: 1.0, + 8: 0.8857142857142858, + 9: 1.0, + 10: 0.7714285714285714, + 11: 0.7714285714285714, + 12: 0.6571428571428571, + 17: 0.7714285714285714, + }, +) +y_imb_easy_1_class_safe_levels = defaultdict( + float, + { + 13: 0.6571428571428571, + 14: 0.6571428571428571, + 15: 0.7714285714285714, + 16: 0.7714285714285714, + 18: 0.8857142857142858, + 19: 0.8857142857142858, + }, +) y_imb_hard = np.array([0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0]) y_imb_hard_first_sample_safe_level = 0.685714 -y_imb_hard_quantities_0_class_safe_levels = defaultdict(float, {0: 0.8857142857142858, 1: 0.7714285714285714, - 2: 0.8857142857142858, 3: 0.8857142857142858, - 4: 0.8857142857142858, 5: 0.7714285714285714, - 7: 0.7714285714285714, 10: 0.6571428571428571, - 11: 0.6571428571428571, 12: 0.7714285714285714, - 13: 0.7714285714285714, 15: 0.7714285714285714, - 17: 0.7714285714285714, 19: 0.6571428571428571}) -y_imb_hard_quantities_1_class_safe_levels = defaultdict(float, {6: 0.5428571428571429, 8: 0.5428571428571429, - 9: 0.5428571428571429, 14: 0.5428571428571429, - 16: 0.5428571428571429, 18: 0.6571428571428571}) +y_imb_hard_quantities_0_class_safe_levels = defaultdict( + float, + { + 0: 0.8857142857142858, + 1: 0.7714285714285714, + 2: 0.8857142857142858, + 3: 0.8857142857142858, + 4: 0.8857142857142858, + 5: 0.7714285714285714, + 7: 0.7714285714285714, + 10: 0.6571428571428571, + 11: 0.6571428571428571, + 12: 0.7714285714285714, + 13: 0.7714285714285714, + 15: 0.7714285714285714, + 17: 0.7714285714285714, + 19: 0.6571428571428571, + }, +) +y_imb_hard_quantities_1_class_safe_levels = defaultdict( + float, + { + 6: 0.5428571428571429, + 8: 0.5428571428571429, + 9: 0.5428571428571429, + 14: 0.5428571428571429, + 16: 0.5428571428571429, + 18: 0.6571428571428571, + }, +) complete_test_data = [ - (X, y_balanced, y_balanced_0_class_safe_levels, y_balanced_1_class_safe_levels, y_balanced_first_sample_safe_level), - (X, y_imb_easy, y_imb_easy_0_class_safe_levels, y_imb_easy_1_class_safe_levels, y_imb_easy_first_sample_safe_level), - (X, y_imb_hard, y_imb_hard_quantities_0_class_safe_levels, y_imb_hard_quantities_1_class_safe_levels, - y_imb_hard_first_sample_safe_level), + ( + X, + y_balanced, + y_balanced_0_class_safe_levels, + y_balanced_1_class_safe_levels, + y_balanced_first_sample_safe_level, + ), + ( + X, + y_imb_easy, + y_imb_easy_0_class_safe_levels, + y_imb_easy_1_class_safe_levels, + y_imb_easy_first_sample_safe_level, + ), + ( + X, + y_imb_hard, + y_imb_hard_quantities_0_class_safe_levels, + y_imb_hard_quantities_1_class_safe_levels, + y_imb_hard_first_sample_safe_level, + ), ] safe_levels_test_data = [ @@ -90,8 +165,12 @@ def _get_parametrized_soup(X, y): return _get_parametrized_soup -@pytest.mark.parametrize("X, y, zero_safe_levels, one_safe_levels, first_sample_safe", complete_test_data) -def test_calculating_safe_levels_for_sample(X, y, zero_safe_levels, one_safe_levels, first_sample_safe, soup_mock): +@pytest.mark.parametrize( + "X, y, zero_safe_levels, one_safe_levels, first_sample_safe", complete_test_data +) +def test_calculating_safe_levels_for_sample( + X, y, zero_safe_levels, one_safe_levels, first_sample_safe, soup_mock +): clf = soup_mock(X, y) neighbour_quantities = Counter({0: 3, 1: 1}) @@ -99,9 +178,12 @@ def test_calculating_safe_levels_for_sample(X, y, zero_safe_levels, one_safe_lev assert_array_almost_equal(safe_level, first_sample_safe) -@pytest.mark.parametrize("X, y, zero_safe_levels, one_safe_levels, first_sample_safe", complete_test_data) -def test_calculating_safe_levels_for_class(X, y, zero_safe_levels, one_safe_levels, first_sample_safe, - soup_mock): +@pytest.mark.parametrize( + "X, y, zero_safe_levels, one_safe_levels, first_sample_safe", complete_test_data +) +def test_calculating_safe_levels_for_class( + X, y, zero_safe_levels, one_safe_levels, first_sample_safe, soup_mock +): clf = soup_mock(X, y) zero_levels = clf._construct_class_safe_levels(X, y, 0) @@ -111,16 +193,26 @@ def test_calculating_safe_levels_for_class(X, y, zero_safe_levels, one_safe_leve one_levels == one_safe_levels -@pytest.mark.parametrize("X, y, class_name, expected_undersampling, expected_oversampling", safe_levels_test_data) -def test_oversample(X, y, class_name, expected_undersampling, expected_oversampling, soup_mock): +@pytest.mark.parametrize( + "X, y, class_name, expected_undersampling, expected_oversampling", + safe_levels_test_data, +) +def test_oversample( + X, y, class_name, expected_undersampling, expected_oversampling, soup_mock +): clf = soup_mock(X, y) oversampled_X, oversampled_y = clf._oversample(X, y, class_name) assert len(oversampled_X) == expected_oversampling assert len(oversampled_y) == expected_oversampling -@pytest.mark.parametrize("X, y, class_name, expected_undersampling, expected_oversampling", safe_levels_test_data) -def test_undersample(X, y, class_name, expected_undersampling, expected_oversampling, soup_mock): +@pytest.mark.parametrize( + "X, y, class_name, expected_undersampling, expected_oversampling", + safe_levels_test_data, +) +def test_undersample( + X, y, class_name, expected_undersampling, expected_oversampling, soup_mock +): clf = soup_mock(X, y) undersampled_X, undersampled_y = clf._undersample(X, y, class_name) assert len(undersampled_X) == expected_undersampling diff --git a/multi_imbalance/resampling/tests/test_spider.py b/multi_imbalance/resampling/tests/test_spider.py index 15484ce..5791f58 100644 --- a/multi_imbalance/resampling/tests/test_spider.py +++ b/multi_imbalance/resampling/tests/test_spider.py @@ -3,11 +3,13 @@ import numpy as np from multi_imbalance.resampling.spider import SPIDER3 -from multi_imbalance.utils.array_util import (union, intersect, setdiff) +from multi_imbalance.utils.array_util import union, intersect, setdiff cost = np.ones((3, 3)) np.fill_diagonal(cost, 0) -spider = SPIDER3(1, maj_int_min={'maj': ["MAJ"], 'int': ["INT"], 'min': ["MIN"]}, cost=cost) +spider = SPIDER3( + 1, maj_int_min={"maj": ["MAJ"], "int": ["INT"], "min": ["MIN"]}, cost=cost +) def test_union(): @@ -47,13 +49,7 @@ def test_setdiff(): def test_knn(): - X = np.array([ - [1, 1], - [1, -1], - [-1, 1], - [-1, -1], - [0, 0] - ]).astype(object) + X = np.array([[1, 1], [1, -1], [-1, 1], [-1, -1], [0, 0]]).astype(object) y = np.array(["MIN", "MIN", "MAJ", "MAJ", "MAJ"]) @@ -63,13 +59,7 @@ def test_knn(): def test_min_cost_classes(): - X = np.array([ - [1, 1], - [1, -1], - [-1, 1], - [-1, -1], - [0, 0] - ]).astype(object) + X = np.array([[1, 1], [1, -1], [-1, 1], [-1, -1], [0, 0]]).astype(object) y = np.array(["MIN", "MIN", "MAJ", "MAJ", "MAJ"]) @@ -79,7 +69,7 @@ def test_min_cost_classes(): assert (spider._min_cost_classes(DS[0], DS) == ["MAJ"]).all() assert (spider._min_cost_classes(DS[4], DS) == ["MIN", "MAJ"]).all() - + def test_estimate_cost_matrix(): y = [0, 1, 1, 2, 2, 2, 2, 2, 2] cost = SPIDER3._estimate_cost_matrix(y).ravel().tolist() @@ -88,9 +78,13 @@ def test_estimate_cost_matrix(): def test_fit_resample(): np.random.seed(7) - X = np.vstack([np.random.normal(0, 1, (100, 2)), - np.random.normal(3, 5, (30, 2)), - np.random.normal(-2, 2, (20, 2))]) + X = np.vstack( + [ + np.random.normal(0, 1, (100, 2)), + np.random.normal(3, 5, (30, 2)), + np.random.normal(-2, 2, (20, 2)), + ] + ) y = np.array([1] * 100 + [2] * 30 + [3] * 20) sp = SPIDER3(5) @@ -98,4 +92,4 @@ def test_fit_resample(): cnt = Counter(y_resampled) assert cnt[1] == 72 assert cnt[2] == 57 - assert cnt[3] == 30 \ No newline at end of file + assert cnt[3] == 30 diff --git a/multi_imbalance/resampling/tests/test_static_smote.py b/multi_imbalance/resampling/tests/test_static_smote.py index 687673e..e3a7adb 100644 --- a/multi_imbalance/resampling/tests/test_static_smote.py +++ b/multi_imbalance/resampling/tests/test_static_smote.py @@ -6,9 +6,13 @@ def test_static_smote(): - X = np.vstack([np.random.normal(0, 1, (100, 2)), - np.random.normal(3, 5, (30, 2)), - np.random.normal(-2, 2, (20, 2))]) + X = np.vstack( + [ + np.random.normal(0, 1, (100, 2)), + np.random.normal(3, 5, (30, 2)), + np.random.normal(-2, 2, (20, 2)), + ] + ) y = np.array([1] * 100 + [2] * 30 + [3] * 20) ssm = StaticSMOTE() diff --git a/multi_imbalance/utils/data.py b/multi_imbalance/utils/data.py index bbb53e4..522e479 100644 --- a/multi_imbalance/utils/data.py +++ b/multi_imbalance/utils/data.py @@ -21,8 +21,8 @@ def construct_flat_2pc_df(X, y) -> pd.DataFrame: :return: Data frame with 3 columns x1 x2 and y and with number of rows equal to number of rows in X """ - y = pd.DataFrame({'y': y}) - X_df = pd.DataFrame(data=X, columns=['x1', 'x2']) + y = pd.DataFrame({"y": y}) + X_df = pd.DataFrame(data=X, columns=["x1", "x2"]) df = pd.concat([X_df, y], axis=1) @@ -34,7 +34,9 @@ def get_project_root() -> Path: # pragma no cover return Path(__file__).parent.parent.parent -def load_arff_dataset(path: str, one_hot_encode: bool = True, return_non_cat_length: bool = False): +def load_arff_dataset( + path: str, one_hot_encode: bool = True, return_non_cat_length: bool = False +): """ Load and return the dataset saved in arff type file @@ -63,7 +65,7 @@ def load_arff_dataset(path: str, one_hot_encode: bool = True, return_non_cat_len categorical_cols = df.columns[categorical_feature_mask].tolist() non_categorical_cols = df.columns[~categorical_feature_mask].tolist() - df[categorical_cols] = df[categorical_cols].replace({b'?': np.NaN}) + df[categorical_cols] = df[categorical_cols].replace({b"?": np.NaN}) mode = df.mode().iloc[0] mean = df.filter(non_categorical_cols).mean() @@ -84,15 +86,19 @@ def load_arff_dataset(path: str, one_hot_encode: bool = True, return_non_cat_len def load_datasets_arff(return_non_cat_length=False, dataset_paths=None): if dataset_paths is None: - dataset_paths = glob.glob(f'{get_project_root()}/data/arff/*') + dataset_paths = glob.glob(f"{get_project_root()}/data/arff/*") datasets = OrderedDict() for path in sorted(dataset_paths): - dataset_file = path.split('/')[-1] - dataset_name = dataset_file.split('.')[0] + dataset_file = path.split("/")[-1] + dataset_name = dataset_file.split(".")[0] if return_non_cat_length: - X, y, cat_length = load_arff_dataset(path, return_non_cat_length=return_non_cat_length) - datasets[dataset_name] = Bunch(data=X, target=y, non_cat_length=cat_length, DESCR=dataset_name) + X, y, cat_length = load_arff_dataset( + path, return_non_cat_length=return_non_cat_length + ) + datasets[dataset_name] = Bunch( + data=X, target=y, non_cat_length=cat_length, DESCR=dataset_name + ) else: X, y = load_arff_dataset(path, return_non_cat_length=return_non_cat_length) datasets[dataset_name] = Bunch(data=X, target=y, DESCR=dataset_name) @@ -100,7 +106,7 @@ def load_datasets_arff(return_non_cat_length=False, dataset_paths=None): return datasets -def construct_maj_int_min(y: np.ndarray, strategy='median') -> OrderedDict: +def construct_maj_int_min(y: np.ndarray, strategy="median") -> OrderedDict: """ This function creates dictionary with information which classes are minority or majority @@ -120,25 +126,23 @@ def construct_maj_int_min(y: np.ndarray, strategy='median') -> OrderedDict: """ class_sizes = Counter(y) - if strategy == 'median': + if strategy == "median": middle_size = median(list(class_sizes.values())) - elif strategy == 'average': + elif strategy == "average": middle_size = np.mean(list(class_sizes.values())) else: - raise ValueError(f'Unrecognized {strategy}. Only "median" and "average" are allowed.') + raise ValueError( + f'Unrecognized {strategy}. Only "median" and "average" are allowed.' + ) - maj_int_min = OrderedDict({ - 'maj': list(), - 'int': list(), - 'min': list() - }) + maj_int_min = OrderedDict({"maj": list(), "int": list(), "min": list()}) for class_label, class_size in class_sizes.items(): if class_size == middle_size: - class_group = 'int' + class_group = "int" elif class_size < middle_size: - class_group = 'min' + class_group = "min" else: - class_group = 'maj' + class_group = "maj" maj_int_min[class_group].append(class_label) diff --git a/multi_imbalance/utils/min_int_maj.py b/multi_imbalance/utils/min_int_maj.py index e87e01f..434c6f6 100644 --- a/multi_imbalance/utils/min_int_maj.py +++ b/multi_imbalance/utils/min_int_maj.py @@ -1,21 +1,21 @@ maj_int_min = { # pragma no cover - "1czysty-cut": {'maj': [0], 'int': [2], 'min': [1]}, - "2delikatne-cut": {'maj': [0], 'int': [2], 'min': [1]}, - "3mocniej-cut": {'maj': [0], 'int': [2], 'min': [1]}, - "4delikatne-bezover-cut": {'maj': [0], 'int': [2], 'min': [1]}, - "balance-scale": {'maj': [2, 1], 'int': [], 'min': [0]}, - "cleveland": {'maj': [0], 'int': [], 'min': [1, 2, 3, 4]}, - "cleveland_v2": {'maj': [0], 'int': [], 'min': [1, 2, 3]}, - "car": {'maj': [2, 0], 'int': [], 'min': [1, 3]}, - "cmc": {'maj': [0, 2], 'int': [], 'min': [1]}, - "dermatology": {'maj': [0, 2, 1, 4, 3], 'int': [], 'min': [5]}, - "flare": {'maj': [1, 2, 3, 6], 'int': [], 'min': [4, 5]}, - "glass": {'maj': [1, 0, 3], 'int': [], 'min': [5, 2, 4]}, - "hayes-roth": {'maj': [0, 1], 'int': [], 'min': [2]}, - "new_ecoli": {'maj': [0, 1], 'int': [], 'min': [4, 2, 3]}, - "new_led7digit": {'maj': [3, 5, 0, 2], 'int': [], 'min': [4, 1]}, - "new_vehicle": {'maj': [1], 'int': [], 'min': [0, 2]}, - "new_winequality-red": {'maj': [0, 1], 'int': [], 'min': [2, 3]}, - "new_yeast": {'maj': [0, 1, 8, 7], 'int': [], 'min': [6, 5, 4, 3, 2]}, - "thyroid-newthyroid": {'maj': [0], 'int': [], 'min': [1, 2]} + "1czysty-cut": {"maj": [0], "int": [2], "min": [1]}, + "2delikatne-cut": {"maj": [0], "int": [2], "min": [1]}, + "3mocniej-cut": {"maj": [0], "int": [2], "min": [1]}, + "4delikatne-bezover-cut": {"maj": [0], "int": [2], "min": [1]}, + "balance-scale": {"maj": [2, 1], "int": [], "min": [0]}, + "cleveland": {"maj": [0], "int": [], "min": [1, 2, 3, 4]}, + "cleveland_v2": {"maj": [0], "int": [], "min": [1, 2, 3]}, + "car": {"maj": [2, 0], "int": [], "min": [1, 3]}, + "cmc": {"maj": [0, 2], "int": [], "min": [1]}, + "dermatology": {"maj": [0, 2, 1, 4, 3], "int": [], "min": [5]}, + "flare": {"maj": [1, 2, 3, 6], "int": [], "min": [4, 5]}, + "glass": {"maj": [1, 0, 3], "int": [], "min": [5, 2, 4]}, + "hayes-roth": {"maj": [0, 1], "int": [], "min": [2]}, + "new_ecoli": {"maj": [0, 1], "int": [], "min": [4, 2, 3]}, + "new_led7digit": {"maj": [3, 5, 0, 2], "int": [], "min": [4, 1]}, + "new_vehicle": {"maj": [1], "int": [], "min": [0, 2]}, + "new_winequality-red": {"maj": [0, 1], "int": [], "min": [2, 3]}, + "new_yeast": {"maj": [0, 1, 8, 7], "int": [], "min": [6, 5, 4, 3, 2]}, + "thyroid-newthyroid": {"maj": [0], "int": [], "min": [1, 2]}, } diff --git a/multi_imbalance/utils/plot.py b/multi_imbalance/utils/plot.py index fe1ad6f..3c66d22 100644 --- a/multi_imbalance/utils/plot.py +++ b/multi_imbalance/utils/plot.py @@ -6,10 +6,10 @@ from multi_imbalance.utils.data import construct_flat_2pc_df -sns.set_style('darkgrid') +sns.set_style("darkgrid") -def plot_cardinality_and_2d_data(X, y, dataset_name='') -> None: # pragma no cover +def plot_cardinality_and_2d_data(X, y, dataset_name="") -> None: # pragma no cover """ Plots cardinality of classes from y as well as scatter plot of X transformed to two dimensions using PCA @@ -35,13 +35,25 @@ def plot_cardinality_and_2d_data(X, y, dataset_name='') -> None: # pragma no co sns.countplot(y, ax=axs[0], palette=p) X = pca.transform(X) df = construct_flat_2pc_df(X, y) - sns.scatterplot(x='x1', y='x2', hue='y', style='y', data=df, alpha=0.7, ax=axs[1], legend='full', palette=p) + sns.scatterplot( + x="x1", + y="x2", + hue="y", + style="y", + data=df, + alpha=0.7, + ax=axs[1], + legend="full", + palette=p, + ) axs[0].set_xlabel("class") axs[0].set_ylabel("cardinality") -def plot_visual_comparision_datasets(X1, y1, X2, y2, dataset_name1='', dataset_name2='') -> None: # pragma no cover +def plot_visual_comparision_datasets( + X1, y1, X2, y2, dataset_name1="", dataset_name2="" +) -> None: # pragma no cover """ Plots comparision of X1 y1 and X2 y2 using plot_cardinality_and_2d_data, which plots cardinality of classes from y as well as scatter plot of X transformed to two dimensions using PCA @@ -72,12 +84,32 @@ def plot_visual_comparision_datasets(X1, y1, X2, y2, dataset_name1='', dataset_n sns.countplot(y1, ax=axs[0], palette=p) transformed_X = pca.transform(X1) df = construct_flat_2pc_df(transformed_X, y1) - sns.scatterplot(x='x1', y='x2', hue='y', style='y', data=df, alpha=0.7, ax=axs[1], legend='full', palette=p) + sns.scatterplot( + x="x1", + y="x2", + hue="y", + style="y", + data=df, + alpha=0.7, + ax=axs[1], + legend="full", + palette=p, + ) sns.countplot(y2, ax=axs[2], palette=p) transformed_X2 = pca.transform(X2) df = construct_flat_2pc_df(transformed_X2, y2) - sns.scatterplot(x='x1', y='x2', hue='y', style='y', data=df, alpha=0.7, ax=axs[3], legend='full', palette=p) + sns.scatterplot( + x="x1", + y="x2", + hue="y", + style="y", + data=df, + alpha=0.7, + ax=axs[3], + legend="full", + palette=p, + ) axs[0].set_xlabel("class") axs[2].set_xlabel("class") diff --git a/multi_imbalance/utils/tests/test_data.py b/multi_imbalance/utils/tests/test_data.py index b55d368..459de9b 100644 --- a/multi_imbalance/utils/tests/test_data.py +++ b/multi_imbalance/utils/tests/test_data.py @@ -4,8 +4,12 @@ import numpy as np import pytest -from multi_imbalance.utils.data import construct_flat_2pc_df, load_arff_dataset, load_datasets_arff, \ - construct_maj_int_min +from multi_imbalance.utils.data import ( + construct_flat_2pc_df, + load_arff_dataset, + load_datasets_arff, + construct_maj_int_min, +) def test_2pc(): @@ -18,7 +22,7 @@ def test_2pc(): def test_preprocess(): dir_path = os.path.dirname(os.path.realpath(__file__)) - ds_path = os.path.join(dir_path, 'ds_example.arrf') + ds_path = os.path.join(dir_path, "ds_example.arrf") x, y, non_cat = load_arff_dataset(ds_path, return_non_cat_length=True) assert all(y == np.array([0, 0, 0, 0, 0, 0, 0])) assert non_cat == 2 @@ -27,29 +31,29 @@ def test_preprocess(): def test_load_arff_datasets(): dir_path = os.path.dirname(os.path.realpath(__file__)) - ds_paths = [os.path.join(dir_path, 'ds_example.arrf')] + ds_paths = [os.path.join(dir_path, "ds_example.arrf")] datasets = load_datasets_arff(return_non_cat_length=False, dataset_paths=ds_paths) keys = list(datasets.keys()) assert type(datasets) == OrderedDict - assert 'ds_example' in keys + assert "ds_example" in keys assert len(keys) == 1 - for k in ['data', 'target', 'DESCR']: - assert k in list(datasets['ds_example'].keys()) + for k in ["data", "target", "DESCR"]: + assert k in list(datasets["ds_example"].keys()) def test_load_arff_datasets_wth_non_cats(): dir_path = os.path.dirname(os.path.realpath(__file__)) - ds_paths = [os.path.join(dir_path, 'ds_example.arrf')] + ds_paths = [os.path.join(dir_path, "ds_example.arrf")] datasets = load_datasets_arff(return_non_cat_length=True, dataset_paths=ds_paths) keys = list(datasets.keys()) assert type(datasets) == OrderedDict - assert 'ds_example' in keys + assert "ds_example" in keys assert len(keys) == 1 - for k in ['data', 'target', 'DESCR', 'non_cat_length']: - assert k in list(datasets['ds_example'].keys()) + for k in ["data", "target", "DESCR", "non_cat_length"]: + assert k in list(datasets["ds_example"].keys()) def test_construct_maj_int_min_when_correct_and_median_strategy(): @@ -58,21 +62,27 @@ def test_construct_maj_int_min_when_correct_and_median_strategy(): 1: 6, 3: 7, # median 5: 10, - 8: 12 + 8: 12, } - y = np.array([class_label for class_label, class_size in class_sizes.items() for _ in range(class_size)]) + y = np.array( + [ + class_label + for class_label, class_size in class_sizes.items() + for _ in range(class_size) + ] + ) np.random.shuffle(y) - maj_int_dict = construct_maj_int_min(y, strategy='median') + maj_int_dict = construct_maj_int_min(y, strategy="median") - assert len(maj_int_dict['int']) == 1 - assert maj_int_dict['int'][0] == 3 + assert len(maj_int_dict["int"]) == 1 + assert maj_int_dict["int"][0] == 3 - assert len(maj_int_dict['min']) == 2 - assert all(i in maj_int_dict['min'] for i in [0, 1]) + assert len(maj_int_dict["min"]) == 2 + assert all(i in maj_int_dict["min"] for i in [0, 1]) - assert len(maj_int_dict['maj']) == 2 - assert all(i in maj_int_dict['maj'] for i in [5, 8]) + assert len(maj_int_dict["maj"]) == 2 + assert all(i in maj_int_dict["maj"] for i in [5, 8]) def test_construct_maj_int_min_when_correct_and_average_strategy(): @@ -81,20 +91,26 @@ def test_construct_maj_int_min_when_correct_and_average_strategy(): 1: 6, 3: 7, 5: 10, - 8: 2000 + 8: 2000, } - y = np.array([class_label for class_label, class_size in class_sizes.items() for _ in range(class_size)]) + y = np.array( + [ + class_label + for class_label, class_size in class_sizes.items() + for _ in range(class_size) + ] + ) np.random.shuffle(y) - maj_int_dict = construct_maj_int_min(y, strategy='average') + maj_int_dict = construct_maj_int_min(y, strategy="average") - assert len(maj_int_dict['int']) == 0 + assert len(maj_int_dict["int"]) == 0 - assert len(maj_int_dict['min']) == 4 - assert all(i in maj_int_dict['min'] for i in [0, 1, 3, 5]) + assert len(maj_int_dict["min"]) == 4 + assert all(i in maj_int_dict["min"] for i in [0, 1, 3, 5]) - assert len(maj_int_dict['maj']) == 1 - assert maj_int_dict['maj'][0] == 8 + assert len(maj_int_dict["maj"]) == 1 + assert maj_int_dict["maj"][0] == 8 def test_construct_maj_int_min_when_wrong_strategy(): @@ -103,11 +119,16 @@ def test_construct_maj_int_min_when_wrong_strategy(): 1: 6, 3: 7, 5: 10, - 8: 2000 + 8: 2000, } - y = np.array([class_label for class_label, class_size in class_sizes.items() for _ in range(class_size)]) + y = np.array( + [ + class_label + for class_label, class_size in class_sizes.items() + for _ in range(class_size) + ] + ) np.random.shuffle(y) with pytest.raises(ValueError): - construct_maj_int_min(y, strategy='WRONG_STRATEGY') - + construct_maj_int_min(y, strategy="WRONG_STRATEGY") diff --git a/setup.py b/setup.py index 80940c3..ac83413 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ -import setuptools # pragma no cover +import setuptools # pragma no cover -with open("README.md", "r", encoding='UTF8') as fh: # pragma no cover +with open("README.md", "r", encoding="UTF8") as fh: # pragma no cover long_description = fh.read() @@ -18,11 +18,11 @@ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", - 'Topic :: Software Development', - 'Topic :: Scientific/Engineering', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8' + "Topic :: Software Development", + "Topic :: Scientific/Engineering", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", ], install_requires=[ "numpy>=1.17.0", @@ -35,5 +35,5 @@ "IPython>=7.13.0", "seaborn>=0.10.1", "matplotlib>=3.2.1", - ] + ], ) From 1982753b29dff45b06a2dd0735e93446d239c640 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Thu, 3 Nov 2022 13:37:42 +0100 Subject: [PATCH 02/48] new tests directory --- multi_imbalance/datasets/_data_loader.py | 4 ++-- multi_imbalance/utils/data.py | 7 ++++--- {multi_imbalance/datasets/tests => tests}/__init__.py | 0 .../ensemble/tests => tests/datasets}/__init__.py | 0 .../datasets/tests => tests/datasets}/test_data_loader.py | 3 ++- .../resampling/tests => tests/ensemble}/__init__.py | 0 .../ensemble/tests => tests/ensemble}/test_ecoc.py | 0 .../ensemble/tests => tests/ensemble}/test_mrbbagging.py | 0 .../ensemble/tests => tests/ensemble}/test_ovo.py | 0 .../ensemble/tests => tests/ensemble}/test_soupbagging.py | 0 tests/resampling/__init__.py | 0 .../resampling/tests => tests/resampling}/test_globalcs.py | 0 .../resampling/tests => tests/resampling}/test_mdo.py | 0 .../resampling/tests => tests/resampling}/test_soup.py | 0 .../resampling/tests => tests/resampling}/test_spider.py | 0 .../tests => tests/resampling}/test_static_smote.py | 0 tests/utils/__init__.py | 0 .../utils/tests => tests/utils}/ds_example.arrf | 0 {multi_imbalance/utils/tests => tests/utils}/test_data.py | 0 19 files changed, 8 insertions(+), 6 deletions(-) rename {multi_imbalance/datasets/tests => tests}/__init__.py (100%) rename {multi_imbalance/ensemble/tests => tests/datasets}/__init__.py (100%) rename {multi_imbalance/datasets/tests => tests/datasets}/test_data_loader.py (90%) rename {multi_imbalance/resampling/tests => tests/ensemble}/__init__.py (100%) rename {multi_imbalance/ensemble/tests => tests/ensemble}/test_ecoc.py (100%) rename {multi_imbalance/ensemble/tests => tests/ensemble}/test_mrbbagging.py (100%) rename {multi_imbalance/ensemble/tests => tests/ensemble}/test_ovo.py (100%) rename {multi_imbalance/ensemble/tests => tests/ensemble}/test_soupbagging.py (100%) create mode 100644 tests/resampling/__init__.py rename {multi_imbalance/resampling/tests => tests/resampling}/test_globalcs.py (100%) rename {multi_imbalance/resampling/tests => tests/resampling}/test_mdo.py (100%) rename {multi_imbalance/resampling/tests => tests/resampling}/test_soup.py (100%) rename {multi_imbalance/resampling/tests => tests/resampling}/test_spider.py (100%) rename {multi_imbalance/resampling/tests => tests/resampling}/test_static_smote.py (100%) create mode 100644 tests/utils/__init__.py rename {multi_imbalance/utils/tests => tests/utils}/ds_example.arrf (100%) rename {multi_imbalance/utils/tests => tests/utils}/test_data.py (100%) diff --git a/multi_imbalance/datasets/_data_loader.py b/multi_imbalance/datasets/_data_loader.py index c4abe2e..e6f1b6b 100644 --- a/multi_imbalance/datasets/_data_loader.py +++ b/multi_imbalance/datasets/_data_loader.py @@ -28,7 +28,7 @@ PRE_FILENAME = "x" POST_FILENAME = "data.npz" -DATA_HOME_BASIC = "./../../data/" +DATA_HOME_BASIC = join(".", "..", "..", "data") MAP_NAME_ID_KEYS = [ "1czysty-cut", @@ -84,7 +84,7 @@ def load_datasets(data_home=DATA_HOME_BASIC): if not available: makedirs(extracted_dir, exist_ok=True) - with open(f"{data_home}data.tar.gz", "rb") as fin: + with open(join(data_home, "data.tar.gz"), "rb") as fin: f = BytesIO(fin.read()) tar = tarfile.open(fileobj=f) tar.extractall(path=extracted_dir) diff --git a/multi_imbalance/utils/data.py b/multi_imbalance/utils/data.py index 522e479..9bd3b3d 100644 --- a/multi_imbalance/utils/data.py +++ b/multi_imbalance/utils/data.py @@ -8,6 +8,7 @@ from scipy.io import arff from sklearn.preprocessing import LabelEncoder from sklearn.utils import Bunch +import os def construct_flat_2pc_df(X, y) -> pd.DataFrame: @@ -86,12 +87,12 @@ def load_arff_dataset( def load_datasets_arff(return_non_cat_length=False, dataset_paths=None): if dataset_paths is None: - dataset_paths = glob.glob(f"{get_project_root()}/data/arff/*") + dataset_paths = glob.glob(os.path.join(get_project_root(), "data", "arff", "*")) datasets = OrderedDict() for path in sorted(dataset_paths): - dataset_file = path.split("/")[-1] - dataset_name = dataset_file.split(".")[0] + dataset_file = os.path.basename(path) + dataset_name = os.path.splitext(dataset_file)[0] if return_non_cat_length: X, y, cat_length = load_arff_dataset( path, return_non_cat_length=return_non_cat_length diff --git a/multi_imbalance/datasets/tests/__init__.py b/tests/__init__.py similarity index 100% rename from multi_imbalance/datasets/tests/__init__.py rename to tests/__init__.py diff --git a/multi_imbalance/ensemble/tests/__init__.py b/tests/datasets/__init__.py similarity index 100% rename from multi_imbalance/ensemble/tests/__init__.py rename to tests/datasets/__init__.py diff --git a/multi_imbalance/datasets/tests/test_data_loader.py b/tests/datasets/test_data_loader.py similarity index 90% rename from multi_imbalance/datasets/tests/test_data_loader.py rename to tests/datasets/test_data_loader.py index 6c734ad..2c9bde6 100644 --- a/multi_imbalance/datasets/tests/test_data_loader.py +++ b/tests/datasets/test_data_loader.py @@ -2,6 +2,7 @@ """ from multi_imbalance.datasets import load_datasets +from os.path import join DATASET_SHAPE = { "1czysty-cut": (1200, 2), @@ -26,7 +27,7 @@ def test_load_datasets(): print("Testing loading datasets") - datasets = load_datasets(data_home="./data/") + datasets = load_datasets(data_home=join(".", "data")) for k in DATASET_SHAPE.keys(): X = datasets[k].data assert DATASET_SHAPE[k] == X.shape diff --git a/multi_imbalance/resampling/tests/__init__.py b/tests/ensemble/__init__.py similarity index 100% rename from multi_imbalance/resampling/tests/__init__.py rename to tests/ensemble/__init__.py diff --git a/multi_imbalance/ensemble/tests/test_ecoc.py b/tests/ensemble/test_ecoc.py similarity index 100% rename from multi_imbalance/ensemble/tests/test_ecoc.py rename to tests/ensemble/test_ecoc.py diff --git a/multi_imbalance/ensemble/tests/test_mrbbagging.py b/tests/ensemble/test_mrbbagging.py similarity index 100% rename from multi_imbalance/ensemble/tests/test_mrbbagging.py rename to tests/ensemble/test_mrbbagging.py diff --git a/multi_imbalance/ensemble/tests/test_ovo.py b/tests/ensemble/test_ovo.py similarity index 100% rename from multi_imbalance/ensemble/tests/test_ovo.py rename to tests/ensemble/test_ovo.py diff --git a/multi_imbalance/ensemble/tests/test_soupbagging.py b/tests/ensemble/test_soupbagging.py similarity index 100% rename from multi_imbalance/ensemble/tests/test_soupbagging.py rename to tests/ensemble/test_soupbagging.py diff --git a/tests/resampling/__init__.py b/tests/resampling/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/multi_imbalance/resampling/tests/test_globalcs.py b/tests/resampling/test_globalcs.py similarity index 100% rename from multi_imbalance/resampling/tests/test_globalcs.py rename to tests/resampling/test_globalcs.py diff --git a/multi_imbalance/resampling/tests/test_mdo.py b/tests/resampling/test_mdo.py similarity index 100% rename from multi_imbalance/resampling/tests/test_mdo.py rename to tests/resampling/test_mdo.py diff --git a/multi_imbalance/resampling/tests/test_soup.py b/tests/resampling/test_soup.py similarity index 100% rename from multi_imbalance/resampling/tests/test_soup.py rename to tests/resampling/test_soup.py diff --git a/multi_imbalance/resampling/tests/test_spider.py b/tests/resampling/test_spider.py similarity index 100% rename from multi_imbalance/resampling/tests/test_spider.py rename to tests/resampling/test_spider.py diff --git a/multi_imbalance/resampling/tests/test_static_smote.py b/tests/resampling/test_static_smote.py similarity index 100% rename from multi_imbalance/resampling/tests/test_static_smote.py rename to tests/resampling/test_static_smote.py diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/multi_imbalance/utils/tests/ds_example.arrf b/tests/utils/ds_example.arrf similarity index 100% rename from multi_imbalance/utils/tests/ds_example.arrf rename to tests/utils/ds_example.arrf diff --git a/multi_imbalance/utils/tests/test_data.py b/tests/utils/test_data.py similarity index 100% rename from multi_imbalance/utils/tests/test_data.py rename to tests/utils/test_data.py From 6cc9128604f10545cbd747c2c0bbfc48550d47a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Thu, 3 Nov 2022 13:58:02 +0100 Subject: [PATCH 03/48] fix warnings --- multi_imbalance/ensemble/ecoc.py | 11 +++++------ multi_imbalance/ensemble/mrbbagging.py | 8 ++++---- multi_imbalance/resampling/soup.py | 6 ++++-- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/multi_imbalance/ensemble/ecoc.py b/multi_imbalance/ensemble/ecoc.py index 6b111cd..a96e8d0 100644 --- a/multi_imbalance/ensemble/ecoc.py +++ b/multi_imbalance/ensemble/ecoc.py @@ -450,12 +450,11 @@ def _calc_weights(self, X_for_weights, y_for_weights): ][clf_idx] ): min_correct_pred[sample_label] += 1 - avg_tpr_min = np.mean( - [ - min_correct_pred[clazz] / min_counter[clazz] - for clazz in min_counter.keys() - ] - ) + tpr_min = [ + min_correct_pred[clazz] / min_counter[clazz] + for clazz in min_counter.keys() + ] + avg_tpr_min = np.mean(tpr_min) if tpr_min else np.nan dich_weights[clf_idx] = avg_tpr_min self.dich_weights = dich_weights diff --git a/multi_imbalance/ensemble/mrbbagging.py b/multi_imbalance/ensemble/mrbbagging.py index 6ac4bbc..64ac130 100644 --- a/multi_imbalance/ensemble/mrbbagging.py +++ b/multi_imbalance/ensemble/mrbbagging.py @@ -137,8 +137,8 @@ def _train(self, la_list, n, prob, classes, grouped_data): for i in range(len(la_list)): subset_x, subset_y = self._resample(n, prob, classes, grouped_data) - subset_x = np.array(subset_x).astype(np.float) - subset_y = np.array(subset_y).astype(np.float) + subset_x = np.array(subset_x).astype(np.float64) + subset_y = np.array(subset_y).astype(np.float64) self.classifiers[i] = la_list[i].fit(subset_x, subset_y) @@ -171,8 +171,8 @@ def _train_with_feature_selection(self, la_list, n, prob, classes, grouped_data) else: features_no = int(sqrt(labels_no)) - subset_x = np.array(subset_x).astype(np.float) - subset_y = np.array(subset_y).astype(np.float) + subset_x = np.array(subset_x).astype(np.float64) + subset_y = np.array(subset_y).astype(np.float64) if self.all_random: subset1, subset1_idx = self._find_random_features( diff --git a/multi_imbalance/resampling/soup.py b/multi_imbalance/resampling/soup.py index 4141744..12d3f99 100644 --- a/multi_imbalance/resampling/soup.py +++ b/multi_imbalance/resampling/soup.py @@ -176,9 +176,11 @@ def _calculate_goal_quantity(self, maj_int_min=None): } min_q = list(min_classes.values()) - if len(maj_q) == 0: + if len(min_q) == 0 and len(maj_q) == 0: + return np.nan + elif len(maj_q) == 0: return np.mean(min_q, dtype=int) - if len(min_q) == 0: + elif len(min_q) == 0: return np.mean(maj_q, dtype=int) return np.mean((max(min_q), min(maj_q)), dtype=int) From 88bb9b81bd99caafdb279e2bcd502a8d7b8f4859 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Thu, 3 Nov 2022 16:45:53 +0100 Subject: [PATCH 04/48] add typing --- multi_imbalance/datasets/_data_loader.py | 2 +- multi_imbalance/ensemble/ecoc.py | 58 ++++++++++++++-------- multi_imbalance/ensemble/mrbbagging.py | 57 +++++++++++++-------- multi_imbalance/ensemble/ovo.py | 35 ++++++++----- multi_imbalance/ensemble/soup_bagging.py | 18 ++++--- multi_imbalance/resampling/global_cs.py | 9 +++- multi_imbalance/resampling/mdo.py | 24 +++++++-- multi_imbalance/resampling/soup.py | 29 ++++++++--- multi_imbalance/resampling/spider.py | 44 +++++++++------- multi_imbalance/resampling/static_smote.py | 5 +- multi_imbalance/utils/array_util.py | 10 ++-- multi_imbalance/utils/data.py | 11 ++-- multi_imbalance/utils/metrics.py | 5 +- multi_imbalance/utils/plot.py | 14 ++++-- 14 files changed, 213 insertions(+), 108 deletions(-) diff --git a/multi_imbalance/datasets/_data_loader.py b/multi_imbalance/datasets/_data_loader.py index e6f1b6b..3dd5b81 100644 --- a/multi_imbalance/datasets/_data_loader.py +++ b/multi_imbalance/datasets/_data_loader.py @@ -57,7 +57,7 @@ MAP_ID_NAME[v + 1] = k -def load_datasets(data_home=DATA_HOME_BASIC): +def load_datasets(data_home: str = DATA_HOME_BASIC) -> OrderedDict: """ Load the benchmark datasets. diff --git a/multi_imbalance/ensemble/ecoc.py b/multi_imbalance/ensemble/ecoc.py index a96e8d0..067f9a8 100644 --- a/multi_imbalance/ensemble/ecoc.py +++ b/multi_imbalance/ensemble/ecoc.py @@ -2,6 +2,7 @@ from collections import Counter from collections import defaultdict from copy import deepcopy +from typing import Tuple, Union import numpy as np from imblearn.over_sampling import SMOTE @@ -31,13 +32,14 @@ class ECOC(BaggingClassifier): def __init__( self, - binary_classifier="KNN", - preprocessing="SOUP", - encoding="OVO", - n_neighbors=3, - weights=None, + binary_classifier: str = "KNN", + preprocessing: str = "SOUP", + encoding: str = "OVO", + n_neighbors: int = 3, + weights: Union[None, str] = None, ): """ + :param binary_classifier: binary classifier used by the algorithm. Possible classifiers: @@ -104,7 +106,9 @@ def __init__( self._labels = None self._dich_weights = None - def fit(self, X, y, minority_classes=None): + def fit( + self, X: np.ndarray, y: np.ndarray, minority_classes: Union[list, None] = None + ): """ :param X: @@ -136,7 +140,7 @@ def fit(self, X, y, minority_classes=None): self._calc_weights(X_for_weights, y_for_weights) return self - def predict(self, X): + def predict(self, X: np.ndarray) -> np.ndarray: """ :param X: two dimensional numpy array (number of samples x number of features) with float numbers @@ -153,7 +157,7 @@ def predict(self, X): return predicted - def _learn_binary_classifiers(self, X, y): + def _learn_binary_classifiers(self, X: np.ndarray, y: np.ndarray): for classifier_idx, classifier in enumerate(self._binary_classifiers): excluded_classes_indices = [ idx @@ -194,8 +198,11 @@ def _gen_code_matrix(self): ) def _encode_dense( - self, number_of_classes, random_state=0, number_of_code_generations=10000 - ): + self, + number_of_classes: int, + random_state: int = 0, + number_of_code_generations: int = 10000, + ) -> np.ndarray: try: dirname = os.path.dirname(__file__) matrix = np.load( @@ -234,8 +241,11 @@ def _encode_dense( return code_matrix def _encode_sparse( - self, number_of_classes, random_state=0, number_of_code_generations=10000 - ): + self, + number_of_classes: int, + random_state: int = 0, + number_of_code_generations: int = 10000, + ) -> np.ndarray: try: dirname = os.path.dirname(__file__) matrix = np.load( @@ -283,12 +293,12 @@ def _encode_sparse( return code_matrix - def _encode_ova(self, number_of_classes): + def _encode_ova(self, number_of_classes: int) -> np.ndarray: matrix = np.identity(number_of_classes) matrix[matrix == 0] = -1 return matrix - def _encode_ovo(self, number_of_classes): + def _encode_ovo(self, number_of_classes: int) -> np.ndarray: number_of_columns = int(number_of_classes * (number_of_classes - 1) / 2) matrix = np.zeros((number_of_classes, number_of_columns), dtype=int) indices_map = self._map_indices_to_class_pairs(number_of_classes) @@ -300,7 +310,7 @@ def _encode_ovo(self, number_of_classes): matrix[row, col] = -1 return matrix - def _map_indices_to_class_pairs(self, number_of_classes): + def _map_indices_to_class_pairs(self, number_of_classes: int) -> dict: indices_map = dict() idx = 0 for i in range(number_of_classes): @@ -309,7 +319,7 @@ def _map_indices_to_class_pairs(self, number_of_classes): idx += 1 return indices_map - def _encode_complete(self, number_of_classes): + def _encode_complete(self, number_of_classes: int) -> np.ndarray: code_length = 2 ** (number_of_classes - 1) - 1 matrix = np.ones((number_of_classes, code_length)) for row_idx in range(1, number_of_classes): @@ -320,13 +330,13 @@ def _encode_complete(self, number_of_classes): digit *= -1 return matrix - def _hamming_distance(self, v1, v2): + def _hamming_distance(self, v1: np.ndarray, v2: np.ndarray) -> int: return np.count_nonzero(v1 != v2) - def _has_matrix_all_zeros_column(self, matrix): + def _has_matrix_all_zeros_column(self, matrix: np.ndarray) -> bool: return (~matrix.any(axis=0)).any() - def _get_closest_class(self, row): + def _get_closest_class(self, row: np.ndarray) -> np.ndarray: if self.weights is not None: return self._labels[ np.argmin( @@ -346,7 +356,9 @@ def _get_closest_class(self, row): ) ] - def _oversample(self, X, y): + def _oversample( + self, X: np.ndarray, y: np.ndarray + ) -> Tuple[np.ndarray, np.ndarray]: if self.preprocessing is None: return X, y @@ -396,7 +408,9 @@ def _get_classifier(self): ) return deepcopy(self.binary_classifier) - def _smote_oversample(self, X, y): + def _smote_oversample( + self, X: np.ndarray, y: np.ndarray + ) -> Tuple[np.ndarray, np.ndarray]: n_neighbors = min(3, min(np.unique(y, return_counts=True)[1]) - 1) if n_neighbors == 0: raise ValueError( @@ -405,7 +419,7 @@ def _smote_oversample(self, X, y): smote = SMOTE(k_neighbors=n_neighbors, random_state=42) return smote.fit_resample(X, y) - def _calc_weights(self, X_for_weights, y_for_weights): + def _calc_weights(self, X_for_weights: np.ndarray, y_for_weights: np.ndarray): if self.weights not in ECOC._allowed_weights: raise ValueError( "Unknown weighting strategy: %s, expected to be one of %s." diff --git a/multi_imbalance/ensemble/mrbbagging.py b/multi_imbalance/ensemble/mrbbagging.py index 64ac130..ba12360 100644 --- a/multi_imbalance/ensemble/mrbbagging.py +++ b/multi_imbalance/ensemble/mrbbagging.py @@ -1,6 +1,7 @@ from collections import Counter from copy import deepcopy from math import sqrt +from typing import Any, Callable, Tuple, Union import numpy as np from scipy.stats import multinomial @@ -23,13 +24,13 @@ class MRBBagging(BaggingClassifier): def __init__( self, - k, - learning_algorithm, - undersampling=True, - feature_selection=False, - random_fs=False, - half_features=True, - random_state=None, + k: int, + learning_algorithm: Any, + undersampling: bool = True, + feature_selection: bool = False, + random_fs: bool = False, + half_features: bool = True, + random_state: Union[int, None] = None, ): """ :param k: @@ -62,7 +63,7 @@ def __init__( self.half_features = half_features self.random_state = random_state - def fit(self, x, y, **kwargs): + def fit(self, x: np.ndarray, y: np.ndarray, **kwargs): """ Build a MRBBagging ensemble of estimators from the training data. @@ -98,7 +99,7 @@ def fit(self, x, y, **kwargs): return self - def predict(self, data): + def predict(self, data: np.ndarray) -> list: """ Predict classes for examples in data. @@ -107,7 +108,7 @@ def predict(self, data): """ return self._select_classes(data) - def _group_data(self, x, y): + def _group_data(self, x: np.ndarray, y: np.ndarray) -> Tuple[set, dict]: classes = set(y) self.classes = {key: value for (key, value) in enumerate(classes)} data = [[x[i], y[i]] for i in range(len(x))] @@ -117,7 +118,9 @@ def _group_data(self, x, y): grouped_data[cl] = list(filter(lambda d: d[1] == cl, data)) return classes, grouped_data - def _resample(self, n, prob, classes, grouped_data): + def _resample( + self, n: int, prob: float, classes: set, grouped_data: dict + ) -> Tuple[np.ndarray, np.ndarray]: samples_no = multinomial.rvs(n=n, p=prob, random_state=self.random_state) subset_x, subset_y = [], [] for no, j in enumerate(classes): @@ -133,7 +136,9 @@ def _resample(self, n, prob, classes, grouped_data): subset_y.append(sample[1]) return np.array(subset_x), np.array(subset_y) - def _train(self, la_list, n, prob, classes, grouped_data): + def _train( + self, la_list: list, n: int, prob: float, classes: set, grouped_data: dict + ): for i in range(len(la_list)): subset_x, subset_y = self._resample(n, prob, classes, grouped_data) @@ -142,12 +147,16 @@ def _train(self, la_list, n, prob, classes, grouped_data): self.classifiers[i] = la_list[i].fit(subset_x, subset_y) - def _find_random_features(self, labels_no, features_no, subset_x): + def _find_random_features( + self, labels_no: int, features_no: int, subset_x: np.ndarray + ) -> Tuple[np.ndarray, np.ndarray]: random_features_idx = sample_without_replacement(labels_no, features_no) random_features = self._get_features_array(subset_x, random_features_idx) return random_features, random_features_idx - def _get_features_array(self, subset_x, random_features_idx): + def _get_features_array( + self, subset_x: np.ndarray, random_features_idx: np.ndarray + ) -> np.ndarray: random_features = np.array(subset_x[:, random_features_idx[0]]) for f in range(1, len(random_features_idx)): random_features = np.vstack( @@ -157,12 +166,20 @@ def _get_features_array(self, subset_x, random_features_idx): return random_features[:, np.newaxis] return random_features.T - def _get_kbest_classifier(self, test, features_no, subset_x, subset_y): + def _get_kbest_classifier( + self, + test: Callable, + features_no: int, + subset_x: np.ndarray, + subset_y: np.ndarray, + ) -> Tuple[np.ndarray, SelectKBest]: kBest_estimator = SelectKBest(test, k=features_no) subset = kBest_estimator.fit_transform(subset_x, subset_y) return subset, kBest_estimator - def _train_with_feature_selection(self, la_list, n, prob, classes, grouped_data): + def _train_with_feature_selection( + self, la_list: list, n: int, prob: float, classes: set, grouped_data: dict + ): for i in range(0, len(la_list), 3): subset_x, subset_y = self._resample(n, prob, classes, grouped_data) labels_no = len(subset_x[0]) @@ -204,10 +221,10 @@ def _train_with_feature_selection(self, la_list, n, prob, classes, grouped_data) self.classifiers[i + 1] = la_list[i + 1].fit(subset2, subset_y) self.classifiers[i + 2] = la_list[i + 2].fit(subset3, subset_y) - def _set_classes_dict(self, classes): + def _set_classes_dict(self, classes: set): self.classifier_classes = dict(enumerate(classes)) - def _select_data(self, classifier_id, data): + def _select_data(self, classifier_id: int, data: np.ndarray) -> np.ndarray: if self.feature_selection: if self.all_random: new_data = self._get_features_array( @@ -225,7 +242,7 @@ def _select_data(self, classifier_id, data): return new_data return data - def _count_votes(self, data): + def _count_votes(self, data: np.ndarray) -> np.ndarray: voting_matrix = np.zeros((len(data), len(self.classes))) for classifier_id in range(len(self.classifiers)): new_data = self._select_data(classifier_id, data) @@ -238,7 +255,7 @@ def _count_votes(self, data): voting_matrix[i][idx] += max(probabilities[i]) return voting_matrix - def _select_classes(self, data): + def _select_classes(self, data: np.ndarray) -> list: voting_matrix = self._count_votes(data) selected_classes_ids = voting_matrix.argmax(axis=1) selected_classes = [] diff --git a/multi_imbalance/ensemble/ovo.py b/multi_imbalance/ensemble/ovo.py index c4ae5af..512fc61 100644 --- a/multi_imbalance/ensemble/ovo.py +++ b/multi_imbalance/ensemble/ovo.py @@ -1,4 +1,5 @@ from copy import deepcopy +from typing import Any, Tuple, Union import numpy as np from imblearn.over_sampling import SMOTE @@ -28,10 +29,10 @@ class OVO(BaggingClassifier): def __init__( self, - binary_classifier="tree", - n_neighbors=3, - preprocessing="SOUP", - preprocessing_between="all", + binary_classifier: str = "tree", + n_neighbors: int = 3, + preprocessing: str = "SOUP", + preprocessing_between: str = "all", ): """ :param binary_classifier: @@ -77,7 +78,9 @@ def __init__( self._labels = np.array([]) self._minority_classes = list() - def fit(self, X, y, minority_classes=None): + def fit( + self, X: np.ndarray, y: np.ndarray, minority_classes: Union[list, None] = None + ): """ :param X: two dimensional numpy array (number of samples x number of features) with float numbers @@ -100,7 +103,7 @@ def fit(self, X, y, minority_classes=None): self._learn_binary_classifiers(X, y) return self - def predict(self, X): + def predict(self, X: np.ndarray) -> np.ndarray: """ :param X: two dimensional numpy array (number of samples x number of features) with float numbers @@ -117,7 +120,9 @@ def predict(self, X): return np.array(predicted) - def _construct_binary_outputs_matrix(self, instance, num_of_classes): + def _construct_binary_outputs_matrix( + self, instance: np.ndarray, num_of_classes: int + ) -> np.ndarray: binary_outputs_matrix = np.zeros((num_of_classes, num_of_classes)) for class_idx1 in range(len(self._labels)): for class_idx2 in range(class_idx1): @@ -126,7 +131,7 @@ def _construct_binary_outputs_matrix(self, instance, num_of_classes): ] = self._binary_classifiers[class_idx1][class_idx2].predict([instance]) return binary_outputs_matrix - def _learn_binary_classifiers(self, X, y): + def _learn_binary_classifiers(self, X: np.ndarray, y: np.ndarray): for row in range(len(self._labels)): for col in range(row): first_class, second_class = self._labels[row], self._labels[col] @@ -140,7 +145,7 @@ def _learn_binary_classifiers(self, X, y): X_filtered, y_filtered = self._oversample(X_filtered, y_filtered) self._binary_classifiers[row][col].fit(X_filtered, y_filtered) - def _get_classifier(self): + def _get_classifier(self) -> Any: if isinstance(self.binary_classifier, str): if self.binary_classifier not in OVO._allowed_classifiers: raise ValueError( @@ -165,7 +170,7 @@ def _get_classifier(self): ) return deepcopy(self.binary_classifier) - def _perform_max_voting(self, binary_outputs_matrix): + def _perform_max_voting(self, binary_outputs_matrix: np.ndarray) -> np.ndarray: scores = np.zeros(len(self._labels)) for clf_1 in range(len(binary_outputs_matrix)): for clf_2 in range(clf_1): @@ -174,7 +179,9 @@ def _perform_max_voting(self, binary_outputs_matrix): ] += 1 return self._labels[np.argmax(scores)] - def _oversample(self, X, y): + def _oversample( + self, X: np.ndarray, y: np.ndarray + ) -> Tuple[np.ndarray, np.ndarray]: if self.preprocessing is None: return X, y @@ -199,7 +206,9 @@ def _oversample(self, X, y): raise ValueError("Your resampler must implement fit_resample method") return self.preprocessing.fit_resample(X, y) - def _smote_oversample(self, X, y): + def _smote_oversample( + self, X: np.ndarray, y: np.ndarray + ) -> Tuple[np.ndarray, np.ndarray]: n_neighbors = min(3, min(np.unique(y, return_counts=True)[1]) - 1) if n_neighbors == 0: raise ValueError( @@ -208,7 +217,7 @@ def _smote_oversample(self, X, y): smote = SMOTE(k_neighbors=n_neighbors, random_state=42) return smote.fit_resample(X, y) - def should_perform_oversampling(self, first_class, second_class): + def should_perform_oversampling(self, first_class: int, second_class: int): if self.oversample_between not in OVO._allowed_preprocessing_between: raise ValueError( "Unknown strategy for oversampling: %s, expected to be one of %s." diff --git a/multi_imbalance/ensemble/soup_bagging.py b/multi_imbalance/ensemble/soup_bagging.py index f2e8ed5..0e6a2b5 100644 --- a/multi_imbalance/ensemble/soup_bagging.py +++ b/multi_imbalance/ensemble/soup_bagging.py @@ -1,6 +1,7 @@ import multiprocessing from collections import Counter from copy import deepcopy +from typing import Any, Tuple, Union import numpy as np from sklearn.ensemble import BaggingClassifier @@ -11,7 +12,7 @@ from multi_imbalance.utils.array_util import setdiff -def fit_clf(args): +def fit_clf(args: list): return SOUPBagging.fit_classifier(args) @@ -25,7 +26,12 @@ class SOUPBagging(BaggingClassifier): Inteligencji (2019). """ - def __init__(self, classifier=None, maj_int_min=None, n_classifiers=5): + def __init__( + self, + classifier: Union[Any, None] = None, + maj_int_min: Union[dict, None] = None, + n_classifiers: int = 5, + ): """ :param classifier: Instance of classifier @@ -48,7 +54,7 @@ def __init__(self, classifier=None, maj_int_min=None, n_classifiers=5): self.classifiers.append(KNeighborsClassifier()) @staticmethod - def fit_classifier(args): + def fit_classifier(args: list) -> Tuple[Any, np.ndarray]: clf, X, y, resampled, maj_int_min = args x_sampled, y_sampled = resampled @@ -78,7 +84,7 @@ def fit_classifier(args): ) return clf, global_weights - def fit(self, X, y, **kwargs): + def fit(self, X: np.ndarray, y: np.ndarray, **kwargs): """ :param X: array-like, sparse matrix of shape = [n_samples, n_features] The training input samples. @@ -113,7 +119,7 @@ def fit(self, X, y, **kwargs): self.clf_weights = np.array(self.clf_weights) - def predict(self, X, strategy: str = "average"): + def predict(self, X: np.ndarray, strategy: str = "average") -> np.ndarray: """ Predict class for X. The predicted class of an input sample is computed as the class with the highest sum of predicted probability. @@ -166,7 +172,7 @@ def predict(self, X, strategy: str = "average"): y_result = np.argmax(p, axis=1) return y_result - def predict_proba(self, X): + def predict_proba(self, X: np.ndarray) -> np.ndarray: """ Predict class probabilities for X. diff --git a/multi_imbalance/resampling/global_cs.py b/multi_imbalance/resampling/global_cs.py index b9b9b8f..9c181a2 100644 --- a/multi_imbalance/resampling/global_cs.py +++ b/multi_imbalance/resampling/global_cs.py @@ -1,4 +1,5 @@ from collections import Counter +from typing import Tuple import numpy as np import sklearn @@ -17,7 +18,9 @@ def __init__(self, shuffle: bool = True): self.shuffle = shuffle self.quantities, self.max_quantity, self.X, self.y = [None] * 4 - def _fit_resample(self, X, y): + def _fit_resample( + self, X: np.ndarray, y: np.ndarray + ) -> Tuple[np.ndarray, np.ndarray]: """ :param X: two dimensional numpy array (number of samples x number of features) with float numbers @@ -49,7 +52,9 @@ def _fit_resample(self, X, y): return np.array(result_X), np.array(result_y) - def _equal_oversample(self, X, y, class_name): + def _equal_oversample( + self, X: np.ndarray, y: np.ndarray, class_name: str + ) -> Tuple[list, list]: indices_in_class = [ i for i, class_label in enumerate(y) if class_label == class_name ] diff --git a/multi_imbalance/resampling/mdo.py b/multi_imbalance/resampling/mdo.py index 30288bf..2f1ed31 100644 --- a/multi_imbalance/resampling/mdo.py +++ b/multi_imbalance/resampling/mdo.py @@ -1,4 +1,5 @@ from collections import Counter +from typing import Tuple, Union import numpy as np from imblearn.base import BaseSampler @@ -17,7 +18,14 @@ class MDO(BaseSampler): """ - def __init__(self, k=5, k1_frac=0.4, seed=0, prop=1, maj_int_min=None): + def __init__( + self, + k: int = 5, + k1_frac: float = 0.4, + seed: int = 0, + prop: int = 1, + maj_int_min: Union[dict, None] = None, + ): """ :param k: Number of neighbours considered during the neighbourhood analysis @@ -41,7 +49,9 @@ def __init__(self, k=5, k1_frac=0.4, seed=0, prop=1, maj_int_min=None): self.prop = prop self.class_balances = maj_int_min - def _fit_resample(self, X, y): + def _fit_resample( + self, X: np.ndarray, y: np.ndarray + ) -> Tuple[np.ndarray, np.ndarray]: """ :param X: two dimensional numpy array (number of samples x number of features) with float numbers @@ -117,7 +127,7 @@ def _fit_resample(self, X, y): return oversampled_X, oversampled_y - def _choose_samples(self, class_label): + def _choose_samples(self, class_label: str) -> Tuple[np.ndarray, np.ndarray]: minor_class_indices = [ i for i, value in enumerate(self.y) if value == class_label ] @@ -144,7 +154,9 @@ def _choose_samples(self, class_label): return chosen_minor_class_samples_to_oversample, weights - def _MDO_oversampling(self, T, v, oversampling_rate, weights): + def _MDO_oversampling( + self, T: np.ndarray, v: np.ndarray, oversampling_rate: int, weights: np.ndarray + ) -> np.ndarray: oversampled_set = list() V = np.clip(np.copy(v), a_min=0.001, a_max=None) for _ in range(oversampling_rate): @@ -173,7 +185,9 @@ def _MDO_oversampling(self, T, v, oversampling_rate, weights): return np.array(oversampled_set) - def calculate_same_class_neighbour_quantities(self, S_minor, S_minor_label): + def calculate_same_class_neighbour_quantities( + self, S_minor: np.ndarray, S_minor_label: str + ) -> np.ndarray: minority_class_neighbours_indices = self.knn.kneighbors( S_minor, return_distance=False ) diff --git a/multi_imbalance/resampling/soup.py b/multi_imbalance/resampling/soup.py index 12d3f99..04a493e 100644 --- a/multi_imbalance/resampling/soup.py +++ b/multi_imbalance/resampling/soup.py @@ -1,6 +1,7 @@ from collections import Counter, defaultdict from copy import deepcopy from operator import itemgetter +from typing import Tuple, Union import numpy as np import sklearn @@ -18,7 +19,9 @@ class SOUP(BaseSampler): which are in the safest area in space """ - def __init__(self, k: int = 7, shuffle=False, maj_int_min=None) -> None: + def __init__( + self, k: int = 7, shuffle: bool = False, maj_int_min: Union[dict, None] = None + ): """ :param k: number of neighbors @@ -36,7 +39,9 @@ def __init__(self, k: int = 7, shuffle=False, maj_int_min=None) -> None: self.dsc_maj_cls, self.asc_min_cls = None, None self._X, self._y = None, None - def _fit_resample(self, X, y): + def _fit_resample( + self, X: np.ndarray, y: np.ndarray + ) -> Tuple[np.ndarray, np.ndarray]: """ The method computes the metrics required for resampling based on the given set @@ -83,7 +88,9 @@ def _fit_resample(self, X, y): return np.array(self._X), np.array(self._y) - def _construct_class_safe_levels(self, X, y, class_name) -> defaultdict: + def _construct_class_safe_levels( + self, X: np.ndarray, y: np.ndarray, class_name: str + ) -> defaultdict: self.quantities = Counter(y) indices_in_class = [i for i, value in enumerate(y) if value == class_name] @@ -102,7 +109,9 @@ def _construct_class_safe_levels(self, X, y, class_name) -> defaultdict: return class_safe_levels - def _calculate_sample_safe_level(self, class_name, neighbours_quantities: Counter): + def _calculate_sample_safe_level( + self, class_name: str, neighbours_quantities: Counter + ) -> float: safe_level = 0 q: Counter = self.quantities @@ -119,7 +128,9 @@ def _calculate_sample_safe_level(self, class_name, neighbours_quantities: Counte return safe_level - def _undersample(self, X, y, class_name): + def _undersample( + self, X: np.ndarray, y: np.ndarray, class_name: str + ) -> Tuple[np.ndarray, np.ndarray]: safe_levels_of_samples_in_class = self._construct_class_safe_levels( X, y, class_name ) @@ -138,7 +149,9 @@ def _undersample(self, X, y, class_name): return X, y - def _oversample(self, X, y, class_name): + def _oversample( + self, X: np.ndarray, y: np.ndarray, class_name: str + ) -> Tuple[np.ndarray, np.ndarray]: safe_levels_of_samples_in_class = self._construct_class_safe_levels( X, y, class_name ) @@ -161,7 +174,9 @@ def _oversample(self, X, y, class_name): return X, y - def _calculate_goal_quantity(self, maj_int_min=None): + def _calculate_goal_quantity( + self, maj_int_min: Union[dict, None] = None + ) -> Union[int, float]: if maj_int_min is None: maj_q = max(list(self.quantities.values())) min_q = min(list(self.quantities.values())) diff --git a/multi_imbalance/resampling/spider.py b/multi_imbalance/resampling/spider.py index 91f2075..4b90041 100644 --- a/multi_imbalance/resampling/spider.py +++ b/multi_imbalance/resampling/spider.py @@ -1,4 +1,5 @@ from collections import Counter +from typing import Tuple, Union import numpy as np from imblearn.base import BaseSampler @@ -18,7 +19,12 @@ class SPIDER3(BaseSampler): on Computer Recognition Systems CORES 2017 """ - def __init__(self, k, maj_int_min=None, cost=None): + def __init__( + self, + k: int, + maj_int_min: Union[dict, None] = None, + cost: Union[np.ndarray, None] = None, + ): """ :param k: Number of nearest neighbors considered while resampling. @@ -37,7 +43,9 @@ def __init__(self, k, maj_int_min=None, cost=None): self.cost = cost self.AS, self.RS = np.array([]), np.array([]) - def _fit_resample(self, X, y): + def _fit_resample( + self, X: np.ndarray, y: np.ndarray + ) -> Tuple[np.ndarray, np.ndarray]: """ Performs resampling @@ -66,7 +74,7 @@ def _fit_resample(self, X, y): return self.DS[:, :-1], self.DS[:, -1] - def _initialize_algorithm(self, X, y): + def _initialize_algorithm(self, X: np.ndarray, y: np.ndarray): if self.maj_int_min is None: self.maj_int_min = construct_maj_int_min(y) self.majority_classes = self.maj_int_min["maj"] @@ -78,7 +86,7 @@ def _initialize_algorithm(self, X, y): self.cost = self._estimate_cost_matrix(y) @staticmethod - def _estimate_cost_matrix(y): + def _estimate_cost_matrix(y: Union[np.ndarray, list]) -> np.ndarray: """ Method that estimates cost matrix automatically. For example given imbalance ratios of 1:2:6, the estimated matrix will be: @@ -102,7 +110,7 @@ def _estimate_cost_matrix(y): np.fill_diagonal(cost, 0) return cost - def _sort_by_cardinality(self, y): + def _sort_by_cardinality(self, y: Union[list, np.ndarray]) -> Tuple[list, list]: class_cardinality = Counter(y) # to ensure looping over classes with decreasing cardinality. int_classes = sorted( @@ -113,14 +121,14 @@ def _sort_by_cardinality(self, y): ) return int_classes, min_classes - def amplify(self, int_min_class): + def amplify(self, int_min_class: str): self._restart_perspective() int_min_ds = self.DS[self.DS[:, -1] == int_min_class] for x in int_min_ds: self._amplify_nn(x) self._restore_perspective() - def clean(self, int_min_class): + def clean(self, int_min_class: str): self._restart_perspective() int_min_ds = self.DS[self.DS[:, -1] == int_min_class] int_min_as = self._calc_int_min_as(int_min_class) @@ -128,7 +136,7 @@ def clean(self, int_min_class): self._clean_nn(x) self._restore_perspective() - def relabel(self, int_min_class): + def relabel(self, int_min_class: str): self._restart_perspective() int_min_ds = self.DS[self.DS[:, -1] == int_min_class] for x in int_min_ds: @@ -159,15 +167,15 @@ def _restore_perspective(self): if dataset.shape[0] > 0: self._denormalize(dataset) - def _normalize(self, dataset): + def _normalize(self, dataset: np.ndarray): for col in range(dataset.shape[1] - 1): dataset[:, col] = (dataset[:, col] - self.means[col]) / (4 * self.stds[col]) - def _denormalize(self, dataset): + def _denormalize(self, dataset: np.ndarray): for col in range(dataset.shape[1] - 1): dataset[:, col] = dataset[:, col] * self.stds[col] * 4 + self.means[col] - def _calc_int_min_as(self, int_min_class): + def _calc_int_min_as(self, int_min_class: str) -> np.ndarray: """ Helper method to calculate examples form AS that belong to int_min_class parameter class. :param int_min_class: @@ -193,7 +201,7 @@ def _calculate_weak_majority_examples(self): if majority_class not in self._min_cost_classes(x, self.DS): self.RS = union(self.RS, np.array([x])) - def _min_cost_classes(self, x, DS): + def _min_cost_classes(self, x: np.ndarray, DS: np.ndarray) -> np.ndarray: """ Utility function that aims to identify minimum-cost classes, i.e. classes leading to the minimum cost after being (mis)classified as classes appearing in the neighborhood of x. @@ -222,7 +230,7 @@ def _min_cost_classes(self, x, DS): vals = np.round(vals, 6) return C[vals == vals[np.argmin(vals)]] - def _relabel_nn(self, x): + def _relabel_nn(self, x: np.ndarray): """ Performs relabeling in the nearest neighborhood of x. @@ -241,7 +249,7 @@ def _relabel_nn(self, x): neighbor[-1] = x[-1] self.AS = union(self.AS, np.array([neighbor])) - def _clean_nn(self, x): + def _clean_nn(self, x: np.ndarray): """ Performs cleaning in the nearest neighborhood of x. @@ -256,7 +264,7 @@ def _clean_nn(self, x): self.DS = setdiff(self.DS, np.array([neighbor])) self.RS = setdiff(self.RS, np.array([neighbor])) - def _knn(self, x, DS): + def _knn(self, x: np.ndarray, DS: np.ndarray) -> np.ndarray: """ Returns k nearest neighbors of x in DS that belong to c class if specified. @@ -296,7 +304,7 @@ def _knn(self, x, DS): return DS[indices] - def _amplify_nn(self, x): + def _amplify_nn(self, x: np.ndarray): """ Artificially amplifies example x by adding a copy of it to the AS. @@ -311,8 +319,8 @@ def _amplify_nn(self, x): self.AS = union(self.AS, np.asarray([y])) @staticmethod - def _class_of(example): + def _class_of(example: np.ndarray): return example[-1] - def _ds_as_rs_union(self): + def _ds_as_rs_union(self) -> np.ndarray: return union(self.DS, union(self.AS, self.RS)) diff --git a/multi_imbalance/resampling/static_smote.py b/multi_imbalance/resampling/static_smote.py index 6f05f78..c91ae62 100644 --- a/multi_imbalance/resampling/static_smote.py +++ b/multi_imbalance/resampling/static_smote.py @@ -1,4 +1,5 @@ from collections import Counter +from typing import Tuple import numpy as np from imblearn.over_sampling import SMOTE @@ -19,7 +20,9 @@ def __init__(self): super().__init__() self._sampling_type = "over-sampling" - def _fit_resample(self, X, y): + def _fit_resample( + self, X: np.ndarray, y: np.ndarray + ) -> Tuple[np.ndarray, np.ndarray]: """ Performs resampling diff --git a/multi_imbalance/utils/array_util.py b/multi_imbalance/utils/array_util.py index 078a33e..fdda76f 100644 --- a/multi_imbalance/utils/array_util.py +++ b/multi_imbalance/utils/array_util.py @@ -1,7 +1,7 @@ import numpy as np -def setdiff(arr1, arr2): +def setdiff(arr1: np.ndarray, arr2: np.ndarray) -> np.ndarray: """ Performs the difference over two numpy arrays. @@ -19,7 +19,7 @@ def setdiff(arr1, arr2): return arr1 -def union(arr1, arr2): +def union(arr1: np.ndarray, arr2: np.ndarray) -> np.ndarray: """ Performs the union over two numpy arrays (not removing duplicates, as it's how the algorithm SPIDER3 actually works). @@ -40,7 +40,7 @@ def union(arr1, arr2): return np.append(arr1, arr2, axis=0) -def contains(dataset, example): +def contains(dataset: np.ndarray, example: np.ndarray) -> bool: """ Returns if dataset contains the example. :param dataset: @@ -53,7 +53,7 @@ def contains(dataset, example): return False -def index_of(arr, example): +def index_of(arr: np.ndarray, example: np.ndarray) -> int: """ :return: Index of learning exmaple in arr. """ @@ -63,7 +63,7 @@ def index_of(arr, example): return -1 -def intersect(arr1, arr2): +def intersect(arr1: np.ndarray, arr2: np.ndarray) -> np.ndarray: """ Performs the intersection operation over two numpy arrays (not removing duplicates). diff --git a/multi_imbalance/utils/data.py b/multi_imbalance/utils/data.py index 9bd3b3d..892e317 100644 --- a/multi_imbalance/utils/data.py +++ b/multi_imbalance/utils/data.py @@ -2,6 +2,7 @@ from collections import OrderedDict, Counter from pathlib import Path from statistics import median +from typing import Tuple, Union import numpy as np import pandas as pd @@ -11,7 +12,7 @@ import os -def construct_flat_2pc_df(X, y) -> pd.DataFrame: +def construct_flat_2pc_df(X: np.ndarray, y: np.ndarray) -> pd.DataFrame: """ This function takes two dimensional X and one dimensional y arrays, concatenates and returns them as data frame @@ -37,7 +38,7 @@ def get_project_root() -> Path: # pragma no cover def load_arff_dataset( path: str, one_hot_encode: bool = True, return_non_cat_length: bool = False -): +) -> Union[Tuple[np.ndarray, np.ndarray, int], Tuple[np.ndarray, np.ndarray]]: """ Load and return the dataset saved in arff type file @@ -85,7 +86,9 @@ def load_arff_dataset( return X.to_numpy(), y -def load_datasets_arff(return_non_cat_length=False, dataset_paths=None): +def load_datasets_arff( + return_non_cat_length: bool = False, dataset_paths: Union[str, None] = None +) -> OrderedDict: if dataset_paths is None: dataset_paths = glob.glob(os.path.join(get_project_root(), "data", "arff", "*")) @@ -107,7 +110,7 @@ def load_datasets_arff(return_non_cat_length=False, dataset_paths=None): return datasets -def construct_maj_int_min(y: np.ndarray, strategy="median") -> OrderedDict: +def construct_maj_int_min(y: np.ndarray, strategy: str = "median") -> OrderedDict: """ This function creates dictionary with information which classes are minority or majority diff --git a/multi_imbalance/utils/metrics.py b/multi_imbalance/utils/metrics.py index bb4170b..ee260da 100644 --- a/multi_imbalance/utils/metrics.py +++ b/multi_imbalance/utils/metrics.py @@ -1,7 +1,10 @@ from imblearn.metrics import geometric_mean_score +import numpy as np -def gmean_score(y_test, y_pred, correction: float = 0.001) -> float: # pragma no cover +def gmean_score( + y_test: np.ndarray, y_pred: np.ndarray, correction: float = 0.001 +) -> float: # pragma no cover """ Calculate geometric mean score diff --git a/multi_imbalance/utils/plot.py b/multi_imbalance/utils/plot.py index 3c66d22..4501634 100644 --- a/multi_imbalance/utils/plot.py +++ b/multi_imbalance/utils/plot.py @@ -3,13 +3,16 @@ import matplotlib.pyplot as plt import seaborn as sns from sklearn.decomposition import PCA +import numpy as np from multi_imbalance.utils.data import construct_flat_2pc_df sns.set_style("darkgrid") -def plot_cardinality_and_2d_data(X, y, dataset_name="") -> None: # pragma no cover +def plot_cardinality_and_2d_data( + X: np.ndarray, y: np.ndarray, dataset_name: str = "" +): # pragma no cover """ Plots cardinality of classes from y as well as scatter plot of X transformed to two dimensions using PCA @@ -52,8 +55,13 @@ def plot_cardinality_and_2d_data(X, y, dataset_name="") -> None: # pragma no co def plot_visual_comparision_datasets( - X1, y1, X2, y2, dataset_name1="", dataset_name2="" -) -> None: # pragma no cover + X1: np.ndarray, + y1: np.ndarray, + X2: np.ndarray, + y2: np.ndarray, + dataset_name1: str = "", + dataset_name2: str = "", +): # pragma no cover """ Plots comparision of X1 y1 and X2 y2 using plot_cardinality_and_2d_data, which plots cardinality of classes from y as well as scatter plot of X transformed to two dimensions using PCA From 59d988d15e64aeab4c964fb57d7037216eb37538 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Fri, 4 Nov 2022 18:25:38 +0100 Subject: [PATCH 05/48] add tests --- multi_imbalance/utils/array_util.py | 2 +- tests/datasets/test_data_loader.py | 6 +- tests/ensemble/test_ecoc.py | 29 ++++++- tests/ensemble/test_ovo.py | 20 +++++ tests/resampling/test_spider.py | 37 --------- tests/utils/test_array_util.py | 113 ++++++++++++++++++++++++++++ tests/utils/test_data.py | 11 +++ 7 files changed, 176 insertions(+), 42 deletions(-) create mode 100644 tests/utils/test_array_util.py diff --git a/multi_imbalance/utils/array_util.py b/multi_imbalance/utils/array_util.py index fdda76f..a7b26d1 100644 --- a/multi_imbalance/utils/array_util.py +++ b/multi_imbalance/utils/array_util.py @@ -55,7 +55,7 @@ def contains(dataset: np.ndarray, example: np.ndarray) -> bool: def index_of(arr: np.ndarray, example: np.ndarray) -> int: """ - :return: Index of learning exmaple in arr. + :return: Index of learning example in arr. """ for i, x in enumerate(arr): if all(x == example): diff --git a/tests/datasets/test_data_loader.py b/tests/datasets/test_data_loader.py index 2c9bde6..ff2bec4 100644 --- a/tests/datasets/test_data_loader.py +++ b/tests/datasets/test_data_loader.py @@ -1,6 +1,7 @@ """Test the datasets loader. """ +import shutil from multi_imbalance.datasets import load_datasets from os.path import join @@ -27,7 +28,10 @@ def test_load_datasets(): print("Testing loading datasets") - datasets = load_datasets(data_home=join(".", "data")) + data_home = join(".", "data") + datasets = load_datasets(data_home=data_home) for k in DATASET_SHAPE.keys(): X = datasets[k].data assert DATASET_SHAPE[k] == X.shape + + shutil.rmtree(join(data_home, "extracted")) diff --git a/tests/ensemble/test_ecoc.py b/tests/ensemble/test_ecoc.py index 6acdadb..edadf7d 100644 --- a/tests/ensemble/test_ecoc.py +++ b/tests/ensemble/test_ecoc.py @@ -96,10 +96,13 @@ def test_no_oversampling(): @pytest.mark.parametrize( "encoding_strategy", ["dense", "sparse", "OVO", "OVA", "complete"] ) -@pytest.mark.parametrize("oversampling", [None, "globalCS", "SMOTE", "SOUP"]) -def test_encoding(encoding_strategy, oversampling): +@pytest.mark.parametrize( + "oversampling, minority_classes", + [(None, None), ("globalCS", None), ("SMOTE", None), ("SOUP", [0, 2])], +) +def test_encoding(encoding_strategy, oversampling, minority_classes): ecoc_clf = ecoc.ECOC(encoding=encoding_strategy, preprocessing=oversampling) - ecoc_clf.fit(X, y) + ecoc_clf.fit(X, y, minority_classes=minority_classes) matrix = ecoc_clf._code_matrix number_of_classes = len(np.unique(y)) @@ -167,6 +170,26 @@ def test_unknown_classifier(): assert "DUMMY_CLASSIFIER" in str(e.value) +def test_unknown_encoding(): + ecoc_clf = ecoc.ECOC(encoding="dummy") + with pytest.raises(ValueError) as e: + ecoc_clf.fit(X, y) + assert ( + e.value.args[0] + == "Unknown matrix generation encoding: dummy, expected to be one of ['dense', 'sparse', 'complete', 'OVA', 'OVO']." + ) + + +def test_unknown_weighting_strategy(): + ecoc_clf = ecoc.ECOC(weights="dummy") + with pytest.raises(ValueError) as e: + ecoc_clf.fit(X, y) + assert ( + e.value.args[0] + == "Unknown weighting strategy: dummy, expected to be one of [None, 'acc', 'avg_tpr_min']." + ) + + def test_own_classifier_without_predict_and_fit(): class DummyClassifier: def foo(self, X, y): diff --git a/tests/ensemble/test_ovo.py b/tests/ensemble/test_ovo.py index 9d337ce..6b127ba 100644 --- a/tests/ensemble/test_ovo.py +++ b/tests/ensemble/test_ovo.py @@ -122,6 +122,26 @@ def test_predefined_classifiers_and_preprocessings_without_errors( assert len(predicted) == 3 +@pytest.mark.parametrize( + "clf, expected_exception", + [ + ( + "dummy", + "Unknown binary classifier: dummy, expected to be one of ['tree', 'NB', 'KNN'].", + ), + ( + lambda x: x, + "Your classifier must implement fit and predict methods", + ), + ], +) +def test_unknown_classifier(clf, expected_exception): + ovo_clf = ovo.OVO(binary_classifier=clf) + with pytest.raises(ValueError) as e: + ovo_clf.fit(X, y) + assert e.value.args[0] == expected_exception + + def test_unknown_preprocessing(): ovo_clf = ovo.OVO(preprocessing="DUMMY_OVERSAMPLING") with pytest.raises(ValueError) as e: diff --git a/tests/resampling/test_spider.py b/tests/resampling/test_spider.py index 5791f58..276f31c 100644 --- a/tests/resampling/test_spider.py +++ b/tests/resampling/test_spider.py @@ -3,7 +3,6 @@ import numpy as np from multi_imbalance.resampling.spider import SPIDER3 -from multi_imbalance.utils.array_util import union, intersect, setdiff cost = np.ones((3, 3)) np.fill_diagonal(cost, 0) @@ -12,42 +11,6 @@ ) -def test_union(): - arr1 = np.array([[1, 2, 3]]) - arr2 = np.array([[4, 5, 6]]) - actual = union(arr1, arr2) - expected = np.array([[1, 2, 3], [4, 5, 6]]) - assert (actual == expected).all() - - arr1 = np.array([[1, 2, 3], [4, 5, 6]]) - arr2 = np.array([[1, 2, 3]]) - - actual = union(arr1, arr2) - expected = np.array([[1, 2, 3], [4, 5, 6], [1, 2, 3]]) - - assert (actual == expected).all() - - -def test_intersect(): - arr1 = np.array([[1, 2, 3], [4, 5, 6]]) - arr2 = np.array([[1, 2, 3]]) - - actual = intersect(arr1, arr2) - expected = np.array([[1, 2, 3]]) - - assert (actual == expected).all() - - -def test_setdiff(): - arr1 = np.array([[1, 2, 3], [4, 5, 6]]) - arr2 = np.array([[1, 2, 3]]) - - actual = setdiff(arr1, arr2) - expected = np.array([[4, 5, 6]]) - - assert (actual == expected).all() - - def test_knn(): X = np.array([[1, 1], [1, -1], [-1, 1], [-1, -1], [0, 0]]).astype(object) diff --git a/tests/utils/test_array_util.py b/tests/utils/test_array_util.py new file mode 100644 index 0000000..b34ba36 --- /dev/null +++ b/tests/utils/test_array_util.py @@ -0,0 +1,113 @@ +import pytest +import numpy as np + + +from multi_imbalance.utils.array_util import setdiff, union, intersect, index_of + + +@pytest.mark.parametrize( + "arr1, arr2, expected", + [ + ( + [[1, 2, 3]], + [[4, 5, 6]], + [[1, 2, 3], [4, 5, 6]], + ), + ( + [[1, 2, 3], [4, 5, 6]], + [[1, 2, 3]], + [[1, 2, 3], [4, 5, 6], [1, 2, 3]], + ), + ( + [[1, 2, 3]], + [], + [[1, 2, 3]], + ), + ], +) +def test_union(arr1, arr2, expected): + arr1 = np.array(arr1) + arr2 = np.array(arr2) + actual = union(arr1, arr2) + expected = np.array(expected) + assert (actual == expected).all() + + +@pytest.mark.parametrize( + "arr1, arr2, expected", + [ + ( + [[1, 2, 3], [4, 5, 6]], + [[1, 2, 3]], + [[1, 2, 3]], + ), + ( + [[1, 2, 3], [4, 5, 6]], + [], + [], + ), + ], +) +def test_intersect(arr1, arr2, expected): + arr1 = np.array(arr1) + arr2 = np.array(arr2) + + actual = intersect(arr1, arr2) + expected = np.array(expected) + + assert (actual == expected).all() + + +@pytest.mark.parametrize( + "arr1, arr2, expected", + [ + ( + [[1, 2, 3], [4, 5, 6]], + [[1, 2, 3]], + [[4, 5, 6]], + ), + ( + [[1, 2, 3], [4, 5, 6]], + [], + [[1, 2, 3], [4, 5, 6]], + ), + ], +) +def test_setdiff(arr1, arr2, expected): + arr1 = np.array(arr1) + arr2 = np.array(arr2) + + actual = setdiff(arr1, arr2) + expected = np.array(expected) + + assert (actual == expected).all() + + +@pytest.mark.parametrize( + "arr1, arr2, expected", + [ + ( + [[1, 2, 3], [4, 5, 6]], + [1, 2, 3], + 0, + ), + ( + [[1, 2, 3], [4, 5, 6]], + [4, 5, 6], + 1, + ), + ( + [[1, 2, 3], [4, 5, 6]], + [7, 8, 9], + -1, + ), + ], +) +def test_index_of(arr1, arr2, expected): + arr1 = np.array(arr1) + arr2 = np.array(arr2) + + actual = index_of(arr1, arr2) + expected = np.array(expected) + + assert (actual == expected).all() diff --git a/tests/utils/test_data.py b/tests/utils/test_data.py index 459de9b..6502972 100644 --- a/tests/utils/test_data.py +++ b/tests/utils/test_data.py @@ -29,6 +29,17 @@ def test_preprocess(): assert x.shape == (7, 2) +def test_preprocess_without_one_hot_encode(): + dir_path = os.path.dirname(os.path.realpath(__file__)) + ds_path = os.path.join(dir_path, "ds_example.arrf") + x, y, non_cat = load_arff_dataset( + ds_path, return_non_cat_length=True, one_hot_encode=False + ) + assert all(y == np.array([0, 0, 0, 0, 0, 0, 0])) + assert non_cat == 2 + assert x.shape == (7, 2) + + def test_load_arff_datasets(): dir_path = os.path.dirname(os.path.realpath(__file__)) ds_paths = [os.path.join(dir_path, "ds_example.arrf")] From 71a6e3507f6096b3de86f3b46dcb51ca82df1c00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Sat, 5 Nov 2022 17:04:29 +0100 Subject: [PATCH 06/48] tox toml and fix bug with cross val --- .github/workflows/tests.yml | 25 +++ README.md | 36 ++-- examples/ensemble/ecoc_pipeline.ipynb | 61 ++++++ examples/ensemble/mrbbagging.ipynb | 33 +++- examples/ensemble/mrbbagging_pipeline.ipynb | 39 ++-- examples/ensemble/ovo_pipeline.ipynb | 61 ++++++ examples/resampling/GlobalCS.ipynb | 105 +++++----- examples/resampling/StaticSMOTE.ipynb | 209 ++++++++++++++++++++ multi_imbalance/ensemble/ecoc.py | 3 +- multi_imbalance/ensemble/mrbbagging.py | 6 +- multi_imbalance/ensemble/ovo.py | 10 +- multi_imbalance/ensemble/soup_bagging.py | 11 +- multi_imbalance/resampling/static_smote.py | 7 +- pyproject.toml | 70 +++++++ requirements.txt | 104 ++-------- setup.cfg | 2 + setup.py | 39 ---- tests/resampling/test_mdo.py | 4 +- tox.ini | 22 +++ 19 files changed, 600 insertions(+), 247 deletions(-) create mode 100644 .github/workflows/tests.yml create mode 100644 examples/ensemble/ecoc_pipeline.ipynb create mode 100644 examples/ensemble/ovo_pipeline.ipynb create mode 100644 examples/resampling/StaticSMOTE.ipynb create mode 100644 pyproject.toml create mode 100644 setup.cfg delete mode 100644 setup.py create mode 100644 tox.ini diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..eb0efaf --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,25 @@ +name: Tests and code analysis +on: + - push + - pull_request + +jobs: + test: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-latest, windows-latest] + python-version: [['3.9', 'py39']] + + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version[0] }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version[0] }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install tox tox-gh-actions + - name: Test ${{ matrix.python-version[1] }} with tox + run: tox -e ${{ matrix.python-version[1] }} \ No newline at end of file diff --git a/README.md b/README.md index 0c4fc55..861ad0b 100644 --- a/README.md +++ b/README.md @@ -9,16 +9,16 @@ Multi-class imbalance is a common problem occurring in real-world supervised classifications tasks. While there has already been some research on the specialized methods aiming to tackle that challenging problem, most of them still lack coherent Python implementation that is simple, intuitive and easy to use. multi-imbalance is a python package tackling the problem of multi-class imbalanced datasets in machine learning. ## Requirements -Tha package has been tested under python 3.6, 3.7 and 3.8. It relies heavily on scikit-learn and typical scientific stack (numpy, scipy, pandas etc.). +Tha package has been tested under python 3.9. It relies heavily on scikit-learn and typical scientific stack (numpy, scipy, pandas etc.). Requirements include: -* numpy>=1.17.0, -* scikit-learn>=0.22.0, -* pandas>=0.25.1, -* pytest>=5.1.2, -* imbalanced-learn>=0.6.1 -* IPython>=7.13.0, -* seaborn>=0.10.1, -* matplotlib>=3.2.1 +* numpy>=1.23.4, +* scikit-learn>=1.1.3, +* pandas>=1.5.1, +* pytest>=7.2.0, +* imbalanced-learn>=0.9.1 +* IPython>=8.6.0, +* seaborn>=0.12.1, +* matplotlib>=3.6.2 ## Installation @@ -91,11 +91,23 @@ We use pytest as our unit tests framework. To use it, simply run: pytest ``` -If you would like to check the code coverage: +If you would like to check the code coverage just install and use tox: ```bash -coverage run -m pytest -coverage report -m # or coverage html +pip install tox +tox ``` +multi-imbalance uses flake8 as code linter. To use it, simply run: +```bash +pip install flake8 +flake8 multi_imbalance +``` +If you would use project for local tests (e.g. run `examples/` with your code changes) type: +``` +pip install -U -e ".[all]" +``` +After that you can use multi-imbalance as normal Python package. Your next changes will be automatically loaded. + +Our project uses GitHub actions on push and pull request. The action runs tox with configuration define in tox.ini. multi-imbalance uses reStructuredText markdown for docstrings. To build the documentation locally run: ```bash diff --git a/examples/ensemble/ecoc_pipeline.ipynb b/examples/ensemble/ecoc_pipeline.ipynb new file mode 100644 index 0000000..f51d7cc --- /dev/null +++ b/examples/ensemble/ecoc_pipeline.ipynb @@ -0,0 +1,61 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0.88118812 0.81188119 0.77227723 0.8019802 0.86138614]\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "from sklearn.model_selection import cross_val_score, ShuffleSplit\n", + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "from multi_imbalance.ensemble.ecoc import ECOC\n", + "from multi_imbalance.utils.data import load_arff_dataset\n", + "\n", + "# an example of how ecoc can be used in sklearn pipeline\n", + "X, y = load_arff_dataset(f\"{os.getcwd()}/../../data/arff/new_ecoli.arff\")\n", + "clf = make_pipeline(StandardScaler(), ECOC())\n", + "cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)\n", + "print(cross_val_score(clf, X, y, cv=cv))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.2 ('.test')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.2" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "354fe7bbb08ce19365ae7e9dc9251db0b8655780cc27fe67a2a3ffff5cc90304" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/ensemble/mrbbagging.ipynb b/examples/ensemble/mrbbagging.ipynb index df9d53f..51b8812 100644 --- a/examples/ensemble/mrbbagging.ipynb +++ b/examples/ensemble/mrbbagging.ipynb @@ -2,14 +2,16 @@ "cells": [ { "cell_type": "code", - "execution_count": 9, + "execution_count": 2, "metadata": {}, "outputs": [ { "data": { - "text/plain": "0.9772648059188251" + "text/plain": [ + "1.0" + ] }, - "execution_count": 9, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -18,7 +20,7 @@ "from imblearn.metrics import geometric_mean_score\n", "from sklearn.datasets import load_wine\n", "from sklearn.model_selection import train_test_split\n", - "from sklearn.tree import tree\n", + "from sklearn import tree\n", "\n", "from multi_imbalance.ensemble.mrbbagging import MRBBagging\n", "\n", @@ -55,11 +57,28 @@ ], "metadata": { "kernelspec": { - "name": "python3", + "display_name": "Python 3.9.2 ('.project_venv')", "language": "python", - "display_name": "Python 3" + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.2" + }, + "vscode": { + "interpreter": { + "hash": "23e9df2d9424db89a1bc7cf8b9f3a46204923f77702e66b6afb0e7a76a59f4cc" + } } }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +} diff --git a/examples/ensemble/mrbbagging_pipeline.ipynb b/examples/ensemble/mrbbagging_pipeline.ipynb index 70cf341..de0dc2f 100644 --- a/examples/ensemble/mrbbagging_pipeline.ipynb +++ b/examples/ensemble/mrbbagging_pipeline.ipynb @@ -2,32 +2,16 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/plutasnyy/anaconda3/envs/multi-imbalance/lib/python3.7/site-packages/sklearn/base.py:197: FutureWarning: From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None.\n", - " FutureWarning)\n", - "/home/plutasnyy/anaconda3/envs/multi-imbalance/lib/python3.7/site-packages/sklearn/base.py:197: FutureWarning: From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None.\n", - " FutureWarning)\n", - "/home/plutasnyy/anaconda3/envs/multi-imbalance/lib/python3.7/site-packages/sklearn/base.py:197: FutureWarning: From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None.\n", - " FutureWarning)\n", - "/home/plutasnyy/anaconda3/envs/multi-imbalance/lib/python3.7/site-packages/sklearn/base.py:197: FutureWarning: From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None.\n", - " FutureWarning)\n", - "/home/plutasnyy/anaconda3/envs/multi-imbalance/lib/python3.7/site-packages/sklearn/base.py:197: FutureWarning: From version 0.24, get_params will raise an AttributeError if a parameter cannot be retrieved as an instance attribute. Previously it would return None.\n", - " FutureWarning)\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "[0.83168317 0.86138614 0.82178218 0.85148515 0.84158416]\n" + "[0.81188119 0.84158416 0.8019802 0.87128713 0.83168317]\n" ] } ], @@ -37,7 +21,7 @@ "from sklearn.model_selection import cross_val_score, ShuffleSplit\n", "from sklearn.pipeline import make_pipeline\n", "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.tree import tree\n", + "from sklearn import tree\n", "\n", "from multi_imbalance.ensemble.mrbbagging import MRBBagging\n", "from multi_imbalance.utils.data import load_arff_dataset\n", @@ -52,23 +36,28 @@ ], "metadata": { "kernelspec": { - "name": "python3", + "display_name": "Python 3.9.2 ('.test')", "language": "python", - "display_name": "Python 3" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" + "pygments_lexer": "ipython3", + "version": "3.9.2" + }, + "vscode": { + "interpreter": { + "hash": "354fe7bbb08ce19365ae7e9dc9251db0b8655780cc27fe67a2a3ffff5cc90304" + } } }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +} diff --git a/examples/ensemble/ovo_pipeline.ipynb b/examples/ensemble/ovo_pipeline.ipynb new file mode 100644 index 0000000..0f109b0 --- /dev/null +++ b/examples/ensemble/ovo_pipeline.ipynb @@ -0,0 +1,61 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0.77227723 0.79207921 0.82178218 0.8019802 0.78217822]\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "from sklearn.model_selection import cross_val_score, ShuffleSplit\n", + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "from multi_imbalance.ensemble.ovo import OVO\n", + "from multi_imbalance.utils.data import load_arff_dataset\n", + "\n", + "# an example of how ovo can be used in sklearn pipeline\n", + "X, y = load_arff_dataset(f\"{os.getcwd()}/../../data/arff/new_ecoli.arff\")\n", + "clf = make_pipeline(StandardScaler(), OVO())\n", + "cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)\n", + "print(cross_val_score(clf, X, y, cv=cv))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.2 ('.test')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.2" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "354fe7bbb08ce19365ae7e9dc9251db0b8655780cc27fe67a2a3ffff5cc90304" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/resampling/GlobalCS.ipynb b/examples/resampling/GlobalCS.ipynb index db44f63..737a536 100644 --- a/examples/resampling/GlobalCS.ipynb +++ b/examples/resampling/GlobalCS.ipynb @@ -14,7 +14,13 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { "name": "stdout", @@ -27,14 +33,6 @@ " [0.23 0.32 0.48 0.5 0.55 0.25 0.35]]\n", "[0 0 0 0 0]\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/plutasnyy/anaconda3/envs/multi-imbalance/lib/python3.7/site-packages/sklearn/utils/deprecation.py:144: FutureWarning: The sklearn.datasets.base module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.datasets. Anything that cannot be imported from sklearn.datasets is now part of the private API.\n", - " warnings.warn(message, FutureWarning)\n" - ] } ], "source": [ @@ -55,69 +53,73 @@ "X, y = df[\"data\"], df[\"target\"]\n", "print(X[:5])\n", "print(y[:5])" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + ] }, { "cell_type": "markdown", - "source": [ - "Resample data using Global CS algorithm" - ], "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } - } + }, + "source": [ + "Resample data using Global CS algorithm" + ] }, { "cell_type": "code", - "execution_count": 2, - "outputs": [], - "source": [ - "clf = GlobalCS()\n", - "resampled_X, resampled_y = clf.fit_resample(X, y)" - ], + "execution_count": 5, "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n" } - } + }, + "outputs": [], + "source": [ + "clf = GlobalCS()\n", + "resampled_X, resampled_y = clf.fit_resample(X, y)" + ] }, { "cell_type": "markdown", - "source": [ - "Compare results by plotting data in 2 dimensions" - ], "metadata": { "collapsed": false, "pycharm": { "name": "#%% md\n" } - } + }, + "source": [ + "Compare results by plotting data in 2 dimensions" + ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [ { "data": { - "text/plain": "" + "text/plain": [ + "" + ] }, - "execution_count": 3, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" }, { "data": { - "text/plain": "
", - "image/png": "\n" + "image/png": "", + "text/plain": [ + "
" + ] }, "metadata": {}, "output_type": "display_data" @@ -151,7 +153,7 @@ ")\n", "\n", "\n", - "axs[3].set_title(\"MDO\")\n", + "axs[3].set_title(\"GlobalCS\")\n", "sns.countplot(resampled_y, ax=axs[2], palette=p)\n", "resampled_X = pca.transform(resampled_X)\n", "df = construct_flat_2pc_df(resampled_X, resampled_y)\n", @@ -166,34 +168,33 @@ " legend=\"full\",\n", " palette=p,\n", ")" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + ] } ], "metadata": { + "kernelspec": { + "display_name": "Python 3.9.2 ('.test')", + "language": "python", + "name": "python3" + }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" + "pygments_lexer": "ipython3", + "version": "3.9.2" }, - "kernelspec": { - "name": "python3", - "language": "python", - "display_name": "Python 3" + "vscode": { + "interpreter": { + "hash": "354fe7bbb08ce19365ae7e9dc9251db0b8655780cc27fe67a2a3ffff5cc90304" + } } }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +} diff --git a/examples/resampling/StaticSMOTE.ipynb b/examples/resampling/StaticSMOTE.ipynb new file mode 100644 index 0000000..3aa6a11 --- /dev/null +++ b/examples/resampling/StaticSMOTE.ipynb @@ -0,0 +1,209 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Unzip datasets and prepare data:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0.49 0.29 0.48 0.5 0.56 0.24 0.35]\n", + " [0.07 0.4 0.48 0.5 0.54 0.35 0.44]\n", + " [0.56 0.4 0.48 0.5 0.49 0.37 0.46]\n", + " [0.59 0.49 0.48 0.5 0.52 0.45 0.36]\n", + " [0.23 0.32 0.48 0.5 0.55 0.25 0.35]]\n", + "[0 0 0 0 0]\n" + ] + } + ], + "source": [ + "from collections import Counter\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from sklearn.decomposition import PCA\n", + "\n", + "from multi_imbalance.datasets import load_datasets\n", + "from multi_imbalance.resampling.static_smote import StaticSMOTE\n", + "from multi_imbalance.utils.data import construct_flat_2pc_df\n", + "\n", + "%matplotlib inline\n", + "sns.set_style(\"darkgrid\")\n", + "\n", + "df = load_datasets()[\"new_ecoli\"]\n", + "X, y = df[\"data\"], df[\"target\"]\n", + "print(X[:5])\n", + "print(y[:5])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Resample data using StaticSMOTE algorithm" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (148) in class 2 will be larger than the number of samples in the majority class (class #0 -> 145)\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "clf = StaticSMOTE()\n", + "resampled_X, resampled_y = clf.fit_resample(X, y)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Compare results by plotting data in 2 dimensions" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "n = len(Counter(y).keys())\n", + "p = sns.color_palette(\"husl\", n)\n", + "\n", + "pca = PCA(n_components=2)\n", + "pca.fit(X)\n", + "\n", + "fig, axs = plt.subplots(ncols=2, nrows=2)\n", + "fig.set_size_inches(16, 10)\n", + "axs = axs.flatten()\n", + "\n", + "axs[1].set_title(\"Base\")\n", + "sns.countplot(y, ax=axs[0], palette=p)\n", + "X = pca.transform(X)\n", + "df = construct_flat_2pc_df(X, y)\n", + "sns.scatterplot(\n", + " x=\"x1\",\n", + " y=\"x2\",\n", + " hue=\"y\",\n", + " style=\"y\",\n", + " data=df,\n", + " alpha=0.7,\n", + " ax=axs[1],\n", + " legend=\"full\",\n", + " palette=p,\n", + ")\n", + "\n", + "\n", + "axs[3].set_title(\"StaticSMOTE\")\n", + "sns.countplot(resampled_y, ax=axs[2], palette=p)\n", + "resampled_X = pca.transform(resampled_X)\n", + "df = construct_flat_2pc_df(resampled_X, resampled_y)\n", + "sns.scatterplot(\n", + " x=\"x1\",\n", + " y=\"x2\",\n", + " hue=\"y\",\n", + " style=\"y\",\n", + " data=df,\n", + " alpha=0.7,\n", + " ax=axs[3],\n", + " legend=\"full\",\n", + " palette=p,\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.2 ('.test')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.2" + }, + "vscode": { + "interpreter": { + "hash": "354fe7bbb08ce19365ae7e9dc9251db0b8655780cc27fe67a2a3ffff5cc90304" + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/multi_imbalance/ensemble/ecoc.py b/multi_imbalance/ensemble/ecoc.py index 067f9a8..cb01c3d 100644 --- a/multi_imbalance/ensemble/ecoc.py +++ b/multi_imbalance/ensemble/ecoc.py @@ -326,7 +326,8 @@ def _encode_complete(self, number_of_classes: int) -> np.ndarray: digit = -1 partial_code_len = 2 ** (number_of_classes - row_idx - 1) for idx in range(0, code_length, partial_code_len): - matrix[row_idx][idx : idx + partial_code_len] = digit + right_idx = idx + partial_code_len + matrix[row_idx][idx:right_idx] = digit digit *= -1 return matrix diff --git a/multi_imbalance/ensemble/mrbbagging.py b/multi_imbalance/ensemble/mrbbagging.py index ba12360..84161ba 100644 --- a/multi_imbalance/ensemble/mrbbagging.py +++ b/multi_imbalance/ensemble/mrbbagging.py @@ -59,7 +59,7 @@ def __init__( self.learning_algorithm = learning_algorithm self.undersampling = undersampling self.feature_selection = feature_selection - self.all_random = random_fs + self.random_fs = random_fs self.half_features = half_features self.random_state = random_state @@ -191,7 +191,7 @@ def _train_with_feature_selection( subset_x = np.array(subset_x).astype(np.float64) subset_y = np.array(subset_y).astype(np.float64) - if self.all_random: + if self.random_fs: subset1, subset1_idx = self._find_random_features( labels_no, features_no, subset_x ) @@ -226,7 +226,7 @@ def _set_classes_dict(self, classes: set): def _select_data(self, classifier_id: int, data: np.ndarray) -> np.ndarray: if self.feature_selection: - if self.all_random: + if self.random_fs: new_data = self._get_features_array( data, self.feature_selection_methods[classifier_id] ) diff --git a/multi_imbalance/ensemble/ovo.py b/multi_imbalance/ensemble/ovo.py index 512fc61..4729fb7 100644 --- a/multi_imbalance/ensemble/ovo.py +++ b/multi_imbalance/ensemble/ovo.py @@ -73,7 +73,7 @@ def __init__( self.binary_classifier = binary_classifier self.n_neighbors = n_neighbors self.preprocessing = preprocessing - self.oversample_between = preprocessing_between + self.preprocessing_between = preprocessing_between self._binary_classifiers = [] self._labels = np.array([]) self._minority_classes = list() @@ -218,14 +218,14 @@ def _smote_oversample( return smote.fit_resample(X, y) def should_perform_oversampling(self, first_class: int, second_class: int): - if self.oversample_between not in OVO._allowed_preprocessing_between: + if self.preprocessing_between not in OVO._allowed_preprocessing_between: raise ValueError( "Unknown strategy for oversampling: %s, expected to be one of %s." - % (self.oversample_between, OVO._allowed_preprocessing_between) + % (self.preprocessing_between, OVO._allowed_preprocessing_between) ) - elif self.oversample_between == "all": + elif self.preprocessing_between == "all": return True - elif self.oversample_between == "maj-min": + elif self.preprocessing_between == "maj-min": return ( first_class in self._minority_classes and second_class not in self._minority_classes diff --git a/multi_imbalance/ensemble/soup_bagging.py b/multi_imbalance/ensemble/soup_bagging.py index 0e6a2b5..427ac4e 100644 --- a/multi_imbalance/ensemble/soup_bagging.py +++ b/multi_imbalance/ensemble/soup_bagging.py @@ -12,10 +12,6 @@ from multi_imbalance.utils.array_util import setdiff -def fit_clf(args: list): - return SOUPBagging.fit_classifier(args) - - class SOUPBagging(BaggingClassifier): """ Version of Bagging that applies SOUP in each classifier @@ -42,6 +38,7 @@ def __init__( """ super().__init__() self.classifiers, self.clf_weights = list(), list() + self.classifier = classifier self.maj_int_min = maj_int_min self.num_core = multiprocessing.cpu_count() self.n_classifiers = n_classifiers @@ -84,6 +81,10 @@ def fit_classifier(args: list) -> Tuple[Any, np.ndarray]: ) return clf, global_weights + @staticmethod + def fit_clf(args: list): + return SOUPBagging.fit_classifier(args) + def fit(self, X: np.ndarray, y: np.ndarray, **kwargs): """ :param X: @@ -99,7 +100,7 @@ def fit(self, X: np.ndarray, y: np.ndarray, **kwargs): pool = multiprocessing.Pool(self.num_core) results = pool.map( - fit_clf, + self.fit_clf, [ ( clf, diff --git a/multi_imbalance/resampling/static_smote.py b/multi_imbalance/resampling/static_smote.py index c91ae62..8934bda 100644 --- a/multi_imbalance/resampling/static_smote.py +++ b/multi_imbalance/resampling/static_smote.py @@ -42,11 +42,10 @@ def _fit_resample( for _ in range(M): sm = SMOTE(sampling_strategy={min_class: cnt[min_class] * 2}) X_smote, y_smote = sm.fit_resample(X_original, y_original) - X_added_examples = X_smote[y_smote == min_class][cnt[min_class] :, :] + idx = cnt[min_class] + X_added_examples = X_smote[y_smote == min_class][idx:, :] X_resampled = np.vstack([X_resampled, X_added_examples]) - y_resampled = np.hstack( - [y_resampled, y_smote[y_smote == min_class][cnt[min_class] :]] - ) + y_resampled = np.hstack([y_resampled, y_smote[y_smote == min_class][idx:]]) cnt = Counter(y_resampled) min_class = min(cnt, key=cnt.get) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..eebb47a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,70 @@ +[build-system] +requires = [ + "setuptools>=42", + "wheel" +] +build-backend = "setuptools.build_meta" + +[project] +name = "multi-imbalance" +description = "Python package for tackling multiclass imbalance problems." +version = "0.1.0" +maintainers = [ + {name = "Damian Horna, Kamil Pluciński, Hanna Klimczak, Jacek Grycza", email = "horna.damian@gmail.com"} +] +readme = "README.md" +classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Topic :: Software Development", + "Topic :: Scientific/Engineering", + "Programming Language :: Python :: 3.9", + +] +dependencies = [ + "numpy~=1.23.4", + "scikit-learn~=1.1.3", + "pandas~=1.5.1", + "imbalanced-learn~=0.9.1", + "seaborn~=0.12.1", + "matplotlib~=3.6.2", +] + +[project.optional-dependencies] +test = [ + "pytest~=7.2.0", + "pytest-cov~=4.0.0", + "coverage~=6.5.0" +] +lint = [ + "flake8~=5.0.4" +] +dev = [ + "tox~=3.27.0" +] +notebooks = [ + "ipython~=8.6.0", + "ipykernel~=6.17.0", + "tqdm~=4.64.1", + "jupyter~=1.0.0" +] +all = [ + "multi-imbalance[test]", + "multi-imbalance[lint]", + "multi-imbalance[dev]", + "multi-imbalance[notebooks]" +] + +[project.urls] +homepage="https://github.com/damian-horna/multi-imbalance" +documentation="https://github.com/damian-horna/multi-imbalance/blob/master/README.md" + +[tool.pytest.ini_options] +addopts = "--cov=multi_imbalance" +testpaths = [ + "tests", +] + +[tool.setuptools] +py-modules = ["multi_imbalance"] \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 1370ff0..71e9137 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,91 +1,13 @@ -alabaster==0.7.12 -attrs==19.3.0 -Babel==2.8.0 -backcall==0.1.0 -bleach==3.1.4 -certifi==2020.4.5.1 -chardet==3.0.4 -commonmark==0.9.1 -coverage==5.1 -cycler==0.10.0 -decorator==4.4.2 -defusedxml==0.6.0 -docutils==0.16 -entrypoints==0.3 -idna==2.9 -imagesize==1.2.0 -imbalanced-learn==0.6.2 -importlib-metadata==1.6.0 -ipykernel==5.1.4 -ipython==7.13.0 -ipython-genutils==0.2.0 -ipywidgets==7.5.1 -jedi==0.16.0 -Jinja2==2.11.1 -joblib==0.14.1 -jsonschema==3.2.0 -jupyter==1.0.0 -jupyter-client==6.1.2 -jupyter-console==6.1.0 -jupyter-core==4.6.3 -kiwisolver==1.1.0 -MarkupSafe==1.1.1 -matplotlib==3.1.3 -mistune==0.8.4 -more-itertools==8.2.0 -multi-imbalance==0.0.8 -nbconvert==5.6.1 -nbformat==5.0.4 -notebook==6.0.3 -numpy==1.18.2 -packaging==20.3 -pandas==1.0.3 -pandocfilters==1.4.2 -parso==0.6.2 -pexpect==4.8.0 -pickleshare==0.7.5 -pluggy==0.13.1 -prometheus-client==0.7.1 -prompt-toolkit==3.0.4 -ptyprocess==0.6.0 -py==1.8.1 -Pygments==2.6.1 -pyparsing==2.4.7 -pyrsistent==0.16.0 -pytest==5.4.1 -pytest-cov==2.8.1 -python-dateutil==2.8.1 -pytz==2019.3 -pyzmq==18.1.1 -qtconsole==4.7.2 -QtPy==1.9.0 -recommonmark==0.6.0 -requests==2.23.0 -rinoh-typeface-dejavuserif==0.1.1 -rinoh-typeface-texgyrecursor==0.1.1 -rinoh-typeface-texgyreheros==0.1.1 -rinoh-typeface-texgyrepagella==0.1.1 -rinohtype==0.4.0 -scikit-learn==0.22.2.post1 -scipy==1.4.1 -seaborn==0.10.0 -Send2Trash==1.5.0 -six==1.14.0 -snowballstemmer==2.0.0 -Sphinx==3.0.1 -sphinx-rtd-theme==0.4.3 -sphinxcontrib-applehelp==1.0.2 -sphinxcontrib-devhelp==1.0.2 -sphinxcontrib-htmlhelp==1.0.3 -sphinxcontrib-jsmath==1.0.1 -sphinxcontrib-qthelp==1.0.3 -sphinxcontrib-serializinghtml==1.1.4 -terminado==0.8.3 -testpath==0.4.4 -tornado==6.0.4 -traitlets==4.3.3 -urllib3==1.25.9 -wcwidth==0.1.9 -webencodings==0.5.1 -widgetsnbextension==3.5.1 -zipp==3.1.0 +numpy~=1.23.4 +scikit-learn~=1.1.3 +pandas~=1.5.1 +imbalanced-learn~=0.9.1 +seaborn~=0.12.1 +matplotlib~=3.6.2 +IPython~=8.6.0 +ipykernel~=6.17.0 +tqdm~=4.64.1 +jupyter~=1.0.0 +pytest~=7.2.0 +pytest-cov~=4.0.0 +coverage~=6.5.0 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..905c8bb --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[flake8] +max-line-length = 160 \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index ac83413..0000000 --- a/setup.py +++ /dev/null @@ -1,39 +0,0 @@ -import setuptools # pragma no cover - -with open("README.md", "r", encoding="UTF8") as fh: # pragma no cover - - long_description = fh.read() - -setuptools.setup( # pragma no cover - name="multi-imbalance", - version="0.0.14", - author="Damian Horna, Kamil Pluciński, Hanna Klimczak, Jacek Grycza", - author_email="horna.damian@gmail.com", - description="Python package for tackling multiclass imbalance problems.", - long_description=long_description, - long_description_content_type="text/markdown", - url="https://github.com/damian-horna/multi-imbalance", - packages=setuptools.find_packages(), - classifiers=[ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: MIT License", - "Operating System :: OS Independent", - "Topic :: Software Development", - "Topic :: Scientific/Engineering", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - ], - install_requires=[ - "numpy>=1.17.0", - "scikit-learn>=0.22", - "pandas>=0.25.1", - "pytest>=5.1.2", - "imbalanced-learn>=0.6.1", - "coverage>=5.1", - "pytest-cov>=2.8.1", - "IPython>=7.13.0", - "seaborn>=0.10.1", - "matplotlib>=3.2.1", - ], -) diff --git a/tests/resampling/test_mdo.py b/tests/resampling/test_mdo.py index 0e88012..551c794 100644 --- a/tests/resampling/test_mdo.py +++ b/tests/resampling/test_mdo.py @@ -1,8 +1,6 @@ -from collections import Counter, defaultdict - import numpy as np import pytest -from numpy.testing import assert_array_equal, assert_allclose, assert_array_almost_equal +from numpy.testing import assert_array_almost_equal from multi_imbalance.resampling.mdo import MDO diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..2a7842d --- /dev/null +++ b/tox.ini @@ -0,0 +1,22 @@ +[tox] +minversions = 3.9.0 +envlist = py39, flake8 +isolated_build = true + +[gh-actions] +python = + 3.9: py39, + flake8 + +[testenv] +setenv = + PYTHONPATH = {toxinidir} +deps = + -r{toxinidir}/requirements.txt +commands = + pytest --cov-report term-missing --basetemp={envtmpdir} + +[testenv:flake8] +basepython = python3.9 +deps = flake8 +commands = flake8 multi_imbalance tests From 86761c90b599933c7a8dbf27fe3126368cbb2b11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Sat, 5 Nov 2022 17:21:00 +0100 Subject: [PATCH 07/48] workflows --- .github/workflows/code_analysis.yml | 21 +++++++++++++++++++++ .github/workflows/tests.yml | 2 +- README.md | 3 +-- 3 files changed, 23 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/code_analysis.yml diff --git a/.github/workflows/code_analysis.yml b/.github/workflows/code_analysis.yml new file mode 100644 index 0000000..663c44e --- /dev/null +++ b/.github/workflows/code_analysis.yml @@ -0,0 +1,21 @@ +name: Code analysis + +on: + - push + - pull_request + +jobs: + tools: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.9 + uses: actions/setup-python@v2 + with: + python-version: 3.9 + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install tox tox-gh-actions + - name: Run flake8 with tox + run: tox -e flake8 \ No newline at end of file diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index eb0efaf..272c2ca 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,4 +1,4 @@ -name: Tests and code analysis +name: Tests on: - push - pull_request diff --git a/README.md b/README.md index 861ad0b..156797b 100644 --- a/README.md +++ b/README.md @@ -106,8 +106,7 @@ If you would use project for local tests (e.g. run `examples/` with your code ch pip install -U -e ".[all]" ``` After that you can use multi-imbalance as normal Python package. Your next changes will be automatically loaded. - -Our project uses GitHub actions on push and pull request. The action runs tox with configuration define in tox.ini. +Our project uses GitHub actions on push and pull request. The actions run tox with configuration define in tox.ini. multi-imbalance uses reStructuredText markdown for docstrings. To build the documentation locally run: ```bash From 17df20852c37374fe829fbe9ae421009c97a0697 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Sat, 5 Nov 2022 17:51:32 +0100 Subject: [PATCH 08/48] different python version --- .github/workflows/tests.yml | 2 +- tox.ini | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 272c2ca..9b42d9d 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -9,7 +9,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, windows-latest] - python-version: [['3.9', 'py39']] + python-version: [['3.8', 'py38'], ['3.9', 'py39']] steps: - uses: actions/checkout@v2 diff --git a/tox.ini b/tox.ini index 2a7842d..aa3ef17 100644 --- a/tox.ini +++ b/tox.ini @@ -1,10 +1,11 @@ [tox] -minversions = 3.9.0 -envlist = py39, flake8 +minversions = 3.8.0 +envlist = py38, py39, flake8 isolated_build = true [gh-actions] python = + 3.8: py38 3.9: py39, flake8 From fe68a2ce5823340475b575e9208cde62f02d3c5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Sat, 5 Nov 2022 18:35:11 +0100 Subject: [PATCH 09/48] ignore ovo tests --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index aa3ef17..11ec619 100644 --- a/tox.ini +++ b/tox.ini @@ -15,7 +15,7 @@ setenv = deps = -r{toxinidir}/requirements.txt commands = - pytest --cov-report term-missing --basetemp={envtmpdir} + pytest --cov-report term-missing --basetemp={envtmpdir} --ignore=tests/ensemble/test_ovo.py [testenv:flake8] basepython = python3.9 From 83d2cc3a60f4e2e9617548fcf33d608ebea44216 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Sat, 5 Nov 2022 18:42:23 +0100 Subject: [PATCH 10/48] ignore soupbagging --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 11ec619..f159f3e 100644 --- a/tox.ini +++ b/tox.ini @@ -15,7 +15,7 @@ setenv = deps = -r{toxinidir}/requirements.txt commands = - pytest --cov-report term-missing --basetemp={envtmpdir} --ignore=tests/ensemble/test_ovo.py + pytest --cov-report term-missing --basetemp={envtmpdir} --ignore=tests/ensemble/test_soupbagging.py [testenv:flake8] basepython = python3.9 From ef256c4158d465f65b347dfcec606a8e32995e6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Sun, 6 Nov 2022 11:16:47 +0100 Subject: [PATCH 11/48] new typing + check soup bagging --- multi_imbalance/ensemble/ecoc.py | 24 ++++++++----- multi_imbalance/ensemble/mrbbagging.py | 41 +++++++++++++++------- multi_imbalance/ensemble/ovo.py | 16 +++++---- multi_imbalance/ensemble/soup_bagging.py | 35 +++++++++++++----- multi_imbalance/resampling/global_cs.py | 6 ++-- multi_imbalance/resampling/mdo.py | 6 ++-- multi_imbalance/resampling/soup.py | 11 +++--- multi_imbalance/resampling/spider.py | 36 ++++++++++--------- multi_imbalance/resampling/static_smote.py | 2 +- multi_imbalance/utils/data.py | 4 +-- tox.ini | 2 +- 11 files changed, 117 insertions(+), 66 deletions(-) diff --git a/multi_imbalance/ensemble/ecoc.py b/multi_imbalance/ensemble/ecoc.py index cb01c3d..9d09f22 100644 --- a/multi_imbalance/ensemble/ecoc.py +++ b/multi_imbalance/ensemble/ecoc.py @@ -2,7 +2,7 @@ from collections import Counter from collections import defaultdict from copy import deepcopy -from typing import Tuple, Union +from typing import Dict, List, Tuple, Union import numpy as np from imblearn.over_sampling import SMOTE @@ -12,6 +12,7 @@ from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.utils import check_random_state +from sklearn.base import ClassifierMixin from multi_imbalance.resampling.global_cs import GlobalCS from multi_imbalance.resampling.soup import SOUP @@ -37,7 +38,7 @@ def __init__( encoding: str = "OVO", n_neighbors: int = 3, weights: Union[None, str] = None, - ): + ) -> None: """ :param binary_classifier: @@ -107,7 +108,10 @@ def __init__( self._dich_weights = None def fit( - self, X: np.ndarray, y: np.ndarray, minority_classes: Union[list, None] = None + self, + X: np.ndarray, + y: np.ndarray, + minority_classes: Union[List[int], None] = None, ): """ @@ -157,7 +161,7 @@ def predict(self, X: np.ndarray) -> np.ndarray: return predicted - def _learn_binary_classifiers(self, X: np.ndarray, y: np.ndarray): + def _learn_binary_classifiers(self, X: np.ndarray, y: np.ndarray) -> None: for classifier_idx, classifier in enumerate(self._binary_classifiers): excluded_classes_indices = [ idx @@ -180,7 +184,7 @@ def _learn_binary_classifiers(self, X: np.ndarray, y: np.ndarray): X_filtered, binary_labels = self._oversample(X_filtered, binary_labels) classifier.fit(X_filtered, binary_labels) - def _gen_code_matrix(self): + def _gen_code_matrix(self) -> None: if self.encoding == "dense": self._code_matrix = self._encode_dense(self._labels.shape[0]) elif self.encoding == "sparse": @@ -310,7 +314,9 @@ def _encode_ovo(self, number_of_classes: int) -> np.ndarray: matrix[row, col] = -1 return matrix - def _map_indices_to_class_pairs(self, number_of_classes: int) -> dict: + def _map_indices_to_class_pairs( + self, number_of_classes: int + ) -> Dict[int, Tuple[int, int]]: indices_map = dict() idx = 0 for i in range(number_of_classes): @@ -384,7 +390,7 @@ def _oversample( raise ValueError("Your resampler must implement fit_transform method") return self.preprocessing.fit_transform(X, y) - def _get_classifier(self): + def _get_classifier(self) -> ClassifierMixin: if isinstance(self.binary_classifier, str): if self.binary_classifier not in ECOC._allowed_classifiers: raise ValueError( @@ -420,7 +426,9 @@ def _smote_oversample( smote = SMOTE(k_neighbors=n_neighbors, random_state=42) return smote.fit_resample(X, y) - def _calc_weights(self, X_for_weights: np.ndarray, y_for_weights: np.ndarray): + def _calc_weights( + self, X_for_weights: np.ndarray, y_for_weights: np.ndarray + ) -> None: if self.weights not in ECOC._allowed_weights: raise ValueError( "Unknown weighting strategy: %s, expected to be one of %s." diff --git a/multi_imbalance/ensemble/mrbbagging.py b/multi_imbalance/ensemble/mrbbagging.py index 84161ba..8b180b2 100644 --- a/multi_imbalance/ensemble/mrbbagging.py +++ b/multi_imbalance/ensemble/mrbbagging.py @@ -1,7 +1,7 @@ from collections import Counter from copy import deepcopy from math import sqrt -from typing import Any, Callable, Tuple, Union +from typing import Callable, Dict, List, Set, Tuple, Union import numpy as np from scipy.stats import multinomial @@ -9,6 +9,7 @@ from sklearn.feature_selection import SelectKBest, chi2, f_classif from sklearn.utils import resample from sklearn.utils.random import sample_without_replacement +from sklearn.base import ClassifierMixin class MRBBagging(BaggingClassifier): @@ -25,13 +26,13 @@ class MRBBagging(BaggingClassifier): def __init__( self, k: int, - learning_algorithm: Any, + learning_algorithm: ClassifierMixin, undersampling: bool = True, feature_selection: bool = False, random_fs: bool = False, half_features: bool = True, random_state: Union[int, None] = None, - ): + ) -> None: """ :param k: number of classifiers (multiplied by 3 when choosing feature selection) @@ -99,7 +100,7 @@ def fit(self, x: np.ndarray, y: np.ndarray, **kwargs): return self - def predict(self, data: np.ndarray) -> list: + def predict(self, data: np.ndarray) -> List[int]: """ Predict classes for examples in data. @@ -108,7 +109,9 @@ def predict(self, data: np.ndarray) -> list: """ return self._select_classes(data) - def _group_data(self, x: np.ndarray, y: np.ndarray) -> Tuple[set, dict]: + def _group_data( + self, x: np.ndarray, y: np.ndarray + ) -> Tuple[Set[int], Dict[int, List[Tuple[np.ndarray, int]]]]: classes = set(y) self.classes = {key: value for (key, value) in enumerate(classes)} data = [[x[i], y[i]] for i in range(len(x))] @@ -119,7 +122,11 @@ def _group_data(self, x: np.ndarray, y: np.ndarray) -> Tuple[set, dict]: return classes, grouped_data def _resample( - self, n: int, prob: float, classes: set, grouped_data: dict + self, + n: int, + prob: float, + classes: Set[int], + grouped_data: Dict[int, List[Tuple[np.ndarray, int]]], ) -> Tuple[np.ndarray, np.ndarray]: samples_no = multinomial.rvs(n=n, p=prob, random_state=self.random_state) subset_x, subset_y = [], [] @@ -137,8 +144,13 @@ def _resample( return np.array(subset_x), np.array(subset_y) def _train( - self, la_list: list, n: int, prob: float, classes: set, grouped_data: dict - ): + self, + la_list: List[ClassifierMixin], + n: int, + prob: float, + classes: Set[int], + grouped_data: Dict[int, List[Tuple[np.ndarray, int]]], + ) -> None: for i in range(len(la_list)): subset_x, subset_y = self._resample(n, prob, classes, grouped_data) @@ -178,8 +190,13 @@ def _get_kbest_classifier( return subset, kBest_estimator def _train_with_feature_selection( - self, la_list: list, n: int, prob: float, classes: set, grouped_data: dict - ): + self, + la_list: List[ClassifierMixin], + n: int, + prob: float, + classes: Set[int], + grouped_data: Dict[int, List[Tuple[np.ndarray, int]]], + ) -> None: for i in range(0, len(la_list), 3): subset_x, subset_y = self._resample(n, prob, classes, grouped_data) labels_no = len(subset_x[0]) @@ -221,7 +238,7 @@ def _train_with_feature_selection( self.classifiers[i + 1] = la_list[i + 1].fit(subset2, subset_y) self.classifiers[i + 2] = la_list[i + 2].fit(subset3, subset_y) - def _set_classes_dict(self, classes: set): + def _set_classes_dict(self, classes: Set[int]) -> None: self.classifier_classes = dict(enumerate(classes)) def _select_data(self, classifier_id: int, data: np.ndarray) -> np.ndarray: @@ -255,7 +272,7 @@ def _count_votes(self, data: np.ndarray) -> np.ndarray: voting_matrix[i][idx] += max(probabilities[i]) return voting_matrix - def _select_classes(self, data: np.ndarray) -> list: + def _select_classes(self, data: np.ndarray) -> List[int]: voting_matrix = self._count_votes(data) selected_classes_ids = voting_matrix.argmax(axis=1) selected_classes = [] diff --git a/multi_imbalance/ensemble/ovo.py b/multi_imbalance/ensemble/ovo.py index 4729fb7..96e9d46 100644 --- a/multi_imbalance/ensemble/ovo.py +++ b/multi_imbalance/ensemble/ovo.py @@ -1,5 +1,5 @@ from copy import deepcopy -from typing import Any, Tuple, Union +from typing import List, Tuple, Union import numpy as np from imblearn.over_sampling import SMOTE @@ -7,6 +7,7 @@ from sklearn.naive_bayes import GaussianNB from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier +from sklearn.base import ClassifierMixin from multi_imbalance.resampling.global_cs import GlobalCS from multi_imbalance.resampling.soup import SOUP @@ -33,7 +34,7 @@ def __init__( n_neighbors: int = 3, preprocessing: str = "SOUP", preprocessing_between: str = "all", - ): + ) -> None: """ :param binary_classifier: binary classifier. Possible classifiers: @@ -79,7 +80,10 @@ def __init__( self._minority_classes = list() def fit( - self, X: np.ndarray, y: np.ndarray, minority_classes: Union[list, None] = None + self, + X: np.ndarray, + y: np.ndarray, + minority_classes: Union[List[int], None] = None, ): """ :param X: @@ -131,7 +135,7 @@ def _construct_binary_outputs_matrix( ] = self._binary_classifiers[class_idx1][class_idx2].predict([instance]) return binary_outputs_matrix - def _learn_binary_classifiers(self, X: np.ndarray, y: np.ndarray): + def _learn_binary_classifiers(self, X: np.ndarray, y: np.ndarray) -> None: for row in range(len(self._labels)): for col in range(row): first_class, second_class = self._labels[row], self._labels[col] @@ -145,7 +149,7 @@ def _learn_binary_classifiers(self, X: np.ndarray, y: np.ndarray): X_filtered, y_filtered = self._oversample(X_filtered, y_filtered) self._binary_classifiers[row][col].fit(X_filtered, y_filtered) - def _get_classifier(self) -> Any: + def _get_classifier(self) -> ClassifierMixin: if isinstance(self.binary_classifier, str): if self.binary_classifier not in OVO._allowed_classifiers: raise ValueError( @@ -217,7 +221,7 @@ def _smote_oversample( smote = SMOTE(k_neighbors=n_neighbors, random_state=42) return smote.fit_resample(X, y) - def should_perform_oversampling(self, first_class: int, second_class: int): + def should_perform_oversampling(self, first_class: int, second_class: int) -> None: if self.preprocessing_between not in OVO._allowed_preprocessing_between: raise ValueError( "Unknown strategy for oversampling: %s, expected to be one of %s." diff --git a/multi_imbalance/ensemble/soup_bagging.py b/multi_imbalance/ensemble/soup_bagging.py index 427ac4e..90dd006 100644 --- a/multi_imbalance/ensemble/soup_bagging.py +++ b/multi_imbalance/ensemble/soup_bagging.py @@ -1,17 +1,30 @@ import multiprocessing from collections import Counter from copy import deepcopy -from typing import Any, Tuple, Union +from typing import Dict, List, Tuple, Union import numpy as np from sklearn.ensemble import BaggingClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.utils import resample +from sklearn.base import ClassifierMixin from multi_imbalance.resampling.soup import SOUP from multi_imbalance.utils.array_util import setdiff +def fit_clf( + args: Tuple[ + ClassifierMixin, + np.ndarray, + np.ndarray, + Tuple[np.ndarray, np.ndarray], + Union[Dict[str, List[int]], None], + ] +): + return SOUPBagging.fit_classifier(args) + + class SOUPBagging(BaggingClassifier): """ Version of Bagging that applies SOUP in each classifier @@ -24,8 +37,8 @@ class SOUPBagging(BaggingClassifier): def __init__( self, - classifier: Union[Any, None] = None, - maj_int_min: Union[dict, None] = None, + classifier: Union[ClassifierMixin, None] = None, + maj_int_min: Union[Dict[str, List[int]], None] = None, n_classifiers: int = 5, ): """ @@ -51,7 +64,15 @@ def __init__( self.classifiers.append(KNeighborsClassifier()) @staticmethod - def fit_classifier(args: list) -> Tuple[Any, np.ndarray]: + def fit_classifier( + args: Tuple[ + ClassifierMixin, + np.ndarray, + np.ndarray, + Tuple[np.ndarray, np.ndarray], + Union[Dict[str, List[int]], None], + ] + ) -> Tuple[ClassifierMixin, np.ndarray]: clf, X, y, resampled, maj_int_min = args x_sampled, y_sampled = resampled @@ -81,10 +102,6 @@ def fit_classifier(args: list) -> Tuple[Any, np.ndarray]: ) return clf, global_weights - @staticmethod - def fit_clf(args: list): - return SOUPBagging.fit_classifier(args) - def fit(self, X: np.ndarray, y: np.ndarray, **kwargs): """ :param X: @@ -100,7 +117,7 @@ def fit(self, X: np.ndarray, y: np.ndarray, **kwargs): pool = multiprocessing.Pool(self.num_core) results = pool.map( - self.fit_clf, + fit_clf, [ ( clf, diff --git a/multi_imbalance/resampling/global_cs.py b/multi_imbalance/resampling/global_cs.py index 9c181a2..44578a1 100644 --- a/multi_imbalance/resampling/global_cs.py +++ b/multi_imbalance/resampling/global_cs.py @@ -1,5 +1,5 @@ from collections import Counter -from typing import Tuple +from typing import List, Tuple import numpy as np import sklearn @@ -12,7 +12,7 @@ class GlobalCS(BaseSampler): for each class to achieve majority class size """ - def __init__(self, shuffle: bool = True): + def __init__(self, shuffle: bool = True) -> None: super().__init__() self._sampling_type = "over-sampling" self.shuffle = shuffle @@ -54,7 +54,7 @@ def _fit_resample( def _equal_oversample( self, X: np.ndarray, y: np.ndarray, class_name: str - ) -> Tuple[list, list]: + ) -> Tuple[List[np.ndarray], List[np.ndarray]]: indices_in_class = [ i for i, class_label in enumerate(y) if class_label == class_name ] diff --git a/multi_imbalance/resampling/mdo.py b/multi_imbalance/resampling/mdo.py index 2f1ed31..6ccb85b 100644 --- a/multi_imbalance/resampling/mdo.py +++ b/multi_imbalance/resampling/mdo.py @@ -1,5 +1,5 @@ from collections import Counter -from typing import Tuple, Union +from typing import Dict, List, Tuple, Union import numpy as np from imblearn.base import BaseSampler @@ -24,8 +24,8 @@ def __init__( k1_frac: float = 0.4, seed: int = 0, prop: int = 1, - maj_int_min: Union[dict, None] = None, - ): + maj_int_min: Union[Dict[str, List[int]], None] = None, + ) -> None: """ :param k: Number of neighbours considered during the neighbourhood analysis diff --git a/multi_imbalance/resampling/soup.py b/multi_imbalance/resampling/soup.py index 04a493e..71e3842 100644 --- a/multi_imbalance/resampling/soup.py +++ b/multi_imbalance/resampling/soup.py @@ -1,7 +1,7 @@ from collections import Counter, defaultdict from copy import deepcopy from operator import itemgetter -from typing import Tuple, Union +from typing import Dict, List, Tuple, Union import numpy as np import sklearn @@ -20,8 +20,11 @@ class SOUP(BaseSampler): """ def __init__( - self, k: int = 7, shuffle: bool = False, maj_int_min: Union[dict, None] = None - ): + self, + k: int = 7, + shuffle: bool = False, + maj_int_min: Union[Dict[str, List[int]], None] = None, + ) -> None: """ :param k: number of neighbors @@ -175,7 +178,7 @@ def _oversample( return X, y def _calculate_goal_quantity( - self, maj_int_min: Union[dict, None] = None + self, maj_int_min: Union[Dict[str, List[int]], None] = None ) -> Union[int, float]: if maj_int_min is None: maj_q = max(list(self.quantities.values())) diff --git a/multi_imbalance/resampling/spider.py b/multi_imbalance/resampling/spider.py index 4b90041..9a57ee1 100644 --- a/multi_imbalance/resampling/spider.py +++ b/multi_imbalance/resampling/spider.py @@ -1,5 +1,5 @@ from collections import Counter -from typing import Tuple, Union +from typing import Dict, List, Tuple, Union import numpy as np from imblearn.base import BaseSampler @@ -22,9 +22,9 @@ class SPIDER3(BaseSampler): def __init__( self, k: int, - maj_int_min: Union[dict, None] = None, + maj_int_min: Union[Dict[str, List[int]], None] = None, cost: Union[np.ndarray, None] = None, - ): + ) -> None: """ :param k: Number of nearest neighbors considered while resampling. @@ -74,7 +74,7 @@ def _fit_resample( return self.DS[:, :-1], self.DS[:, -1] - def _initialize_algorithm(self, X: np.ndarray, y: np.ndarray): + def _initialize_algorithm(self, X: np.ndarray, y: np.ndarray) -> None: if self.maj_int_min is None: self.maj_int_min = construct_maj_int_min(y) self.majority_classes = self.maj_int_min["maj"] @@ -86,7 +86,7 @@ def _initialize_algorithm(self, X: np.ndarray, y: np.ndarray): self.cost = self._estimate_cost_matrix(y) @staticmethod - def _estimate_cost_matrix(y: Union[np.ndarray, list]) -> np.ndarray: + def _estimate_cost_matrix(y: Union[np.ndarray, List[int]]) -> np.ndarray: """ Method that estimates cost matrix automatically. For example given imbalance ratios of 1:2:6, the estimated matrix will be: @@ -110,7 +110,9 @@ def _estimate_cost_matrix(y: Union[np.ndarray, list]) -> np.ndarray: np.fill_diagonal(cost, 0) return cost - def _sort_by_cardinality(self, y: Union[list, np.ndarray]) -> Tuple[list, list]: + def _sort_by_cardinality( + self, y: Union[List[int], np.ndarray] + ) -> Tuple[List[int], List[int]]: class_cardinality = Counter(y) # to ensure looping over classes with decreasing cardinality. int_classes = sorted( @@ -121,14 +123,14 @@ def _sort_by_cardinality(self, y: Union[list, np.ndarray]) -> Tuple[list, list]: ) return int_classes, min_classes - def amplify(self, int_min_class: str): + def amplify(self, int_min_class: str) -> None: self._restart_perspective() int_min_ds = self.DS[self.DS[:, -1] == int_min_class] for x in int_min_ds: self._amplify_nn(x) self._restore_perspective() - def clean(self, int_min_class: str): + def clean(self, int_min_class: str) -> None: self._restart_perspective() int_min_ds = self.DS[self.DS[:, -1] == int_min_class] int_min_as = self._calc_int_min_as(int_min_class) @@ -136,14 +138,14 @@ def clean(self, int_min_class: str): self._clean_nn(x) self._restore_perspective() - def relabel(self, int_min_class: str): + def relabel(self, int_min_class: str) -> None: self._restart_perspective() int_min_ds = self.DS[self.DS[:, -1] == int_min_class] for x in int_min_ds: self._relabel_nn(x) self._restore_perspective() - def _restart_perspective(self): + def _restart_perspective(self) -> None: """ Performs normalization over resampled dataset. """ @@ -159,7 +161,7 @@ def _restart_perspective(self): if dataset.shape[0] > 0: self._normalize(dataset) - def _restore_perspective(self): + def _restore_perspective(self) -> None: """ Denormalizes for further processing. """ @@ -167,11 +169,11 @@ def _restore_perspective(self): if dataset.shape[0] > 0: self._denormalize(dataset) - def _normalize(self, dataset: np.ndarray): + def _normalize(self, dataset: np.ndarray) -> None: for col in range(dataset.shape[1] - 1): dataset[:, col] = (dataset[:, col] - self.means[col]) / (4 * self.stds[col]) - def _denormalize(self, dataset: np.ndarray): + def _denormalize(self, dataset: np.ndarray) -> None: for col in range(dataset.shape[1] - 1): dataset[:, col] = dataset[:, col] * self.stds[col] * 4 + self.means[col] @@ -190,7 +192,7 @@ def _calc_int_min_as(self, int_min_class: str) -> np.ndarray: int_min_as = np.array([]) return int_min_as - def _calculate_weak_majority_examples(self): + def _calculate_weak_majority_examples(self) -> None: """ Calculates weak majority examples and appends them to the RS set. """ @@ -230,7 +232,7 @@ def _min_cost_classes(self, x: np.ndarray, DS: np.ndarray) -> np.ndarray: vals = np.round(vals, 6) return C[vals == vals[np.argmin(vals)]] - def _relabel_nn(self, x: np.ndarray): + def _relabel_nn(self, x: np.ndarray) -> None: """ Performs relabeling in the nearest neighborhood of x. @@ -249,7 +251,7 @@ def _relabel_nn(self, x: np.ndarray): neighbor[-1] = x[-1] self.AS = union(self.AS, np.array([neighbor])) - def _clean_nn(self, x: np.ndarray): + def _clean_nn(self, x: np.ndarray) -> None: """ Performs cleaning in the nearest neighborhood of x. @@ -304,7 +306,7 @@ def _knn(self, x: np.ndarray, DS: np.ndarray) -> np.ndarray: return DS[indices] - def _amplify_nn(self, x: np.ndarray): + def _amplify_nn(self, x: np.ndarray) -> None: """ Artificially amplifies example x by adding a copy of it to the AS. diff --git a/multi_imbalance/resampling/static_smote.py b/multi_imbalance/resampling/static_smote.py index 8934bda..ddf437e 100644 --- a/multi_imbalance/resampling/static_smote.py +++ b/multi_imbalance/resampling/static_smote.py @@ -16,7 +16,7 @@ class StaticSMOTE(BaseSampler): (2011) """ - def __init__(self): + def __init__(self) -> None: super().__init__() self._sampling_type = "over-sampling" diff --git a/multi_imbalance/utils/data.py b/multi_imbalance/utils/data.py index 892e317..7e10a78 100644 --- a/multi_imbalance/utils/data.py +++ b/multi_imbalance/utils/data.py @@ -94,8 +94,8 @@ def load_datasets_arff( datasets = OrderedDict() for path in sorted(dataset_paths): - dataset_file = os.path.basename(path) - dataset_name = os.path.splitext(dataset_file)[0] + path = Path(path) + dataset_name = path.stem if return_non_cat_length: X, y, cat_length = load_arff_dataset( path, return_non_cat_length=return_non_cat_length diff --git a/tox.ini b/tox.ini index f159f3e..aa3ef17 100644 --- a/tox.ini +++ b/tox.ini @@ -15,7 +15,7 @@ setenv = deps = -r{toxinidir}/requirements.txt commands = - pytest --cov-report term-missing --basetemp={envtmpdir} --ignore=tests/ensemble/test_soupbagging.py + pytest --cov-report term-missing --basetemp={envtmpdir} [testenv:flake8] basepython = python3.9 From ded5cecda4b18461f222c1761324208ecb1209b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Sun, 6 Nov 2022 12:03:59 +0100 Subject: [PATCH 12/48] black line length and delete pool close --- multi_imbalance/ensemble/ecoc.py | 138 +++++---------------- multi_imbalance/ensemble/mrbbagging.py | 56 +++------ multi_imbalance/ensemble/ovo.py | 62 +++------ multi_imbalance/ensemble/soup_bagging.py | 22 +--- multi_imbalance/resampling/global_cs.py | 20 +-- multi_imbalance/resampling/mdo.py | 77 +++--------- multi_imbalance/resampling/soup.py | 74 +++-------- multi_imbalance/resampling/spider.py | 36 ++---- multi_imbalance/resampling/static_smote.py | 4 +- multi_imbalance/utils/data.py | 16 +-- multi_imbalance/utils/metrics.py | 4 +- multi_imbalance/utils/plot.py | 4 +- pyproject.toml | 5 +- setup.cfg | 2 +- tests/ensemble/test_ecoc.py | 59 ++------- tests/ensemble/test_mrbbagging.py | 8 +- tests/ensemble/test_ovo.py | 16 +-- tests/resampling/test_mdo.py | 4 +- tests/resampling/test_soup.py | 24 +--- tests/resampling/test_spider.py | 4 +- tests/utils/test_data.py | 28 +---- 21 files changed, 152 insertions(+), 511 deletions(-) diff --git a/multi_imbalance/ensemble/ecoc.py b/multi_imbalance/ensemble/ecoc.py index 9d09f22..7113634 100644 --- a/multi_imbalance/ensemble/ecoc.py +++ b/multi_imbalance/ensemble/ecoc.py @@ -128,17 +128,13 @@ def fit( self.minority_classes = minority_classes if self.weights is not None: - X_train, X_for_weights, y_train, y_for_weights = train_test_split( - X, y, test_size=0.2, stratify=y, random_state=0 - ) + X_train, X_for_weights, y_train, y_for_weights = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0) else: X_train, y_train = X, y self._labels = np.unique(y) self._gen_code_matrix() - self._binary_classifiers = [ - self._get_classifier() for _ in range(self._code_matrix.shape[1]) - ] + self._binary_classifiers = [self._get_classifier() for _ in range(self._code_matrix.shape[1])] self._learn_binary_classifiers(X_train, y_train) if self.weights is not None: self._calc_weights(X_for_weights, y_for_weights) @@ -164,23 +160,11 @@ def predict(self, X: np.ndarray) -> np.ndarray: def _learn_binary_classifiers(self, X: np.ndarray, y: np.ndarray) -> None: for classifier_idx, classifier in enumerate(self._binary_classifiers): excluded_classes_indices = [ - idx - for idx in range(len(y)) - if self._code_matrix[self._labels.tolist().index(y[idx])][ - classifier_idx - ] - == 0 + idx for idx in range(len(y)) if self._code_matrix[self._labels.tolist().index(y[idx])][classifier_idx] == 0 ] X_filtered = np.delete(X, excluded_classes_indices, 0) y_filtered = np.delete(y, excluded_classes_indices) - binary_labels = np.array( - [ - self._code_matrix[self._labels.tolist().index(clazz)][ - classifier_idx - ] - for clazz in y_filtered - ] - ) + binary_labels = np.array([self._code_matrix[self._labels.tolist().index(clazz)][classifier_idx] for clazz in y_filtered]) X_filtered, binary_labels = self._oversample(X_filtered, binary_labels) classifier.fit(X_filtered, binary_labels) @@ -196,10 +180,7 @@ def _gen_code_matrix(self) -> None: elif self.encoding == "OVA": self._code_matrix = self._encode_ova(self._labels.shape[0]) else: - raise ValueError( - "Unknown matrix generation encoding: %s, expected to be one of %s." - % (self.encoding, ECOC._allowed_encodings) - ) + raise ValueError("Unknown matrix generation encoding: %s, expected to be one of %s." % (self.encoding, ECOC._allowed_encodings)) def _encode_dense( self, @@ -209,14 +190,10 @@ def _encode_dense( ) -> np.ndarray: try: dirname = os.path.dirname(__file__) - matrix = np.load( - dirname + f"/cached_matrices/dense_{number_of_classes}.npy" - ) + matrix = np.load(dirname + f"/cached_matrices/dense_{number_of_classes}.npy") return matrix except IOError: - print( - f"Could not find cached matrix for dense code for {number_of_classes} classes, generating matrix..." - ) + print(f"Could not find cached matrix for dense code for {number_of_classes} classes, generating matrix...") number_of_columns = int(np.ceil(10 * np.log2(number_of_classes))) code_matrix = np.ones((number_of_classes, number_of_columns)) @@ -233,9 +210,7 @@ def _encode_dense( tmp_code_matrix[row, col] = -1 for compared_row in range(0, row): - dist = self._hamming_distance( - tmp_code_matrix[compared_row], tmp_code_matrix[row] - ) + dist = self._hamming_distance(tmp_code_matrix[compared_row], tmp_code_matrix[row]) if dist < min_dist: min_dist = dist @@ -252,14 +227,10 @@ def _encode_sparse( ) -> np.ndarray: try: dirname = os.path.dirname(__file__) - matrix = np.load( - dirname + f"/cached_matrices/sparse_{number_of_classes}.npy" - ) + matrix = np.load(dirname + f"/cached_matrices/sparse_{number_of_classes}.npy") return matrix except IOError: - print( - f"Could not find cached matrix for sparse code for {number_of_classes} classes, generating matrix..." - ) + print(f"Could not find cached matrix for sparse code for {number_of_classes} classes, generating matrix...") number_of_columns = int(np.ceil(15 * np.log2(number_of_classes))) code_matrix = np.ones((number_of_classes, number_of_columns)) @@ -282,9 +253,7 @@ def _encode_sparse( break for compared_row in range(0, row): - dist = self._hamming_distance( - tmp_code_matrix[compared_row], tmp_code_matrix[row] - ) + dist = self._hamming_distance(tmp_code_matrix[compared_row], tmp_code_matrix[row]) if dist < min_dist: min_dist = dist @@ -314,9 +283,7 @@ def _encode_ovo(self, number_of_classes: int) -> np.ndarray: matrix[row, col] = -1 return matrix - def _map_indices_to_class_pairs( - self, number_of_classes: int - ) -> Dict[int, Tuple[int, int]]: + def _map_indices_to_class_pairs(self, number_of_classes: int) -> Dict[int, Tuple[int, int]]: indices_map = dict() idx = 0 for i in range(number_of_classes): @@ -346,34 +313,19 @@ def _has_matrix_all_zeros_column(self, matrix: np.ndarray) -> bool: def _get_closest_class(self, row: np.ndarray) -> np.ndarray: if self.weights is not None: return self._labels[ - np.argmin( - [ - sum(np.multiply(self.dich_weights, (encoded_class - row) ** 2)) - for encoded_class in self._code_matrix - ] - ) + np.argmin([sum(np.multiply(self.dich_weights, (encoded_class - row) ** 2)) for encoded_class in self._code_matrix]) ] else: - return self._labels[ - np.argmin( - [ - self._hamming_distance(row, encoded_class) - for encoded_class in self._code_matrix - ] - ) - ] + return self._labels[np.argmin([self._hamming_distance(row, encoded_class) for encoded_class in self._code_matrix])] - def _oversample( - self, X: np.ndarray, y: np.ndarray - ) -> Tuple[np.ndarray, np.ndarray]: + def _oversample(self, X: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: if self.preprocessing is None: return X, y if isinstance(self.preprocessing, str): if self.preprocessing not in ECOC._allowed_oversampling: raise ValueError( - "Unknown preprocessing method: %s, expected to be one of %s." - % (self.preprocessing, ECOC._allowed_oversampling) + "Unknown preprocessing method: %s, expected to be one of %s." % (self.preprocessing, ECOC._allowed_oversampling) ) elif np.unique(y).size == 1: return X, y @@ -394,8 +346,7 @@ def _get_classifier(self) -> ClassifierMixin: if isinstance(self.binary_classifier, str): if self.binary_classifier not in ECOC._allowed_classifiers: raise ValueError( - "Unknown binary classifier: %s, expected to be one of %s." - % (self.binary_classifier, ECOC._allowed_classifiers) + "Unknown binary classifier: %s, expected to be one of %s." % (self.binary_classifier, ECOC._allowed_classifiers) ) elif self.binary_classifier == "tree": decision_tree_classifier = DecisionTreeClassifier(random_state=42) @@ -407,33 +358,20 @@ def _get_classifier(self) -> ClassifierMixin: knn = KNeighborsClassifier(n_neighbors=self.n_neighbors) return knn else: - if not hasattr(self.binary_classifier, "fit") or not hasattr( - self.binary_classifier, "predict" - ): - raise ValueError( - "Your classifier must implement fit and predict methods" - ) + if not hasattr(self.binary_classifier, "fit") or not hasattr(self.binary_classifier, "predict"): + raise ValueError("Your classifier must implement fit and predict methods") return deepcopy(self.binary_classifier) - def _smote_oversample( - self, X: np.ndarray, y: np.ndarray - ) -> Tuple[np.ndarray, np.ndarray]: + def _smote_oversample(self, X: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: n_neighbors = min(3, min(np.unique(y, return_counts=True)[1]) - 1) if n_neighbors == 0: - raise ValueError( - "In order to use SMOTE preprocessing, the training set should contain at least 2 examples from each class" - ) + raise ValueError("In order to use SMOTE preprocessing, the training set should contain at least 2 examples from each class") smote = SMOTE(k_neighbors=n_neighbors, random_state=42) return smote.fit_resample(X, y) - def _calc_weights( - self, X_for_weights: np.ndarray, y_for_weights: np.ndarray - ) -> None: + def _calc_weights(self, X_for_weights: np.ndarray, y_for_weights: np.ndarray) -> None: if self.weights not in ECOC._allowed_weights: - raise ValueError( - "Unknown weighting strategy: %s, expected to be one of %s." - % (self.weights, ECOC._allowed_weights) - ) + raise ValueError("Unknown weighting strategy: %s, expected to be one of %s." % (self.weights, ECOC._allowed_weights)) dich_weights = np.ones(self._code_matrix.shape[1]) if self.weights == "acc": @@ -441,42 +379,22 @@ def _calc_weights( samples_no = 0 correct_no = 0 for sample, sample_label in zip(X_for_weights, y_for_weights): - if ( - self._code_matrix[np.where(self._labels == sample_label)[0][0]][ - clf_idx - ] - != 0 - ): + if self._code_matrix[np.where(self._labels == sample_label)[0][0]][clf_idx] != 0: samples_no += 1 - if ( - clf.predict([sample])[0] - == self._code_matrix[ - np.where(self._labels == sample_label)[0][0] - ][clf_idx] - ): + if clf.predict([sample])[0] == self._code_matrix[np.where(self._labels == sample_label)[0][0]][clf_idx]: correct_no += 1 if samples_no != 0: acc = correct_no / samples_no dich_weights[clf_idx] = -1 + 2 * acc elif self.weights == "avg_tpr_min": - min_counter = Counter( - [y for y in y_for_weights if y in self.minority_classes] - ) + min_counter = Counter([y for y in y_for_weights if y in self.minority_classes]) for clf_idx, clf in enumerate(self._binary_classifiers): min_correct_pred = defaultdict(lambda: 0) for sample, sample_label in zip(X_for_weights, y_for_weights): - if ( - clf.predict([sample])[0] - == self._code_matrix[ - np.where(self._labels == sample_label)[0][0] - ][clf_idx] - ): + if clf.predict([sample])[0] == self._code_matrix[np.where(self._labels == sample_label)[0][0]][clf_idx]: min_correct_pred[sample_label] += 1 - tpr_min = [ - min_correct_pred[clazz] / min_counter[clazz] - for clazz in min_counter.keys() - ] + tpr_min = [min_correct_pred[clazz] / min_counter[clazz] for clazz in min_counter.keys()] avg_tpr_min = np.mean(tpr_min) if tpr_min else np.nan dich_weights[clf_idx] = avg_tpr_min diff --git a/multi_imbalance/ensemble/mrbbagging.py b/multi_imbalance/ensemble/mrbbagging.py index 8b180b2..819fab6 100644 --- a/multi_imbalance/ensemble/mrbbagging.py +++ b/multi_imbalance/ensemble/mrbbagging.py @@ -109,9 +109,7 @@ def predict(self, data: np.ndarray) -> List[int]: """ return self._select_classes(data) - def _group_data( - self, x: np.ndarray, y: np.ndarray - ) -> Tuple[Set[int], Dict[int, List[Tuple[np.ndarray, int]]]]: + def _group_data(self, x: np.ndarray, y: np.ndarray) -> Tuple[Set[int], Dict[int, List[Tuple[np.ndarray, int]]]]: classes = set(y) self.classes = {key: value for (key, value) in enumerate(classes)} data = [[x[i], y[i]] for i in range(len(x))] @@ -159,21 +157,15 @@ def _train( self.classifiers[i] = la_list[i].fit(subset_x, subset_y) - def _find_random_features( - self, labels_no: int, features_no: int, subset_x: np.ndarray - ) -> Tuple[np.ndarray, np.ndarray]: + def _find_random_features(self, labels_no: int, features_no: int, subset_x: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: random_features_idx = sample_without_replacement(labels_no, features_no) random_features = self._get_features_array(subset_x, random_features_idx) return random_features, random_features_idx - def _get_features_array( - self, subset_x: np.ndarray, random_features_idx: np.ndarray - ) -> np.ndarray: + def _get_features_array(self, subset_x: np.ndarray, random_features_idx: np.ndarray) -> np.ndarray: random_features = np.array(subset_x[:, random_features_idx[0]]) for f in range(1, len(random_features_idx)): - random_features = np.vstack( - (random_features, subset_x[:, random_features_idx[f]]) - ) + random_features = np.vstack((random_features, subset_x[:, random_features_idx[f]])) if random_features.ndim == 1: return random_features[:, np.newaxis] return random_features.T @@ -209,26 +201,14 @@ def _train_with_feature_selection( subset_y = np.array(subset_y).astype(np.float64) if self.random_fs: - subset1, subset1_idx = self._find_random_features( - labels_no, features_no, subset_x - ) - subset2, subset2_idx = self._find_random_features( - labels_no, features_no, subset_x - ) - subset3, subset3_idx = self._find_random_features( - labels_no, features_no, subset_x - ) + subset1, subset1_idx = self._find_random_features(labels_no, features_no, subset_x) + subset2, subset2_idx = self._find_random_features(labels_no, features_no, subset_x) + subset3, subset3_idx = self._find_random_features(labels_no, features_no, subset_x) else: - subset1, subset1_idx = self._get_kbest_classifier( - chi2, features_no, subset_x, subset_y - ) - subset2, subset2_idx = self._get_kbest_classifier( - f_classif, features_no, subset_x, subset_y - ) - subset3, subset3_idx = self._find_random_features( - labels_no, features_no, subset_x - ) + subset1, subset1_idx = self._get_kbest_classifier(chi2, features_no, subset_x, subset_y) + subset2, subset2_idx = self._get_kbest_classifier(f_classif, features_no, subset_x, subset_y) + subset3, subset3_idx = self._find_random_features(labels_no, features_no, subset_x) self.feature_selection_methods[i] = subset1_idx self.feature_selection_methods[i + 1] = subset2_idx @@ -244,18 +224,12 @@ def _set_classes_dict(self, classes: Set[int]) -> None: def _select_data(self, classifier_id: int, data: np.ndarray) -> np.ndarray: if self.feature_selection: if self.random_fs: - new_data = self._get_features_array( - data, self.feature_selection_methods[classifier_id] - ) + new_data = self._get_features_array(data, self.feature_selection_methods[classifier_id]) else: if (classifier_id % 3) - 2 == 0: - new_data = self._get_features_array( - data, self.feature_selection_methods[classifier_id] - ) + new_data = self._get_features_array(data, self.feature_selection_methods[classifier_id]) else: - new_data = self.feature_selection_methods[classifier_id].transform( - data - ) + new_data = self.feature_selection_methods[classifier_id].transform(data) return new_data return data @@ -266,9 +240,7 @@ def _count_votes(self, data: np.ndarray) -> np.ndarray: classes = self.classifiers[classifier_id].predict(new_data) probabilities = self.classifiers[classifier_id].predict_proba(new_data) for i, cl in enumerate(classes): - idx = list(self.classifier_classes.keys())[ - list(self.classifier_classes.values()).index(int(cl)) - ] + idx = list(self.classifier_classes.keys())[list(self.classifier_classes.values()).index(int(cl))] voting_matrix[i][idx] += max(probabilities[i]) return voting_matrix diff --git a/multi_imbalance/ensemble/ovo.py b/multi_imbalance/ensemble/ovo.py index 96e9d46..6d9e83b 100644 --- a/multi_imbalance/ensemble/ovo.py +++ b/multi_imbalance/ensemble/ovo.py @@ -101,9 +101,7 @@ def fit( self._labels = np.unique(y) self._minority_classes = minority_classes num_of_classes = len(self._labels) - self._binary_classifiers = [ - [self._get_classifier() for _ in range(n)] for n in range(0, num_of_classes) - ] + self._binary_classifiers = [[self._get_classifier() for _ in range(n)] for n in range(0, num_of_classes)] self._learn_binary_classifiers(X, y) return self @@ -117,33 +115,23 @@ def predict(self, X: np.ndarray) -> np.ndarray: num_of_classes = len(self._labels) predicted = list() for instance in X: - binary_outputs_matrix = self._construct_binary_outputs_matrix( - instance, num_of_classes - ) + binary_outputs_matrix = self._construct_binary_outputs_matrix(instance, num_of_classes) predicted.append(self._perform_max_voting(binary_outputs_matrix)) return np.array(predicted) - def _construct_binary_outputs_matrix( - self, instance: np.ndarray, num_of_classes: int - ) -> np.ndarray: + def _construct_binary_outputs_matrix(self, instance: np.ndarray, num_of_classes: int) -> np.ndarray: binary_outputs_matrix = np.zeros((num_of_classes, num_of_classes)) for class_idx1 in range(len(self._labels)): for class_idx2 in range(class_idx1): - binary_outputs_matrix[class_idx1][ - class_idx2 - ] = self._binary_classifiers[class_idx1][class_idx2].predict([instance]) + binary_outputs_matrix[class_idx1][class_idx2] = self._binary_classifiers[class_idx1][class_idx2].predict([instance]) return binary_outputs_matrix def _learn_binary_classifiers(self, X: np.ndarray, y: np.ndarray) -> None: for row in range(len(self._labels)): for col in range(row): first_class, second_class = self._labels[row], self._labels[col] - filtered_indices = [ - idx - for idx in range(len(y)) - if y[idx] in (first_class, second_class) - ] + filtered_indices = [idx for idx in range(len(y)) if y[idx] in (first_class, second_class)] X_filtered, y_filtered = X[filtered_indices], y[filtered_indices] if self.should_perform_oversampling(first_class, second_class): X_filtered, y_filtered = self._oversample(X_filtered, y_filtered) @@ -153,8 +141,7 @@ def _get_classifier(self) -> ClassifierMixin: if isinstance(self.binary_classifier, str): if self.binary_classifier not in OVO._allowed_classifiers: raise ValueError( - "Unknown binary classifier: %s, expected to be one of %s." - % (self.binary_classifier, OVO._allowed_classifiers) + "Unknown binary classifier: %s, expected to be one of %s." % (self.binary_classifier, OVO._allowed_classifiers) ) elif self.binary_classifier == "tree": decision_tree_classifier = DecisionTreeClassifier(random_state=42) @@ -166,35 +153,24 @@ def _get_classifier(self) -> ClassifierMixin: knn = KNeighborsClassifier(n_neighbors=self.n_neighbors) return knn else: - if not hasattr(self.binary_classifier, "fit") or not hasattr( - self.binary_classifier, "predict" - ): - raise ValueError( - "Your classifier must implement fit and predict methods" - ) + if not hasattr(self.binary_classifier, "fit") or not hasattr(self.binary_classifier, "predict"): + raise ValueError("Your classifier must implement fit and predict methods") return deepcopy(self.binary_classifier) def _perform_max_voting(self, binary_outputs_matrix: np.ndarray) -> np.ndarray: scores = np.zeros(len(self._labels)) for clf_1 in range(len(binary_outputs_matrix)): for clf_2 in range(clf_1): - scores[ - self._labels.tolist().index(binary_outputs_matrix[clf_1][clf_2]) - ] += 1 + scores[self._labels.tolist().index(binary_outputs_matrix[clf_1][clf_2])] += 1 return self._labels[np.argmax(scores)] - def _oversample( - self, X: np.ndarray, y: np.ndarray - ) -> Tuple[np.ndarray, np.ndarray]: + def _oversample(self, X: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: if self.preprocessing is None: return X, y if isinstance(self.preprocessing, str): if self.preprocessing not in OVO._allowed_preprocessing: - raise ValueError( - "Unknown preprocessing: %s, expected to be one of %s." - % (self.preprocessing, OVO._allowed_preprocessing) - ) + raise ValueError("Unknown preprocessing: %s, expected to be one of %s." % (self.preprocessing, OVO._allowed_preprocessing)) elif np.unique(y).size == 1: return X, y elif self.preprocessing == "globalCS": @@ -210,14 +186,10 @@ def _oversample( raise ValueError("Your resampler must implement fit_resample method") return self.preprocessing.fit_resample(X, y) - def _smote_oversample( - self, X: np.ndarray, y: np.ndarray - ) -> Tuple[np.ndarray, np.ndarray]: + def _smote_oversample(self, X: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: n_neighbors = min(3, min(np.unique(y, return_counts=True)[1]) - 1) if n_neighbors == 0: - raise ValueError( - "In order to use SMOTE preprocessing, the training set should contain at least 2 examples from each class" - ) + raise ValueError("In order to use SMOTE preprocessing, the training set should contain at least 2 examples from each class") smote = SMOTE(k_neighbors=n_neighbors, random_state=42) return smote.fit_resample(X, y) @@ -230,10 +202,6 @@ def should_perform_oversampling(self, first_class: int, second_class: int) -> No elif self.preprocessing_between == "all": return True elif self.preprocessing_between == "maj-min": - return ( - first_class in self._minority_classes - and second_class not in self._minority_classes - ) or ( - second_class in self._minority_classes - and first_class not in self._minority_classes + return (first_class in self._minority_classes and second_class not in self._minority_classes) or ( + second_class in self._minority_classes and first_class not in self._minority_classes ) diff --git a/multi_imbalance/ensemble/soup_bagging.py b/multi_imbalance/ensemble/soup_bagging.py index 90dd006..79860c1 100644 --- a/multi_imbalance/ensemble/soup_bagging.py +++ b/multi_imbalance/ensemble/soup_bagging.py @@ -82,24 +82,18 @@ def fit_classifier( ) x_out, y_out = out_of_bag[:, :-1], out_of_bag[:, -1].astype(int) - x_resampled, y_resampled = SOUP(maj_int_min=maj_int_min).fit_resample( - x_sampled, y_sampled - ) + x_resampled, y_resampled = SOUP(maj_int_min=maj_int_min).fit_resample(x_sampled, y_sampled) clf.fit(x_resampled, y_resampled) result = clf.predict_proba(x_out) class_sum_prob = np.sum(result, axis=0) + 0.001 class_quantities = Counter(y_out) - expected_sum_prob = np.array( - [class_quantities[i] for i in range(len(Counter(y)))] - ) + expected_sum_prob = np.array([class_quantities[i] for i in range(len(Counter(y)))]) try: global_weights = expected_sum_prob / class_sum_prob except Exception: global_weights = np.ones(shape=len(Counter(y))) - print( - f"Exc {Counter(y)} {Counter(y_out)} {result.shape} {expected_sum_prob.shape} {class_sum_prob.shape}" - ) + print(f"Exc {Counter(y)} {Counter(y_out)} {result.shape} {expected_sum_prob.shape} {class_sum_prob.shape}") return clf, global_weights def fit(self, X: np.ndarray, y: np.ndarray, **kwargs): @@ -129,8 +123,6 @@ def fit(self, X: np.ndarray, y: np.ndarray, **kwargs): for i, clf in enumerate(self.classifiers) ], ) - pool.close() - pool.join() for i, (clf, weights) in enumerate(results): self.classifiers[i] = clf self.clf_weights[i] = weights @@ -169,15 +161,11 @@ def predict(self, X: np.ndarray, strategy: str = "average") -> np.ndarray: p = np.zeros(shape=(n_samples, n_classes)) - 1 for i in range(n_classes): - two_dim_class_vector = weights_sum[ - :, :, i - ] # [:,:,1] -> [classifiers x samples] + two_dim_class_vector = weights_sum[:, :, i] # [:,:,1] -> [classifiers x samples] if i in self.maj_int_min["min"]: squeeze_with_strategy = np.max(two_dim_class_vector, axis=0) else: - squeeze_with_strategy = np.min( - two_dim_class_vector, axis=0 - ) # [1, n_samples, 1] -> [n_samples] + squeeze_with_strategy = np.min(two_dim_class_vector, axis=0) # [1, n_samples, 1] -> [n_samples] p[:, i] = squeeze_with_strategy assert -1 not in p elif strategy == "global": diff --git a/multi_imbalance/resampling/global_cs.py b/multi_imbalance/resampling/global_cs.py index 44578a1..18b3483 100644 --- a/multi_imbalance/resampling/global_cs.py +++ b/multi_imbalance/resampling/global_cs.py @@ -18,9 +18,7 @@ def __init__(self, shuffle: bool = True) -> None: self.shuffle = shuffle self.quantities, self.max_quantity, self.X, self.y = [None] * 4 - def _fit_resample( - self, X: np.ndarray, y: np.ndarray - ) -> Tuple[np.ndarray, np.ndarray]: + def _fit_resample(self, X: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """ :param X: two dimensional numpy array (number of samples x number of features) with float numbers @@ -30,9 +28,7 @@ def _fit_resample( Resampled X (max class quantity * number of unique classes), y (number of rows in X) as numpy array """ assert len(X.shape) == 2, "X should have 2 dimension" - assert ( - X.shape[0] == y.shape[0] - ), "Number of labels must be equal to number of samples" + assert X.shape[0] == y.shape[0], "Number of labels must be equal to number of samples" self.quantities = Counter(y) self.max_quantity = int(np.max(list(self.quantities.values()))) @@ -52,17 +48,11 @@ def _fit_resample( return np.array(result_X), np.array(result_y) - def _equal_oversample( - self, X: np.ndarray, y: np.ndarray, class_name: str - ) -> Tuple[List[np.ndarray], List[np.ndarray]]: - indices_in_class = [ - i for i, class_label in enumerate(y) if class_label == class_name - ] + def _equal_oversample(self, X: np.ndarray, y: np.ndarray, class_name: str) -> Tuple[List[np.ndarray], List[np.ndarray]]: + indices_in_class = [i for i, class_label in enumerate(y) if class_label == class_name] desired_quantity = self.max_quantity - len(indices_in_class) - oversampled_X, oversampled_y = list(X[indices_in_class]), list( - y[indices_in_class] - ) + oversampled_X, oversampled_y = list(X[indices_in_class]), list(y[indices_in_class]) for i in range(desired_quantity): sample_index_to_duplicate: int = i % self.quantities[class_name] diff --git a/multi_imbalance/resampling/mdo.py b/multi_imbalance/resampling/mdo.py index 6ccb85b..aabe26a 100644 --- a/multi_imbalance/resampling/mdo.py +++ b/multi_imbalance/resampling/mdo.py @@ -49,9 +49,7 @@ def __init__( self.prop = prop self.class_balances = maj_int_min - def _fit_resample( - self, X: np.ndarray, y: np.ndarray - ) -> Tuple[np.ndarray, np.ndarray]: + def _fit_resample(self, X: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """ :param X: two dimensional numpy array (number of samples x number of features) with float numbers @@ -76,15 +74,11 @@ def _fit_resample( if minority_classes is not None and class_label not in minority_classes: continue - chosen_minor_class_samples_to_oversample, weights = self._choose_samples( - class_label - ) + chosen_minor_class_samples_to_oversample, weights = self._choose_samples(class_label) if len(chosen_minor_class_samples_to_oversample) == 0: continue - oversampling_rate = int( - (goal_quantity - quantities[class_label]) * self.prop - ) + oversampling_rate = int((goal_quantity - quantities[class_label]) * self.prop) if oversampling_rate > 0: if len(chosen_minor_class_samples_to_oversample) == 1: oversampled_set = np.repeat( @@ -93,21 +87,14 @@ def _fit_resample( axis=0, ) else: - chosen_samples_features_mean = np.mean( - chosen_minor_class_samples_to_oversample, axis=0 - ) - zero_mean_samples = ( - chosen_minor_class_samples_to_oversample - - chosen_samples_features_mean - ) + chosen_samples_features_mean = np.mean(chosen_minor_class_samples_to_oversample, axis=0) + zero_mean_samples = chosen_minor_class_samples_to_oversample - chosen_samples_features_mean n_components = min(zero_mean_samples.shape) pca = PCA(n_components=n_components).fit(zero_mean_samples) uncorrelated_samples = pca.transform(zero_mean_samples) - variables_variance = np.diag( - np.cov(uncorrelated_samples, rowvar=False) - ) + variables_variance = np.diag(np.cov(uncorrelated_samples, rowvar=False)) oversampled_set = self._MDO_oversampling( uncorrelated_samples, @@ -115,35 +102,21 @@ def _fit_resample( oversampling_rate, weights, ) - oversampled_set = ( - pca.inverse_transform(oversampled_set) - + chosen_samples_features_mean - ) + oversampled_set = pca.inverse_transform(oversampled_set) + chosen_samples_features_mean oversampled_X = np.vstack((oversampled_X, oversampled_set)) - oversampled_y = np.hstack( - (oversampled_y, np.array([class_label] * oversampling_rate)) - ) + oversampled_y = np.hstack((oversampled_y, np.array([class_label] * oversampling_rate))) return oversampled_X, oversampled_y def _choose_samples(self, class_label: str) -> Tuple[np.ndarray, np.ndarray]: - minor_class_indices = [ - i for i, value in enumerate(self.y) if value == class_label - ] + minor_class_indices = [i for i, value in enumerate(self.y) if value == class_label] minor_set = self.X[minor_class_indices] - quantity_same_class_neighbours = self.calculate_same_class_neighbour_quantities( - minor_set, class_label - ) - chosen_minor_class_samples_to_oversample = minor_set[ - quantity_same_class_neighbours >= self.k1 - ] - - weights = ( - quantity_same_class_neighbours[quantity_same_class_neighbours >= self.k1] - / self.k2 - ) + quantity_same_class_neighbours = self.calculate_same_class_neighbour_quantities(minor_set, class_label) + chosen_minor_class_samples_to_oversample = minor_set[quantity_same_class_neighbours >= self.k1] + + weights = quantity_same_class_neighbours[quantity_same_class_neighbours >= self.k1] / self.k2 weights_sum = np.sum(weights) if weights_sum != 0: @@ -154,9 +127,7 @@ def _choose_samples(self, class_label: str) -> Tuple[np.ndarray, np.ndarray]: return chosen_minor_class_samples_to_oversample, weights - def _MDO_oversampling( - self, T: np.ndarray, v: np.ndarray, oversampling_rate: int, weights: np.ndarray - ) -> np.ndarray: + def _MDO_oversampling(self, T: np.ndarray, v: np.ndarray, oversampling_rate: int, weights: np.ndarray) -> np.ndarray: oversampled_set = list() V = np.clip(np.copy(v), a_min=0.001, a_max=None) for _ in range(oversampling_rate): @@ -176,28 +147,18 @@ def _MDO_oversampling( last = (1 - s) * alpha_V[-1] last_feature = np.sqrt(last) if last > 0 else 0 - random_last_feature = self.random_state.choice( - [-last_feature, last_feature], 1 - )[0] + random_last_feature = self.random_state.choice([-last_feature, last_feature], 1)[0] features_vector.append(random_last_feature) oversampled_set.append(features_vector) return np.array(oversampled_set) - def calculate_same_class_neighbour_quantities( - self, S_minor: np.ndarray, S_minor_label: str - ) -> np.ndarray: - minority_class_neighbours_indices = self.knn.kneighbors( - S_minor, return_distance=False - ) + def calculate_same_class_neighbour_quantities(self, S_minor: np.ndarray, S_minor_label: str) -> np.ndarray: + minority_class_neighbours_indices = self.knn.kneighbors(S_minor, return_distance=False) quantity_with_same_label_in_neighbourhood = list() for i in range(len(S_minor)): sample_neighbours_indices = minority_class_neighbours_indices[i][1:] - quantity_sample_neighbours_indices_with_same_label = sum( - self.y[sample_neighbours_indices] == S_minor_label - ) - quantity_with_same_label_in_neighbourhood.append( - quantity_sample_neighbours_indices_with_same_label - ) + quantity_sample_neighbours_indices_with_same_label = sum(self.y[sample_neighbours_indices] == S_minor_label) + quantity_with_same_label_in_neighbourhood.append(quantity_sample_neighbours_indices_with_same_label) return np.array(quantity_with_same_label_in_neighbourhood) diff --git a/multi_imbalance/resampling/soup.py b/multi_imbalance/resampling/soup.py index 71e3842..3366d83 100644 --- a/multi_imbalance/resampling/soup.py +++ b/multi_imbalance/resampling/soup.py @@ -42,9 +42,7 @@ def __init__( self.dsc_maj_cls, self.asc_min_cls = None, None self._X, self._y = None, None - def _fit_resample( - self, X: np.ndarray, y: np.ndarray - ) -> Tuple[np.ndarray, np.ndarray]: + def _fit_resample(self, X: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """ The method computes the metrics required for resampling based on the given set @@ -63,9 +61,7 @@ def _fit_resample( self._y = deepcopy(y) assert len(self._X.shape) == 2, "X should have 2 dimension" - assert ( - self._X.shape[0] == self._y.shape[0] - ), "Number of labels must be equal to number of samples" + assert self._X.shape[0] == self._y.shape[0], "Number of labels must be equal to number of samples" self.quantities = Counter(self._y) self.goal_quantity = self._calculate_goal_quantity(self.maj_int_min) @@ -91,37 +87,27 @@ def _fit_resample( return np.array(self._X), np.array(self._y) - def _construct_class_safe_levels( - self, X: np.ndarray, y: np.ndarray, class_name: str - ) -> defaultdict: + def _construct_class_safe_levels(self, X: np.ndarray, y: np.ndarray, class_name: str) -> defaultdict: self.quantities = Counter(y) indices_in_class = [i for i, value in enumerate(y) if value == class_name] neigh_clf = NearestNeighbors(n_neighbors=self.k + 1).fit(X) - neighbour_indices = neigh_clf.kneighbors( - X[indices_in_class], return_distance=False - )[:, 1:] + neighbour_indices = neigh_clf.kneighbors(X[indices_in_class], return_distance=False)[:, 1:] neighbour_classes = y[neighbour_indices] class_safe_levels = defaultdict(float) for i, sample_id in enumerate(indices_in_class): neighbours_quantities = Counter(neighbour_classes[i]) - class_safe_levels[sample_id] = self._calculate_sample_safe_level( - class_name, neighbours_quantities - ) + class_safe_levels[sample_id] = self._calculate_sample_safe_level(class_name, neighbours_quantities) return class_safe_levels - def _calculate_sample_safe_level( - self, class_name: str, neighbours_quantities: Counter - ) -> float: + def _calculate_sample_safe_level(self, class_name: str, neighbours_quantities: Counter) -> float: safe_level = 0 q: Counter = self.quantities for neigh_label, neigh_q in neighbours_quantities.items(): - similarity_between_classes = min(q[class_name], q[neigh_label]) / max( - q[class_name], q[neigh_label] - ) + similarity_between_classes = min(q[class_name], q[neigh_label]) / max(q[class_name], q[neigh_label]) safe_level += neigh_q * similarity_between_classes safe_level /= self.k @@ -131,67 +117,43 @@ def _calculate_sample_safe_level( return safe_level - def _undersample( - self, X: np.ndarray, y: np.ndarray, class_name: str - ) -> Tuple[np.ndarray, np.ndarray]: - safe_levels_of_samples_in_class = self._construct_class_safe_levels( - X, y, class_name - ) + def _undersample(self, X: np.ndarray, y: np.ndarray, class_name: str) -> Tuple[np.ndarray, np.ndarray]: + safe_levels_of_samples_in_class = self._construct_class_safe_levels(X, y, class_name) class_quantity = self.quantities[class_name] - safe_levels_list = sorted( - safe_levels_of_samples_in_class.items(), key=itemgetter(1) - ) + safe_levels_list = sorted(safe_levels_of_samples_in_class.items(), key=itemgetter(1)) samples_to_remove_quantity = max(0, int(class_quantity - self.goal_quantity)) if samples_to_remove_quantity > 0: - remove_indices = list( - map(itemgetter(0), safe_levels_list[:samples_to_remove_quantity]) - ) + remove_indices = list(map(itemgetter(0), safe_levels_list[:samples_to_remove_quantity])) X = np.delete(X, remove_indices, axis=0) y = np.delete(y, remove_indices, axis=0) return X, y - def _oversample( - self, X: np.ndarray, y: np.ndarray, class_name: str - ) -> Tuple[np.ndarray, np.ndarray]: - safe_levels_of_samples_in_class = self._construct_class_safe_levels( - X, y, class_name - ) + def _oversample(self, X: np.ndarray, y: np.ndarray, class_name: str) -> Tuple[np.ndarray, np.ndarray]: + safe_levels_of_samples_in_class = self._construct_class_safe_levels(X, y, class_name) class_quantity = self.quantities[class_name] - safe_levels_list = list( - sorted( - safe_levels_of_samples_in_class.items(), key=itemgetter(1), reverse=True - ) - ) + safe_levels_list = list(sorted(safe_levels_of_samples_in_class.items(), key=itemgetter(1), reverse=True)) difference = self.goal_quantity - class_quantity while difference > 0: quantity_items_to_copy = min(difference, class_quantity) - indices_to_copy = list( - map(itemgetter(0), safe_levels_list[:quantity_items_to_copy]) - ) + indices_to_copy = list(map(itemgetter(0), safe_levels_list[:quantity_items_to_copy])) X = np.vstack((X, X[indices_to_copy])) y = np.hstack((y, y[indices_to_copy])) difference -= quantity_items_to_copy return X, y - def _calculate_goal_quantity( - self, maj_int_min: Union[Dict[str, List[int]], None] = None - ) -> Union[int, float]: + def _calculate_goal_quantity(self, maj_int_min: Union[Dict[str, List[int]], None] = None) -> Union[int, float]: if maj_int_min is None: maj_q = max(list(self.quantities.values())) min_q = min(list(self.quantities.values())) return np.mean((min_q, maj_q), dtype=int) else: - maj_classes = { - k: v for k, v in self.quantities.items() if k in maj_int_min["maj"] - } + maj_classes = {k: v for k, v in self.quantities.items() if k in maj_int_min["maj"]} maj_q = list(maj_classes.values()) - min_classes = { - k: v for k, v in self.quantities.items() if k in maj_int_min["min"] - } + min_classes = {k: v for k, v in self.quantities.items() if k in maj_int_min["min"]} min_q = list(min_classes.values()) if len(min_q) == 0 and len(maj_q) == 0: diff --git a/multi_imbalance/resampling/spider.py b/multi_imbalance/resampling/spider.py index 9a57ee1..d4de036 100644 --- a/multi_imbalance/resampling/spider.py +++ b/multi_imbalance/resampling/spider.py @@ -43,9 +43,7 @@ def __init__( self.cost = cost self.AS, self.RS = np.array([]), np.array([]) - def _fit_resample( - self, X: np.ndarray, y: np.ndarray - ) -> Tuple[np.ndarray, np.ndarray]: + def _fit_resample(self, X: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """ Performs resampling @@ -110,17 +108,11 @@ def _estimate_cost_matrix(y: Union[np.ndarray, List[int]]) -> np.ndarray: np.fill_diagonal(cost, 0) return cost - def _sort_by_cardinality( - self, y: Union[List[int], np.ndarray] - ) -> Tuple[List[int], List[int]]: + def _sort_by_cardinality(self, y: Union[List[int], np.ndarray]) -> Tuple[List[int], List[int]]: class_cardinality = Counter(y) # to ensure looping over classes with decreasing cardinality. - int_classes = sorted( - self.intermediate_classes, key=lambda clazz: -class_cardinality[clazz] - ) - min_classes = sorted( - self.minority_classes, key=lambda clazz: -class_cardinality[clazz] - ) + int_classes = sorted(self.intermediate_classes, key=lambda clazz: -class_cardinality[clazz]) + min_classes = sorted(self.minority_classes, key=lambda clazz: -class_cardinality[clazz]) return int_classes, min_classes def amplify(self, int_min_class: str) -> None: @@ -223,9 +215,7 @@ def _min_cost_classes(self, x: np.ndarray, DS: np.ndarray) -> np.ndarray: for cj in C: s = 0 for ci in C: - s += ((kneighbors[:, -1] == ci).astype(int).sum() / self.k) * self.cost[ - C.index(ci), C.index(cj) - ] + s += ((kneighbors[:, -1] == ci).astype(int).sum() / self.k) * self.cost[C.index(ci), C.index(cj)] vals.append(s) C = np.array(C) vals = np.array(vals) @@ -244,8 +234,7 @@ def _relabel_nn(self, x: np.ndarray) -> None: if ( contains(self.RS, neighbor) and self._class_of(neighbor) in self.majority_classes - and self._class_of(neighbor) - in self._min_cost_classes(x, self._ds_as_rs_union()) + and self._class_of(neighbor) in self._min_cost_classes(x, self._ds_as_rs_union()) ): self.RS = setdiff(self.RS, np.array([neighbor])) neighbor[-1] = x[-1] @@ -260,9 +249,9 @@ def _clean_nn(self, x: np.ndarray) -> None: """ nearest_neighbors = self._knn(x, self._ds_as_rs_union()) for neighbor in nearest_neighbors: - if self._class_of(neighbor) in self.majority_classes and self._class_of( - neighbor - ) in self._min_cost_classes(x, self._ds_as_rs_union()): + if self._class_of(neighbor) in self.majority_classes and self._class_of(neighbor) in self._min_cost_classes( + x, self._ds_as_rs_union() + ): self.DS = setdiff(self.DS, np.array([neighbor])) self.RS = setdiff(self.RS, np.array([neighbor])) @@ -291,8 +280,7 @@ def _knn(self, x: np.ndarray, DS: np.ndarray) -> np.ndarray: within_radius = self.neigh_clf.radius_neighbors( [x[:-1]], radius=self.neigh_clf.kneighbors([x[:-1]], return_distance=True)[0][0][-1] - + 0.0001 - * self.neigh_clf.kneighbors([x[:-1]], return_distance=True)[0][0][-1], + + 0.0001 * self.neigh_clf.kneighbors([x[:-1]], return_distance=True)[0][0][-1], return_distance=True, ) @@ -314,9 +302,7 @@ def _amplify_nn(self, x: np.ndarray) -> None: Single observation. """ - while self._class_of(x) not in self._min_cost_classes( - x, self._ds_as_rs_union() - ): + while self._class_of(x) not in self._min_cost_classes(x, self._ds_as_rs_union()): y = x.copy() self.AS = union(self.AS, np.asarray([y])) diff --git a/multi_imbalance/resampling/static_smote.py b/multi_imbalance/resampling/static_smote.py index ddf437e..f3f5ff6 100644 --- a/multi_imbalance/resampling/static_smote.py +++ b/multi_imbalance/resampling/static_smote.py @@ -20,9 +20,7 @@ def __init__(self) -> None: super().__init__() self._sampling_type = "over-sampling" - def _fit_resample( - self, X: np.ndarray, y: np.ndarray - ) -> Tuple[np.ndarray, np.ndarray]: + def _fit_resample(self, X: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """ Performs resampling diff --git a/multi_imbalance/utils/data.py b/multi_imbalance/utils/data.py index 7e10a78..4a4212a 100644 --- a/multi_imbalance/utils/data.py +++ b/multi_imbalance/utils/data.py @@ -86,9 +86,7 @@ def load_arff_dataset( return X.to_numpy(), y -def load_datasets_arff( - return_non_cat_length: bool = False, dataset_paths: Union[str, None] = None -) -> OrderedDict: +def load_datasets_arff(return_non_cat_length: bool = False, dataset_paths: Union[str, None] = None) -> OrderedDict: if dataset_paths is None: dataset_paths = glob.glob(os.path.join(get_project_root(), "data", "arff", "*")) @@ -97,12 +95,8 @@ def load_datasets_arff( path = Path(path) dataset_name = path.stem if return_non_cat_length: - X, y, cat_length = load_arff_dataset( - path, return_non_cat_length=return_non_cat_length - ) - datasets[dataset_name] = Bunch( - data=X, target=y, non_cat_length=cat_length, DESCR=dataset_name - ) + X, y, cat_length = load_arff_dataset(path, return_non_cat_length=return_non_cat_length) + datasets[dataset_name] = Bunch(data=X, target=y, non_cat_length=cat_length, DESCR=dataset_name) else: X, y = load_arff_dataset(path, return_non_cat_length=return_non_cat_length) datasets[dataset_name] = Bunch(data=X, target=y, DESCR=dataset_name) @@ -135,9 +129,7 @@ def construct_maj_int_min(y: np.ndarray, strategy: str = "median") -> OrderedDic elif strategy == "average": middle_size = np.mean(list(class_sizes.values())) else: - raise ValueError( - f'Unrecognized {strategy}. Only "median" and "average" are allowed.' - ) + raise ValueError(f'Unrecognized {strategy}. Only "median" and "average" are allowed.') maj_int_min = OrderedDict({"maj": list(), "int": list(), "min": list()}) for class_label, class_size in class_sizes.items(): diff --git a/multi_imbalance/utils/metrics.py b/multi_imbalance/utils/metrics.py index ee260da..9431f54 100644 --- a/multi_imbalance/utils/metrics.py +++ b/multi_imbalance/utils/metrics.py @@ -2,9 +2,7 @@ import numpy as np -def gmean_score( - y_test: np.ndarray, y_pred: np.ndarray, correction: float = 0.001 -) -> float: # pragma no cover +def gmean_score(y_test: np.ndarray, y_pred: np.ndarray, correction: float = 0.001) -> float: # pragma no cover """ Calculate geometric mean score diff --git a/multi_imbalance/utils/plot.py b/multi_imbalance/utils/plot.py index 4501634..685474c 100644 --- a/multi_imbalance/utils/plot.py +++ b/multi_imbalance/utils/plot.py @@ -10,9 +10,7 @@ sns.set_style("darkgrid") -def plot_cardinality_and_2d_data( - X: np.ndarray, y: np.ndarray, dataset_name: str = "" -): # pragma no cover +def plot_cardinality_and_2d_data(X: np.ndarray, y: np.ndarray, dataset_name: str = ""): # pragma no cover """ Plots cardinality of classes from y as well as scatter plot of X transformed to two dimensions using PCA diff --git a/pyproject.toml b/pyproject.toml index eebb47a..010fee9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -67,4 +67,7 @@ testpaths = [ ] [tool.setuptools] -py-modules = ["multi_imbalance"] \ No newline at end of file +py-modules = ["multi_imbalance"] + +[tool.black] +line-length = 140 diff --git a/setup.cfg b/setup.cfg index 905c8bb..2b21f65 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,2 +1,2 @@ [flake8] -max-line-length = 160 \ No newline at end of file +max-line-length = 140 \ No newline at end of file diff --git a/tests/ensemble/test_ecoc.py b/tests/ensemble/test_ecoc.py index edadf7d..79c1895 100644 --- a/tests/ensemble/test_ecoc.py +++ b/tests/ensemble/test_ecoc.py @@ -40,40 +40,7 @@ ] ) -y = np.array( - [ - 2, - 0, - 2, - 3, - 0, - 3, - 1, - 0, - 2, - 0, - 2, - 3, - 1, - 2, - 1, - 3, - 0, - 3, - 2, - 0, - 0, - 1, - 2, - 3, - 0, - 1, - 2, - 3, - 1, - 2, - ] -) +y = np.array([2, 0, 2, 3, 0, 3, 1, 0, 2, 0, 2, 3, 1, 2, 1, 3, 0, 3, 2, 0, 0, 1, 2, 3, 0, 1, 2, 3, 1, 2]) def test_random_oversampling(): @@ -93,9 +60,7 @@ def test_no_oversampling(): assert y.shape == y_oversampled.shape -@pytest.mark.parametrize( - "encoding_strategy", ["dense", "sparse", "OVO", "OVA", "complete"] -) +@pytest.mark.parametrize("encoding_strategy", ["dense", "sparse", "OVO", "OVA", "complete"]) @pytest.mark.parametrize( "oversampling, minority_classes", [(None, None), ("globalCS", None), ("SMOTE", None), ("SOUP", [0, 2])], @@ -175,8 +140,7 @@ def test_unknown_encoding(): with pytest.raises(ValueError) as e: ecoc_clf.fit(X, y) assert ( - e.value.args[0] - == "Unknown matrix generation encoding: dummy, expected to be one of ['dense', 'sparse', 'complete', 'OVA', 'OVO']." + e.value.args[0] == "Unknown matrix generation encoding: dummy, expected to be one of ['dense', 'sparse', 'complete', 'OVA', 'OVO']." ) @@ -184,10 +148,7 @@ def test_unknown_weighting_strategy(): ecoc_clf = ecoc.ECOC(weights="dummy") with pytest.raises(ValueError) as e: ecoc_clf.fit(X, y) - assert ( - e.value.args[0] - == "Unknown weighting strategy: dummy, expected to be one of [None, 'acc', 'avg_tpr_min']." - ) + assert e.value.args[0] == "Unknown weighting strategy: dummy, expected to be one of [None, 'acc', 'avg_tpr_min']." def test_own_classifier_without_predict_and_fit(): @@ -211,9 +172,7 @@ def bar(self, X): def test_predefined_classifiers_and_weighting_without_exceptions(classifier, weights): ecoc_clf = ecoc.ECOC(binary_classifier=classifier, weights=weights) ecoc_clf.fit(X, y) - predicted = ecoc_clf.predict( - np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6], [7.7, 8.8, 9.9]]) - ) + predicted = ecoc_clf.predict(np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6], [7.7, 8.8, 9.9]])) assert len(predicted) == 3 @@ -239,9 +198,7 @@ def bar(self, X): assert "fit_transform" in str(e.value) -@pytest.mark.parametrize( - "encoding_strategy", ["dense", "sparse", "OVO", "OVA", "complete"] -) +@pytest.mark.parametrize("encoding_strategy", ["dense", "sparse", "OVO", "OVA", "complete"]) @pytest.mark.parametrize("oversampling", [None, "globalCS", "SMOTE", "SOUP"]) def test_ecoc_with_sklearn_pipeline(encoding_strategy, oversampling): pipeline = Pipeline( @@ -251,7 +208,5 @@ def test_ecoc_with_sklearn_pipeline(encoding_strategy, oversampling): ] ) pipeline.fit(X, y) - y_hat = pipeline.predict( - np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6], [7.7, 8.8, 9.9]]) - ) + y_hat = pipeline.predict(np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6], [7.7, 8.8, 9.9]])) assert len(y_hat) == 3 diff --git a/tests/ensemble/test_mrbbagging.py b/tests/ensemble/test_mrbbagging.py index c63c0dc..d5121b1 100644 --- a/tests/ensemble/test_mrbbagging.py +++ b/tests/ensemble/test_mrbbagging.py @@ -42,17 +42,13 @@ class TestMRBBagging(unittest.TestCase): def test_api(self): - mrbbagging = MRBBagging( - 1, DecisionTreeClassifier(random_state=0), random_state=0 - ) + mrbbagging = MRBBagging(1, DecisionTreeClassifier(random_state=0), random_state=0) mrbbagging.fit(X_train, y_train) y_pred = mrbbagging.predict(X_test) assert all(y_pred == y_test) def test_api_multiple_trees(self): - mrbbagging = MRBBagging( - 5, DecisionTreeClassifier(random_state=0), random_state=0 - ) + mrbbagging = MRBBagging(5, DecisionTreeClassifier(random_state=0), random_state=0) mrbbagging.fit(X_train, y_train) y_pred = mrbbagging.predict(X_test) assert all(y_pred == y_test) diff --git a/tests/ensemble/test_ovo.py b/tests/ensemble/test_ovo.py index 6b127ba..512bf53 100644 --- a/tests/ensemble/test_ovo.py +++ b/tests/ensemble/test_ovo.py @@ -35,9 +35,7 @@ ] ) -y = np.array( - [1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 1, 1, 2, 3, 2, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1] -) +y = np.array([1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 1, 1, 2, 3, 2, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1]) def test_fit_predict(): @@ -107,18 +105,14 @@ def fit_resample(self, X, y): @pytest.mark.parametrize("preprocessing_btwn", ["all", "maj-min"]) @pytest.mark.parametrize("classifier", ["tree", "NB", "KNN"]) @pytest.mark.parametrize("preprocessing", [None, "globalCS", "SMOTE", "SOUP"]) -def test_predefined_classifiers_and_preprocessings_without_errors( - classifier, preprocessing, preprocessing_btwn -): +def test_predefined_classifiers_and_preprocessings_without_errors(classifier, preprocessing, preprocessing_btwn): ovo_clf = ovo.OVO( binary_classifier=classifier, preprocessing=preprocessing, preprocessing_between=preprocessing_btwn, ) ovo_clf.fit(X, y) - predicted = ovo_clf.predict( - np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6], [7.7, 8.8, 9.9]]) - ) + predicted = ovo_clf.predict(np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6], [7.7, 8.8, 9.9]])) assert len(predicted) == 3 @@ -189,7 +183,5 @@ def test_ecoc_with_sklearn_pipeline(preprocessing_btwn, classifier, preprocessin ] ) pipeline.fit(X, y) - y_hat = pipeline.predict( - np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6], [7.7, 8.8, 9.99]]) - ) + y_hat = pipeline.predict(np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6], [7.7, 8.8, 9.99]])) assert len(y_hat) == 3 diff --git a/tests/resampling/test_mdo.py b/tests/resampling/test_mdo.py index 551c794..3a15be1 100644 --- a/tests/resampling/test_mdo.py +++ b/tests/resampling/test_mdo.py @@ -88,9 +88,7 @@ def _get_parametrized_mdo(X, y): return _get_parametrized_mdo -@pytest.mark.parametrize( - "X, y, sc_minor_expected, weights_expected", complete_test_data -) +@pytest.mark.parametrize("X, y, sc_minor_expected, weights_expected", complete_test_data) def test_choose_samples(X, y, sc_minor_expected, weights_expected, mdo_mock): clf = mdo_mock(X, y) SC_minor, weights = clf._choose_samples(1) diff --git a/tests/resampling/test_soup.py b/tests/resampling/test_soup.py index 16a3585..3ebe122 100644 --- a/tests/resampling/test_soup.py +++ b/tests/resampling/test_soup.py @@ -165,12 +165,8 @@ def _get_parametrized_soup(X, y): return _get_parametrized_soup -@pytest.mark.parametrize( - "X, y, zero_safe_levels, one_safe_levels, first_sample_safe", complete_test_data -) -def test_calculating_safe_levels_for_sample( - X, y, zero_safe_levels, one_safe_levels, first_sample_safe, soup_mock -): +@pytest.mark.parametrize("X, y, zero_safe_levels, one_safe_levels, first_sample_safe", complete_test_data) +def test_calculating_safe_levels_for_sample(X, y, zero_safe_levels, one_safe_levels, first_sample_safe, soup_mock): clf = soup_mock(X, y) neighbour_quantities = Counter({0: 3, 1: 1}) @@ -178,12 +174,8 @@ def test_calculating_safe_levels_for_sample( assert_array_almost_equal(safe_level, first_sample_safe) -@pytest.mark.parametrize( - "X, y, zero_safe_levels, one_safe_levels, first_sample_safe", complete_test_data -) -def test_calculating_safe_levels_for_class( - X, y, zero_safe_levels, one_safe_levels, first_sample_safe, soup_mock -): +@pytest.mark.parametrize("X, y, zero_safe_levels, one_safe_levels, first_sample_safe", complete_test_data) +def test_calculating_safe_levels_for_class(X, y, zero_safe_levels, one_safe_levels, first_sample_safe, soup_mock): clf = soup_mock(X, y) zero_levels = clf._construct_class_safe_levels(X, y, 0) @@ -197,9 +189,7 @@ def test_calculating_safe_levels_for_class( "X, y, class_name, expected_undersampling, expected_oversampling", safe_levels_test_data, ) -def test_oversample( - X, y, class_name, expected_undersampling, expected_oversampling, soup_mock -): +def test_oversample(X, y, class_name, expected_undersampling, expected_oversampling, soup_mock): clf = soup_mock(X, y) oversampled_X, oversampled_y = clf._oversample(X, y, class_name) assert len(oversampled_X) == expected_oversampling @@ -210,9 +200,7 @@ def test_oversample( "X, y, class_name, expected_undersampling, expected_oversampling", safe_levels_test_data, ) -def test_undersample( - X, y, class_name, expected_undersampling, expected_oversampling, soup_mock -): +def test_undersample(X, y, class_name, expected_undersampling, expected_oversampling, soup_mock): clf = soup_mock(X, y) undersampled_X, undersampled_y = clf._undersample(X, y, class_name) assert len(undersampled_X) == expected_undersampling diff --git a/tests/resampling/test_spider.py b/tests/resampling/test_spider.py index 276f31c..39ff068 100644 --- a/tests/resampling/test_spider.py +++ b/tests/resampling/test_spider.py @@ -6,9 +6,7 @@ cost = np.ones((3, 3)) np.fill_diagonal(cost, 0) -spider = SPIDER3( - 1, maj_int_min={"maj": ["MAJ"], "int": ["INT"], "min": ["MIN"]}, cost=cost -) +spider = SPIDER3(1, maj_int_min={"maj": ["MAJ"], "int": ["INT"], "min": ["MIN"]}, cost=cost) def test_knn(): diff --git a/tests/utils/test_data.py b/tests/utils/test_data.py index 6502972..4d959fb 100644 --- a/tests/utils/test_data.py +++ b/tests/utils/test_data.py @@ -32,9 +32,7 @@ def test_preprocess(): def test_preprocess_without_one_hot_encode(): dir_path = os.path.dirname(os.path.realpath(__file__)) ds_path = os.path.join(dir_path, "ds_example.arrf") - x, y, non_cat = load_arff_dataset( - ds_path, return_non_cat_length=True, one_hot_encode=False - ) + x, y, non_cat = load_arff_dataset(ds_path, return_non_cat_length=True, one_hot_encode=False) assert all(y == np.array([0, 0, 0, 0, 0, 0, 0])) assert non_cat == 2 assert x.shape == (7, 2) @@ -75,13 +73,7 @@ def test_construct_maj_int_min_when_correct_and_median_strategy(): 5: 10, 8: 12, } - y = np.array( - [ - class_label - for class_label, class_size in class_sizes.items() - for _ in range(class_size) - ] - ) + y = np.array([class_label for class_label, class_size in class_sizes.items() for _ in range(class_size)]) np.random.shuffle(y) maj_int_dict = construct_maj_int_min(y, strategy="median") @@ -104,13 +96,7 @@ def test_construct_maj_int_min_when_correct_and_average_strategy(): 5: 10, 8: 2000, } - y = np.array( - [ - class_label - for class_label, class_size in class_sizes.items() - for _ in range(class_size) - ] - ) + y = np.array([class_label for class_label, class_size in class_sizes.items() for _ in range(class_size)]) np.random.shuffle(y) maj_int_dict = construct_maj_int_min(y, strategy="average") @@ -132,13 +118,7 @@ def test_construct_maj_int_min_when_wrong_strategy(): 5: 10, 8: 2000, } - y = np.array( - [ - class_label - for class_label, class_size in class_sizes.items() - for _ in range(class_size) - ] - ) + y = np.array([class_label for class_label, class_size in class_sizes.items() for _ in range(class_size)]) np.random.shuffle(y) with pytest.raises(ValueError): From 36edd37aaaa5f15aca863040b137920582d76b8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Sun, 6 Nov 2022 12:35:48 +0100 Subject: [PATCH 13/48] new authors and joblib instead of multiprocessing --- multi_imbalance/ensemble/soup_bagging.py | 13 ++++++------- pyproject.toml | 2 +- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/multi_imbalance/ensemble/soup_bagging.py b/multi_imbalance/ensemble/soup_bagging.py index 79860c1..8f7ad92 100644 --- a/multi_imbalance/ensemble/soup_bagging.py +++ b/multi_imbalance/ensemble/soup_bagging.py @@ -2,6 +2,7 @@ from collections import Counter from copy import deepcopy from typing import Dict, List, Tuple, Union +from joblib import Parallel, delayed import numpy as np from sklearn.ensemble import BaggingClassifier @@ -108,11 +109,9 @@ def fit(self, X: np.ndarray, y: np.ndarray, **kwargs): self object """ self.classes = np.unique(y) - - pool = multiprocessing.Pool(self.num_core) - results = pool.map( - fit_clf, - [ + parallel = Parallel(n_jobs=self.num_core) + results = parallel( + delayed(fit_clf)( ( clf, X, @@ -120,8 +119,8 @@ def fit(self, X: np.ndarray, y: np.ndarray, **kwargs): resample(X, y, stratify=y, random_state=i), self.maj_int_min, ) - for i, clf in enumerate(self.classifiers) - ], + ) + for i, clf in enumerate(self.classifiers) ) for i, (clf, weights) in enumerate(results): self.classifiers[i] = clf diff --git a/pyproject.toml b/pyproject.toml index 010fee9..59165d4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ name = "multi-imbalance" description = "Python package for tackling multiclass imbalance problems." version = "0.1.0" maintainers = [ - {name = "Damian Horna, Kamil Pluciński, Hanna Klimczak, Jacek Grycza", email = "horna.damian@gmail.com"} + {name = "Damian Horna, Kamil Pluciński, Hanna Klimczak, Jacek Grycza, Jan Kozłowski, Maciej Falbogowski, Adam Wojciechowski, Mateusz Woźny", email = "horna.damian@gmail.com"} ] readme = "README.md" classifiers=[ From 17514c4362065c285b17b03b31b64dd00a1a07bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Sun, 4 Dec 2022 10:16:29 +0100 Subject: [PATCH 14/48] first version --- .gitignore | 1 + examples/datasets/analysis.ipynb | 457 +++++++++++++++++++++++ multi_imbalance/datasets/_data_loader.py | 13 +- multi_imbalance/datasets/analysis.py | 176 +++++++++ tests/conftest.py | 45 +++ tests/datasets/test_analysis.py | 196 ++++++++++ tests/ensemble/test_ecoc.py | 107 ++---- 7 files changed, 921 insertions(+), 74 deletions(-) create mode 100644 examples/datasets/analysis.ipynb create mode 100644 multi_imbalance/datasets/analysis.py create mode 100644 tests/conftest.py create mode 100644 tests/datasets/test_analysis.py diff --git a/.gitignore b/.gitignore index 00de997..6b66379 100644 --- a/.gitignore +++ b/.gitignore @@ -70,6 +70,7 @@ venv # Datasets data/extracted/ +data/csv/ docs/_build htmlcov \ No newline at end of file diff --git a/examples/datasets/analysis.ipynb b/examples/datasets/analysis.ipynb new file mode 100644 index 0000000..77c3583 --- /dev/null +++ b/examples/datasets/analysis.ipynb @@ -0,0 +1,457 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "from imblearn.metrics import geometric_mean_score\n", + "from multi_imbalance.datasets.analysis import AnalysisPipeline, Config\n", + "from pathlib import Path\n", + "from sklearn.metrics import accuracy_score\n", + "from multi_imbalance.datasets import load_datasets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "load_datasets(save_to_csv=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "Path(\"results.csv\").unlink()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "cwd = Path.cwd()\n", + "\n", + "config = {\n", + " \"datasets\": [cwd.parents[1] / \"data\" / \"csv\" / \"glass.csv\"],\n", + " \"classifiers\": {\n", + " \"tree\": [{\"max_depth\" : 100}, {\"max_depth\" : 20}, {\"max_depth\" : None}],\n", + " },\n", + " \"resample_methods\": {\n", + " \"globalCS\": {\"shuffle\": True},\n", + " \"MDO\": dict(k1_frac=0.3, maj_int_min={\"maj\": [0, 1], \"min\": [2, 3, 4, 5]}),\n", + " \"StaticSMOTE\": {},\n", + " \"SOUP\": {\"shuffle\": True},\n", + " },\n", + " \"metrics\": {geometric_mean_score: {\"correction\": 0.001}},\n", + " \"n_repeats\": 10,\n", + " \"train_test_split_kwargs\": dict(test_size=0.35),\n", + "}\n", + "\n", + "config2 = {\n", + " \"datasets\": [cwd.parents[1] / \"data\" / \"csv\" / \"new_ecoli.csv\"],\n", + " \"classifiers\": {\n", + " \"tree\": [{\"max_depth\" : 100}, {\"max_depth\" : 20}, {\"max_depth\" : None}],\n", + " },\n", + " \"resample_methods\": {\n", + " \"globalCS\": {\"shuffle\": True},\n", + " \"MDO\": dict(k1_frac=0.3, maj_int_min={\"maj\": [0, 1], \"min\": [2, 3, 4, 5]}),\n", + " \"StaticSMOTE\": {},\n", + " \"SOUP\": {\"shuffle\": True},\n", + " },\n", + " \"metrics\": {geometric_mean_score: {\"correction\": 0.001}},\n", + " \"n_repeats\": 10,\n", + " \"train_test_split_kwargs\": dict(test_size=0.35),\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "c = Config.from_dict(config)\n", + "c2 = Config.from_dict(config2)\n", + "pipeline = AnalysisPipeline([c, c2])\n", + "pipeline.run_analysis(\"results.csv\", explode_clf_kwargs = False, train_without_resampling = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "df=pd.read_csv(\"results.csv\")\n", + "df.fillna(0, inplace=True)\n", + "group_df = df[[\"dataset_name\", \"classifier\", \"resampling_method\", \"metric_value\", \"metric_name\", \"kwargs\"]].groupby(by=[\"classifier\",\"metric_name\" ,\"dataset_name\", \"resampling_method\", \"kwargs\"]).agg({'metric_value': ['mean', 'std']})" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
metric_value
meanstd
dataset_nameresampling_methodkwargs
glassMDO{'max_depth': 100}0.6438550.070864
{'max_depth': 20}0.5946530.158337
{'max_depth': None}0.6364540.094362
Not defined{'max_depth': 100}0.5347180.175402
{'max_depth': 20}0.5729390.147061
{'max_depth': None}0.6315420.062332
SOUP{'max_depth': 100}0.6602340.052351
{'max_depth': 20}0.6194250.095159
{'max_depth': None}0.6560910.073783
StaticSMOTE{'max_depth': 100}0.6216950.147462
{'max_depth': 20}0.6499090.053088
{'max_depth': None}0.5319510.171160
globalCS{'max_depth': 100}0.6327910.070711
{'max_depth': 20}0.5066500.160718
{'max_depth': None}0.5801050.099831
new_ecoliMDO{'max_depth': 100}0.7777600.052380
{'max_depth': 20}0.7560260.041473
{'max_depth': None}0.7542410.050392
Not defined{'max_depth': 100}0.6965990.043062
{'max_depth': 20}0.7067520.076627
{'max_depth': None}0.7205470.062143
SOUP{'max_depth': 100}0.7498570.028518
{'max_depth': 20}0.7471580.046176
{'max_depth': None}0.7397590.075347
StaticSMOTE{'max_depth': 100}0.7194960.048702
{'max_depth': 20}0.7211930.050183
{'max_depth': None}0.6940620.066351
globalCS{'max_depth': 100}0.6671230.049824
{'max_depth': 20}0.6943780.075182
{'max_depth': None}0.6714510.058819
\n", + "
" + ], + "text/plain": [ + " metric_value \n", + " mean std\n", + "dataset_name resampling_method kwargs \n", + "glass MDO {'max_depth': 100} 0.643855 0.070864\n", + " {'max_depth': 20} 0.594653 0.158337\n", + " {'max_depth': None} 0.636454 0.094362\n", + " Not defined {'max_depth': 100} 0.534718 0.175402\n", + " {'max_depth': 20} 0.572939 0.147061\n", + " {'max_depth': None} 0.631542 0.062332\n", + " SOUP {'max_depth': 100} 0.660234 0.052351\n", + " {'max_depth': 20} 0.619425 0.095159\n", + " {'max_depth': None} 0.656091 0.073783\n", + " StaticSMOTE {'max_depth': 100} 0.621695 0.147462\n", + " {'max_depth': 20} 0.649909 0.053088\n", + " {'max_depth': None} 0.531951 0.171160\n", + " globalCS {'max_depth': 100} 0.632791 0.070711\n", + " {'max_depth': 20} 0.506650 0.160718\n", + " {'max_depth': None} 0.580105 0.099831\n", + "new_ecoli MDO {'max_depth': 100} 0.777760 0.052380\n", + " {'max_depth': 20} 0.756026 0.041473\n", + " {'max_depth': None} 0.754241 0.050392\n", + " Not defined {'max_depth': 100} 0.696599 0.043062\n", + " {'max_depth': 20} 0.706752 0.076627\n", + " {'max_depth': None} 0.720547 0.062143\n", + " SOUP {'max_depth': 100} 0.749857 0.028518\n", + " {'max_depth': 20} 0.747158 0.046176\n", + " {'max_depth': None} 0.739759 0.075347\n", + " StaticSMOTE {'max_depth': 100} 0.719496 0.048702\n", + " {'max_depth': 20} 0.721193 0.050183\n", + " {'max_depth': None} 0.694062 0.066351\n", + " globalCS {'max_depth': 100} 0.667123 0.049824\n", + " {'max_depth': 20} 0.694378 0.075182\n", + " {'max_depth': None} 0.671451 0.058819" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "group_df.loc[\"tree\", \"geometric_mean_score\", :, :]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.2 ('.test')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.2" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "354fe7bbb08ce19365ae7e9dc9251db0b8655780cc27fe67a2a3ffff5cc90304" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/multi_imbalance/datasets/_data_loader.py b/multi_imbalance/datasets/_data_loader.py index 3dd5b81..fa9f29a 100644 --- a/multi_imbalance/datasets/_data_loader.py +++ b/multi_imbalance/datasets/_data_loader.py @@ -23,7 +23,7 @@ from os.path import join, isfile import numpy as np - +import pandas as pd from sklearn.datasets._base import Bunch PRE_FILENAME = "x" @@ -57,7 +57,7 @@ MAP_ID_NAME[v + 1] = k -def load_datasets(data_home: str = DATA_HOME_BASIC) -> OrderedDict: +def load_datasets(data_home: str = DATA_HOME_BASIC, save_to_csv: bool = False) -> OrderedDict: """ Load the benchmark datasets. @@ -73,6 +73,7 @@ def load_datasets(data_home: str = DATA_HOME_BASIC) -> OrderedDict: string Description of the each dataset. """ extracted_dir = join(data_home, "extracted") + csv_dir = join(data_home, "csv") datasets = OrderedDict() filter_data_ = MAP_NAME_ID.keys() @@ -92,6 +93,14 @@ def load_datasets(data_home: str = DATA_HOME_BASIC) -> OrderedDict: data = np.load(filename) X, y = data["data"], data["label"] + if save_to_csv: + csv_filename = it + ".csv" + csv_filename = join(csv_dir, csv_filename) + df = pd.DataFrame(X) + df.rename(columns=lambda x: f"X{x}", inplace=True) + df["y"] = y + df.to_csv(csv_filename, index=False) + datasets[it] = Bunch(data=X, target=y, DESCR=it) return datasets diff --git a/multi_imbalance/datasets/analysis.py b/multi_imbalance/datasets/analysis.py new file mode 100644 index 0000000..74947a2 --- /dev/null +++ b/multi_imbalance/datasets/analysis.py @@ -0,0 +1,176 @@ +from copy import deepcopy +from dataclasses import dataclass +import json +from pathlib import Path +from typing import Callable, Dict, List, Tuple, Union +import pandas as pd +from sklearn.base import ClassifierMixin +from sklearn.model_selection import train_test_split +from sklearn.naive_bayes import GaussianNB +from sklearn.neighbors import KNeighborsClassifier +from sklearn.tree import DecisionTreeClassifier +from imblearn.base import BaseSampler +from sklearn.pipeline import _name_estimators +import logging + +from multi_imbalance.resampling.global_cs import GlobalCS +from multi_imbalance.resampling.soup import SOUP +from multi_imbalance.resampling.spider import SPIDER3 +from multi_imbalance.resampling.mdo import MDO +from multi_imbalance.resampling.static_smote import StaticSMOTE + +logging.basicConfig(level=logging.INFO, format="[%(asctime)s] %(levelname)s %(name)s %(message)s", datefmt="%d.%m.%Y %H:%M:%S") + + +@dataclass +class Config: + datasets: List[str] + classifiers: Dict[Union[str, ClassifierMixin], List[Dict]] + resample_methods: Dict[Union[str, BaseSampler], Dict] + metrics: Dict[Callable, Dict] + n_repeats: int + train_test_split_kwargs: Dict + + @classmethod + def from_dict(cls, config: Dict) -> "Config": + return cls(**config) + + +@dataclass +class Result: + dataset_name: str + classifier: str + resampling_method: str + metric_name: str + metric_value: float + no_repeat: int + kwargs: Dict + + +class AnalysisPipeline: + _allowed_resampling = ["globalCS", "StaticSMOTE", "SOUP", "spider3", "MDO"] + _allowed_classifiers = ["tree", "NB", "KNN"] + + def __init__(self, configs: List[Config]) -> None: + self.__logger = logging.getLogger("AnalysisPipeline") + self._configs = configs + self.__resampling_methods = {"globalCS": GlobalCS, "StaticSMOTE": StaticSMOTE, "SOUP": SOUP, "spider3": SPIDER3, "MDO": MDO} + self.__classifiers = {"tree": DecisionTreeClassifier, "NB": GaussianNB, "KNN": KNeighborsClassifier} + + def run_analysis(self, output_path: str, explode_clf_kwargs: bool, train_without_resampling: bool): + self._output_path = Path(output_path) + for config in self._configs: + self._config = config + self.__metrics = self._config.metrics + self.__n_repeats = self._config.n_repeats + self.__tts_kwargs = self._config.train_test_split_kwargs + for clf_name, clf, clf_kwargs in self._get_classifier(): + for n in range(1, self.__n_repeats + 1): + for dataset_name, dataset in self._get_dataset(): + for resampler_name, resampler in self._get_resampler(): + tmp_clf = deepcopy(clf) + X, y = dataset.iloc[:, :-1], dataset.iloc[:, -1] + X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, **self.__tts_kwargs) + + try: + X_train_res, y_train_res = resampler.fit_resample(X_train, y_train) + results = [] + tmp_clf.fit(X_train_res, y_train_res) + y_pred = tmp_clf.predict(X_test) + + for metric, kwargs in self.__metrics.items(): + results.append( + Result( + dataset_name, + clf_name, + resampler_name, + metric.__name__, + metric(y_test, y_pred, **kwargs), + n, + clf_kwargs, + ) + ) + df_results = pd.DataFrame(results) + if self._output_path.exists(): + df_results.to_csv(output_path, mode="a", index=False, header=False) + else: + df_results.to_csv(output_path, index=False) + except Exception as e: + self.__logger.error(f"Raised exception '{e}' for {dataset_name=}, {resampler_name=} and {clf_name=}") + + if train_without_resampling: + tmp_clf = deepcopy(clf) + results = [] + X, y = dataset.iloc[:, :-1], dataset.iloc[:, -1] + X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, **self.__tts_kwargs) + tmp_clf.fit(X_train, y_train) + y_pred = tmp_clf.predict(X_test) + + for metric, kwargs in self.__metrics.items(): + results.append( + Result( + dataset_name, + clf_name, + "Not defined", + metric.__name__, + metric(y_test, y_pred, **kwargs), + n, + clf_kwargs, + ) + ) + df_results = pd.DataFrame(results) + if self._output_path.exists(): + df_results.to_csv(output_path, mode="a", index=False, header=False) + else: + df_results.to_csv(output_path, index=False) + + if explode_clf_kwargs: + df_results = pd.read_csv(output_path) + df_results = pd.concat( + [df_results.drop(columns="kwargs"), df_results["kwargs"].apply(lambda x: dict(eval(x))).apply(pd.Series)], axis=1 + ) + df_results.to_csv(output_path, index=False) + + def _get_dataset(self) -> Tuple[str, pd.DataFrame]: + for dataset_path in self._config.datasets: + path = Path(dataset_path) + + if path.is_file() and path.suffix == ".csv": + yield path.stem, pd.read_csv(str(path)) + elif path.is_dir(): + dataset_dir = path + for path in dataset_dir.glob("**/*.csv"): + yield path.stem, pd.read_csv(str(path)) + else: + raise Exception("Wrong dataset path, should be csv file or dir with csv files") + + def _get_resampler(self) -> List[Tuple[str, BaseSampler]]: + for resampler, kwargs in self._config.resample_methods.items(): + if isinstance(resampler, str): + if resampler not in AnalysisPipeline._allowed_resampling: + raise ValueError( + "Unknown resample method: %s, expected to be one of %s" % (resampler, AnalysisPipeline._allowed_resampling) + ) + yield resampler, self.__resampling_methods[resampler](**kwargs) + else: + if not hasattr(resampler, "fit_resample"): + raise ValueError("Your resampler must implement fit_resample method") + yield self._get_name(resampler(**kwargs)) + + def _get_classifier(self) -> List[Tuple[str, ClassifierMixin, Dict]]: + for classifier, kwargs_list in self._config.classifiers.items(): + if isinstance(classifier, str): + if classifier not in AnalysisPipeline._allowed_classifiers: + raise ValueError( + "Unknown classifier: %s, expected to be one of %s" % (classifier, AnalysisPipeline._allowed_classifiers) + ) + for kwargs in kwargs_list: + yield classifier, self.__classifiers[classifier](**kwargs), kwargs + else: + if not hasattr(classifier, "fit") or not hasattr(classifier, "predict"): + raise ValueError("Your classifier must implement fit and predict methods") + for kwargs in kwargs_list: + yield *self._get_name(classifier(**kwargs)), kwargs + + def _get_name(self, estimator: Union[ClassifierMixin, BaseSampler]) -> Tuple[str, Union[ClassifierMixin, BaseSampler]]: + return _name_estimators([estimator])[0] diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..1421117 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,45 @@ +import numpy as np +import pytest + + +@pytest.fixture +def X_ecoc(): + return np.array( + [ + [1.8938566839198983, 0.7347724642028586, 1.5817290619305417], + [1.6893330472771877, 1.3729481360429043, 0.1779576959347715], + [1.1103882804642866, 0.2684931500114267, 0.24565717871603532], + [0.9635120154904986, 0.44338438370111577, 1.6559238383999697], + [0.6525827502237067, 0.8978087724631425, 1.5056794207545134], + [0.8232009732859464, 0.5270243940630088, 1.434695372722657], + [0.519304726338536, 0.4635228434262648, 0.014170648565480004], + [1.3938520002157688, 1.524670776643407, 0.9011189423913637], + [0.09993454781831534, 0.5991188594563008, 0.6462181194010983], + [1.5300019511124079, 0.08177359763506553, 1.7642527715349894], + [1.1770688242955876, 0.9604049547799067, 0.6989025594835503], + [1.5143651712498534, 1.4914673103908214, 1.3377704178955587], + [1.1299009013495136, 0.700540900007983, 1.071829181951729], + [1.530652133805449, 0.2992536048983532, 1.957731948975865], + [1.6236761570974148, 0.5919033806975751, 1.6334065904199757], + [0.9365056250644108, 1.526475631725099, 1.420298571686271], + [0.9063995770780813, 1.0248369545634513, 1.36911505163145], + [0.3861789635773656, 0.5758917834278445, 0.910187724154228], + [0.7165380621896438, 1.494299618627891, 0.521854931610239], + [1.3621939213912113, 0.387219837127391, 1.321376123618781], + [1.6764775993219296, 0.15364096535456317, 1.3219739817389], + [3.6764775993219296, 0.15364096535456317, 1.3789731273891], + [4.6764775993219296, 2.15364096535456317, 1.9830281123211], + [5.6764775993219296, 3.15364096535456317, 1.3213121322321], + [6.321312, 11.15364096535456317, 1.0998908320132], + [8.414132131, 2.15364096535456317, 5.0998908320132], + [10.6764775993219296, -2.15364096535456317, 3.0998908320132], + [4.6764775993219296, 0.15364096535456317, 2.0998908320132], + [-4.6764775993219296, -1.15364096535456317, 9.0998908320132], + [-6.6764775993219296, 11.15364096535456317, 5.0998908320132], + ] + ) + + +@pytest.fixture +def y_ecoc(): + return np.array([2, 0, 2, 3, 0, 3, 1, 0, 2, 0, 2, 3, 1, 2, 1, 3, 0, 3, 2, 0, 0, 1, 2, 3, 0, 1, 2, 3, 1, 2]) diff --git a/tests/datasets/test_analysis.py b/tests/datasets/test_analysis.py new file mode 100644 index 0000000..5d726d2 --- /dev/null +++ b/tests/datasets/test_analysis.py @@ -0,0 +1,196 @@ +import pytest +from sklearn.metrics import accuracy_score +from sklearn.tree import DecisionTreeClassifier +from sklearn.linear_model import LinearRegression +import pandas as pd +from imblearn.metrics import geometric_mean_score +import numpy as np + + +from multi_imbalance.datasets.analysis import AnalysisPipeline, Config +from multi_imbalance.resampling.global_cs import GlobalCS +from multi_imbalance.resampling.soup import SOUP + + +def get_dummy_config(): + return { + "datasets": [], + "classifiers": {}, + "resample_methods": {}, + "metrics": {}, + "n_repeats": 2, + "train_test_split_kwargs": {}, + } + + +@pytest.fixture +def dataset_file(tmp_path): + filename = tmp_path / "dataset.csv" + filename.touch() + + return str(filename) + + +@pytest.fixture +def output_file(tmp_path): + filename = tmp_path / "output.csv" + + return str(filename) + + +@pytest.mark.parametrize( + "config_dict", + [ + { + "datasets": ["path/to/data"], + "classifiers": { + "tree": [{}], + }, + "resample_methods": { + "globalCS": {}, + }, + "metrics": {lambda x, y: (x, y): {}}, + "n_repeats": 2, + "train_test_split_kwargs": dict(test_size=0.2), + } + ], +) +def test_config_from_dict(config_dict): + config = Config.from_dict(config_dict) + + assert config.datasets == config_dict["datasets"] + assert config.classifiers == config_dict["classifiers"] + assert config.resample_methods == config_dict["resample_methods"] + assert config.metrics == config_dict["metrics"] + assert config.n_repeats == config_dict["n_repeats"] + assert config.train_test_split_kwargs == config_dict["train_test_split_kwargs"] + + +@pytest.mark.parametrize( + "classifier, expected_name, expected_clf", + [("tree", "tree", DecisionTreeClassifier), (LinearRegression, "linearregression", LinearRegression)], +) +def test_get_classifier(classifier, expected_name, expected_clf): + config_dict = get_dummy_config() + config_dict["classifiers"].update({classifier: [{}]}) + + config = Config.from_dict(config_dict) + + pipeline = AnalysisPipeline(config) + + clf_name, clf = next(pipeline._get_classifier()) + assert clf_name == expected_name + assert isinstance(clf, expected_clf) + + +@pytest.mark.parametrize( + "resampler, expected_name, expected_resampler", + [("globalCS", "globalCS", GlobalCS), (SOUP, "soup", SOUP)], +) +def test_get_resampler(resampler, expected_name, expected_resampler): + config_dict = get_dummy_config() + config_dict["resample_methods"].update({resampler: {}}) + + config = Config.from_dict(config_dict) + + pipeline = AnalysisPipeline(config) + + resampler_name, resampler = next(pipeline._get_resampler()) + assert resampler_name == expected_name + assert isinstance(resampler, expected_resampler) + + +@pytest.mark.parametrize("data, columns", [([[1, 2, 0]], ["X1", "X2", "y"]), ([[1, 2, 0], [4, 2, 1]], ["X1", "X2", "y"])]) +def test_get_dataset(data, columns, tmp_path, dataset_file): + config_dict = get_dummy_config() + config_dict["datasets"].append(str(tmp_path)) + + config = Config.from_dict(config_dict) + expected_df = pd.DataFrame(data, columns=columns) + expected_df.to_csv(dataset_file, index=False) + + pipeline = AnalysisPipeline(config) + + dataset_name, df = next(pipeline._get_dataset()) + assert dataset_name == "dataset" + pd.testing.assert_frame_equal(df, expected_df) + + +def test_run_analysis(X_ecoc, y_ecoc, dataset_file, output_file): + df = pd.DataFrame(X_ecoc, columns=["X1", "X2", "X3"]) + df["y"] = y_ecoc + df.to_csv(dataset_file, index=False) + config_dict = { + "datasets": [dataset_file], + "classifiers": dict(tree=[{"max_depth": 30}]), + "resample_methods": dict(globalCS={"shuffle": True}), + "metrics": {geometric_mean_score: {"correction": 0.005}, accuracy_score: {}}, + "n_repeats": 2, + "train_test_split_kwargs": dict(test_size=0.2, random_state=42), + } + config = Config.from_dict(config_dict) + + pipeline = AnalysisPipeline(config) + pipeline.run_analysis(output_file) + + result_df = pd.read_csv(output_file) + assert (result_df["dataset_name"] == "dataset").all() + assert (result_df["classifier"] == "tree").all() + np.testing.assert_array_equal(result_df["metric_name"].unique(), ["geometric_mean_score", "accuracy_score"]) + np.testing.assert_array_almost_equal(result_df["metric_value"].unique(), [0.018803, 0.166667]) + + +def test_get_dataset_wrong_path(): + config_dict = get_dummy_config() + config_dict["datasets"].append("bad/path/to/file.ext") + + config = Config.from_dict(config_dict) + + pipeline = AnalysisPipeline(config) + + with pytest.raises(Exception) as ex: + next(pipeline._get_dataset()) + + assert ex.value.args[0] == "Wrong dataset path, should be csv file or dir with csv files" + + +@pytest.mark.parametrize( + "wrong_resampler, expected_exception", + [ + ("wrong", "Unknown resample method: wrong, expected to be one of ['globalCS', 'StaticSMOTE', 'SOUP', 'spider3', 'MDO']"), + (lambda x: x, "Your resampler must implement fit_resample method"), + ], +) +def test_get_resampler_wrong(wrong_resampler, expected_exception): + config_dict = get_dummy_config() + config_dict["resample_methods"].update({wrong_resampler: {}}) + + config = Config.from_dict(config_dict) + + pipeline = AnalysisPipeline(config) + + with pytest.raises(ValueError) as ex: + next(pipeline._get_resampler()) + + assert ex.value.args[0] == expected_exception + + +@pytest.mark.parametrize( + "wrong_clf, expected_exception", + [ + ("wrong", "Unknown classifier: wrong, expected to be one of ['tree', 'NB', 'KNN']"), + (lambda x: x, "Your classifier must implement fit and predict methods"), + ], +) +def test_get_classifier_wrong(wrong_clf, expected_exception): + config_dict = get_dummy_config() + config_dict["classifiers"].update({wrong_clf: {}}) + + config = Config.from_dict(config_dict) + + pipeline = AnalysisPipeline(config) + + with pytest.raises(ValueError) as ex: + next(pipeline._get_classifier()) + + assert ex.value.args[0] == expected_exception diff --git a/tests/ensemble/test_ecoc.py b/tests/ensemble/test_ecoc.py index 79c1895..9f8718b 100644 --- a/tests/ensemble/test_ecoc.py +++ b/tests/ensemble/test_ecoc.py @@ -5,59 +5,22 @@ import multi_imbalance.ensemble.ecoc as ecoc -X = np.array( - [ - [1.8938566839198983, 0.7347724642028586, 1.5817290619305417], - [1.6893330472771877, 1.3729481360429043, 0.1779576959347715], - [1.1103882804642866, 0.2684931500114267, 0.24565717871603532], - [0.9635120154904986, 0.44338438370111577, 1.6559238383999697], - [0.6525827502237067, 0.8978087724631425, 1.5056794207545134], - [0.8232009732859464, 0.5270243940630088, 1.434695372722657], - [0.519304726338536, 0.4635228434262648, 0.014170648565480004], - [1.3938520002157688, 1.524670776643407, 0.9011189423913637], - [0.09993454781831534, 0.5991188594563008, 0.6462181194010983], - [1.5300019511124079, 0.08177359763506553, 1.7642527715349894], - [1.1770688242955876, 0.9604049547799067, 0.6989025594835503], - [1.5143651712498534, 1.4914673103908214, 1.3377704178955587], - [1.1299009013495136, 0.700540900007983, 1.071829181951729], - [1.530652133805449, 0.2992536048983532, 1.957731948975865], - [1.6236761570974148, 0.5919033806975751, 1.6334065904199757], - [0.9365056250644108, 1.526475631725099, 1.420298571686271], - [0.9063995770780813, 1.0248369545634513, 1.36911505163145], - [0.3861789635773656, 0.5758917834278445, 0.910187724154228], - [0.7165380621896438, 1.494299618627891, 0.521854931610239], - [1.3621939213912113, 0.387219837127391, 1.321376123618781], - [1.6764775993219296, 0.15364096535456317, 1.3219739817389], - [3.6764775993219296, 0.15364096535456317, 1.3789731273891], - [4.6764775993219296, 2.15364096535456317, 1.9830281123211], - [5.6764775993219296, 3.15364096535456317, 1.3213121322321], - [6.321312, 11.15364096535456317, 1.0998908320132], - [8.414132131, 2.15364096535456317, 5.0998908320132], - [10.6764775993219296, -2.15364096535456317, 3.0998908320132], - [4.6764775993219296, 0.15364096535456317, 2.0998908320132], - [-4.6764775993219296, -1.15364096535456317, 9.0998908320132], - [-6.6764775993219296, 11.15364096535456317, 5.0998908320132], - ] -) - -y = np.array([2, 0, 2, 3, 0, 3, 1, 0, 2, 0, 2, 3, 1, 2, 1, 3, 0, 3, 2, 0, 0, 1, 2, 3, 0, 1, 2, 3, 1, 2]) - -def test_random_oversampling(): +def test_random_oversampling(X_ecoc, y_ecoc): ecoc_clf = ecoc.ECOC(preprocessing="globalCS") - X_oversampled, y_oversampled = ecoc_clf._oversample(X, y) + X_oversampled, y_oversampled = ecoc_clf._oversample(X_ecoc, y_ecoc) assert len(X_oversampled) == len(y_oversampled) assert len(set(np.unique(y_oversampled, return_counts=True)[1])) == 1 - assert set(y_oversampled).issubset(set(y)) + assert set(y_oversampled).issubset(set(y_ecoc)) -def test_no_oversampling(): +def test_no_oversampling(X_ecoc, y_ecoc): ecoc_clf = ecoc.ECOC(preprocessing=None) - X_oversampled, y_oversampled = ecoc_clf._oversample(X, y) + X_oversampled, y_oversampled = ecoc_clf._oversample(X_ecoc, y_ecoc) - assert X.shape == X_oversampled.shape - assert y.shape == y_oversampled.shape + assert X_ecoc.shape == X_oversampled.shape + assert y_ecoc.shape == y_oversampled.shape @pytest.mark.parametrize("encoding_strategy", ["dense", "sparse", "OVO", "OVA", "complete"]) @@ -65,12 +28,12 @@ def test_no_oversampling(): "oversampling, minority_classes", [(None, None), ("globalCS", None), ("SMOTE", None), ("SOUP", [0, 2])], ) -def test_encoding(encoding_strategy, oversampling, minority_classes): +def test_encoding(encoding_strategy, oversampling, minority_classes, X_ecoc, y_ecoc): ecoc_clf = ecoc.ECOC(encoding=encoding_strategy, preprocessing=oversampling) - ecoc_clf.fit(X, y, minority_classes=minority_classes) + ecoc_clf.fit(X_ecoc, y_ecoc, minority_classes=minority_classes) matrix = ecoc_clf._code_matrix - number_of_classes = len(np.unique(y)) + number_of_classes = len(np.unique(y_ecoc)) assert matrix.shape[0] == number_of_classes assert len(np.unique(matrix, axis=0)) == number_of_classes @@ -78,9 +41,9 @@ def test_encoding(encoding_strategy, oversampling, minority_classes): @pytest.mark.parametrize("encoding_strategy", ["dense", "sparse"]) -def test_dense_and_sparse_with_not_cached_matrices(encoding_strategy): - X1 = np.concatenate((X, 2 * X, 3 * X, 4 * X, 5 * X), axis=0) - y1 = np.concatenate((y + 4, y + 8, y + 12, y + 16, y + 20)) +def test_dense_and_sparse_with_not_cached_matrices(encoding_strategy, X_ecoc, y_ecoc): + X1 = np.concatenate((X_ecoc, 2 * X_ecoc, 3 * X_ecoc, 4 * X_ecoc, 5 * X_ecoc), axis=0) + y1 = np.concatenate((y_ecoc + 4, y_ecoc + 8, y_ecoc + 12, y_ecoc + 16, y_ecoc + 20)) ecoc_clf = ecoc.ECOC(encoding=encoding_strategy) ecoc_clf.fit(X1, y1) @@ -101,7 +64,7 @@ def test_hamming_distance(): assert distance == 5 -def test_with_own_classifier(): +def test_with_own_classifier(X_ecoc, y_ecoc): class DummyClassifier: def fit(self, X, y): pass @@ -111,47 +74,47 @@ def predict(self, X): dummy_clf = DummyClassifier() ecoc_clf = ecoc.ECOC(binary_classifier=dummy_clf, preprocessing=None) - ecoc_clf.fit(X, y) + ecoc_clf.fit(X_ecoc, y_ecoc) predicted = ecoc_clf.predict(np.array([[1.0, 2.0], [4.0, 5.5], [6.7, 8.8]])) assert np.all(predicted == 0) -def test_with_own_preprocessing(): +def test_with_own_preprocessing(X_ecoc, y_ecoc): class DummyResampler: def fit_transform(self, X, y): return np.concatenate((X, X), axis=0), np.concatenate((y, y), axis=None) dummy_resampler = DummyResampler() ecoc_clf = ecoc.ECOC(preprocessing=dummy_resampler) - X_oversampled, y_oversampled = ecoc_clf._oversample(X, y) - assert len(X_oversampled) == 2 * len(X) - assert len(y_oversampled) == 2 * len(y) + X_oversampled, y_oversampled = ecoc_clf._oversample(X_ecoc, y_ecoc) + assert len(X_oversampled) == 2 * len(X_ecoc) + assert len(y_oversampled) == 2 * len(y_ecoc) -def test_unknown_classifier(): +def test_unknown_classifier(X_ecoc, y_ecoc): ecoc_clf = ecoc.ECOC(binary_classifier="DUMMY_CLASSIFIER", preprocessing=None) with pytest.raises(ValueError) as e: - ecoc_clf.fit(X, y) + ecoc_clf.fit(X_ecoc, y_ecoc) assert "DUMMY_CLASSIFIER" in str(e.value) -def test_unknown_encoding(): +def test_unknown_encoding(X_ecoc, y_ecoc): ecoc_clf = ecoc.ECOC(encoding="dummy") with pytest.raises(ValueError) as e: - ecoc_clf.fit(X, y) + ecoc_clf.fit(X_ecoc, y_ecoc) assert ( e.value.args[0] == "Unknown matrix generation encoding: dummy, expected to be one of ['dense', 'sparse', 'complete', 'OVA', 'OVO']." ) -def test_unknown_weighting_strategy(): +def test_unknown_weighting_strategy(X_ecoc, y_ecoc): ecoc_clf = ecoc.ECOC(weights="dummy") with pytest.raises(ValueError) as e: - ecoc_clf.fit(X, y) + ecoc_clf.fit(X_ecoc, y_ecoc) assert e.value.args[0] == "Unknown weighting strategy: dummy, expected to be one of [None, 'acc', 'avg_tpr_min']." -def test_own_classifier_without_predict_and_fit(): +def test_own_classifier_without_predict_and_fit(X_ecoc, y_ecoc): class DummyClassifier: def foo(self, X, y): pass @@ -162,28 +125,28 @@ def bar(self, X): dummy_clf = DummyClassifier() ecoc_clf = ecoc.ECOC(binary_classifier=dummy_clf, preprocessing=None) with pytest.raises(ValueError) as e: - ecoc_clf.fit(X, y) + ecoc_clf.fit(X_ecoc, y_ecoc) assert "predict" in str(e.value) assert "fit" in str(e.value) @pytest.mark.parametrize("classifier", ["tree", "NB", "KNN"]) @pytest.mark.parametrize("weights", [None, "acc", "avg_tpr_min"]) -def test_predefined_classifiers_and_weighting_without_exceptions(classifier, weights): +def test_predefined_classifiers_and_weighting_without_exceptions(classifier, weights, X_ecoc, y_ecoc): ecoc_clf = ecoc.ECOC(binary_classifier=classifier, weights=weights) - ecoc_clf.fit(X, y) + ecoc_clf.fit(X_ecoc, y_ecoc) predicted = ecoc_clf.predict(np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6], [7.7, 8.8, 9.9]])) assert len(predicted) == 3 -def test_unknown_preprocessing(): +def test_unknown_preprocessing(X_ecoc, y_ecoc): ecoc_clf = ecoc.ECOC(preprocessing="DUMMY_OVERSAMPLING") with pytest.raises(ValueError) as e: - ecoc_clf.fit(X, y) + ecoc_clf.fit(X_ecoc, y_ecoc) assert "DUMMY_OVERSAMPLING" in str(e.value) -def test_own_preprocessing_without_fit_transform(): +def test_own_preprocessing_without_fit_transform(X_ecoc, y_ecoc): class DummyOversampler: def foo(self, X, y): pass @@ -194,19 +157,19 @@ def bar(self, X): dummy_oversampler = DummyOversampler() ecoc_clf = ecoc.ECOC(preprocessing=dummy_oversampler) with pytest.raises(ValueError) as e: - ecoc_clf.fit(X, y) + ecoc_clf.fit(X_ecoc, y_ecoc) assert "fit_transform" in str(e.value) @pytest.mark.parametrize("encoding_strategy", ["dense", "sparse", "OVO", "OVA", "complete"]) @pytest.mark.parametrize("oversampling", [None, "globalCS", "SMOTE", "SOUP"]) -def test_ecoc_with_sklearn_pipeline(encoding_strategy, oversampling): +def test_ecoc_with_sklearn_pipeline(encoding_strategy, oversampling, X_ecoc, y_ecoc): pipeline = Pipeline( [ ("scaler", StandardScaler()), ("ecoc", ecoc.ECOC(encoding=encoding_strategy, preprocessing=oversampling)), ] ) - pipeline.fit(X, y) + pipeline.fit(X_ecoc, y_ecoc) y_hat = pipeline.predict(np.array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6], [7.7, 8.8, 9.9]])) assert len(y_hat) == 3 From 2f6c2557f833ede40a3e2ef2ed02a2133dd3bca8 Mon Sep 17 00:00:00 2001 From: Maciej Falbogowski Date: Sun, 4 Dec 2022 19:37:49 +0100 Subject: [PATCH 15/48] Change dtype of resampled_y to match the dtype of original y in SPIDER3 --- multi_imbalance/resampling/spider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/multi_imbalance/resampling/spider.py b/multi_imbalance/resampling/spider.py index df403d4..4e69e9d 100644 --- a/multi_imbalance/resampling/spider.py +++ b/multi_imbalance/resampling/spider.py @@ -64,7 +64,7 @@ def _fit_resample(self, X, y): self.DS = union(self.DS, self.AS) - return self.DS[:, :-1], self.DS[:, -1] + return self.DS[:, :-1], self.DS[:, -1].astype(y.dtype) def _initialize_algorithm(self, X, y): if self.maj_int_min is None: From b7d8d7cdc14a6dbd0b2ee6faacb43728b2a08ee7 Mon Sep 17 00:00:00 2001 From: Maciej Falbogowski Date: Sun, 4 Dec 2022 19:45:38 +0100 Subject: [PATCH 16/48] Add "threshold" strategy to utils.construct_maj_int_min Add a new "threshold" strategy which separates minority and majority classes based on the given threshold value --- multi_imbalance/utils/data.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/multi_imbalance/utils/data.py b/multi_imbalance/utils/data.py index bbb53e4..6733e6a 100644 --- a/multi_imbalance/utils/data.py +++ b/multi_imbalance/utils/data.py @@ -2,6 +2,7 @@ from collections import OrderedDict, Counter from pathlib import Path from statistics import median +from typing import Optional import numpy as np import pandas as pd @@ -100,7 +101,7 @@ def load_datasets_arff(return_non_cat_length=False, dataset_paths=None): return datasets -def construct_maj_int_min(y: np.ndarray, strategy='median') -> OrderedDict: +def construct_maj_int_min(y: np.ndarray, strategy='median', threshold: Optional[float] = None) -> OrderedDict: """ This function creates dictionary with information which classes are minority or majority @@ -114,8 +115,13 @@ def construct_maj_int_min(y: np.ndarray, strategy='median') -> OrderedDict: * 'average': The average class size will be calculated, all classes that are smaller will be considered as minority and the rest will be considered majority + * 'threshold': + All classes that are smaller than given threshold value will be considered as minority and + the rest will be considered majority. + :param threshold: + A mandatory threshold value for the "threshold" strategy :return: - dictionary with keys 'maj', 'int', 'min. The value for each key is a list containing the class labels belonging + dictionary with keys 'maj', 'int', 'min'. The value for each key is a list containing the class labels belonging to the given group """ class_sizes = Counter(y) @@ -124,8 +130,12 @@ def construct_maj_int_min(y: np.ndarray, strategy='median') -> OrderedDict: middle_size = median(list(class_sizes.values())) elif strategy == 'average': middle_size = np.mean(list(class_sizes.values())) + elif strategy == 'threshold': + if threshold is None: + raise ValueError('Missing threshold value for "threshold" strategy') + middle_size = threshold else: - raise ValueError(f'Unrecognized {strategy}. Only "median" and "average" are allowed.') + raise ValueError(f'Unrecognized {strategy}. Only "median", "average" or "threshold" are allowed.') maj_int_min = OrderedDict({ 'maj': list(), From b941eb561fb007d2b4c14ffa0d2671ca55d7f537 Mon Sep 17 00:00:00 2001 From: Maciej Falbogowski Date: Sun, 4 Dec 2022 19:55:05 +0100 Subject: [PATCH 17/48] Add shuffle function to utils.array_utils which shuffles many arrays at once --- multi_imbalance/utils/array_util.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/multi_imbalance/utils/array_util.py b/multi_imbalance/utils/array_util.py index 078a33e..5910ac7 100644 --- a/multi_imbalance/utils/array_util.py +++ b/multi_imbalance/utils/array_util.py @@ -1,4 +1,6 @@ import numpy as np +from typing import Optional, Tuple +import sklearn def setdiff(arr1, arr2): @@ -83,3 +85,24 @@ def intersect(arr1, arr2): if contains(arr2, x): result = union(result, np.array([x])) return result + + +def shuffle(*arrs: np.ndarray, + state: Optional[np.random.RandomState] = None) -> Tuple[ + np.ndarray, ...]: + """ + Shuffles rows of many arrays at once. + + Shuffles given arrays using a shuffled matrix of row indices. + The number of rows in the given arrays should be the same. + + :param arrs: + Numpy arrays to shuffle. + :param state: + Optional RandomState used to shuffle. + :return: + A tuple of shuffled copies of given arrays. + """ + indices = np.arange(arrs[0].shape[0]) + sklearn.utils.check_random_state(state).shuffle(indices) + return tuple(arr[indices] for arr in arrs) From 30e12297f1c7e51be12489c1c494b24147581449 Mon Sep 17 00:00:00 2001 From: Maciej Falbogowski Date: Sun, 4 Dec 2022 21:16:56 +0100 Subject: [PATCH 18/48] Add initial implementation of SMOM algorithm --- multi_imbalance/resampling/smom.py | 341 +++++++++++++++++++++++++++++ 1 file changed, 341 insertions(+) create mode 100644 multi_imbalance/resampling/smom.py diff --git a/multi_imbalance/resampling/smom.py b/multi_imbalance/resampling/smom.py new file mode 100644 index 0000000..a7962b9 --- /dev/null +++ b/multi_imbalance/resampling/smom.py @@ -0,0 +1,341 @@ +from collections import Counter, defaultdict + +import numpy as np + +from typing import List, Dict, Optional, Sequence + +from sklearn import neighbors +from multi_imbalance.utils import array_util +from imblearn.base import BaseSampler + + +def _expand_cluster(self, sfSc, i, curId): + sfC = {i} + self._xcl[i] = curId + while sfC: + j = next(iter(sfC)) + for L in self._hck[j]: + self._xcl[L] = curId + if L in sfSc: + sfC.add(L) + sfC.remove(j) + +def _nbdos(Sc: List[int], k: int, + sNk: Dict[int, List[int]], rTh: float, + nTh: int): + """ + NBDOS clustering algorithm implementation. + + Reference: + Zhu, Tuanfei, Yaping Lin, and Yonghe Liu. "Synthetic minority oversampling + technique for multiclass imbalance problems." Pattern Recognition 72 + (2017): 327-340. + + :param Sc: + Indices of items to perform clustering on + :param k: + The number of nearest neighbors + :param sNk: + The k-nearest neighbors lists + :param rTh: + The minimal proportion of c class instances which should be + achieved for the soft core instances in their k-nearest neighbors + :param nTh: + The minimal number of members required for the discovered clusters + """ + xcl = defaultdict(int) + hck = dict() + sfSc = set() + + for i in Sc: + Tem = sNk[i] + if round(Tem.size / k) >= rTh: + sfSc.add(i) + hck[i] = set(Tem) + + for i in sfSc: + Tem = set((j for j, tem in hck.items() if i in tem)) + hck[i].update(Tem & sfSc) + + curId = 0 + for i in sfSc: + if xcl[i] == 0: + curId += 1 + _expand_cluster(sfSc, i, curId) + for i in range(1, curId + 1): + ci = [j for j, cluster in xcl.items() if cluster == i] + if len(ci) < nTh: + for j in ci: + xcl[j] = 0 + Scl = np.array([xcl[i] for i in Sc]) + return Scl + + + + +def _dpn(X, i, j, Ss): + N_pn = [i, j] + mid_ij = (X[i] + X[j]) * 0.5 + dis_ij = np.linalg.norm(X[i] - X[j]) + for l in Ss: + dis_lm = np.linalg.norm(X[l] - mid_ij) + if 2 * dis_lm <= dis_ij: + N_pn.append(l) + return N_pn + + +def _compute_ss(Fs_i, Fs_d, i, dst): + Ss = [] + for i_, d_ in zip(Fs_i[i], Fs_d[i]): + if d_ <= dst: + Ss.append(i_) + return np.array(Ss, dtype=np.int64) + + +def _normalized_entropy(classes_counts: Sequence[int]): + if len(classes_counts) <= 1: + E = 0 + else: + total = sum(classes_counts) + E_min = np.log(1. / total) + E = sum(count / total * np.log(count / total) for count in + classes_counts) / E_min + assert 0 <= E <= 1, f'{E=} not in range [0, 1]' + return E + + +class SMOM(BaseSampler): + """ + SMOM technique implementation for synthetic minority oversampling for multiclass imbalanced problems. + + Reference: + Zhu, Tuanfei, Yaping Lin, and Yonghe Liu. "Synthetic minority oversampling + technique for multiclass imbalance problems." Pattern Recognition 72 + (2017): 327-340. + """ + + def __init__(self, c: int, zeta: int, k1: int = 12, k2: int = 8, + rTh: float = 5 / 8, + nTh: int = 10, w1: float = 0.2, w2: float = 0.5, + r1: float = 1 / 3, r2: float = 0.2, + maj_int_min: Optional[Dict[str, List[int]]] = None, + shuffle: bool = False, + metric: str = 'minkowski', + p: int = 2) -> None: + """ + :param maj_int_min: + dict {'maj': majority class labels, 'min': minority class labels} + :param c: + The minority class under consideration + :param zeta: + Number of synthetic instances to be generated + :param k1: + Number of nearest neighbors used to generate the synthetic instances + :params k2, rTh, nTh: + The parameters used in clustering algorithm NBDOS + :params w1, w2, r1, r2: + The parameters used for calculating the selection weights + :param maj_int_min: + Dict that contains lists of majority, intermediate and minority classes labels. + :param shuffle: + Shuffle resampled data + :param metric: + Metric to use for distance computation. + :param p: + Power parameter for Minkowski metric. + """ + super().__init__() + self._sampling_type = 'over-sampling' + self.maj_int_min = maj_int_min + self.c = c + self.zeta = zeta + self.k1 = k1 + self.k2 = k2 + self.rTh = rTh + self.nTh = nTh + self.w1 = w1 + self.w2 = w2 + self.r1 = r1 + self.r2 = r2 + self.shuffle = shuffle + if metric == 'minkowski': + self._metric = neighbors.DistanceMetric.get_metric(metric, p=p) + else: + self._metric = neighbors.metrics.DistanceMetric.get_metric(metric) + + def _pairwise_distance(self, x1, x2): + return self._metric.pairwise([x1], [x2])[0, 0] + + def _fit_resample(self, X, y): + """ + Performs resampling + + :param X: + Numpy array of examples that is the subject of resampling. + :param y: + Numpy array of labels corresponding to examples from X. + :return: + Resampled X along with accordingly modified labels, resampled y + """ + self.k1, self.k2, self.r1, self.r2 + k3 = max(self.k1, self.k2) + + # 1 + Sc = np.array([i for i, _ in enumerate(X) if y[i] == self.c]) + Sct = np.array([i for i, _ in enumerate(X) if y[i] != self.c]) + + if self.maj_int_min is None: + M = y.size + L = len(set(y)) + cnt = Counter(y[y != self.c]) + y_maj_classes = {k: v for k, v in cnt.items() if v >= M / L} + y_min_classes = {k: v for k, v in cnt.items() if v < M / L} + else: + cnt = Counter(y) + if 'int' not in self.maj_int_min: + self.maj_int_min['int'] = [] + y_maj_classes = {cls: cnt[cls] for cls in + self.maj_int_min['maj'] + self.maj_int_min['int']} + y_min_classes = {cls: cnt[cls] for cls in self.maj_int_min['min']} + + # nearest k3 instances from x_i in Sc + N_c_k3_i = dict() + N_c_k3_d = dict() + # nearest k1 instances from x_i in Sc + N_c_k1_i = dict() + N_c_k1_d = dict() + # nearest k3 instances from x_i in Sct + N_ct_k3_i = dict() + N_ct_k3_d = dict() + # nearest k2 instances from x_i in union of N_c_k3[i] and N_ct_k3[i] + N_k2_i = dict() + N_k2_d = dict() + + r_k1_d = dict() + r_k1_i = dict() + + Fs_i = dict() + Fs_d = dict() + + kdt_c = neighbors.KDTree(X[Sc]) + kdt_ct = neighbors.KDTree(X[Sct]) + for i in Sc: + # 2: 1) + dist_, ind_ = kdt_c.query([X[i]], k3 + 1) + ind, dist = ind_[0][1:], dist_[0][1:] + N_c_k3_i[i] = ind + N_c_k3_d[i] = dist + N_c_k1_i[i] = ind[:self.k1] + N_c_k1_d[i] = dist[:self.k1] + r_k1_i[i] = ind[self.k1 - 1] + r_k1_d[i] = dist[self.k1 - 1] + + # 2: 2) + dist_, ind_ = kdt_ct.query([X[i]], k3) + ind, dist = ind_[0], dist_[0] + N_ct_k3_i[i] = ind + N_ct_k3_d[i] = dist + T_ind, T_dist = [], [] + for t_ind, t_dist in zip(ind, dist): + if t_dist <= r_k1_d[i]: + T_ind.append(t_ind) + T_dist.append(t_dist) + Fs_i[i] = np.concatenate([T_ind, N_c_k1_i[i]], axis=0) + Fs_d[i] = np.concatenate([T_dist, N_c_k1_d[i]], axis=0) + + # 2: 3) + nc_nct_union_i = np.concatenate([N_c_k3_i[i], N_ct_k3_i[i]], + axis=0) + nc_nct_union_d = np.concatenate([N_c_k3_d[i], N_ct_k3_d[i]], + axis=0) + dist_, ind_ = neighbors.KDTree(X[nc_nct_union_i]).query([X[i]]) + ind, dist = ind_[0], dist_[0] + N_k2_i[i] = nc_nct_union_i[ind[0]] + N_k2_d[i] = nc_nct_union_d[ind[0]] + + # 3: + N_k2_dct = {index: neighbors for index, neighbors in N_k2_i.items()} + Sc_cl = _nbdos(Sc, self.k2, N_k2_dct, self.rTh, self.nTh) + + # 4: + OiC = Sc[Sc_cl != 0] + TiC = Sc[Sc_cl == 0] + + # 5: + Sw = defaultdict(dict) + for i in TiC: + for j in N_c_k1_i[i]: + # 5: 1) + if j in OiC and j in N_c_k1_i and i in N_c_k1_i[j]: + Sw[i][j] = 1 + self.w1 / np.e + # 5: 2) + elif j in N_c_k1_i and i in N_c_k1_i[j] and j in Sw and i in \ + Sw[j]: + Sw[i][j] = Sw[j][i] + # 5: 3) + else: + dis_ij = self._pairwise_distance(X[i], X[j]) + Ss = _compute_ss(Fs_i, Fs_d, i, dis_ij) + N_pn = _dpn(X, i, j, Ss) + y_mi_pn = {k: v for k, v in y_min_classes.items() if + k in N_pn} + y_ma_pn = {k: v for k, v in y_maj_classes.items() if + k in N_pn} + y_mi = sum(y_mi_pn.values()) + y_ma = sum(y_ma_pn.values()) + E_mi = _normalized_entropy(y_mi_pn.values()) + E_ma = _normalized_entropy(y_ma_pn.values()) + + y_c = len(Sc) + exponent1 = self.r1 * y_mi / y_c + self.r2 * E_mi + self.w2 * ( + self.r1 * y_ma / y_c + self.r2 * E_ma) + exponent2 = -y_c / (y_ma + y_mi + y_c) + Sw[i][j] = 1.0 / np.exp(exponent1) + self.w1 * np.exp( + exponent2) + + # 6: + P = defaultdict(dict) + for i in TiC: + # 6: 1) + for j in Sw[i]: + dis_ij = self._pairwise_distance(X[i], X[j]) + Ss = _compute_ss(Fs_i, Fs_d, i, dis_ij) + PN = _dpn(X, i, j, Ss) + if (y[PN] == self.c).all(): + break + else: + if i not in N_c_k1_i[i]: + N_c_k1_i[i] = np.append(N_c_k1_i[i], i) + N_c_k1_d[i] = np.append(N_c_k1_d[i], 0.) + Sw[i][i] = 1 + self.w1 / np.e + # 6: 2) + for j in Sw[i]: + P[i][j] = Sw[i][j] / sum(Sw[i][l] for l in N_c_k1_i[i]) + + # 7: + N_syn = dict() + div, mod = divmod(self.zeta, Sc.shape[0]) + for i in Sc: + N_syn[i] = div + (mod > 0) + mod -= 1 + + # 8: + SI = [] + for i in Sc: + # 8: 3) + for _ in range(N_syn[i]): + # 8: 1) + if i in TiC: + p = [P[i][j] for j in N_c_k1_i[i]] + j = np.random.choice(N_c_k1_i[i], size=1, p=p) + else: + j = np.random.choice(N_c_k1_i, size=1) + # 8: 2) + si = X[i] + (X[j] - X[i]) * np.random.rand(*X[j].shape) + SI.append(si) + X_resampled = np.concatenate([X, np.concatenate(SI, 0)], 0) + y_resampled = np.concatenate([y, [self.c] * len(SI)], 0) + if self.shuffle: + X_resampled, y_resampled = array_util.shuffle(X_resampled, + y_resampled) + return X_resampled, y_resampled From 1ea29c7a619725809bccf8afa1a8a54bbe1abb49 Mon Sep 17 00:00:00 2001 From: Maciej Falbogowski Date: Sun, 4 Dec 2022 21:45:00 +0100 Subject: [PATCH 19/48] Refactor initial implementation of SMOM algorithm --- multi_imbalance/resampling/smom.py | 396 ++++++++++++++++------------- 1 file changed, 219 insertions(+), 177 deletions(-) diff --git a/multi_imbalance/resampling/smom.py b/multi_imbalance/resampling/smom.py index a7962b9..68fdb3a 100644 --- a/multi_imbalance/resampling/smom.py +++ b/multi_imbalance/resampling/smom.py @@ -1,28 +1,20 @@ +"""Implementations of SMOM tqchnique and DBOS clustering algorithm.""" + from collections import Counter, defaultdict import numpy as np from typing import List, Dict, Optional, Sequence +import sklearn.utils from sklearn import neighbors from multi_imbalance.utils import array_util from imblearn.base import BaseSampler -def _expand_cluster(self, sfSc, i, curId): - sfC = {i} - self._xcl[i] = curId - while sfC: - j = next(iter(sfC)) - for L in self._hck[j]: - self._xcl[L] = curId - if L in sfSc: - sfC.add(L) - sfC.remove(j) - def _nbdos(Sc: List[int], k: int, - sNk: Dict[int, List[int]], rTh: float, - nTh: int): + sNk: Dict[int, List[int]], rTh: float, + nTh: int): """ NBDOS clustering algorithm implementation. @@ -32,21 +24,35 @@ def _nbdos(Sc: List[int], k: int, (2017): 327-340. :param Sc: - Indices of items to perform clustering on + Indices of items to perform clustering on. :param k: - The number of nearest neighbors + The number of nearest neighbors. :param sNk: - The k-nearest neighbors lists + The k-nearest neighbors lists. :param rTh: The minimal proportion of c class instances which should be - achieved for the soft core instances in their k-nearest neighbors + achieved for the soft core instances in their k-nearest neighbors. :param nTh: - The minimal number of members required for the discovered clusters + The minimal number of members required for the discovered clusters. """ + + # Naming of variables follows the pseudo code of nbdos xcl = defaultdict(int) hck = dict() sfSc = set() + def _expand_cluster(j, cur_id): + sfC = {j} + xcl[j] = cur_id + while sfC: + j = next(iter(sfC)) + for L in hck[j]: + if xcl[L] == 0: + xcl[L] = cur_id + if L in sfSc: + sfC.add(L) + sfC.remove(j) + for i in Sc: Tem = sNk[i] if round(Tem.size / k) >= rTh: @@ -61,7 +67,7 @@ def _nbdos(Sc: List[int], k: int, for i in sfSc: if xcl[i] == 0: curId += 1 - _expand_cluster(sfSc, i, curId) + _expand_cluster(i, curId) for i in range(1, curId + 1): ci = [j for j, cluster in xcl.items() if cluster == i] if len(ci) < nTh: @@ -71,28 +77,16 @@ def _nbdos(Sc: List[int], k: int, return Scl - - -def _dpn(X, i, j, Ss): - N_pn = [i, j] - mid_ij = (X[i] + X[j]) * 0.5 - dis_ij = np.linalg.norm(X[i] - X[j]) - for l in Ss: - dis_lm = np.linalg.norm(X[l] - mid_ij) - if 2 * dis_lm <= dis_ij: - N_pn.append(l) - return N_pn - - def _compute_ss(Fs_i, Fs_d, i, dst): Ss = [] - for i_, d_ in zip(Fs_i[i], Fs_d[i]): - if d_ <= dst: - Ss.append(i_) + for idx, dst_ in zip(Fs_i[i], Fs_d[i]): + if dst_ <= dst: + Ss.append(idx) return np.array(Ss, dtype=np.int64) def _normalized_entropy(classes_counts: Sequence[int]): + # TODO: compare with the pseudo code in the article (falbogowski) if len(classes_counts) <= 1: E = 0 else: @@ -114,35 +108,43 @@ class SMOM(BaseSampler): (2017): 327-340. """ - def __init__(self, c: int, zeta: int, k1: int = 12, k2: int = 8, + def __init__(self, + c: int, + zeta: int, + k1: int = 12, + k2: int = 8, rTh: float = 5 / 8, - nTh: int = 10, w1: float = 0.2, w2: float = 0.5, - r1: float = 1 / 3, r2: float = 0.2, - maj_int_min: Optional[Dict[str, List[int]]] = None, + nTh: int = 10, + w1: float = 0.2, + w2: float = 0.5, + r1: float = 1 / 3, + r2: float = 0.2, + maj_int_min: Optional[Dict[str, Sequence[int]]] = None, shuffle: bool = False, metric: str = 'minkowski', - p: int = 2) -> None: + p: int = 2, + seed: Optional[int] = None) -> None: """ - :param maj_int_min: - dict {'maj': majority class labels, 'min': minority class labels} :param c: - The minority class under consideration + The minority class under consideration. :param zeta: - Number of synthetic instances to be generated + Number of synthetic instances to be generated. :param k1: - Number of nearest neighbors used to generate the synthetic instances + Number of nearest neighbors used to generate the synthetic instances. :params k2, rTh, nTh: - The parameters used in clustering algorithm NBDOS + The parameters used in clustering algorithm NBDOS. :params w1, w2, r1, r2: - The parameters used for calculating the selection weights + The parameters used for calculating the selection weights. :param maj_int_min: Dict that contains lists of majority, intermediate and minority classes labels. :param shuffle: - Shuffle resampled data + Shuffle resampled data. :param metric: Metric to use for distance computation. :param p: Power parameter for Minkowski metric. + :param seed: + Seed for random state. """ super().__init__() self._sampling_type = 'over-sampling' @@ -151,6 +153,7 @@ def __init__(self, c: int, zeta: int, k1: int = 12, k2: int = 8, self.zeta = zeta self.k1 = k1 self.k2 = k2 + self.k3 = max(k1, k2) self.rTh = rTh self.nTh = nTh self.w1 = w1 @@ -158,6 +161,7 @@ def __init__(self, c: int, zeta: int, k1: int = 12, k2: int = 8, self.r1 = r1 self.r2 = r2 self.shuffle = shuffle + self.random_state = sklearn.utils.check_random_state(seed) if metric == 'minkowski': self._metric = neighbors.DistanceMetric.get_metric(metric, p=p) else: @@ -166,24 +170,53 @@ def __init__(self, c: int, zeta: int, k1: int = 12, k2: int = 8, def _pairwise_distance(self, x1, x2): return self._metric.pairwise([x1], [x2])[0, 0] - def _fit_resample(self, X, y): - """ - Performs resampling - - :param X: - Numpy array of examples that is the subject of resampling. - :param y: - Numpy array of labels corresponding to examples from X. - :return: - Resampled X along with accordingly modified labels, resampled y - """ - self.k1, self.k2, self.r1, self.r2 - k3 = max(self.k1, self.k2) - - # 1 - Sc = np.array([i for i, _ in enumerate(X) if y[i] == self.c]) - Sct = np.array([i for i, _ in enumerate(X) if y[i] != self.c]) - + def _dpn(self, X, i, j, Ss, dis_ij): + # Naming of variables follows the pseudo code of dPN + N_pn = [i, j] + mid_ij = (X[i] + X[j]) * 0.5 + for L in Ss: + dis_lm = self._pairwise_distance(X[L], mid_ij) + if 2 * dis_lm <= dis_ij: + N_pn.append(L) + return N_pn + + def _find_nearest_k3_in_sc(self, X, Sc, i): + dist_, ind_ = self.kdt_c.query([X[i]], self.k3 + 1) + ind, dist = ind_[0][1:], dist_[0][1:] + ind = Sc[ind] + self.N_c_k3_i[i] = ind + self.N_c_k3_d[i] = dist + self.N_c_k1_i[i] = ind[:self.k1] + self.N_c_k1_d[i] = dist[:self.k1] + self.r_k1_i[i] = ind[self.k1 - 1] + self.r_k1_d[i] = dist[self.k1 - 1] + + def _find_nearest_k3_in_sct(self, X, Sct, i): + dist_, ind_ = self.kdt_ct.query([X[i]], self.k3) + ind, dist = ind_[0], dist_[0] + ind = Sct[ind] + self.N_ct_k3_i[i] = ind + self.N_ct_k3_d[i] = dist + T_ind, T_dist = [], [] + for t_ind, t_dist in zip(ind, dist): + if t_dist <= self.r_k1_d[i]: + T_ind.append(t_ind) + T_dist.append(t_dist) + self.Fs_i[i] = np.concatenate([T_ind, self.N_c_k1_i[i]], axis=0) + self.Fs_d[i] = np.concatenate([T_dist, self.N_c_k1_d[i]], axis=0) + + def _find_k2_nearest_in_neighbor(self, X, i): + nc_nct_union_i = np.concatenate([self.N_c_k3_i[i], self.N_ct_k3_i[i]], + axis=0) + nc_nct_union_d = np.concatenate([self.N_c_k3_d[i], self.N_ct_k3_d[i]], + axis=0) + dist_, ind_ = neighbors.KDTree(X[nc_nct_union_i], + metric=self._metric).query([X[i]], self.k2) + ind, dist = ind_[0], dist_[0] + self.N_k2_i[i] = nc_nct_union_i[ind] + self.N_k2_d[i] = nc_nct_union_d[ind] + + def _compute_min_maj(self, y): if self.maj_int_min is None: M = y.size L = len(set(y)) @@ -197,145 +230,154 @@ def _fit_resample(self, X, y): y_maj_classes = {cls: cnt[cls] for cls in self.maj_int_min['maj'] + self.maj_int_min['int']} y_min_classes = {cls: cnt[cls] for cls in self.maj_int_min['min']} - - # nearest k3 instances from x_i in Sc - N_c_k3_i = dict() - N_c_k3_d = dict() - # nearest k1 instances from x_i in Sc - N_c_k1_i = dict() - N_c_k1_d = dict() - # nearest k3 instances from x_i in Sct - N_ct_k3_i = dict() - N_ct_k3_d = dict() - # nearest k2 instances from x_i in union of N_c_k3[i] and N_ct_k3[i] - N_k2_i = dict() - N_k2_d = dict() - - r_k1_d = dict() - r_k1_i = dict() - - Fs_i = dict() - Fs_d = dict() - - kdt_c = neighbors.KDTree(X[Sc]) - kdt_ct = neighbors.KDTree(X[Sct]) - for i in Sc: - # 2: 1) - dist_, ind_ = kdt_c.query([X[i]], k3 + 1) - ind, dist = ind_[0][1:], dist_[0][1:] - N_c_k3_i[i] = ind - N_c_k3_d[i] = dist - N_c_k1_i[i] = ind[:self.k1] - N_c_k1_d[i] = dist[:self.k1] - r_k1_i[i] = ind[self.k1 - 1] - r_k1_d[i] = dist[self.k1 - 1] - - # 2: 2) - dist_, ind_ = kdt_ct.query([X[i]], k3) - ind, dist = ind_[0], dist_[0] - N_ct_k3_i[i] = ind - N_ct_k3_d[i] = dist - T_ind, T_dist = [], [] - for t_ind, t_dist in zip(ind, dist): - if t_dist <= r_k1_d[i]: - T_ind.append(t_ind) - T_dist.append(t_dist) - Fs_i[i] = np.concatenate([T_ind, N_c_k1_i[i]], axis=0) - Fs_d[i] = np.concatenate([T_dist, N_c_k1_d[i]], axis=0) - - # 2: 3) - nc_nct_union_i = np.concatenate([N_c_k3_i[i], N_ct_k3_i[i]], - axis=0) - nc_nct_union_d = np.concatenate([N_c_k3_d[i], N_ct_k3_d[i]], - axis=0) - dist_, ind_ = neighbors.KDTree(X[nc_nct_union_i]).query([X[i]]) - ind, dist = ind_[0], dist_[0] - N_k2_i[i] = nc_nct_union_i[ind[0]] - N_k2_d[i] = nc_nct_union_d[ind[0]] - - # 3: - N_k2_dct = {index: neighbors for index, neighbors in N_k2_i.items()} - Sc_cl = _nbdos(Sc, self.k2, N_k2_dct, self.rTh, self.nTh) - - # 4: - OiC = Sc[Sc_cl != 0] - TiC = Sc[Sc_cl == 0] - - # 5: + return y_maj_classes, y_min_classes + + def _run_nbdos(self, Sc): + N_k2_dct = {index: neighbors_ for index, neighbors_ in + self.N_k2_i.items()} + return _nbdos(Sc, self.k2, N_k2_dct, self.rTh, self.nTh) + + def _compute_selection_weight(self, X, Sc, i, j, y_min_classes, + y_maj_classes): + dis_ij = self._pairwise_distance(X[i], X[j]) + Ss = _compute_ss(self.Fs_i, self.Fs_d, i, dis_ij) + N_pn = self._dpn(X, i, j, Ss, dis_ij) + y_mi_pn = {k: v for k, v in y_min_classes.items() if + k in N_pn} + y_ma_pn = {k: v for k, v in y_maj_classes.items() if + k in N_pn} + y_mi = sum(y_mi_pn.values()) + y_ma = sum(y_ma_pn.values()) + E_mi = _normalized_entropy(y_mi_pn.values()) + E_ma = _normalized_entropy(y_ma_pn.values()) + + y_c = len(Sc) + exponent1 = self.r1 * y_mi / y_c + self.r2 * E_mi + self.w2 * ( + self.r1 * y_ma / y_c + self.r2 * E_ma) + exponent2 = -y_c / (y_ma + y_mi + y_c) + return 1.0 / np.exp(exponent1) + self.w1 * np.exp( + exponent2) + + def _compute_selection_weights(self, X, Sc, TiC, OiC, y_min_classes, + y_maj_classes): Sw = defaultdict(dict) for i in TiC: - for j in N_c_k1_i[i]: - # 5: 1) - if j in OiC and j in N_c_k1_i and i in N_c_k1_i[j]: + for j in self.N_c_k1_i[i]: + if j in OiC and j in self.N_c_k1_i and i in self.N_c_k1_i[j]: Sw[i][j] = 1 + self.w1 / np.e - # 5: 2) - elif j in N_c_k1_i and i in N_c_k1_i[j] and j in Sw and i in \ + elif j in self.N_c_k1_i and i in self.N_c_k1_i[ + j] and j in Sw and i in \ Sw[j]: Sw[i][j] = Sw[j][i] - # 5: 3) else: - dis_ij = self._pairwise_distance(X[i], X[j]) - Ss = _compute_ss(Fs_i, Fs_d, i, dis_ij) - N_pn = _dpn(X, i, j, Ss) - y_mi_pn = {k: v for k, v in y_min_classes.items() if - k in N_pn} - y_ma_pn = {k: v for k, v in y_maj_classes.items() if - k in N_pn} - y_mi = sum(y_mi_pn.values()) - y_ma = sum(y_ma_pn.values()) - E_mi = _normalized_entropy(y_mi_pn.values()) - E_ma = _normalized_entropy(y_ma_pn.values()) - - y_c = len(Sc) - exponent1 = self.r1 * y_mi / y_c + self.r2 * E_mi + self.w2 * ( - self.r1 * y_ma / y_c + self.r2 * E_ma) - exponent2 = -y_c / (y_ma + y_mi + y_c) - Sw[i][j] = 1.0 / np.exp(exponent1) + self.w1 * np.exp( - exponent2) - - # 6: + Sw[i][j] = self._compute_selection_weight(X, Sc, i, j, + y_min_classes, + y_maj_classes) + return Sw + + def _obtain_probability_distribution(self, X, y, Sw, TiC): P = defaultdict(dict) for i in TiC: - # 6: 1) for j in Sw[i]: dis_ij = self._pairwise_distance(X[i], X[j]) - Ss = _compute_ss(Fs_i, Fs_d, i, dis_ij) - PN = _dpn(X, i, j, Ss) + Ss = _compute_ss(self.Fs_i, self.Fs_d, i, dis_ij) + PN = self._dpn(X, i, j, Ss, dis_ij) if (y[PN] == self.c).all(): break else: - if i not in N_c_k1_i[i]: - N_c_k1_i[i] = np.append(N_c_k1_i[i], i) - N_c_k1_d[i] = np.append(N_c_k1_d[i], 0.) + if i not in self.N_c_k1_i[i]: + self.N_c_k1_i[i] = np.append(self.N_c_k1_i[i], i) + self.N_c_k1_d[i] = np.append(self.N_c_k1_d[i], 0.) Sw[i][i] = 1 + self.w1 / np.e - # 6: 2) for j in Sw[i]: - P[i][j] = Sw[i][j] / sum(Sw[i][l] for l in N_c_k1_i[i]) + P[i][j] = Sw[i][j] / sum(Sw[i][k] for k in self.N_c_k1_i[i]) + return P - # 7: + def _compute_number_of_synthetic_instances(self, Sc): N_syn = dict() div, mod = divmod(self.zeta, Sc.shape[0]) for i in Sc: N_syn[i] = div + (mod > 0) mod -= 1 + return N_syn - # 8: + def _generate_synthetic_instances(self, X, Sc, N_syn, TiC, P): SI = [] for i in Sc: - # 8: 3) for _ in range(N_syn[i]): - # 8: 1) if i in TiC: - p = [P[i][j] for j in N_c_k1_i[i]] - j = np.random.choice(N_c_k1_i[i], size=1, p=p) + p = [P[i][j] for j in self.N_c_k1_i[i]] + j = self.random_state.choice(self.N_c_k1_i[i], size=1, p=p) else: - j = np.random.choice(N_c_k1_i, size=1) - # 8: 2) - si = X[i] + (X[j] - X[i]) * np.random.rand(*X[j].shape) + j = self.random_state.choice(self.N_c_k1_i[i], size=1) + si = X[i] + (X[j] - X[i]) * self.random_state.rand(*X[j].shape) SI.append(si) - X_resampled = np.concatenate([X, np.concatenate(SI, 0)], 0) - y_resampled = np.concatenate([y, [self.c] * len(SI)], 0) + return np.concatenate(SI, 0) + + def _setup(self): + # nearest k3 instances from x_i in Sc + self.N_c_k3_i = dict() + self.N_c_k3_d = dict() + # nearest k1 instances from x_i in Sc + self.N_c_k1_i = dict() + self.N_c_k1_d = dict() + # nearest k3 instances from x_i in Sct + self.N_ct_k3_i = dict() + self.N_ct_k3_d = dict() + # nearest k2 instances from x_i in union of N_c_k3[i] and N_ct_k3[i] + self.N_k2_i = dict() + self.N_k2_d = dict() + + self.r_k1_d = dict() + self.r_k1_i = dict() + + self.Fs_i = dict() + self.Fs_d = dict() + + def _fit_resample(self, X, y): + """ + Performs resampling + + :param X: + Numpy array of examples that is the subject of resampling. + :param y: + Numpy array of labels corresponding to examples from X. + :return: + Resampled X along with accordingly modified labels, resampled y + """ + + # 1 + Sc = np.array([i for i, _ in enumerate(X) if y[i] == self.c]) + Sct = np.array([i for i, _ in enumerate(X) if y[i] != self.c]) + + y_maj_classes, y_min_classes = self._compute_min_maj(y) + + self._setup() + + self.kdt_c = neighbors.KDTree(X[Sc], metric=self._metric) + self.kdt_ct = neighbors.KDTree(X[Sct], metric=self._metric) + + for i in Sc: + self._find_nearest_k3_in_sc(X, Sc, i) + self._find_nearest_k3_in_sct(X, Sct, i) + self._find_k2_nearest_in_neighbor(X, i) + + Sc_cl = self._run_nbdos(Sc) + + OiC = Sc[Sc_cl != 0] + TiC = Sc[Sc_cl == 0] + + Sw = self._compute_selection_weights(X, Sc, TiC, OiC, y_min_classes, + y_maj_classes) + P = self._obtain_probability_distribution(X, y, Sw, TiC) + N_syn = self._compute_number_of_synthetic_instances(Sc) + SI = self._generate_synthetic_instances(X, Sc, N_syn, TiC, P) + + X_resampled = np.concatenate([X, SI], 0) + y_resampled = np.concatenate([y, [self.c] * SI.shape[0]], 0) + if self.shuffle: X_resampled, y_resampled = array_util.shuffle(X_resampled, - y_resampled) + y_resampled, + state=self.random_state) return X_resampled, y_resampled From 6119e01244617c969e09315b5f4f7d7aceb0dc03 Mon Sep 17 00:00:00 2001 From: Maciej Falbogowski Date: Mon, 5 Dec 2022 09:29:33 +0100 Subject: [PATCH 20/48] Add example notebook for SMOM --- examples/resampling/SMOM.ipynb | 218 +++++++++++++++++++++++++++++++++ 1 file changed, 218 insertions(+) create mode 100644 examples/resampling/SMOM.ipynb diff --git a/examples/resampling/SMOM.ipynb b/examples/resampling/SMOM.ipynb new file mode 100644 index 0000000..b9b9d6e --- /dev/null +++ b/examples/resampling/SMOM.ipynb @@ -0,0 +1,218 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 28, + "outputs": [], + "source": [ + "%reload_ext autoreload\n", + "%autoreload 2" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Unzip datasets and prepare data:" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(336, 7)\n", + "(336,)\n", + "Counter({0: 145, 1: 77, 4: 52, 2: 37, 3: 25})\n", + "[[0.49 0.29 0.48 0.5 0.56 0.24 0.35]\n", + " [0.07 0.4 0.48 0.5 0.54 0.35 0.44]\n", + " [0.56 0.4 0.48 0.5 0.49 0.37 0.46]\n", + " [0.59 0.49 0.48 0.5 0.52 0.45 0.36]\n", + " [0.23 0.32 0.48 0.5 0.55 0.25 0.35]]\n", + "[0 0 0 0 0]\n" + ] + } + ], + "source": [ + "from collections import Counter\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from sklearn.decomposition import PCA\n", + "\n", + "from multi_imbalance.datasets import load_datasets\n", + "from multi_imbalance.utils.data import construct_flat_2pc_df\n", + "from multi_imbalance.utils.min_int_maj import maj_int_min\n", + "\n", + "%matplotlib inline\n", + "sns.set_style('darkgrid')\n", + "dataset_name = 'new_ecoli'\n", + "dataset = load_datasets()[dataset_name]\n", + "\n", + "X, y = dataset.data, dataset.target\n", + "print(X.shape)\n", + "print(y.shape)\n", + "print(Counter(y))\n", + "print(X[:5])\n", + "print(y[:5])" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "Resample data using SMOM algorithm" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 30, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'maj': [0, 1], 'int': [], 'min': [4, 2, 3]}\n", + "(386, 7) (386,)\n" + ] + } + ], + "source": [ + "from multi_imbalance.resampling.smom import SMOM\n", + "clf = SMOM(maj_int_min=maj_int_min[dataset_name], c=3, zeta=50, shuffle=True, seed=1234)\n", + "print(maj_int_min[dataset_name])\n", + "resampled_X, resampled_y = clf.fit_resample(X, y)\n", + "print(resampled_X.shape, resampled_y.shape)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "Compare results by plotting data in 2 dimensions" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 31, + "outputs": [ + { + "data": { + "text/plain": "" + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "text/plain": "
", + "image/png": "\n" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "n = len(Counter(y).keys())\n", + "p = sns.color_palette(\"husl\", n)\n", + "\n", + "pca = PCA(n_components=2)\n", + "pca.fit(X)\n", + "\n", + "fig, axs = plt.subplots(ncols=2, nrows=2)\n", + "fig.set_size_inches( 16, 10)\n", + "axs = axs.flatten()\n", + "\n", + "axs[1].set_title(\"Base\")\n", + "sns.countplot(y, ax=axs[0], palette=p)\n", + "X = pca.transform(X)\n", + "df = construct_flat_2pc_df(X, y)\n", + "sns.scatterplot(x='x1', y='x2', hue='y', style='y', data=df, alpha=0.7, ax=axs[1], legend='full', palette=p)\n", + "\n", + "\n", + "axs[3].set_title(\"SMOM\")\n", + "sns.countplot(resampled_y, ax=axs[2],palette=p)\n", + "resampled_X = pca.transform(resampled_X)\n", + "df = construct_flat_2pc_df(resampled_X, resampled_y)\n", + "sns.scatterplot(x='x1', y='x2', hue='y', style='y', data=df, alpha=0.7, ax=axs[3], legend='full', palette=p)" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": 31, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + } + ], + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + }, + "kernelspec": { + "name": "python3", + "language": "python", + "display_name": "Python 3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file From a258372eb2fd9e8c0d8f9af93dffe28d27ba3976 Mon Sep 17 00:00:00 2001 From: Maciej Falbogowski Date: Mon, 5 Dec 2022 10:50:06 +0100 Subject: [PATCH 21/48] Omit class under consideration in majority/minority classes in SMOM --- examples/resampling/SMOM.ipynb | 14 +++++++------- multi_imbalance/resampling/smom.py | 11 +++++++---- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/examples/resampling/SMOM.ipynb b/examples/resampling/SMOM.ipynb index b9b9d6e..c4b8d27 100644 --- a/examples/resampling/SMOM.ipynb +++ b/examples/resampling/SMOM.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 28, + "execution_count": 32, "outputs": [], "source": [ "%reload_ext autoreload\n", @@ -29,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 33, "outputs": [ { "name": "stdout", @@ -91,7 +91,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 34, "outputs": [ { "name": "stdout", @@ -130,13 +130,13 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 35, "outputs": [ { "data": { - "text/plain": "" + "text/plain": "" }, - "execution_count": 31, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" }, @@ -183,7 +183,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 35, "outputs": [], "source": [], "metadata": { diff --git a/multi_imbalance/resampling/smom.py b/multi_imbalance/resampling/smom.py index 68fdb3a..c353f54 100644 --- a/multi_imbalance/resampling/smom.py +++ b/multi_imbalance/resampling/smom.py @@ -1,4 +1,4 @@ -"""Implementations of SMOM tqchnique and DBOS clustering algorithm.""" +"""Implementations of SMOM technique and NBDOS clustering algorithm.""" from collections import Counter, defaultdict @@ -211,7 +211,8 @@ def _find_k2_nearest_in_neighbor(self, X, i): nc_nct_union_d = np.concatenate([self.N_c_k3_d[i], self.N_ct_k3_d[i]], axis=0) dist_, ind_ = neighbors.KDTree(X[nc_nct_union_i], - metric=self._metric).query([X[i]], self.k2) + metric=self._metric).query([X[i]], + self.k2) ind, dist = ind_[0], dist_[0] self.N_k2_i[i] = nc_nct_union_i[ind] self.N_k2_d[i] = nc_nct_union_d[ind] @@ -228,8 +229,10 @@ def _compute_min_maj(self, y): if 'int' not in self.maj_int_min: self.maj_int_min['int'] = [] y_maj_classes = {cls: cnt[cls] for cls in - self.maj_int_min['maj'] + self.maj_int_min['int']} - y_min_classes = {cls: cnt[cls] for cls in self.maj_int_min['min']} + self.maj_int_min['maj'] + self.maj_int_min['int'] + if cls != self.c} + y_min_classes = {cls: cnt[cls] for cls in self.maj_int_min['min'] + if cls != self.c} return y_maj_classes, y_min_classes def _run_nbdos(self, Sc): From addc99fd9d9c30b79eee3b2cf116787b49d53ea7 Mon Sep 17 00:00:00 2001 From: Maciej Falbogowski Date: Mon, 5 Dec 2022 19:49:10 +0100 Subject: [PATCH 22/48] Remove TODO in SMOM --- multi_imbalance/resampling/smom.py | 1 - 1 file changed, 1 deletion(-) diff --git a/multi_imbalance/resampling/smom.py b/multi_imbalance/resampling/smom.py index c353f54..c7c5aff 100644 --- a/multi_imbalance/resampling/smom.py +++ b/multi_imbalance/resampling/smom.py @@ -86,7 +86,6 @@ def _compute_ss(Fs_i, Fs_d, i, dst): def _normalized_entropy(classes_counts: Sequence[int]): - # TODO: compare with the pseudo code in the article (falbogowski) if len(classes_counts) <= 1: E = 0 else: From 11f0f47b1b3006863e2555550633d17f52410667 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Fri, 9 Dec 2022 20:09:36 +0100 Subject: [PATCH 23/48] second version --- examples/datasets/analysis.ipynb | 519 +++++++++++---------------- multi_imbalance/datasets/analysis.py | 272 +++++++------- tests/datasets/test_analysis.py | 19 +- 3 files changed, 368 insertions(+), 442 deletions(-) diff --git a/examples/datasets/analysis.ipynb b/examples/datasets/analysis.ipynb index 77c3583..69a99d2 100644 --- a/examples/datasets/analysis.ipynb +++ b/examples/datasets/analysis.ipynb @@ -6,12 +6,21 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.gaussian_process import GaussianProcessClassifier\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.tree import DecisionTreeClassifier\n", "from imblearn.metrics import geometric_mean_score\n", - "from multi_imbalance.datasets.analysis import AnalysisPipeline, Config\n", "from pathlib import Path\n", "from sklearn.metrics import accuracy_score\n", - "from multi_imbalance.datasets import load_datasets" + "from tempfile import NamedTemporaryFile\n", + "\n", + "from multi_imbalance.datasets.analysis import AnalysisPipeline, Config, Result\n", + "from multi_imbalance.datasets import load_datasets\n", + "from multi_imbalance.resampling.soup import SOUP\n", + "from multi_imbalance.resampling.spider import SPIDER3\n", + "from multi_imbalance.resampling.static_smote import StaticSMOTE\n", + "from multi_imbalance.resampling.global_cs import GlobalCS\n", + "from multi_imbalance.resampling.mdo import MDO\n" ] }, { @@ -23,15 +32,6 @@ "load_datasets(save_to_csv=True)" ] }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "Path(\"results.csv\").unlink()" - ] - }, { "cell_type": "code", "execution_count": 2, @@ -43,34 +43,22 @@ "config = {\n", " \"datasets\": [cwd.parents[1] / \"data\" / \"csv\" / \"glass.csv\"],\n", " \"classifiers\": {\n", - " \"tree\": [{\"max_depth\" : 100}, {\"max_depth\" : 20}, {\"max_depth\" : None}],\n", + " DecisionTreeClassifier: [{\"max_depth\" : 100}, {}],\n", + " KNeighborsClassifier: [{\"n_neighbors\": 7}, {}],\n", + " GaussianProcessClassifier: [ {\"max_iter_predict\": 250}, {}]\n", " },\n", " \"resample_methods\": {\n", - " \"globalCS\": {\"shuffle\": True},\n", - " \"MDO\": dict(k1_frac=0.3, maj_int_min={\"maj\": [0, 1], \"min\": [2, 3, 4, 5]}),\n", - " \"StaticSMOTE\": {},\n", - " \"SOUP\": {\"shuffle\": True},\n", + " GlobalCS: {\"all\": {\"shuffle\": True}},\n", + " MDO: {\"all\": {\"k1_frac\": 0.3, \"maj_int_min\":{\"maj\": [0, 1], \"min\": [2, 3, 4, 5]}}},\n", + " StaticSMOTE: {\"all\":{}},\n", + " SOUP: {\"all\" : {\"shuffle\": True}},\n", + " SPIDER3: {\"all\": {\"k\":5}}\n", " },\n", " \"metrics\": {geometric_mean_score: {\"correction\": 0.001}},\n", - " \"n_repeats\": 10,\n", - " \"train_test_split_kwargs\": dict(test_size=0.35),\n", + " \"n_repeats\": 1,\n", + " \"stratifiedkfold_params\": dict(n_splits=2, shuffle=True),\n", "}\n", - "\n", - "config2 = {\n", - " \"datasets\": [cwd.parents[1] / \"data\" / \"csv\" / \"new_ecoli.csv\"],\n", - " \"classifiers\": {\n", - " \"tree\": [{\"max_depth\" : 100}, {\"max_depth\" : 20}, {\"max_depth\" : None}],\n", - " },\n", - " \"resample_methods\": {\n", - " \"globalCS\": {\"shuffle\": True},\n", - " \"MDO\": dict(k1_frac=0.3, maj_int_min={\"maj\": [0, 1], \"min\": [2, 3, 4, 5]}),\n", - " \"StaticSMOTE\": {},\n", - " \"SOUP\": {\"shuffle\": True},\n", - " },\n", - " \"metrics\": {geometric_mean_score: {\"correction\": 0.001}},\n", - " \"n_repeats\": 10,\n", - " \"train_test_split_kwargs\": dict(test_size=0.35),\n", - "}\n" + "\n" ] }, { @@ -82,92 +70,26 @@ "name": "stderr", "output_type": "stream", "text": [ - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.test\\lib\\site-packages\\imblearn\\utils\\_validation.py:299: UserWarning: After over-sampling, the number of samples (96) in class 2 will be larger than the number of samples in the majority class (class #0 -> 94)\n", - " warnings.warn(\n" + "[09.12.2022 20:06:30] ERROR AnalysisPipeline Raised exception 'Expected n_neighbors <= n_samples, but n_samples = 5, n_neighbors = 6' for dataset_name='glass', resampler_name='staticsmote' and clf_name='decisiontreeclassifier'\n", + "Raised exception 'Expected n_neighbors <= n_samples, but n_samples = 5, n_neighbors = 6' for dataset_name='glass', resampler_name='staticsmote' and clf_name='decisiontreeclassifier'\n", + "Raised exception 'Expected n_neighbors <= n_samples, but n_samples = 5, n_neighbors = 6' for dataset_name='glass', resampler_name='staticsmote' and clf_name='kneighborsclassifier'\n", + "Raised exception 'Expected n_neighbors <= n_samples, but n_samples = 5, n_neighbors = 6' for dataset_name='glass', resampler_name='staticsmote' and clf_name='kneighborsclassifier'\n", + "Raised exception 'Expected n_neighbors <= n_samples, but n_samples = 5, n_neighbors = 6' for dataset_name='glass', resampler_name='staticsmote' and clf_name='gaussianprocessclassifier'\n", + "Raised exception 'Expected n_neighbors <= n_samples, but n_samples = 5, n_neighbors = 6' for dataset_name='glass', resampler_name='staticsmote' and clf_name='gaussianprocessclassifier'\n" ] } ], "source": [ + "result_file = NamedTemporaryFile(suffix=\".csv\")\n", + "result_file.close()\n", "c = Config.from_dict(config)\n", - "c2 = Config.from_dict(config2)\n", - "pipeline = AnalysisPipeline([c, c2])\n", - "pipeline.run_analysis(\"results.csv\", explode_clf_kwargs = False, train_without_resampling = True)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "df=pd.read_csv(\"results.csv\")\n", - "df.fillna(0, inplace=True)\n", - "group_df = df[[\"dataset_name\", \"classifier\", \"resampling_method\", \"metric_value\", \"metric_name\", \"kwargs\"]].groupby(by=[\"classifier\",\"metric_name\" ,\"dataset_name\", \"resampling_method\", \"kwargs\"]).agg({'metric_value': ['mean', 'std']})" + "pipeline = AnalysisPipeline(c)\n", + "pipeline.run_analysis(result_file.name, train_without_resampling = True)" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -197,233 +119,212 @@ " \n", " \n", " \n", - " metric_value\n", + " \n", + " metric_value\n", " \n", " \n", " \n", " \n", " \n", + " \n", " mean\n", " std\n", + " min\n", + " max\n", + " median\n", " \n", " \n", " dataset_name\n", + " classifier\n", " resampling_method\n", - " kwargs\n", + " metric_name\n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " glass\n", - " MDO\n", - " {'max_depth': 100}\n", - " 0.643855\n", - " 0.070864\n", - " \n", - " \n", - " {'max_depth': 20}\n", - " 0.594653\n", - " 0.158337\n", - " \n", - " \n", - " {'max_depth': None}\n", - " 0.636454\n", - " 0.094362\n", - " \n", - " \n", - " Not defined\n", - " {'max_depth': 100}\n", - " 0.534718\n", - " 0.175402\n", - " \n", - " \n", - " {'max_depth': 20}\n", - " 0.572939\n", - " 0.147061\n", - " \n", - " \n", - " {'max_depth': None}\n", - " 0.631542\n", - " 0.062332\n", - " \n", - " \n", - " SOUP\n", - " {'max_depth': 100}\n", - " 0.660234\n", - " 0.052351\n", - " \n", - " \n", - " {'max_depth': 20}\n", - " 0.619425\n", - " 0.095159\n", - " \n", - " \n", - " {'max_depth': None}\n", - " 0.656091\n", - " 0.073783\n", - " \n", - " \n", - " StaticSMOTE\n", - " {'max_depth': 100}\n", - " 0.621695\n", - " 0.147462\n", - " \n", - " \n", - " {'max_depth': 20}\n", - " 0.649909\n", - " 0.053088\n", - " \n", - " \n", - " {'max_depth': None}\n", - " 0.531951\n", - " 0.171160\n", - " \n", - " \n", - " globalCS\n", - " {'max_depth': 100}\n", - " 0.632791\n", - " 0.070711\n", - " \n", - " \n", - " {'max_depth': 20}\n", - " 0.506650\n", - " 0.160718\n", - " \n", - " \n", - " {'max_depth': None}\n", - " 0.580105\n", - " 0.099831\n", - " \n", - " \n", - " new_ecoli\n", - " MDO\n", - " {'max_depth': 100}\n", - " 0.777760\n", - " 0.052380\n", - " \n", - " \n", - " {'max_depth': 20}\n", - " 0.756026\n", - " 0.041473\n", - " \n", - " \n", - " {'max_depth': None}\n", - " 0.754241\n", - " 0.050392\n", - " \n", - " \n", - " Not defined\n", - " {'max_depth': 100}\n", - " 0.696599\n", - " 0.043062\n", - " \n", - " \n", - " {'max_depth': 20}\n", - " 0.706752\n", - " 0.076627\n", - " \n", - " \n", - " {'max_depth': None}\n", - " 0.720547\n", - " 0.062143\n", - " \n", - " \n", - " SOUP\n", - " {'max_depth': 100}\n", - " 0.749857\n", - " 0.028518\n", - " \n", - " \n", - " {'max_depth': 20}\n", - " 0.747158\n", - " 0.046176\n", - " \n", - " \n", - " {'max_depth': None}\n", - " 0.739759\n", - " 0.075347\n", - " \n", - " \n", - " StaticSMOTE\n", - " {'max_depth': 100}\n", - " 0.719496\n", - " 0.048702\n", - " \n", - " \n", - " {'max_depth': 20}\n", - " 0.721193\n", - " 0.050183\n", - " \n", - " \n", - " {'max_depth': None}\n", - " 0.694062\n", - " 0.066351\n", - " \n", - " \n", - " globalCS\n", - " {'max_depth': 100}\n", - " 0.667123\n", - " 0.049824\n", - " \n", - " \n", - " {'max_depth': 20}\n", - " 0.694378\n", - " 0.075182\n", - " \n", - " \n", - " {'max_depth': None}\n", - " 0.671451\n", - " 0.058819\n", + " glass\n", + " decisiontreeclassifier\n", + " Not defined\n", + " geometric_mean_score\n", + " 0.423840\n", + " 0.219519\n", + " 0.224079\n", + " 0.632994\n", + " 0.419144\n", + " \n", + " \n", + " globalcs\n", + " geometric_mean_score\n", + " 0.460604\n", + " 0.173743\n", + " 0.235397\n", + " 0.641363\n", + " 0.482827\n", + " \n", + " \n", + " mdo\n", + " geometric_mean_score\n", + " 0.496419\n", + " 0.207675\n", + " 0.198813\n", + " 0.670846\n", + " 0.558009\n", + " \n", + " \n", + " soup\n", + " geometric_mean_score\n", + " 0.629931\n", + " 0.018804\n", + " 0.607990\n", + " 0.649186\n", + " 0.631274\n", + " \n", + " \n", + " spider3\n", + " geometric_mean_score\n", + " 0.454395\n", + " 0.155110\n", + " 0.231247\n", + " 0.583301\n", + " 0.501516\n", + " \n", + " \n", + " kneighborsclassifier\n", + " Not defined\n", + " geometric_mean_score\n", + " 0.150199\n", + " 0.056244\n", + " 0.071968\n", + " 0.205163\n", + " 0.161833\n", + " \n", + " \n", + " globalcs\n", + " geometric_mean_score\n", + " 0.611671\n", + " 0.084948\n", + " 0.551014\n", + " 0.732382\n", + " 0.581645\n", + " \n", + " \n", + " mdo\n", + " geometric_mean_score\n", + " 0.381856\n", + " 0.197294\n", + " 0.205791\n", + " 0.607216\n", + " 0.357208\n", + " \n", + " \n", + " soup\n", + " geometric_mean_score\n", + " 0.553234\n", + " 0.036744\n", + " 0.514661\n", + " 0.598835\n", + " 0.549719\n", + " \n", + " \n", + " spider3\n", + " geometric_mean_score\n", + " 0.472230\n", + " 0.050068\n", + " 0.408403\n", + " 0.516181\n", + " 0.482168\n", " \n", " \n", "\n", "" ], "text/plain": [ - " metric_value \n", - " mean std\n", - "dataset_name resampling_method kwargs \n", - "glass MDO {'max_depth': 100} 0.643855 0.070864\n", - " {'max_depth': 20} 0.594653 0.158337\n", - " {'max_depth': None} 0.636454 0.094362\n", - " Not defined {'max_depth': 100} 0.534718 0.175402\n", - " {'max_depth': 20} 0.572939 0.147061\n", - " {'max_depth': None} 0.631542 0.062332\n", - " SOUP {'max_depth': 100} 0.660234 0.052351\n", - " {'max_depth': 20} 0.619425 0.095159\n", - " {'max_depth': None} 0.656091 0.073783\n", - " StaticSMOTE {'max_depth': 100} 0.621695 0.147462\n", - " {'max_depth': 20} 0.649909 0.053088\n", - " {'max_depth': None} 0.531951 0.171160\n", - " globalCS {'max_depth': 100} 0.632791 0.070711\n", - " {'max_depth': 20} 0.506650 0.160718\n", - " {'max_depth': None} 0.580105 0.099831\n", - "new_ecoli MDO {'max_depth': 100} 0.777760 0.052380\n", - " {'max_depth': 20} 0.756026 0.041473\n", - " {'max_depth': None} 0.754241 0.050392\n", - " Not defined {'max_depth': 100} 0.696599 0.043062\n", - " {'max_depth': 20} 0.706752 0.076627\n", - " {'max_depth': None} 0.720547 0.062143\n", - " SOUP {'max_depth': 100} 0.749857 0.028518\n", - " {'max_depth': 20} 0.747158 0.046176\n", - " {'max_depth': None} 0.739759 0.075347\n", - " StaticSMOTE {'max_depth': 100} 0.719496 0.048702\n", - " {'max_depth': 20} 0.721193 0.050183\n", - " {'max_depth': None} 0.694062 0.066351\n", - " globalCS {'max_depth': 100} 0.667123 0.049824\n", - " {'max_depth': 20} 0.694378 0.075182\n", - " {'max_depth': None} 0.671451 0.058819" + " metric_value \\\n", + " mean \n", + "dataset_name classifier resampling_method metric_name \n", + "glass decisiontreeclassifier Not defined geometric_mean_score 0.423840 \n", + " globalcs geometric_mean_score 0.460604 \n", + " mdo geometric_mean_score 0.496419 \n", + " soup geometric_mean_score 0.629931 \n", + " spider3 geometric_mean_score 0.454395 \n", + " kneighborsclassifier Not defined geometric_mean_score 0.150199 \n", + " globalcs geometric_mean_score 0.611671 \n", + " mdo geometric_mean_score 0.381856 \n", + " soup geometric_mean_score 0.553234 \n", + " spider3 geometric_mean_score 0.472230 \n", + "\n", + " \\\n", + " std \n", + "dataset_name classifier resampling_method metric_name \n", + "glass decisiontreeclassifier Not defined geometric_mean_score 0.219519 \n", + " globalcs geometric_mean_score 0.173743 \n", + " mdo geometric_mean_score 0.207675 \n", + " soup geometric_mean_score 0.018804 \n", + " spider3 geometric_mean_score 0.155110 \n", + " kneighborsclassifier Not defined geometric_mean_score 0.056244 \n", + " globalcs geometric_mean_score 0.084948 \n", + " mdo geometric_mean_score 0.197294 \n", + " soup geometric_mean_score 0.036744 \n", + " spider3 geometric_mean_score 0.050068 \n", + "\n", + " \\\n", + " min \n", + "dataset_name classifier resampling_method metric_name \n", + "glass decisiontreeclassifier Not defined geometric_mean_score 0.224079 \n", + " globalcs geometric_mean_score 0.235397 \n", + " mdo geometric_mean_score 0.198813 \n", + " soup geometric_mean_score 0.607990 \n", + " spider3 geometric_mean_score 0.231247 \n", + " kneighborsclassifier Not defined geometric_mean_score 0.071968 \n", + " globalcs geometric_mean_score 0.551014 \n", + " mdo geometric_mean_score 0.205791 \n", + " soup geometric_mean_score 0.514661 \n", + " spider3 geometric_mean_score 0.408403 \n", + "\n", + " \\\n", + " max \n", + "dataset_name classifier resampling_method metric_name \n", + "glass decisiontreeclassifier Not defined geometric_mean_score 0.632994 \n", + " globalcs geometric_mean_score 0.641363 \n", + " mdo geometric_mean_score 0.670846 \n", + " soup geometric_mean_score 0.649186 \n", + " spider3 geometric_mean_score 0.583301 \n", + " kneighborsclassifier Not defined geometric_mean_score 0.205163 \n", + " globalcs geometric_mean_score 0.732382 \n", + " mdo geometric_mean_score 0.607216 \n", + " soup geometric_mean_score 0.598835 \n", + " spider3 geometric_mean_score 0.516181 \n", + "\n", + " \n", + " median \n", + "dataset_name classifier resampling_method metric_name \n", + "glass decisiontreeclassifier Not defined geometric_mean_score 0.419144 \n", + " globalcs geometric_mean_score 0.482827 \n", + " mdo geometric_mean_score 0.558009 \n", + " soup geometric_mean_score 0.631274 \n", + " spider3 geometric_mean_score 0.501516 \n", + " kneighborsclassifier Not defined geometric_mean_score 0.161833 \n", + " globalcs geometric_mean_score 0.581645 \n", + " mdo geometric_mean_score 0.357208 \n", + " soup geometric_mean_score 0.549719 \n", + " spider3 geometric_mean_score 0.482168 " ] }, - "execution_count": 5, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "group_df.loc[\"tree\", \"geometric_mean_score\", :, :]" + "d = {\"classifier\": [\"decisiontreeclassifier\", \"kneighborsclassifier\"]}\n", + "import pandas as pd\n", + "\n", + "pd.concat(pipeline.generate_summary(d, [min, max]))" ] } ], diff --git a/multi_imbalance/datasets/analysis.py b/multi_imbalance/datasets/analysis.py index 74947a2..10ef771 100644 --- a/multi_imbalance/datasets/analysis.py +++ b/multi_imbalance/datasets/analysis.py @@ -1,23 +1,15 @@ from copy import deepcopy from dataclasses import dataclass -import json from pathlib import Path -from typing import Callable, Dict, List, Tuple, Union +from typing import Callable, Dict, Iterable, List, Tuple, Union import pandas as pd from sklearn.base import ClassifierMixin -from sklearn.model_selection import train_test_split -from sklearn.naive_bayes import GaussianNB -from sklearn.neighbors import KNeighborsClassifier -from sklearn.tree import DecisionTreeClassifier +from sklearn.model_selection import StratifiedKFold from imblearn.base import BaseSampler from sklearn.pipeline import _name_estimators import logging +from itertools import product -from multi_imbalance.resampling.global_cs import GlobalCS -from multi_imbalance.resampling.soup import SOUP -from multi_imbalance.resampling.spider import SPIDER3 -from multi_imbalance.resampling.mdo import MDO -from multi_imbalance.resampling.static_smote import StaticSMOTE logging.basicConfig(level=logging.INFO, format="[%(asctime)s] %(levelname)s %(name)s %(message)s", datefmt="%d.%m.%Y %H:%M:%S") @@ -25,11 +17,11 @@ @dataclass class Config: datasets: List[str] - classifiers: Dict[Union[str, ClassifierMixin], List[Dict]] - resample_methods: Dict[Union[str, BaseSampler], Dict] + classifiers: Dict[ClassifierMixin, List[Dict]] + resample_methods: Dict[BaseSampler, Dict[str, Dict]] metrics: Dict[Callable, Dict] n_repeats: int - train_test_split_kwargs: Dict + stratifiedkfold_params: Dict @classmethod def from_dict(cls, config: Dict) -> "Config": @@ -44,94 +36,135 @@ class Result: metric_name: str metric_value: float no_repeat: int - kwargs: Dict + clf_params: Dict class AnalysisPipeline: - _allowed_resampling = ["globalCS", "StaticSMOTE", "SOUP", "spider3", "MDO"] - _allowed_classifiers = ["tree", "NB", "KNN"] - - def __init__(self, configs: List[Config]) -> None: + def __init__(self, config: Config) -> None: self.__logger = logging.getLogger("AnalysisPipeline") - self._configs = configs - self.__resampling_methods = {"globalCS": GlobalCS, "StaticSMOTE": StaticSMOTE, "SOUP": SOUP, "spider3": SPIDER3, "MDO": MDO} - self.__classifiers = {"tree": DecisionTreeClassifier, "NB": GaussianNB, "KNN": KNeighborsClassifier} - - def run_analysis(self, output_path: str, explode_clf_kwargs: bool, train_without_resampling: bool): + self._config = config + self.__metrics = self._config.metrics + self.__n_repeats = self._config.n_repeats + self.__stratifiedkfold_params = self._config.stratifiedkfold_params + self.__iter = 0 + self.__chunksize = 10000 + + def run_analysis(self, output_path: str, train_without_resampling: bool) -> None: self._output_path = Path(output_path) - for config in self._configs: - self._config = config - self.__metrics = self._config.metrics - self.__n_repeats = self._config.n_repeats - self.__tts_kwargs = self._config.train_test_split_kwargs - for clf_name, clf, clf_kwargs in self._get_classifier(): - for n in range(1, self.__n_repeats + 1): - for dataset_name, dataset in self._get_dataset(): - for resampler_name, resampler in self._get_resampler(): - tmp_clf = deepcopy(clf) - X, y = dataset.iloc[:, :-1], dataset.iloc[:, -1] - X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, **self.__tts_kwargs) - - try: - X_train_res, y_train_res = resampler.fit_resample(X_train, y_train) - results = [] - tmp_clf.fit(X_train_res, y_train_res) - y_pred = tmp_clf.predict(X_test) - - for metric, kwargs in self.__metrics.items(): - results.append( - Result( - dataset_name, - clf_name, - resampler_name, - metric.__name__, - metric(y_test, y_pred, **kwargs), - n, - clf_kwargs, - ) - ) - df_results = pd.DataFrame(results) - if self._output_path.exists(): - df_results.to_csv(output_path, mode="a", index=False, header=False) - else: - df_results.to_csv(output_path, index=False) - except Exception as e: - self.__logger.error(f"Raised exception '{e}' for {dataset_name=}, {resampler_name=} and {clf_name=}") - - if train_without_resampling: - tmp_clf = deepcopy(clf) - results = [] - X, y = dataset.iloc[:, :-1], dataset.iloc[:, -1] - X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, **self.__tts_kwargs) - tmp_clf.fit(X_train, y_train) - y_pred = tmp_clf.predict(X_test) - - for metric, kwargs in self.__metrics.items(): - results.append( - Result( - dataset_name, - clf_name, - "Not defined", - metric.__name__, - metric(y_test, y_pred, **kwargs), - n, - clf_kwargs, - ) - ) - df_results = pd.DataFrame(results) - if self._output_path.exists(): - df_results.to_csv(output_path, mode="a", index=False, header=False) - else: - df_results.to_csv(output_path, index=False) - - if explode_clf_kwargs: - df_results = pd.read_csv(output_path) - df_results = pd.concat( - [df_results.drop(columns="kwargs"), df_results["kwargs"].apply(lambda x: dict(eval(x))).apply(pd.Series)], axis=1 - ) - df_results.to_csv(output_path, index=False) - - def _get_dataset(self) -> Tuple[str, pd.DataFrame]: + list_of_errors = [] + + for clf_data, n, dataset_data in product(self._get_classifier(), range(self.__n_repeats), self._get_dataset()): + for resampler_data in self._get_resampler(train_without_resampling, dataset_name=dataset_data[0]): + self._prepare_result(clf_data, dataset_data, resampler_data, output_path, n, list_of_errors) + self.__iter += 1 + + self.__iter = 0 + if list_of_errors: + self.__logger.error("\n".join(list_of_errors)) + + def explode_clf_params(self, input_path: str, output_path: str) -> None: + df_results = pd.read_csv(input_path) + df_results = pd.concat( + [df_results.drop(columns="clf_params"), df_results["clf_params"].apply(lambda x: dict(eval(x))).apply(pd.Series)], axis=1 + ) + df_results.to_csv(output_path, index=False) + + def generate_summary(self, query_dict: Dict[str, List[str]], aggregate_func: Union[List[Callable], None] = None) -> List[pd.DataFrame]: + selected_columns = self.column_names + selected_columns.remove("no_repeat") + selected_columns.remove("metric_value") + agg_func_list = ["mean", "std"] if aggregate_func is None else ["mean", "std", *aggregate_func] + df_list = [] + for i in product(*query_dict.values()): + tmp_dict = dict(zip(query_dict.keys(), i)) + query = " & ".join(map(lambda x: f"{x[0]}=='{x[1]}'", tmp_dict.items())) + + gen = pd.read_csv(str(self._output_path), chunksize=self.__chunksize) + df = pd.concat([df.query(query) for df in gen]) + + group_df = df[[*selected_columns, "metric_value"]].groupby(by=selected_columns[:-1]).agg({"metric_value": agg_func_list}) + df_list.append(group_df) + + return df_list + + @property + def dataset_names(self) -> List[str]: + gen = pd.read_csv(str(self._output_path), chunksize=self.__chunksize) + return list(set(i for df in gen for i in df["dataset_name"].unique())) + + @property + def metric_names(self) -> List[str]: + gen = pd.read_csv(str(self._output_path), chunksize=self.__chunksize) + return list(set(i for df in gen for i in df["metric_name"].unique())) + + @property + def clf_names(self) -> List[str]: + gen = pd.read_csv(str(self._output_path), chunksize=self.__chunksize) + return list(set(i for df in gen for i in df["classifier"].unique())) + + @property + def resampling_methods(self) -> List[str]: + gen = pd.read_csv(str(self._output_path), chunksize=self.__chunksize) + return list(set(i for df in gen for i in df["resampling_method"].unique())) + + @property + def column_names(self) -> List[str]: + gen = pd.read_csv(str(self._output_path), chunksize=self.__chunksize) + return list(next(gen).columns) + + def _prepare_result( + self, + clf_data: Tuple[str, ClassifierMixin, Dict], + dataset_data: Tuple[str, pd.DataFrame], + resampler_data: Tuple[str, BaseSampler], + output_path: str, + n: int, + list_of_errors: List[str], + ) -> None: + try: + clf_name, clf, clf_params = clf_data + dataset_name, dataset = dataset_data + resampler_name, resampler = resampler_data + + tmp_clf = deepcopy(clf) + + X, y = dataset.iloc[:, :-1], dataset.iloc[:, -1] + + skf = StratifiedKFold(**self.__stratifiedkfold_params, random_state=self.__iter) + for train_index, test_index in skf.split(X, y): + X_train, X_test = X.loc[train_index, :], X.loc[test_index, :] + y_train, y_test = y[train_index], y[test_index] + + if resampler is not None: + X_train, y_train = resampler.fit_resample(X_train, y_train) + + results = [] + tmp_clf.fit(X_train, y_train) + y_pred = tmp_clf.predict(X_test) + + for metric, params in self.__metrics.items(): + results.append( + Result( + dataset_name, + clf_name, + resampler_name, + metric.__name__, + metric(y_test, y_pred, **params), + n, + clf_params, + ) + ) + df_results = pd.DataFrame(results) + + if self._output_path.exists(): + df_results.to_csv(output_path, mode="a", index=False, header=False) + else: + df_results.to_csv(output_path, index=False) + + except Exception as e: + list_of_errors.append(f"Raised exception '{e}' for {dataset_name=}, {resampler_name=} and {clf_name=}") + + def _get_dataset(self) -> Iterable[Tuple[str, pd.DataFrame]]: for dataset_path in self._config.datasets: path = Path(dataset_path) @@ -144,33 +177,24 @@ def _get_dataset(self) -> Tuple[str, pd.DataFrame]: else: raise Exception("Wrong dataset path, should be csv file or dir with csv files") - def _get_resampler(self) -> List[Tuple[str, BaseSampler]]: - for resampler, kwargs in self._config.resample_methods.items(): - if isinstance(resampler, str): - if resampler not in AnalysisPipeline._allowed_resampling: - raise ValueError( - "Unknown resample method: %s, expected to be one of %s" % (resampler, AnalysisPipeline._allowed_resampling) - ) - yield resampler, self.__resampling_methods[resampler](**kwargs) - else: - if not hasattr(resampler, "fit_resample"): - raise ValueError("Your resampler must implement fit_resample method") - yield self._get_name(resampler(**kwargs)) - - def _get_classifier(self) -> List[Tuple[str, ClassifierMixin, Dict]]: - for classifier, kwargs_list in self._config.classifiers.items(): - if isinstance(classifier, str): - if classifier not in AnalysisPipeline._allowed_classifiers: - raise ValueError( - "Unknown classifier: %s, expected to be one of %s" % (classifier, AnalysisPipeline._allowed_classifiers) - ) - for kwargs in kwargs_list: - yield classifier, self.__classifiers[classifier](**kwargs), kwargs - else: - if not hasattr(classifier, "fit") or not hasattr(classifier, "predict"): - raise ValueError("Your classifier must implement fit and predict methods") - for kwargs in kwargs_list: - yield *self._get_name(classifier(**kwargs)), kwargs + def _get_resampler(self, train_without_resampling: bool, dataset_name: str) -> Iterable[Tuple[str, Union[BaseSampler, None]]]: + for resampler, params_dict in self._config.resample_methods.items(): + if not hasattr(resampler, "fit_resample"): + raise ValueError("Your resampler must implement fit_resample method") + if dataset_name and "all" not in params_dict: + raise KeyError("Must define params for all datasets or for the specific dataset") + + params = params_dict.get(dataset_name, params_dict.get("all")) + yield self._get_name(resampler(**params)) + if train_without_resampling: + yield "Not defined", None + + def _get_classifier(self) -> Iterable[Tuple[str, ClassifierMixin, Dict]]: + for classifier, params_list in self._config.classifiers.items(): + if not hasattr(classifier, "fit") or not hasattr(classifier, "predict"): + raise ValueError("Your classifier must implement fit and predict methods") + for params in params_list: + yield *self._get_name(classifier(**params)), params def _get_name(self, estimator: Union[ClassifierMixin, BaseSampler]) -> Tuple[str, Union[ClassifierMixin, BaseSampler]]: return _name_estimators([estimator])[0] diff --git a/tests/datasets/test_analysis.py b/tests/datasets/test_analysis.py index 5d726d2..786ac57 100644 --- a/tests/datasets/test_analysis.py +++ b/tests/datasets/test_analysis.py @@ -19,7 +19,7 @@ def get_dummy_config(): "resample_methods": {}, "metrics": {}, "n_repeats": 2, - "train_test_split_kwargs": {}, + "train_test_split_params": {}, } @@ -51,7 +51,7 @@ def output_file(tmp_path): }, "metrics": {lambda x, y: (x, y): {}}, "n_repeats": 2, - "train_test_split_kwargs": dict(test_size=0.2), + "train_test_split_params": dict(test_size=0.2), } ], ) @@ -63,7 +63,7 @@ def test_config_from_dict(config_dict): assert config.resample_methods == config_dict["resample_methods"] assert config.metrics == config_dict["metrics"] assert config.n_repeats == config_dict["n_repeats"] - assert config.train_test_split_kwargs == config_dict["train_test_split_kwargs"] + assert config.train_test_split_params == config_dict["train_test_split_params"] @pytest.mark.parametrize( @@ -78,9 +78,10 @@ def test_get_classifier(classifier, expected_name, expected_clf): pipeline = AnalysisPipeline(config) - clf_name, clf = next(pipeline._get_classifier()) + clf_name, clf, params = next(pipeline._get_classifier()) assert clf_name == expected_name assert isinstance(clf, expected_clf) + assert params == {} @pytest.mark.parametrize( @@ -95,7 +96,7 @@ def test_get_resampler(resampler, expected_name, expected_resampler): pipeline = AnalysisPipeline(config) - resampler_name, resampler = next(pipeline._get_resampler()) + resampler_name, resampler = next(pipeline._get_resampler(train_without_resampling=False)) assert resampler_name == expected_name assert isinstance(resampler, expected_resampler) @@ -126,18 +127,18 @@ def test_run_analysis(X_ecoc, y_ecoc, dataset_file, output_file): "resample_methods": dict(globalCS={"shuffle": True}), "metrics": {geometric_mean_score: {"correction": 0.005}, accuracy_score: {}}, "n_repeats": 2, - "train_test_split_kwargs": dict(test_size=0.2, random_state=42), + "train_test_split_params": dict(test_size=0.2, random_state=42), } config = Config.from_dict(config_dict) pipeline = AnalysisPipeline(config) - pipeline.run_analysis(output_file) + pipeline.run_analysis(output_file, train_without_resampling=True) result_df = pd.read_csv(output_file) assert (result_df["dataset_name"] == "dataset").all() assert (result_df["classifier"] == "tree").all() np.testing.assert_array_equal(result_df["metric_name"].unique(), ["geometric_mean_score", "accuracy_score"]) - np.testing.assert_array_almost_equal(result_df["metric_value"].unique(), [0.018803, 0.166667]) + np.testing.assert_array_almost_equal(result_df["metric_value"].unique(), [0.018803, 0.166667, 0.005, 0.0]) def test_get_dataset_wrong_path(): @@ -170,7 +171,7 @@ def test_get_resampler_wrong(wrong_resampler, expected_exception): pipeline = AnalysisPipeline(config) with pytest.raises(ValueError) as ex: - next(pipeline._get_resampler()) + next(pipeline._get_resampler(train_without_resampling=False)) assert ex.value.args[0] == expected_exception From 59f88928066856a42adcf102d321904baf467ad5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Fri, 6 Jan 2023 13:50:34 +0100 Subject: [PATCH 24/48] Final version --- examples/datasets/analysis.ipynb | 358 --------------- multi_imbalance/datasets/_data_loader.py | 2 + multi_imbalance/datasets/analysis.py | 559 +++++++++++++++++++---- multi_imbalance/datasets/helpers.py | 108 +++++ pyproject.toml | 2 + tests/datasets/test_analysis.py | 375 ++++++++++++--- tests/datasets/test_data_loader.py | 3 +- 7 files changed, 898 insertions(+), 509 deletions(-) delete mode 100644 examples/datasets/analysis.ipynb create mode 100644 multi_imbalance/datasets/helpers.py diff --git a/examples/datasets/analysis.ipynb b/examples/datasets/analysis.ipynb deleted file mode 100644 index 69a99d2..0000000 --- a/examples/datasets/analysis.ipynb +++ /dev/null @@ -1,358 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.gaussian_process import GaussianProcessClassifier\n", - "from sklearn.neighbors import KNeighborsClassifier\n", - "from sklearn.tree import DecisionTreeClassifier\n", - "from imblearn.metrics import geometric_mean_score\n", - "from pathlib import Path\n", - "from sklearn.metrics import accuracy_score\n", - "from tempfile import NamedTemporaryFile\n", - "\n", - "from multi_imbalance.datasets.analysis import AnalysisPipeline, Config, Result\n", - "from multi_imbalance.datasets import load_datasets\n", - "from multi_imbalance.resampling.soup import SOUP\n", - "from multi_imbalance.resampling.spider import SPIDER3\n", - "from multi_imbalance.resampling.static_smote import StaticSMOTE\n", - "from multi_imbalance.resampling.global_cs import GlobalCS\n", - "from multi_imbalance.resampling.mdo import MDO\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "load_datasets(save_to_csv=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "cwd = Path.cwd()\n", - "\n", - "config = {\n", - " \"datasets\": [cwd.parents[1] / \"data\" / \"csv\" / \"glass.csv\"],\n", - " \"classifiers\": {\n", - " DecisionTreeClassifier: [{\"max_depth\" : 100}, {}],\n", - " KNeighborsClassifier: [{\"n_neighbors\": 7}, {}],\n", - " GaussianProcessClassifier: [ {\"max_iter_predict\": 250}, {}]\n", - " },\n", - " \"resample_methods\": {\n", - " GlobalCS: {\"all\": {\"shuffle\": True}},\n", - " MDO: {\"all\": {\"k1_frac\": 0.3, \"maj_int_min\":{\"maj\": [0, 1], \"min\": [2, 3, 4, 5]}}},\n", - " StaticSMOTE: {\"all\":{}},\n", - " SOUP: {\"all\" : {\"shuffle\": True}},\n", - " SPIDER3: {\"all\": {\"k\":5}}\n", - " },\n", - " \"metrics\": {geometric_mean_score: {\"correction\": 0.001}},\n", - " \"n_repeats\": 1,\n", - " \"stratifiedkfold_params\": dict(n_splits=2, shuffle=True),\n", - "}\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[09.12.2022 20:06:30] ERROR AnalysisPipeline Raised exception 'Expected n_neighbors <= n_samples, but n_samples = 5, n_neighbors = 6' for dataset_name='glass', resampler_name='staticsmote' and clf_name='decisiontreeclassifier'\n", - "Raised exception 'Expected n_neighbors <= n_samples, but n_samples = 5, n_neighbors = 6' for dataset_name='glass', resampler_name='staticsmote' and clf_name='decisiontreeclassifier'\n", - "Raised exception 'Expected n_neighbors <= n_samples, but n_samples = 5, n_neighbors = 6' for dataset_name='glass', resampler_name='staticsmote' and clf_name='kneighborsclassifier'\n", - "Raised exception 'Expected n_neighbors <= n_samples, but n_samples = 5, n_neighbors = 6' for dataset_name='glass', resampler_name='staticsmote' and clf_name='kneighborsclassifier'\n", - "Raised exception 'Expected n_neighbors <= n_samples, but n_samples = 5, n_neighbors = 6' for dataset_name='glass', resampler_name='staticsmote' and clf_name='gaussianprocessclassifier'\n", - "Raised exception 'Expected n_neighbors <= n_samples, but n_samples = 5, n_neighbors = 6' for dataset_name='glass', resampler_name='staticsmote' and clf_name='gaussianprocessclassifier'\n" - ] - } - ], - "source": [ - "result_file = NamedTemporaryFile(suffix=\".csv\")\n", - "result_file.close()\n", - "c = Config.from_dict(config)\n", - "pipeline = AnalysisPipeline(c)\n", - "pipeline.run_analysis(result_file.name, train_without_resampling = True)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
metric_value
meanstdminmaxmedian
dataset_nameclassifierresampling_methodmetric_name
glassdecisiontreeclassifierNot definedgeometric_mean_score0.4238400.2195190.2240790.6329940.419144
globalcsgeometric_mean_score0.4606040.1737430.2353970.6413630.482827
mdogeometric_mean_score0.4964190.2076750.1988130.6708460.558009
soupgeometric_mean_score0.6299310.0188040.6079900.6491860.631274
spider3geometric_mean_score0.4543950.1551100.2312470.5833010.501516
kneighborsclassifierNot definedgeometric_mean_score0.1501990.0562440.0719680.2051630.161833
globalcsgeometric_mean_score0.6116710.0849480.5510140.7323820.581645
mdogeometric_mean_score0.3818560.1972940.2057910.6072160.357208
soupgeometric_mean_score0.5532340.0367440.5146610.5988350.549719
spider3geometric_mean_score0.4722300.0500680.4084030.5161810.482168
\n", - "
" - ], - "text/plain": [ - " metric_value \\\n", - " mean \n", - "dataset_name classifier resampling_method metric_name \n", - "glass decisiontreeclassifier Not defined geometric_mean_score 0.423840 \n", - " globalcs geometric_mean_score 0.460604 \n", - " mdo geometric_mean_score 0.496419 \n", - " soup geometric_mean_score 0.629931 \n", - " spider3 geometric_mean_score 0.454395 \n", - " kneighborsclassifier Not defined geometric_mean_score 0.150199 \n", - " globalcs geometric_mean_score 0.611671 \n", - " mdo geometric_mean_score 0.381856 \n", - " soup geometric_mean_score 0.553234 \n", - " spider3 geometric_mean_score 0.472230 \n", - "\n", - " \\\n", - " std \n", - "dataset_name classifier resampling_method metric_name \n", - "glass decisiontreeclassifier Not defined geometric_mean_score 0.219519 \n", - " globalcs geometric_mean_score 0.173743 \n", - " mdo geometric_mean_score 0.207675 \n", - " soup geometric_mean_score 0.018804 \n", - " spider3 geometric_mean_score 0.155110 \n", - " kneighborsclassifier Not defined geometric_mean_score 0.056244 \n", - " globalcs geometric_mean_score 0.084948 \n", - " mdo geometric_mean_score 0.197294 \n", - " soup geometric_mean_score 0.036744 \n", - " spider3 geometric_mean_score 0.050068 \n", - "\n", - " \\\n", - " min \n", - "dataset_name classifier resampling_method metric_name \n", - "glass decisiontreeclassifier Not defined geometric_mean_score 0.224079 \n", - " globalcs geometric_mean_score 0.235397 \n", - " mdo geometric_mean_score 0.198813 \n", - " soup geometric_mean_score 0.607990 \n", - " spider3 geometric_mean_score 0.231247 \n", - " kneighborsclassifier Not defined geometric_mean_score 0.071968 \n", - " globalcs geometric_mean_score 0.551014 \n", - " mdo geometric_mean_score 0.205791 \n", - " soup geometric_mean_score 0.514661 \n", - " spider3 geometric_mean_score 0.408403 \n", - "\n", - " \\\n", - " max \n", - "dataset_name classifier resampling_method metric_name \n", - "glass decisiontreeclassifier Not defined geometric_mean_score 0.632994 \n", - " globalcs geometric_mean_score 0.641363 \n", - " mdo geometric_mean_score 0.670846 \n", - " soup geometric_mean_score 0.649186 \n", - " spider3 geometric_mean_score 0.583301 \n", - " kneighborsclassifier Not defined geometric_mean_score 0.205163 \n", - " globalcs geometric_mean_score 0.732382 \n", - " mdo geometric_mean_score 0.607216 \n", - " soup geometric_mean_score 0.598835 \n", - " spider3 geometric_mean_score 0.516181 \n", - "\n", - " \n", - " median \n", - "dataset_name classifier resampling_method metric_name \n", - "glass decisiontreeclassifier Not defined geometric_mean_score 0.419144 \n", - " globalcs geometric_mean_score 0.482827 \n", - " mdo geometric_mean_score 0.558009 \n", - " soup geometric_mean_score 0.631274 \n", - " spider3 geometric_mean_score 0.501516 \n", - " kneighborsclassifier Not defined geometric_mean_score 0.161833 \n", - " globalcs geometric_mean_score 0.581645 \n", - " mdo geometric_mean_score 0.357208 \n", - " soup geometric_mean_score 0.549719 \n", - " spider3 geometric_mean_score 0.482168 " - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "d = {\"classifier\": [\"decisiontreeclassifier\", \"kneighborsclassifier\"]}\n", - "import pandas as pd\n", - "\n", - "pd.concat(pipeline.generate_summary(d, [min, max]))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.9.2 ('.test')", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.2" - }, - "orig_nbformat": 4, - "vscode": { - "interpreter": { - "hash": "354fe7bbb08ce19365ae7e9dc9251db0b8655780cc27fe67a2a3ffff5cc90304" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/multi_imbalance/datasets/_data_loader.py b/multi_imbalance/datasets/_data_loader.py index fa9f29a..dadf033 100644 --- a/multi_imbalance/datasets/_data_loader.py +++ b/multi_imbalance/datasets/_data_loader.py @@ -74,6 +74,8 @@ def load_datasets(data_home: str = DATA_HOME_BASIC, save_to_csv: bool = False) - """ extracted_dir = join(data_home, "extracted") csv_dir = join(data_home, "csv") + makedirs(csv_dir, exist_ok=True) + datasets = OrderedDict() filter_data_ = MAP_NAME_ID.keys() diff --git a/multi_imbalance/datasets/analysis.py b/multi_imbalance/datasets/analysis.py index 10ef771..c1a765f 100644 --- a/multi_imbalance/datasets/analysis.py +++ b/multi_imbalance/datasets/analysis.py @@ -1,56 +1,64 @@ from copy import deepcopy -from dataclasses import dataclass +import json from pathlib import Path -from typing import Callable, Dict, Iterable, List, Tuple, Union +from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union +import click import pandas as pd from sklearn.base import ClassifierMixin -from sklearn.model_selection import StratifiedKFold +from sklearn.model_selection import StratifiedKFold, train_test_split from imblearn.base import BaseSampler from sklearn.pipeline import _name_estimators import logging from itertools import product +import numpy as np +from multi_imbalance.datasets.helpers import Result, Config, import_from_string -logging.basicConfig(level=logging.INFO, format="[%(asctime)s] %(levelname)s %(name)s %(message)s", datefmt="%d.%m.%Y %H:%M:%S") +logging.basicConfig( + level=logging.INFO, + format="[%(asctime)s] %(levelname)s %(name)s %(message)s", + datefmt="%d.%m.%Y %H:%M:%S", +) -@dataclass -class Config: - datasets: List[str] - classifiers: Dict[ClassifierMixin, List[Dict]] - resample_methods: Dict[BaseSampler, Dict[str, Dict]] - metrics: Dict[Callable, Dict] - n_repeats: int - stratifiedkfold_params: Dict - - @classmethod - def from_dict(cls, config: Dict) -> "Config": - return cls(**config) - +class AnalysisPipeline: + """ + This is a class for an analysis pipeline. + The __init__() method initializes the object, taking a Config object containing the pipeline configuration and an optional csv_path to the results CSV file. + The run_analysis() method runs the analysis on a set of classifiers, datasets, and resamplers, saving the results to the specified output_path. + The explode_clf_params() method takes a input_path to a CSV file and explodes the clf_params column into individual columns, saving the result to output_path. + Finally, the generate_summary() method takes a query_dict specifying which results to include in the summary and an optional aggregate_func to apply to the metric_value column + and returns a list of Pandas DataFrames containing the summary of the results. + """ -@dataclass -class Result: - dataset_name: str - classifier: str - resampling_method: str - metric_name: str - metric_value: float - no_repeat: int - clf_params: Dict + def __init__(self, config: Config) -> None: + """ + Initializes the AnalysisPipeline object. + :param: config: + Config object containing the configuration for the pipeline. + """ + __split_funcs = {"Kfold": self.__KFold_split, "train_test": self.__train_test_split} -class AnalysisPipeline: - def __init__(self, config: Config) -> None: self.__logger = logging.getLogger("AnalysisPipeline") self._config = config self.__metrics = self._config.metrics self.__n_repeats = self._config.n_repeats - self.__stratifiedkfold_params = self._config.stratifiedkfold_params + self.__split_params = self._config.split_method[1] + self.__split_func = __split_funcs[self._config.split_method[0]] self.__iter = 0 self.__chunksize = 10000 def run_analysis(self, output_path: str, train_without_resampling: bool) -> None: - self._output_path = Path(output_path) + """ + This function runs a specified analysis on a set of classifiers, datasets, and resamplers. The results of the analysis are saved to the specified output path. + + :param output_path: + str, the location where the results of the analysis will be saved as a CSV file + :param train_without_resampling: + bool, if `True`, the analysis will be run without using resampling on the training set + """ + self._csv_path = Path(output_path) list_of_errors = [] for clf_data, n, dataset_data in product(self._get_classifier(), range(self.__n_repeats), self._get_dataset()): @@ -62,56 +70,141 @@ def run_analysis(self, output_path: str, train_without_resampling: bool) -> None if list_of_errors: self.__logger.error("\n".join(list_of_errors)) - def explode_clf_params(self, input_path: str, output_path: str) -> None: - df_results = pd.read_csv(input_path) - df_results = pd.concat( - [df_results.drop(columns="clf_params"), df_results["clf_params"].apply(lambda x: dict(eval(x))).apply(pd.Series)], axis=1 - ) - df_results.to_csv(output_path, index=False) - - def generate_summary(self, query_dict: Dict[str, List[str]], aggregate_func: Union[List[Callable], None] = None) -> List[pd.DataFrame]: - selected_columns = self.column_names + @staticmethod + def generate_summary( + query_dict: Dict[str, List[str]], + csv_path: str, + save_to_csv: bool = False, + save_path: Optional[str] = None, + aggregate_func: Optional[List[Callable]] = None, + ) -> List[pd.DataFrame]: + """ + Generate summary of analysis results based on specified query parameters. + + This method generates a summary of the results of an analysis based on the specified query parameters. The `csv_path` + should be the path to the CSV file containing the results of the analysis. By default, the mean and std functions + are used for aggregation. If `save_to_csv` is `True`, the summary will be saved to a CSV file. The optional + `aggregate_func` parameter allows specifying a list of functions that will be applied to the `metric_value` column + of the results to generate the summary. + + :param query_dict: + Dict[str, List[str]], a dictionary that specifies the values of different columns in the results to include in the summary + :param csv_path: + str, the path to the CSV file containing the results of the analysis + :param save_to_csv: + bool, optional, if `True`, the summary will be saved to a CSV file + :param save_path: + str, optional, the location where the summary csv files should be saved + :param aggregate_func: + Optional[List[Callable]], optional, a list of functions that will be applied to the `metric_value` column of the results to generate the summary + :return: + List[pd.DataFrame], a list of Pandas DataFrames containing the summary of the results of the analysis + """ + chunksize = 1000 + gen = pd.read_csv(csv_path, chunksize=chunksize) + selected_columns = list(next(gen).columns) selected_columns.remove("no_repeat") selected_columns.remove("metric_value") agg_func_list = ["mean", "std"] if aggregate_func is None else ["mean", "std", *aggregate_func] df_list = [] for i in product(*query_dict.values()): - tmp_dict = dict(zip(query_dict.keys(), i)) - query = " & ".join(map(lambda x: f"{x[0]}=='{x[1]}'", tmp_dict.items())) + df = AnalysisPipeline._search_df_by_query(query_dict, combination=i, csv_path=csv_path, chunksize=chunksize) - gen = pd.read_csv(str(self._output_path), chunksize=self.__chunksize) - df = pd.concat([df.query(query) for df in gen]) - - group_df = df[[*selected_columns, "metric_value"]].groupby(by=selected_columns[:-1]).agg({"metric_value": agg_func_list}) + group_df = df[[*selected_columns, "metric_value"]].groupby(by=selected_columns).agg({"metric_value": agg_func_list}) + if save_to_csv: + if save_path is None: + save_path = Path(csv_path).parent + group_df.reset_index().to_csv(Path(save_path) / ("_".join(i) + ".csv"), index=False) df_list.append(group_df) return df_list + @staticmethod + def generate_posthoc_analysis( + query_dict: Dict[str, List[str]], + csv_path: str, + posthoc_func_list: List[Tuple[Callable, Dict]], + save_to_csv: bool = False, + save_path: Optional[str] = None, + ) -> List[pd.DataFrame]: + """ + Generates a posthoc analysis of the results of the analysis based on the specified query parameters and posthoc functions. + `csv_path` should be the path to the CSV file containing the results of the analysis. + + :param query_dict: + Dict[str, List[str]], a dictionary that specifies the values of different columns in the results to include in the posthoc analysis + :param csv_path: + str, the path to the CSV file containing the results of the analysis + :param posthoc_func_list: + List[Tuple[Callable, Dict]], a list of tuples containing the posthoc functions and their parameters to be applied to the results + :param save_to_csv: + bool, optional, if `True`, the posthoc analysis will be saved to a CSV file + :param save_path: + str, optional, the location where the summary csv files should be saved + :return: + List[pd.DataFrame], a list of Pandas DataFrames containing the posthoc analysis of the results + """ + chunksize = 1000 + gen = pd.read_csv(csv_path, chunksize=chunksize) + selected_columns = list(next(gen).columns) + selected_columns.remove("no_repeat") + selected_columns.remove("metric_value") + + df_list = [] + for i in product(*query_dict.values()): + df = AnalysisPipeline._search_df_by_query(query_dict, combination=i, csv_path=csv_path, chunksize=chunksize) + + for posthoc_func, params in posthoc_func_list: + posthoc_df = posthoc_func(df, "metric_value", "resampling_method", **params) + df_name = posthoc_func.__name__ + "_" + "_".join(i) + posthoc_df.columns.name = df_name + if save_to_csv: + if save_path is None: + save_path = Path(csv_path).parent + df_path = save_path / (df_name + ".csv") + posthoc_df.to_csv(Path(df_path)) + df_list.append(posthoc_df) + + return df_list + @property def dataset_names(self) -> List[str]: - gen = pd.read_csv(str(self._output_path), chunksize=self.__chunksize) + """Returns a list of unique dataset names in the CSV file""" + gen = pd.read_csv(self._csv_path, chunksize=self.__chunksize) return list(set(i for df in gen for i in df["dataset_name"].unique())) @property def metric_names(self) -> List[str]: - gen = pd.read_csv(str(self._output_path), chunksize=self.__chunksize) + """Returns a list of unique metric names in the CSV file""" + gen = pd.read_csv(self._csv_path, chunksize=self.__chunksize) return list(set(i for df in gen for i in df["metric_name"].unique())) @property def clf_names(self) -> List[str]: - gen = pd.read_csv(str(self._output_path), chunksize=self.__chunksize) + """Returns a list of unique classifier names in the CSV file""" + gen = pd.read_csv(self._csv_path, chunksize=self.__chunksize) return list(set(i for df in gen for i in df["classifier"].unique())) @property def resampling_methods(self) -> List[str]: - gen = pd.read_csv(str(self._output_path), chunksize=self.__chunksize) + """Returns a list of unique resample method names in the CSV file""" + gen = pd.read_csv(self._csv_path, chunksize=self.__chunksize) return list(set(i for df in gen for i in df["resampling_method"].unique())) @property def column_names(self) -> List[str]: - gen = pd.read_csv(str(self._output_path), chunksize=self.__chunksize) + """Returns a list of column names in the CSV file""" + gen = pd.read_csv(self._csv_path, chunksize=self.__chunksize) return list(next(gen).columns) + @staticmethod + def _search_df_by_query(query_dict: Dict, combination: Tuple[str, ...], csv_path: str, chunksize: int = 10000) -> pd.DataFrame: + query_dict = dict(zip(query_dict.keys(), combination)) + query = " & ".join(map(lambda x: f"{x[0]}=='{x[1]}'", query_dict.items())) + + gen = pd.read_csv(csv_path, chunksize=chunksize) + return pd.concat([df.query(query) for df in gen]) + def _prepare_result( self, clf_data: Tuple[str, ClassifierMixin, Dict], @@ -121,75 +214,253 @@ def _prepare_result( n: int, list_of_errors: List[str], ) -> None: - try: - clf_name, clf, clf_params = clf_data - dataset_name, dataset = dataset_data - resampler_name, resampler = resampler_data - - tmp_clf = deepcopy(clf) - - X, y = dataset.iloc[:, :-1], dataset.iloc[:, -1] - - skf = StratifiedKFold(**self.__stratifiedkfold_params, random_state=self.__iter) - for train_index, test_index in skf.split(X, y): - X_train, X_test = X.loc[train_index, :], X.loc[test_index, :] - y_train, y_test = y[train_index], y[test_index] - - if resampler is not None: - X_train, y_train = resampler.fit_resample(X_train, y_train) - - results = [] - tmp_clf.fit(X_train, y_train) - y_pred = tmp_clf.predict(X_test) - - for metric, params in self.__metrics.items(): - results.append( - Result( - dataset_name, - clf_name, - resampler_name, - metric.__name__, - metric(y_test, y_pred, **params), - n, - clf_params, - ) - ) - df_results = pd.DataFrame(results) - - if self._output_path.exists(): - df_results.to_csv(output_path, mode="a", index=False, header=False) - else: - df_results.to_csv(output_path, index=False) - - except Exception as e: - list_of_errors.append(f"Raised exception '{e}' for {dataset_name=}, {resampler_name=} and {clf_name=}") + """ + This method prepares the results of running a classifier on a dataset which is resampling by resampler. It will compute the specified metrics for each repeat. + + :param clf_data: + A tuple containing the name of the classifier, the classifier and a dictionary of classifier parameters + :param dataset_data: + A tuple containing the name of the dataset and the dataset + :param resampler_data: + A tuple containing the name of the resampler and the resampler + :param output_path: + str, the path to the output CSV file + :param n: + int, the current number of repeat + :param list_of_errors: + List[str], a list of error messages to append to if an exception is raised + """ + clf_name, clf, clf_params = clf_data + dataset_name, dataset = dataset_data + resampler_name, resampler = resampler_data + tmp_clf = deepcopy(clf) + metric_to_check = {} + for metric in self.__metrics.keys(): + if not self._check_if_exist_in_csv(output_path, dataset_name, clf_name, resampler_name, metric.__name__, n, clf_params): + metric_to_check.update({metric: self.__metrics[metric]}) + + if metric_to_check != {}: + try: + X, y = dataset.iloc[:, :-1], dataset.iloc[:, -1] + self.__split_func( + X, + y, + resampler, + tmp_clf, + metrics=metric_to_check, + dataset_name=dataset_name, + clf_name=clf_name, + resampler_name=resampler_name, + n=n, + clf_params=clf_params, + output_path=output_path, + ) + + except Exception as e: + list_of_errors.append(f"Raised exception: '{e}' for {dataset_name=}, {resampler_name=} and {clf_name=}") + + def __KFold_split(self, X: np.ndarray, y: np.ndarray, resampler: BaseSampler, clf: ClassifierMixin, **kwargs) -> None: + """ + This method performs a K-Fold split of the data and computes the specified metrics on the test set for each split. + + :param X: + numpy.ndarray, the feature matrix + :param y: + numpy.ndarray, the target vector + :param resampler: + BaseSampler, the resampling method to use + :param clf: + ClassifierMixin, the classifier to be evaluated + :param kwargs: + Additional keyword arguments, including the list of metrics to be computed, the name of the dataset, classifier, and resampler, + the current number of repeat, and the parameters of the classifier + """ + skf = StratifiedKFold(**self.__split_params, random_state=self.__iter) + for train_index, test_index in skf.split(X, y): + X_train, X_test = X.loc[train_index, :], X.loc[test_index, :] + y_train, y_test = y[train_index], y[test_index] + + if resampler is not None: + X_train, y_train = resampler.fit_resample(X_train, y_train) + + results = [] + clf.fit(X_train, y_train) + y_pred = clf.predict(X_test) + self._processing_result(results=results, y_test=y_test, y_pred=y_pred, **kwargs) + + def __train_test_split(self, X: np.ndarray, y: np.ndarray, resampler: BaseSampler, clf: ClassifierMixin, **kwargs): + """ + This method performs a train-test split of the data and computes the specified metrics on the test set. + + :param X: + numpy.ndarray, the feature matrix + :param y: + numpy.ndarray, the target vector + :param resampler: + BaseSampler, the resampling method to use + :param clf: + ClassifierMixin, the classifier to be evaluated + :param kwargs: + Additional keyword arguments, including the list of metrics to be computed, the name of the dataset, classifier, and resampler, + the current number of repeat, and the parameters of the classifier + """ + X_train, X_test, y_train, y_test = train_test_split( + X, + y, + random_state=self.__iter, + **self.__split_params, + ) + if resampler is not None: + X_train, y_train = resampler.fit_resample(X_train, y_train) + + results = [] + clf.fit(X_train, y_train) + y_pred = clf.predict(X_test) + self._processing_result(results=results, y_test=y_test, y_pred=y_pred, **kwargs) + + def _processing_result( + self, + results: List, + metrics: Dict[Callable, Dict], + dataset_name: str, + clf_name: str, + resampler_name: str, + y_test: np.ndarray, + y_pred: np.ndarray, + n: int, + clf_params: Dict, + output_path: str, + ): + """ + This method processes the results of applying a classifier to a dataset using a resampler. + + :param results: + List, a list to append the computed results to + :param metrics: + Dict[Callable, Dict], a dictionary of metrics to be computed, along with any additional parameters for each metric + :param dataset_name: + str, the name of the dataset + :param clf_name: + str, the name of the classifier + :param resampler_name: + str, the name of the resampler + :param y_test: + numpy.ndarray, the target vector of the test set + :param y_pred: + numpy.ndarray, the predicted target vector of the test set + :param n: + int, the current number of repeat + :param clf_params: + Dict, the parameters of the classifier + :param output_path: + str, the path to the output CSV file + + """ + for metric, params in metrics.items(): + results.append( + Result( + dataset_name=dataset_name, + classifier=clf_name, + resampling_method=resampler_name, + metric_name=metric.__name__, + metric_value=metric(y_test, y_pred, **params), + no_repeat=n, + clf_params=clf_params, + ) + ) + df_results = pd.DataFrame(results) + + if self._csv_path.exists(): + df_results.to_csv(output_path, mode="a", index=False, header=False) + else: + df_results.to_csv(output_path, index=False) + + def _check_if_exist_in_csv( + self, + path: str, + dataset_name: str, + clf_name: str, + resampler_name: str, + metric_name: str, + n: int, + clf_params: str, + ) -> bool: + """ + Check if a combination of parameters already exists in a CSV file. + + :param path: + str, The path to the CSV file + :param dataset_name: + str, The name of the dataset + :param clf_name: + str, The name of the classifier + :param resampler_name: + str, The name of the resampler + :param metric_name: + str, The name of the metric + :param n: + int, Current number of repeat + :param clf_params: + str, The classifier parameters + :return: + bool, True if the results already exist, False otherwise + """ + query = ( + f"dataset_name=='{dataset_name}' & classifier=='{clf_name}' & resampling_method=='{resampler_name}'" + f"""& metric_name=='{metric_name}' & clf_params=="{clf_params}" & no_repeat=={n}""" + ) + if Path(path).exists(): + gen = pd.read_csv(path, chunksize=self.__chunksize) + return any([not df.query(query).empty for df in gen]) + return False def _get_dataset(self) -> Iterable[Tuple[str, pd.DataFrame]]: + """ + This method retrieves the datasets specified in the configuration object. + + :return: + Iterable[Tuple[str, pd.DataFrame]], An iterable of tuples containing the dataset name and the dataset + """ for dataset_path in self._config.datasets: path = Path(dataset_path) if path.is_file() and path.suffix == ".csv": - yield path.stem, pd.read_csv(str(path)) + yield path.stem, pd.read_csv(path) elif path.is_dir(): dataset_dir = path for path in dataset_dir.glob("**/*.csv"): - yield path.stem, pd.read_csv(str(path)) + yield path.stem, pd.read_csv(path) else: raise Exception("Wrong dataset path, should be csv file or dir with csv files") def _get_resampler(self, train_without_resampling: bool, dataset_name: str) -> Iterable[Tuple[str, Union[BaseSampler, None]]]: - for resampler, params_dict in self._config.resample_methods.items(): + """ + This method retrieves the resamplers specified in the configuration object. + + :param train_without_resampling: + bool, A flag indicating whether to include an option to not use a resampler + :param dataset_name: + str, The name of the dataset for which use specific resampler configuration + + :return: + Iterable[Tuple[str, Union[BaseSampler, None]]], An iterable of tuples containing the resampler name and the resampler + """ + for resampler, params_dict in self._config.resampling_methods.items(): if not hasattr(resampler, "fit_resample"): raise ValueError("Your resampler must implement fit_resample method") - if dataset_name and "all" not in params_dict: - raise KeyError("Must define params for all datasets or for the specific dataset") + if dataset_name and "default" not in params_dict: + raise KeyError("Must define default params for all datasets or for the specific dataset") - params = params_dict.get(dataset_name, params_dict.get("all")) + params = params_dict.get(dataset_name, params_dict.get("default")) yield self._get_name(resampler(**params)) if train_without_resampling: yield "Not defined", None def _get_classifier(self) -> Iterable[Tuple[str, ClassifierMixin, Dict]]: + """This method retrieves the classifiers specified in the configuration object. + + :return: + Iterable[Tuple[str, ClassifierMixin, Dict]], An iterable of tuples containing the classifier name, the classifier and a dictionary of classifier parameters""" for classifier, params_list in self._config.classifiers.items(): if not hasattr(classifier, "fit") or not hasattr(classifier, "predict"): raise ValueError("Your classifier must implement fit and predict methods") @@ -198,3 +469,95 @@ def _get_classifier(self) -> Iterable[Tuple[str, ClassifierMixin, Dict]]: def _get_name(self, estimator: Union[ClassifierMixin, BaseSampler]) -> Tuple[str, Union[ClassifierMixin, BaseSampler]]: return _name_estimators([estimator])[0] + + +@click.command() +@click.argument("output_path") +@click.option( + "--run-analysis", + is_flag=True, + help="Option specifying whether it should be run analysis pipeline", +) +@click.option( + "--summary", + is_flag=True, + help="Option specifying whether it should be run summary", +) +@click.option( + "--posthoc-analysis", + is_flag=True, + help="Option specifying whether it should be run posthoc analysis", +) +@click.option("--config-json", help="Path to json file which contain config for pipeline analysis") +@click.option("--query-json", help="Path to json file which contain query dict for generating summary") +@click.option("--posthoc-query-json", help="Path to json file which contain query dict for posthoc analysis") +@click.option( + "--aggregate-json", help="Optional, path to json file which contain paths (in list) to aggregate functions, e.g. ['numpy.max', ...]" +) +@click.option( + "--posthoc-func-json", + help="Path to json file which contain dict with paths to posthoc analysis functions and their params, e.g. {'scikit_posthoc.posthoc_dunn':{}}", +) +@click.option( + "--train-without-resampling", + is_flag=True, + help="Option specifying if the analysis would be run without using resampling", +) +@click.option( + "--save-to-csv", + is_flag=True, + help="Option defines if results from summary should be save to csv", +) +def main( + output_path, + run_analysis, + summary, + posthoc_analysis, + config_json, + query_json, + posthoc_query_json, + aggregate_json, + posthoc_func_json, + train_without_resampling, + save_to_csv, +): + """ + This function helps to use pipeline analysis, summary and posthoc tests by CLI. Output path is path to result csv file from analysis pipeline. + """ + print("Start") + if run_analysis: + print("Run analysis pipeline") + config = Config.from_json(config_json) + pipeline = AnalysisPipeline(config) + pipeline.run_analysis(output_path, train_without_resampling) + + if summary: + print("Run generate summary") + with open(query_json, "r") as f: + query_dict = json.load(f) + + aggregate_func = [] + if aggregate_json is not None: + with open(aggregate_json, "r") as f: + aggregate_func_paths = json.load(f) + aggregate_func = list(map(import_from_string, aggregate_func_paths)) + + AnalysisPipeline.generate_summary(query_dict, output_path, save_to_csv, aggregate_func=aggregate_func) + + if posthoc_analysis: + print("Run generate posthoc analysis") + + with open(posthoc_query_json, "r") as f: + query_dict = json.load(f) + + with open(posthoc_func_json, "r") as f: + posthoc_func_paths = json.load(f) + posthoc_func = [[import_from_string(func_path), params] for func_path, params in posthoc_func_paths.items()] + + AnalysisPipeline.generate_posthoc_analysis(query_dict, output_path, posthoc_func_list=posthoc_func, save_to_csv=save_to_csv) + + print("Done") + + +if __name__ == "__main__": # pragma no cover + main() diff --git a/multi_imbalance/datasets/helpers.py b/multi_imbalance/datasets/helpers.py new file mode 100644 index 0000000..88832d6 --- /dev/null +++ b/multi_imbalance/datasets/helpers.py @@ -0,0 +1,108 @@ +from dataclasses import dataclass +import importlib +import json +from typing import Callable, Dict, List, Tuple, Union +from sklearn.base import ClassifierMixin +from imblearn.base import BaseSampler + + +def import_from_string(cls_path: str) -> Union[BaseSampler, ClassifierMixin, Callable]: + module_name, class_name = cls_path.rsplit(".", 1) + module = importlib.import_module(module_name) + return getattr(module, class_name) + + +@dataclass +class Config: + """A class representing the configuration for an analysis pipeline. + + Attributes: + ---------- + datasets: A list of dataset names to use in the analysis pipeline. + classifiers: A dictionary mapping classifier objects to lists of dictionaries containing the hyperparameters to use for each classifier. + resampling_methods: A dictionary mapping resampling objects to dictionaries of hyperparameters to use for each resampling method. + metrics: A dictionary mapping metric functions to dictionaries of hyperparameters to use for each metric. + n_repeats: The number of times to repeat the experiment for datasets. + split_method: A dictionary mapping split method to dictionaries of additional parameters. + There are two options to choose: `Kfold` (StratifiedKFold) and `train_test` (train_test_split). + """ + + datasets: List[str] + classifiers: Dict[ClassifierMixin, List[Dict]] + resampling_methods: Dict[BaseSampler, Dict[str, Dict]] + metrics: Dict[Callable, Dict] + n_repeats: int + split_method: List[Tuple[str, Dict]] + + @classmethod + def from_dict(cls, config: Dict) -> "Config": + """Load configuration from a dict. + + :param: config: + The dict containing the configuration. + + :return: + A Config object representing the configuration from dict. + """ + return cls(**config) + + @classmethod + def from_json(cls, json_path: str) -> "Config": + """Load configuration from a JSON file. + + :param: json_path: + The path to the JSON file containing the configuration to load. + + :return: + A Config object representing the configuration from the JSON file. + """ + with open(json_path, "r") as f: + dict_config = json.load(f) + + tmp_dict = {} + for clf_path, clf_params in dict_config["classifiers"].items(): + tmp_dict[import_from_string(clf_path)] = clf_params + + dict_config["classifiers"].clear() + dict_config["classifiers"].update(tmp_dict) + tmp_dict.clear() + + for resample_path, resample_params in dict_config["resampling_methods"].items(): + tmp_dict[import_from_string(resample_path)] = resample_params + + dict_config["resampling_methods"].clear() + dict_config["resampling_methods"].update(tmp_dict) + tmp_dict.clear() + + for metric_path, metric_params in dict_config["metrics"].items(): + tmp_dict[import_from_string(metric_path)] = metric_params + + dict_config["metrics"].clear() + dict_config["metrics"].update(tmp_dict) + tmp_dict.clear() + + return cls(**dict_config) + + +@dataclass +class Result: + """ + Result class is used to store the results of a model evaluation in the analysis pipeline. + + Attributes: + ---------- + metric_name (str): The name of the evaluation metric. + classifier (str): The name of the classifier used. + dataset_name (str): The name of the dataset used. + resampling_method (str): The method used for resampling the data. + metric_value (float): The value of the evaluation metric. + no_repeat (int): The number of times the model was trained and evaluated. + clf_params (Dict): The parameters used for the classifier.""" + + metric_name: str + classifier: str + dataset_name: str + resampling_method: str + metric_value: float + no_repeat: int + clf_params: Dict diff --git a/pyproject.toml b/pyproject.toml index 59165d4..9ee1317 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,8 @@ dependencies = [ "imbalanced-learn~=0.9.1", "seaborn~=0.12.1", "matplotlib~=3.6.2", + "click~=8.1.3", + "scikit-posthoc~=0.7.0" ] [project.optional-dependencies] diff --git a/tests/datasets/test_analysis.py b/tests/datasets/test_analysis.py index 786ac57..d60aafd 100644 --- a/tests/datasets/test_analysis.py +++ b/tests/datasets/test_analysis.py @@ -1,13 +1,15 @@ +import json +from pathlib import Path +from click.testing import CliRunner import pytest from sklearn.metrics import accuracy_score from sklearn.tree import DecisionTreeClassifier from sklearn.linear_model import LinearRegression import pandas as pd from imblearn.metrics import geometric_mean_score -import numpy as np +from scikit_posthocs import posthoc_dunn - -from multi_imbalance.datasets.analysis import AnalysisPipeline, Config +from multi_imbalance.datasets.analysis import AnalysisPipeline, Config, main from multi_imbalance.resampling.global_cs import GlobalCS from multi_imbalance.resampling.soup import SOUP @@ -16,10 +18,10 @@ def get_dummy_config(): return { "datasets": [], "classifiers": {}, - "resample_methods": {}, + "resampling_methods": {}, "metrics": {}, "n_repeats": 2, - "train_test_split_params": {}, + "split_method": ["train_test", {}], } @@ -28,47 +30,144 @@ def dataset_file(tmp_path): filename = tmp_path / "dataset.csv" filename.touch() - return str(filename) + return filename @pytest.fixture def output_file(tmp_path): filename = tmp_path / "output.csv" - return str(filename) + return filename + + +@pytest.fixture +def output_csv(tmp_path): + filename = tmp_path / "output.csv" + filename.touch() + columns = ["metric_name", "classifier", "dataset_name", "resampling_method", "metric_value", "no_repeat", "clf_params"] + data = [ + ["geometric_mean_score", "decisiontreeclassifier", "glass", "globalcs", 0.5, 0, {}], + ["geometric_mean_score", "decisiontreeclassifier", "glass", "globalcs", 0.5, 1, {}], + ["geometric_mean_score", "decisiontreeclassifier", "glass", "globalcs", 1, 2, {}], + ["geometric_mean_score", "decisiontreeclassifier", "glass", "globalcs", 1, 3, {}], + ] + df = pd.DataFrame(data, columns=columns) + df.to_csv(filename, index=False) + return filename + + +@pytest.fixture +def run_analysis_config(dataset_file): + return { + "datasets": [dataset_file], + "classifiers": {DecisionTreeClassifier: [{"max_depth": 30}]}, + "resampling_methods": {GlobalCS: {"default": {"shuffle": True}}}, + "metrics": {geometric_mean_score: {"correction": 0.005}, accuracy_score: {}}, + "n_repeats": 2, + "split_method": ["Kfold", dict(n_splits=2, shuffle=True)], + } + + +@pytest.fixture +def run_analysis_config_json(dataset_file, tmp_path): + config = { + "datasets": [str(dataset_file)], + "classifiers": {"sklearn.tree.DecisionTreeClassifier": [{"max_depth": 30}]}, + "resampling_methods": {"multi_imbalance.resampling.global_cs.GlobalCS": {"default": {"shuffle": True}}}, + "metrics": {"imblearn.metrics.geometric_mean_score": {"correction": 0.005}, "sklearn.metrics.accuracy_score": {}}, + "n_repeats": 2, + "split_method": ["Kfold", dict(n_splits=2, shuffle=True)], + } + json_path = tmp_path / "config.json" + with open(json_path, "w") as f: + json.dump(config, f) + + return json_path + + +@pytest.fixture +def query_dict(): + return { + "classifier": ["decisiontreeclassifier"], + "metric_name": ["geometric_mean_score"], + "dataset_name": ["dataset"], + "resampling_method": ["globalcs"], + } + + +@pytest.fixture +def query_dict_json(query_dict, tmp_path): + json_path = tmp_path / "query.json" + with open(json_path, "w") as f: + json.dump(query_dict, f) + + return json_path + + +@pytest.fixture +def prepare_dataset_file(dataset_file, X_ecoc, y_ecoc): + df = pd.DataFrame(X_ecoc, columns=["X1", "X2", "X3"]) + df["y"] = y_ecoc + df.to_csv(dataset_file, index=False) + + +@pytest.fixture +def config_dict(): + return { + "datasets": ["path/to/data"], + "classifiers": { + "tree": [{}], + }, + "resampling_methods": { + "globalCS": {}, + }, + "metrics": {lambda x, y: (x, y): {}}, + "n_repeats": 2, + "split_method": ["train_test", {}], + } + + +@pytest.fixture +def config_json(tmp_path): + config_dict = { + "datasets": [], + "classifiers": {"sklearn.tree.DecisionTreeClassifier": [{}]}, + "resampling_methods": {"multi_imbalance.resampling.global_cs.GlobalCS": {"default": {}}}, + "metrics": {"imblearn.metrics.geometric_mean_score": {}}, + "n_repeats": 2, + "split_method": ["train_test", {}], + } + json_path = tmp_path / "config.json" + with open(json_path, "w") as f: + json.dump(config_dict, f) + + return json_path -@pytest.mark.parametrize( - "config_dict", - [ - { - "datasets": ["path/to/data"], - "classifiers": { - "tree": [{}], - }, - "resample_methods": { - "globalCS": {}, - }, - "metrics": {lambda x, y: (x, y): {}}, - "n_repeats": 2, - "train_test_split_params": dict(test_size=0.2), - } - ], -) def test_config_from_dict(config_dict): config = Config.from_dict(config_dict) assert config.datasets == config_dict["datasets"] assert config.classifiers == config_dict["classifiers"] - assert config.resample_methods == config_dict["resample_methods"] + assert config.resampling_methods == config_dict["resampling_methods"] assert config.metrics == config_dict["metrics"] assert config.n_repeats == config_dict["n_repeats"] - assert config.train_test_split_params == config_dict["train_test_split_params"] + assert config.split_method == config_dict["split_method"] + + +def test_config_from_json(config_json): + config = Config.from_json(config_json) + for clf, clf_params in config.classifiers.items(): + for param in clf_params: + assert isinstance(clf(**param), DecisionTreeClassifier) + + for resample, resample_params in config.resampling_methods.items(): + assert isinstance(resample(**resample_params["default"]), GlobalCS) @pytest.mark.parametrize( "classifier, expected_name, expected_clf", - [("tree", "tree", DecisionTreeClassifier), (LinearRegression, "linearregression", LinearRegression)], + [(DecisionTreeClassifier, "decisiontreeclassifier", DecisionTreeClassifier), (LinearRegression, "linearregression", LinearRegression)], ) def test_get_classifier(classifier, expected_name, expected_clf): config_dict = get_dummy_config() @@ -86,17 +185,17 @@ def test_get_classifier(classifier, expected_name, expected_clf): @pytest.mark.parametrize( "resampler, expected_name, expected_resampler", - [("globalCS", "globalCS", GlobalCS), (SOUP, "soup", SOUP)], + [(GlobalCS, "globalcs", GlobalCS), (SOUP, "soup", SOUP)], ) def test_get_resampler(resampler, expected_name, expected_resampler): config_dict = get_dummy_config() - config_dict["resample_methods"].update({resampler: {}}) + config_dict["resampling_methods"].update({resampler: {"default": {}}}) config = Config.from_dict(config_dict) pipeline = AnalysisPipeline(config) - resampler_name, resampler = next(pipeline._get_resampler(train_without_resampling=False)) + resampler_name, resampler = next(pipeline._get_resampler(train_without_resampling=False, dataset_name="dataset")) assert resampler_name == expected_name assert isinstance(resampler, expected_resampler) @@ -117,28 +216,165 @@ def test_get_dataset(data, columns, tmp_path, dataset_file): pd.testing.assert_frame_equal(df, expected_df) -def test_run_analysis(X_ecoc, y_ecoc, dataset_file, output_file): - df = pd.DataFrame(X_ecoc, columns=["X1", "X2", "X3"]) - df["y"] = y_ecoc - df.to_csv(dataset_file, index=False) - config_dict = { - "datasets": [dataset_file], - "classifiers": dict(tree=[{"max_depth": 30}]), - "resample_methods": dict(globalCS={"shuffle": True}), - "metrics": {geometric_mean_score: {"correction": 0.005}, accuracy_score: {}}, - "n_repeats": 2, - "train_test_split_params": dict(test_size=0.2, random_state=42), - } +@pytest.mark.parametrize( + "kwargs, expected_output", + [ + ( + { + "dataset_name": "glass", + "clf_name": "decisiontreeclassifier", + "resampler_name": "globalcs", + "metric_name": "geometric_mean_score", + "n": 0, + "clf_params": {}, + }, + True, + ), + ( + { + "dataset_name": "dataset", + "clf_name": "clf", + "resampler_name": "resampler", + "metric_name": "metric", + "n": 0, + "clf_params": {}, + }, + False, + ), + ], +) +def test_check_if_exist_in_csv(output_csv, kwargs, expected_output): + config_dict = get_dummy_config() + config = Config.from_dict(config_dict) + pipeline = AnalysisPipeline(config) + + output = pipeline._check_if_exist_in_csv(path=output_csv, **kwargs) + assert output == expected_output + + +@pytest.mark.parametrize("split_method", [["Kfold", dict(n_splits=4, shuffle=True)], ["train_test", {}]]) +def test_run_analysis(prepare_dataset_file, output_file, run_analysis_config, split_method): + run_analysis_config["split_method"] = split_method + config = Config.from_dict(run_analysis_config) pipeline = AnalysisPipeline(config) pipeline.run_analysis(output_file, train_without_resampling=True) - result_df = pd.read_csv(output_file) - assert (result_df["dataset_name"] == "dataset").all() - assert (result_df["classifier"] == "tree").all() - np.testing.assert_array_equal(result_df["metric_name"].unique(), ["geometric_mean_score", "accuracy_score"]) - np.testing.assert_array_almost_equal(result_df["metric_value"].unique(), [0.018803, 0.166667, 0.005, 0.0]) + assert pipeline.dataset_names == ["dataset"] + assert pipeline.clf_names == ["decisiontreeclassifier"] + assert sorted(pipeline.resampling_methods) == ["Not defined", "globalcs"] + assert sorted(pipeline.metric_names) == ["accuracy_score", "geometric_mean_score"] + assert sorted(pipeline.column_names) == [ + "classifier", + "clf_params", + "dataset_name", + "metric_name", + "metric_value", + "no_repeat", + "resampling_method", + ] + + +def test_run_analysis_cli(prepare_dataset_file, output_file, run_analysis_config_json): + runner = CliRunner() + result = runner.invoke( + main, + [ + str(output_file), + "--run-analysis", + "--config-json", + str(run_analysis_config_json), + ], + ) + assert result.exit_code == 0 + assert result.output == "Start\nRun analysis pipeline\nDone\n" + + assert Path(output_file).exists() + + +def test_generate_summary(prepare_dataset_file, query_dict, output_file, run_analysis_config, tmp_path): + config = Config.from_dict(run_analysis_config) + pipeline = AnalysisPipeline(config) + pipeline.run_analysis(output_file, train_without_resampling=False) + + list_of_df = pipeline.generate_summary(query_dict, csv_path=output_file, save_to_csv=True) + + assert len(list_of_df) == 1 + df = list_of_df[0] + assert df.shape[0] == 1 + assert (tmp_path / ("_".join([j for i in query_dict.values() for j in i]) + ".csv")).exists() is True + + +def test_generate_summary_cli(prepare_dataset_file, query_dict_json, output_file, run_analysis_config, tmp_path): + config = Config.from_dict(run_analysis_config) + pipeline = AnalysisPipeline(config) + pipeline.run_analysis(output_file, train_without_resampling=False) + + aggr_func_path = tmp_path / "aggr_func.json" + with open(aggr_func_path, "w") as f: + json.dump(["numpy.median"], f) + + runner = CliRunner() + result = runner.invoke( + main, + [str(output_file), "--summary", "--query-json", str(query_dict_json), "--save-to-csv", "--aggregate-json", str(aggr_func_path)], + ) + assert result.exit_code == 0 + assert result.output == "Start\nRun generate summary\nDone\n" + + with open(query_dict_json, "r") as f: + query_dict = json.load(f) + + assert (tmp_path / ("_".join([j for i in query_dict.values() for j in i]) + ".csv")).exists() is True + + +def test_generate_posthoc_analysis(prepare_dataset_file, query_dict, output_file, run_analysis_config, tmp_path): + config = Config.from_dict(run_analysis_config) + pipeline = AnalysisPipeline(config) + pipeline.run_analysis(output_file, train_without_resampling=False) + query_dict.pop("resampling_method") + list_of_df = pipeline.generate_posthoc_analysis( + query_dict, csv_path=output_file, posthoc_func_list=[[posthoc_dunn, {}]], save_to_csv=True + ) + + assert len(list_of_df) == 1 + df = list_of_df[0] + assert df.shape[0] == 1 + assert (tmp_path / ("_".join([posthoc_dunn.__name__, *[j for i in query_dict.values() for j in i]]) + ".csv")).exists() is True + + +def test_generate_posthoc_analysis_cli(prepare_dataset_file, query_dict, output_file, run_analysis_config, tmp_path): + config = Config.from_dict(run_analysis_config) + pipeline = AnalysisPipeline(config) + pipeline.run_analysis(output_file, train_without_resampling=False) + query_dict.pop("resampling_method") + + posthoc_func_path = tmp_path / "posthoc_func.json" + with open(posthoc_func_path, "w") as f: + json.dump({"scikit_posthocs.posthoc_dunn": {}}, f) + + query_dict_path = tmp_path / "query_dict.json" + with open(query_dict_path, "w") as f: + json.dump(query_dict, f) + + runner = CliRunner() + result = runner.invoke( + main, + [ + str(output_file), + "--posthoc-analysis", + "--posthoc-query-json", + str(query_dict_path), + "--save-to-csv", + "--posthoc-func-json", + str(posthoc_func_path), + ], + ) + assert result.exit_code == 0 + assert result.output == "Start\nRun generate posthoc analysis\nDone\n" + + assert (tmp_path / ("_".join([posthoc_dunn.__name__, *[j for i in query_dict.values() for j in i]]) + ".csv")).exists() is True def test_get_dataset_wrong_path(): @@ -158,20 +394,20 @@ def test_get_dataset_wrong_path(): @pytest.mark.parametrize( "wrong_resampler, expected_exception", [ - ("wrong", "Unknown resample method: wrong, expected to be one of ['globalCS', 'StaticSMOTE', 'SOUP', 'spider3', 'MDO']"), (lambda x: x, "Your resampler must implement fit_resample method"), + (GlobalCS, "Must define default params for all datasets or for the specific dataset"), ], ) def test_get_resampler_wrong(wrong_resampler, expected_exception): config_dict = get_dummy_config() - config_dict["resample_methods"].update({wrong_resampler: {}}) + config_dict["resampling_methods"].update({wrong_resampler: {}}) config = Config.from_dict(config_dict) pipeline = AnalysisPipeline(config) - with pytest.raises(ValueError) as ex: - next(pipeline._get_resampler(train_without_resampling=False)) + with pytest.raises(Exception) as ex: + next(pipeline._get_resampler(train_without_resampling=False, dataset_name="dataset")) assert ex.value.args[0] == expected_exception @@ -179,7 +415,6 @@ def test_get_resampler_wrong(wrong_resampler, expected_exception): @pytest.mark.parametrize( "wrong_clf, expected_exception", [ - ("wrong", "Unknown classifier: wrong, expected to be one of ['tree', 'NB', 'KNN']"), (lambda x: x, "Your classifier must implement fit and predict methods"), ], ) @@ -195,3 +430,39 @@ def test_get_classifier_wrong(wrong_clf, expected_exception): next(pipeline._get_classifier()) assert ex.value.args[0] == expected_exception + + +def test_run_analysis_exception(X_ecoc, y_ecoc, dataset_file, output_file, caplog): + class DummyClf: + def __init__(*args, **kwargs): + pass + + def fit(*args): + raise Exception("Error during fit") + + def predict(*args): + pass + + df = pd.DataFrame(X_ecoc, columns=["X1", "X2", "X3"]) + df["y"] = y_ecoc + df.to_csv(dataset_file, index=False) + config_dict = { + "datasets": [dataset_file], + "classifiers": {DummyClf: [{"max_depth": 30}]}, + "resampling_methods": {}, + "metrics": {geometric_mean_score: {"correction": 0.005}, accuracy_score: {}}, + "n_repeats": 2, + "split_method": ["train_test", {}], + } + config = Config.from_dict(config_dict) + + pipeline = AnalysisPipeline(config) + pipeline.run_analysis(output_file, train_without_resampling=True) + + for record in caplog.records: + if record.levelname == "ERROR": + assert ( + record.message + == "Raised exception: 'Error during fit' for dataset_name='dataset', resampler_name='Not defined' and clf_name='dummyclf'\n" + "Raised exception: 'Error during fit' for dataset_name='dataset', resampler_name='Not defined' and clf_name='dummyclf'" + ) diff --git a/tests/datasets/test_data_loader.py b/tests/datasets/test_data_loader.py index ff2bec4..634680e 100644 --- a/tests/datasets/test_data_loader.py +++ b/tests/datasets/test_data_loader.py @@ -29,9 +29,10 @@ def test_load_datasets(): print("Testing loading datasets") data_home = join(".", "data") - datasets = load_datasets(data_home=data_home) + datasets = load_datasets(data_home=data_home, save_to_csv=True) for k in DATASET_SHAPE.keys(): X = datasets[k].data assert DATASET_SHAPE[k] == X.shape shutil.rmtree(join(data_home, "extracted")) + shutil.rmtree(join(data_home, "csv")) From ba99e3df46c39dc72b79240c6d0ce1ece2f7539a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Wed, 11 Jan 2023 15:00:11 +0100 Subject: [PATCH 25/48] continue --- examples/datasets/analysis.ipynb | 1083 ++++++++++++++++++++++++++ multi_imbalance/datasets/analysis.py | 4 +- 2 files changed, 1085 insertions(+), 2 deletions(-) create mode 100644 examples/datasets/analysis.ipynb diff --git a/examples/datasets/analysis.ipynb b/examples/datasets/analysis.ipynb new file mode 100644 index 0000000..2723e88 --- /dev/null +++ b/examples/datasets/analysis.ipynb @@ -0,0 +1,1083 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Example usage for analysis pipeline" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this notebook, we will be exploring the functionality that allows to run various classification and resampling methods of different datasets using both Python code and the command line interface (CLI). We will be able to compare the results and efficiency of these methods in order to determine the best approach for our specific use case." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Python code" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from imblearn.metrics import geometric_mean_score\n", + "from pathlib import Path\n", + "from sklearn.metrics import accuracy_score\n", + "from tempfile import NamedTemporaryFile\n", + "from scikit_posthocs import posthoc_wilcoxon\n", + "\n", + "from multi_imbalance.datasets.analysis import AnalysisPipeline, Config, Result\n", + "from multi_imbalance.datasets import load_datasets\n", + "from multi_imbalance.resampling.soup import SOUP\n", + "from multi_imbalance.resampling.spider import SPIDER3\n", + "from multi_imbalance.resampling.static_smote import StaticSMOTE\n", + "from multi_imbalance.resampling.global_cs import GlobalCS\n", + "from multi_imbalance.resampling.mdo import MDO" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load datasets from `data.tar.gz` file to csv files" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "_ = load_datasets(save_to_csv=True)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare configuration for analysis pipeline" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this example we will be working with three datasets: glass, new_ecoli, and dermatology. We will be applying two classifiers (Decision Tree, K-Nearest Neighbors) with two different configurations each to these datasets. We will be using three resampling methods (GlobalCS, MDO, SOUP) with default configurations as well as special configurations for MDO and glass dataset. These combinations will be evaluated using two metrics: geometric mean score and accuracy score. We will be repeating each combination five times and using the train_test_split method to divide the data into train and test sets." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "cwd = Path.cwd()\n", + "\n", + "config = {\n", + " \"datasets\": [cwd.parents[1] / \"data\" / \"csv\" / \"glass.csv\", cwd.parents[1] / \"data\" / \"csv\" / \"new_ecoli.csv\", cwd.parents[1] / \"data\" / \"csv\" / \"dermatology.csv\"],\n", + " \"classifiers\": {\n", + " DecisionTreeClassifier: [{\"max_depth\" : 100}, {}],\n", + " KNeighborsClassifier: [{\"n_neighbors\": 7}, {}],\n", + " },\n", + " \"resampling_methods\": {\n", + " GlobalCS: {\"default\": {\"shuffle\": True}},\n", + " MDO: {\"default\": {\"k1_frac\": 0.3, \"maj_int_min\":{\"maj\": [0, 1], \"min\": [2, 3, 4, 5]}}, \"glass\": {\"k1_frac\": 0.5}},\n", + " SOUP: {\"default\" : {\"shuffle\": True}},\n", + " },\n", + " \"metrics\": {geometric_mean_score: {\"correction\": 0.001}, accuracy_score: {}},\n", + " \"n_repeats\": 10, \n", + " \"split_method\": [\"train_test\",{}],\n", + "}" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Prepare temporary file for result" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "result_file = NamedTemporaryFile(suffix=\".csv\")\n", + "result_file.close()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create an AnalysisPipeline object and run the analysis, including a comparison of training without resampling to demonstrate the effectiveness of the resampling methods." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Mateusz\\Desktop\\venvy\\.project_venv\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1334: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.project_venv\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1334: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.project_venv\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1334: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.project_venv\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1334: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.project_venv\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1334: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.project_venv\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1334: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.project_venv\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1334: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.project_venv\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1334: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.project_venv\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1334: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "c:\\Users\\Mateusz\\Desktop\\venvy\\.project_venv\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1334: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n" + ] + } + ], + "source": [ + "c = Config.from_dict(config)\n", + "pipeline = AnalysisPipeline(c)\n", + "pipeline.run_analysis(result_file.name, train_without_resampling = True)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generate summary for selected classifiers, metric and dataset. If you don't know which names should you write you can use appropriate property to find e.g. dataset name" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(['dermatology', 'glass', 'new_ecoli'],\n", + " ['kneighborsclassifier', 'decisiontreeclassifier'])" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.dataset_names, pipeline.clf_names" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "query_dict = {\"classifier\": [\"decisiontreeclassifier\", \"kneighborsclassifier\"], \"metric_name\": [\"geometric_mean_score\"], \"dataset_name\": [\"glass\", \"new_ecoli\"]}\n", + "\n", + "results = pipeline.generate_summary(query_dict,save_to_csv=False, csv_path=result_file.name, aggregate_func=[min])" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There is two classifiers, one metric and two datasets, so the length of results will be $2\\cdot1\\cdot2=4$" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(results)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### First result" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
metric_value
meanstdmin
metric_nameclassifierdataset_nameresampling_methodclf_params
geometric_mean_scoredecisiontreeclassifierglassNot defined{'max_depth': 100}0.3565120.2225120.177231
{}0.5669070.1859090.225857
globalcs{'max_depth': 100}0.4778720.2176230.179135
{}0.4470950.2373630.057638
mdo{'max_depth': 100}0.5891950.1709550.205291
{}0.5059580.2399500.068824
soup{'max_depth': 100}0.6131240.2189050.204827
{}0.5826460.1985820.209060
\n", + "
" + ], + "text/plain": [ + " metric_value \\\n", + " mean \n", + "metric_name classifier dataset_name resampling_method clf_params \n", + "geometric_mean_score decisiontreeclassifier glass Not defined {'max_depth': 100} 0.356512 \n", + " {} 0.566907 \n", + " globalcs {'max_depth': 100} 0.477872 \n", + " {} 0.447095 \n", + " mdo {'max_depth': 100} 0.589195 \n", + " {} 0.505958 \n", + " soup {'max_depth': 100} 0.613124 \n", + " {} 0.582646 \n", + "\n", + " \\\n", + " std \n", + "metric_name classifier dataset_name resampling_method clf_params \n", + "geometric_mean_score decisiontreeclassifier glass Not defined {'max_depth': 100} 0.222512 \n", + " {} 0.185909 \n", + " globalcs {'max_depth': 100} 0.217623 \n", + " {} 0.237363 \n", + " mdo {'max_depth': 100} 0.170955 \n", + " {} 0.239950 \n", + " soup {'max_depth': 100} 0.218905 \n", + " {} 0.198582 \n", + "\n", + " \n", + " min \n", + "metric_name classifier dataset_name resampling_method clf_params \n", + "geometric_mean_score decisiontreeclassifier glass Not defined {'max_depth': 100} 0.177231 \n", + " {} 0.225857 \n", + " globalcs {'max_depth': 100} 0.179135 \n", + " {} 0.057638 \n", + " mdo {'max_depth': 100} 0.205291 \n", + " {} 0.068824 \n", + " soup {'max_depth': 100} 0.204827 \n", + " {} 0.209060 " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results[0]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Second result" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
metric_value
meanstdmin
metric_nameclassifierdataset_nameresampling_methodclf_params
geometric_mean_scoredecisiontreeclassifiernew_ecoliNot defined{'max_depth': 100}0.7497840.0664260.602696
{}0.6989340.0528910.620988
globalcs{'max_depth': 100}0.7179900.0348460.670200
{}0.7008890.0800180.565038
mdo{'max_depth': 100}0.7677890.0455420.713386
{}0.7713180.0489290.703493
soup{'max_depth': 100}0.7382330.0537730.649839
{}0.7146000.0635480.583691
\n", + "
" + ], + "text/plain": [ + " metric_value \\\n", + " mean \n", + "metric_name classifier dataset_name resampling_method clf_params \n", + "geometric_mean_score decisiontreeclassifier new_ecoli Not defined {'max_depth': 100} 0.749784 \n", + " {} 0.698934 \n", + " globalcs {'max_depth': 100} 0.717990 \n", + " {} 0.700889 \n", + " mdo {'max_depth': 100} 0.767789 \n", + " {} 0.771318 \n", + " soup {'max_depth': 100} 0.738233 \n", + " {} 0.714600 \n", + "\n", + " \\\n", + " std \n", + "metric_name classifier dataset_name resampling_method clf_params \n", + "geometric_mean_score decisiontreeclassifier new_ecoli Not defined {'max_depth': 100} 0.066426 \n", + " {} 0.052891 \n", + " globalcs {'max_depth': 100} 0.034846 \n", + " {} 0.080018 \n", + " mdo {'max_depth': 100} 0.045542 \n", + " {} 0.048929 \n", + " soup {'max_depth': 100} 0.053773 \n", + " {} 0.063548 \n", + "\n", + " \n", + " min \n", + "metric_name classifier dataset_name resampling_method clf_params \n", + "geometric_mean_score decisiontreeclassifier new_ecoli Not defined {'max_depth': 100} 0.602696 \n", + " {} 0.620988 \n", + " globalcs {'max_depth': 100} 0.670200 \n", + " {} 0.565038 \n", + " mdo {'max_depth': 100} 0.713386 \n", + " {} 0.703493 \n", + " soup {'max_depth': 100} 0.649839 \n", + " {} 0.583691 " + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results[1]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Third result" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
metric_value
meanstdmin
metric_nameclassifierdataset_nameresampling_methodclf_params
geometric_mean_scorekneighborsclassifierglassNot defined{'n_neighbors': 7}0.2046970.1916630.068485
{}0.1718390.1263950.075119
globalcs{'n_neighbors': 7}0.5430430.1772860.202031
{}0.6723870.0882900.536591
mdo{'n_neighbors': 7}0.3870750.2069100.078940
{}0.1456100.0616340.074212
soup{'n_neighbors': 7}0.5854210.1548680.206894
{}0.3545660.2069000.153585
\n", + "
" + ], + "text/plain": [ + " metric_value \\\n", + " mean \n", + "metric_name classifier dataset_name resampling_method clf_params \n", + "geometric_mean_score kneighborsclassifier glass Not defined {'n_neighbors': 7} 0.204697 \n", + " {} 0.171839 \n", + " globalcs {'n_neighbors': 7} 0.543043 \n", + " {} 0.672387 \n", + " mdo {'n_neighbors': 7} 0.387075 \n", + " {} 0.145610 \n", + " soup {'n_neighbors': 7} 0.585421 \n", + " {} 0.354566 \n", + "\n", + " \\\n", + " std \n", + "metric_name classifier dataset_name resampling_method clf_params \n", + "geometric_mean_score kneighborsclassifier glass Not defined {'n_neighbors': 7} 0.191663 \n", + " {} 0.126395 \n", + " globalcs {'n_neighbors': 7} 0.177286 \n", + " {} 0.088290 \n", + " mdo {'n_neighbors': 7} 0.206910 \n", + " {} 0.061634 \n", + " soup {'n_neighbors': 7} 0.154868 \n", + " {} 0.206900 \n", + "\n", + " \n", + " min \n", + "metric_name classifier dataset_name resampling_method clf_params \n", + "geometric_mean_score kneighborsclassifier glass Not defined {'n_neighbors': 7} 0.068485 \n", + " {} 0.075119 \n", + " globalcs {'n_neighbors': 7} 0.202031 \n", + " {} 0.536591 \n", + " mdo {'n_neighbors': 7} 0.078940 \n", + " {} 0.074212 \n", + " soup {'n_neighbors': 7} 0.206894 \n", + " {} 0.153585 " + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results[2]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Fourth result" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
metric_value
meanstdmin
metric_nameclassifierdataset_nameresampling_methodclf_params
geometric_mean_scorekneighborsclassifiernew_ecoliNot defined{'n_neighbors': 7}0.8480280.0390820.805196
{}0.8292930.0259650.777511
globalcs{'n_neighbors': 7}0.7792140.0326800.736093
{}0.7890730.0633140.724630
mdo{'n_neighbors': 7}0.7847510.0409920.722135
{}0.8240050.0410230.755261
soup{'n_neighbors': 7}0.8397630.0349170.789633
{}0.8149690.0591390.710178
\n", + "
" + ], + "text/plain": [ + " metric_value \\\n", + " mean \n", + "metric_name classifier dataset_name resampling_method clf_params \n", + "geometric_mean_score kneighborsclassifier new_ecoli Not defined {'n_neighbors': 7} 0.848028 \n", + " {} 0.829293 \n", + " globalcs {'n_neighbors': 7} 0.779214 \n", + " {} 0.789073 \n", + " mdo {'n_neighbors': 7} 0.784751 \n", + " {} 0.824005 \n", + " soup {'n_neighbors': 7} 0.839763 \n", + " {} 0.814969 \n", + "\n", + " \\\n", + " std \n", + "metric_name classifier dataset_name resampling_method clf_params \n", + "geometric_mean_score kneighborsclassifier new_ecoli Not defined {'n_neighbors': 7} 0.039082 \n", + " {} 0.025965 \n", + " globalcs {'n_neighbors': 7} 0.032680 \n", + " {} 0.063314 \n", + " mdo {'n_neighbors': 7} 0.040992 \n", + " {} 0.041023 \n", + " soup {'n_neighbors': 7} 0.034917 \n", + " {} 0.059139 \n", + "\n", + " \n", + " min \n", + "metric_name classifier dataset_name resampling_method clf_params \n", + "geometric_mean_score kneighborsclassifier new_ecoli Not defined {'n_neighbors': 7} 0.805196 \n", + " {} 0.777511 \n", + " globalcs {'n_neighbors': 7} 0.736093 \n", + " {} 0.724630 \n", + " mdo {'n_neighbors': 7} 0.722135 \n", + " {} 0.755261 \n", + " soup {'n_neighbors': 7} 0.789633 \n", + " {} 0.710178 " + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results[3]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generate posthoc analysis for Wilcoxon test. You have to define names of classifiers, dataset names and metric names in query dict." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "query_dict = {\"classifier\": [\"decisiontreeclassifier\", \"kneighborsclassifier\"], \"metric_name\": [\"geometric_mean_score\"], \"dataset_name\": [\"glass\", \"new_ecoli\"]}\n", + "\n", + "results = AnalysisPipeline.generate_posthoc_analysis(query_dict, save_to_csv=False, csv_path=result_file.name ,posthoc_func_list=[[posthoc_wilcoxon, {}]])" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
posthoc_wilcoxon_decisiontreeclassifier_geometric_mean_score_glassglobalcsmdosoupNot defined
globalcs1.0000000.2454870.0582580.898317
mdo0.2454871.0000000.6215130.329983
soup0.0582580.6215131.0000000.044054
Not defined0.8983170.3299830.0440541.000000
\n", + "
" + ], + "text/plain": [ + "posthoc_wilcoxon_decisiontreeclassifier_geometric_mean_score_glass globalcs \\\n", + "globalcs 1.000000 \n", + "mdo 0.245487 \n", + "soup 0.058258 \n", + "Not defined 0.898317 \n", + "\n", + "posthoc_wilcoxon_decisiontreeclassifier_geometric_mean_score_glass mdo \\\n", + "globalcs 0.245487 \n", + "mdo 1.000000 \n", + "soup 0.621513 \n", + "Not defined 0.329983 \n", + "\n", + "posthoc_wilcoxon_decisiontreeclassifier_geometric_mean_score_glass soup \\\n", + "globalcs 0.058258 \n", + "mdo 0.621513 \n", + "soup 1.000000 \n", + "Not defined 0.044054 \n", + "\n", + "posthoc_wilcoxon_decisiontreeclassifier_geometric_mean_score_glass Not defined \n", + "globalcs 0.898317 \n", + "mdo 0.329983 \n", + "soup 0.044054 \n", + "Not defined 1.000000 " + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "results[0]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".project_venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.2 (tags/v3.9.2:1a79785, Feb 19 2021, 13:44:55) [MSC v.1928 64 bit (AMD64)]" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "23e9df2d9424db89a1bc7cf8b9f3a46204923f77702e66b6afb0e7a76a59f4cc" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/multi_imbalance/datasets/analysis.py b/multi_imbalance/datasets/analysis.py index c1a765f..7f78737 100644 --- a/multi_imbalance/datasets/analysis.py +++ b/multi_imbalance/datasets/analysis.py @@ -94,7 +94,7 @@ def generate_summary( :param save_to_csv: bool, optional, if `True`, the summary will be saved to a CSV file :param save_path: - str, optional, the location where the summary csv files should be saved + str, optional, the location where the summary csv files should be saved. If None summary csv files will be saved in the same location as csv_path :param aggregate_func: Optional[List[Callable]], optional, a list of functions that will be applied to the `metric_value` column of the results to generate the summary :return: @@ -140,7 +140,7 @@ def generate_posthoc_analysis( :param save_to_csv: bool, optional, if `True`, the posthoc analysis will be saved to a CSV file :param save_path: - str, optional, the location where the summary csv files should be saved + str, optional, the location where the summary csv files should be saved. If None summary csv files will be saved in the same location as csv_path :return: List[pd.DataFrame], a list of Pandas DataFrames containing the posthoc analysis of the results """ From bb0a0e36c3a87641e8dfbfc1d26dfeb72fa6ae78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Sat, 14 Jan 2023 13:06:28 +0100 Subject: [PATCH 26/48] gh action --- .github/workflows/code_analysis.yml | 4 ++-- .github/workflows/tests.yml | 30 ++++++++++++++--------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/.github/workflows/code_analysis.yml b/.github/workflows/code_analysis.yml index 663c44e..45f2ca4 100644 --- a/.github/workflows/code_analysis.yml +++ b/.github/workflows/code_analysis.yml @@ -8,7 +8,7 @@ jobs: tools: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v1 - name: Set up Python 3.9 uses: actions/setup-python@v2 with: @@ -16,6 +16,6 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install tox tox-gh-actions + python -m pip install tox tox-gh-actions - name: Run flake8 with tox run: tox -e flake8 \ No newline at end of file diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 9b42d9d..02797ce 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,25 +1,25 @@ name: Tests + on: - push - pull_request jobs: - test: - runs-on: ${{ matrix.os }} + build: + runs-on: ubuntu-latest strategy: matrix: - os: [ubuntu-latest, windows-latest] - python-version: [['3.8', 'py38'], ['3.9', 'py39']] + python-version: ['3.7', '3.8', '3.9', '3.10'] steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version[0] }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version[0] }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install tox tox-gh-actions - - name: Test ${{ matrix.python-version[1] }} with tox - run: tox -e ${{ matrix.python-version[1] }} \ No newline at end of file + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install tox tox-gh-actions + - name: Test with tox + run: tox \ No newline at end of file From 18c22d8378dd8c8d50cbaec10b709edf4959531a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Sat, 14 Jan 2023 13:12:50 +0100 Subject: [PATCH 27/48] tox ini edit --- .github/workflows/code_analysis.yml | 4 ++-- pyproject.toml | 2 +- tox.ini | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/code_analysis.yml b/.github/workflows/code_analysis.yml index 45f2ca4..663c44e 100644 --- a/.github/workflows/code_analysis.yml +++ b/.github/workflows/code_analysis.yml @@ -8,7 +8,7 @@ jobs: tools: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v1 + - uses: actions/checkout@v2 - name: Set up Python 3.9 uses: actions/setup-python@v2 with: @@ -16,6 +16,6 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install tox tox-gh-actions + pip install tox tox-gh-actions - name: Run flake8 with tox run: tox -e flake8 \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 9ee1317..5e4427b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ dependencies = [ "seaborn~=0.12.1", "matplotlib~=3.6.2", "click~=8.1.3", - "scikit-posthoc~=0.7.0" + "scikit-posthocs~=0.7.0" ] [project.optional-dependencies] diff --git a/tox.ini b/tox.ini index aa3ef17..5d3af3e 100644 --- a/tox.ini +++ b/tox.ini @@ -6,8 +6,8 @@ isolated_build = true [gh-actions] python = 3.8: py38 - 3.9: py39, - flake8 + 3.9: py39 + [testenv] setenv = From 5b1d2600a498bd4f94d83619b2d6f6a5dccdbddb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Sat, 14 Jan 2023 13:22:34 +0100 Subject: [PATCH 28/48] flake8 --- multi_imbalance/datasets/analysis.py | 42 ++++++++++++++++++---------- multi_imbalance/datasets/helpers.py | 6 ++-- 2 files changed, 32 insertions(+), 16 deletions(-) diff --git a/multi_imbalance/datasets/analysis.py b/multi_imbalance/datasets/analysis.py index 7f78737..c9cb722 100644 --- a/multi_imbalance/datasets/analysis.py +++ b/multi_imbalance/datasets/analysis.py @@ -24,10 +24,14 @@ class AnalysisPipeline: """ This is a class for an analysis pipeline. - The __init__() method initializes the object, taking a Config object containing the pipeline configuration and an optional csv_path to the results CSV file. - The run_analysis() method runs the analysis on a set of classifiers, datasets, and resamplers, saving the results to the specified output_path. - The explode_clf_params() method takes a input_path to a CSV file and explodes the clf_params column into individual columns, saving the result to output_path. - Finally, the generate_summary() method takes a query_dict specifying which results to include in the summary and an optional aggregate_func to apply to the metric_value column + The __init__() method initializes the object, taking a Config object containing the pipeline configuration + and an optional csv_path to the results CSV file. + The run_analysis() method runs the analysis on a set of classifiers, datasets, and resamplers, + saving the results to the specified output_path. + The explode_clf_params() method takes a input_path to a CSV file and explodes the clf_params column + into individual columns, saving the result to output_path. + Finally, the generate_summary() method takes a query_dict specifying which results to include in the summary + and an optional aggregate_func to apply to the metric_value column and returns a list of Pandas DataFrames containing the summary of the results. """ @@ -51,7 +55,8 @@ def __init__(self, config: Config) -> None: def run_analysis(self, output_path: str, train_without_resampling: bool) -> None: """ - This function runs a specified analysis on a set of classifiers, datasets, and resamplers. The results of the analysis are saved to the specified output path. + This function runs a specified analysis on a set of classifiers, datasets, and resamplers. + The results of the analysis are saved to the specified output path. :param output_path: str, the location where the results of the analysis will be saved as a CSV file @@ -88,15 +93,18 @@ def generate_summary( of the results to generate the summary. :param query_dict: - Dict[str, List[str]], a dictionary that specifies the values of different columns in the results to include in the summary + Dict[str, List[str]], a dictionary that specifies the values of + different columns in the results to include in the summary :param csv_path: str, the path to the CSV file containing the results of the analysis :param save_to_csv: bool, optional, if `True`, the summary will be saved to a CSV file :param save_path: - str, optional, the location where the summary csv files should be saved. If None summary csv files will be saved in the same location as csv_path + str, optional, the location where the summary csv files should be saved. + If None summary csv files will be saved in the same location as csv_path :param aggregate_func: - Optional[List[Callable]], optional, a list of functions that will be applied to the `metric_value` column of the results to generate the summary + Optional[List[Callable]], optional, a list of functions that will be applied + to the `metric_value` column of the results to generate the summary :return: List[pd.DataFrame], a list of Pandas DataFrames containing the summary of the results of the analysis """ @@ -132,7 +140,8 @@ def generate_posthoc_analysis( `csv_path` should be the path to the CSV file containing the results of the analysis. :param query_dict: - Dict[str, List[str]], a dictionary that specifies the values of different columns in the results to include in the posthoc analysis + Dict[str, List[str]], a dictionary that specifies the values of different + columns in the results to include in the posthoc analysis :param csv_path: str, the path to the CSV file containing the results of the analysis :param posthoc_func_list: @@ -140,7 +149,8 @@ def generate_posthoc_analysis( :param save_to_csv: bool, optional, if `True`, the posthoc analysis will be saved to a CSV file :param save_path: - str, optional, the location where the summary csv files should be saved. If None summary csv files will be saved in the same location as csv_path + str, optional, the location where the summary csv files should be saved. + If None summary csv files will be saved in the same location as csv_path :return: List[pd.DataFrame], a list of Pandas DataFrames containing the posthoc analysis of the results """ @@ -215,7 +225,8 @@ def _prepare_result( list_of_errors: List[str], ) -> None: """ - This method prepares the results of running a classifier on a dataset which is resampling by resampler. It will compute the specified metrics for each repeat. + This method prepares the results of running a classifier on a dataset which is resampling by resampler. + It will compute the specified metrics for each repeat. :param clf_data: A tuple containing the name of the classifier, the classifier and a dictionary of classifier parameters @@ -460,7 +471,8 @@ def _get_classifier(self) -> Iterable[Tuple[str, ClassifierMixin, Dict]]: """This method retrieves the classifiers specified in the configuration object. :return: - Iterable[Tuple[str, ClassifierMixin, Dict]], An iterable of tuples containing the classifier name, the classifier and a dictionary of classifier parameters""" + Iterable[Tuple[str, ClassifierMixin, Dict]], An iterable of tuples containing the classifier name, + the classifier and a dictionary of classifier parameters""" for classifier, params_list in self._config.classifiers.items(): if not hasattr(classifier, "fit") or not hasattr(classifier, "predict"): raise ValueError("Your classifier must implement fit and predict methods") @@ -496,7 +508,8 @@ def _get_name(self, estimator: Union[ClassifierMixin, BaseSampler]) -> Tuple[str ) @click.option( "--posthoc-func-json", - help="Path to json file which contain dict with paths to posthoc analysis functions and their params, e.g. {'scikit_posthoc.posthoc_dunn':{}}", + help="Path to json file which contain dict with paths to posthoc analysis" + "functions and their params, e.g. {'scikit_posthoc.posthoc_dunn':{}}", ) @click.option( "--train-without-resampling", @@ -522,7 +535,8 @@ def main( save_to_csv, ): """ - This function helps to use pipeline analysis, summary and posthoc tests by CLI. Output path is path to result csv file from analysis pipeline. + This function helps to use pipeline analysis, summary and posthoc tests by CLI. + Output path is path to result csv file from analysis pipeline. """ print("Start") if run_analysis: diff --git a/multi_imbalance/datasets/helpers.py b/multi_imbalance/datasets/helpers.py index 88832d6..090beef 100644 --- a/multi_imbalance/datasets/helpers.py +++ b/multi_imbalance/datasets/helpers.py @@ -19,8 +19,10 @@ class Config: Attributes: ---------- datasets: A list of dataset names to use in the analysis pipeline. - classifiers: A dictionary mapping classifier objects to lists of dictionaries containing the hyperparameters to use for each classifier. - resampling_methods: A dictionary mapping resampling objects to dictionaries of hyperparameters to use for each resampling method. + classifiers: A dictionary mapping classifier objects to lists of dictionaries + containing the hyperparameters to use for each classifier. + resampling_methods: A dictionary mapping resampling objects to dictionaries + of hyperparameters to use for each resampling method. metrics: A dictionary mapping metric functions to dictionaries of hyperparameters to use for each metric. n_repeats: The number of times to repeat the experiment for datasets. split_method: A dictionary mapping split method to dictionaries of additional parameters. From 89134e29470606434df0c4b91dbbb1f400570406 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Sat, 14 Jan 2023 13:27:07 +0100 Subject: [PATCH 29/48] python version --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 02797ce..b0266c0 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -9,7 +9,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.7', '3.8', '3.9', '3.10'] + python-version: ['3.8', '3.9', '3.10'] steps: - uses: actions/checkout@v1 From 05153e10f3434aad1953d41c26afb4edd42e2131 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Sat, 14 Jan 2023 13:30:19 +0100 Subject: [PATCH 30/48] fix --- tox.ini | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tox.ini b/tox.ini index aa3ef17..51f4878 100644 --- a/tox.ini +++ b/tox.ini @@ -6,8 +6,7 @@ isolated_build = true [gh-actions] python = 3.8: py38 - 3.9: py39, - flake8 + 3.9: py39 [testenv] setenv = From a731fc753f4a1b771fd6675e5485d03c1add9b1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Sat, 14 Jan 2023 13:33:27 +0100 Subject: [PATCH 31/48] old version --- .github/workflows/tests.yml | 30 +++++++++++++++--------------- tox.ini | 4 ++-- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b0266c0..9b42d9d 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,25 +1,25 @@ name: Tests - on: - push - pull_request jobs: - build: - runs-on: ubuntu-latest + test: + runs-on: ${{ matrix.os }} strategy: matrix: - python-version: ['3.8', '3.9', '3.10'] + os: [ubuntu-latest, windows-latest] + python-version: [['3.8', 'py38'], ['3.9', 'py39']] steps: - - uses: actions/checkout@v1 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - python -m pip install tox tox-gh-actions - - name: Test with tox - run: tox \ No newline at end of file + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version[0] }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version[0] }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install tox tox-gh-actions + - name: Test ${{ matrix.python-version[1] }} with tox + run: tox -e ${{ matrix.python-version[1] }} \ No newline at end of file diff --git a/tox.ini b/tox.ini index 5d3af3e..aa3ef17 100644 --- a/tox.ini +++ b/tox.ini @@ -6,8 +6,8 @@ isolated_build = true [gh-actions] python = 3.8: py38 - 3.9: py39 - + 3.9: py39, + flake8 [testenv] setenv = From 8c8a25609783af52ddef51aa9d7076cf2b3390e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Sat, 14 Jan 2023 19:28:00 +0100 Subject: [PATCH 32/48] Final version --- examples/datasets/analysis.ipynb | 1717 ++++++++++++++++++-------- multi_imbalance/datasets/analysis.py | 48 +- multi_imbalance/datasets/helpers.py | 8 + tests/datasets/test_analysis.py | 28 +- 4 files changed, 1288 insertions(+), 513 deletions(-) diff --git a/examples/datasets/analysis.ipynb b/examples/datasets/analysis.ipynb index 2723e88..0432909 100644 --- a/examples/datasets/analysis.ipynb +++ b/examples/datasets/analysis.ipynb @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -44,7 +44,8 @@ "from pathlib import Path\n", "from sklearn.metrics import accuracy_score\n", "from tempfile import NamedTemporaryFile\n", - "from scikit_posthocs import posthoc_wilcoxon\n", + "from scikit_posthocs import posthoc_wilcoxon, posthoc_mannwhitney\n", + "import pandas as pd\n", "\n", "from multi_imbalance.datasets.analysis import AnalysisPipeline, Config, Result\n", "from multi_imbalance.datasets import load_datasets\n", @@ -65,7 +66,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -90,26 +91,29 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "cwd = Path.cwd()\n", "\n", "config = {\n", - " \"datasets\": [cwd.parents[1] / \"data\" / \"csv\" / \"glass.csv\", cwd.parents[1] / \"data\" / \"csv\" / \"new_ecoli.csv\", cwd.parents[1] / \"data\" / \"csv\" / \"dermatology.csv\"],\n", + " \"datasets\": [\n", + " cwd.parents[1] / \"data\" / \"csv\" / \"cmc.csv\",\n", + " cwd.parents[1] / \"data\" / \"csv\" / \"new_ecoli.csv\",\n", + " cwd.parents[1] / \"data\" / \"csv\" / \"cleveland.csv\",\n", + " ],\n", " \"classifiers\": {\n", - " DecisionTreeClassifier: [{\"max_depth\" : 100}, {}],\n", + " DecisionTreeClassifier: [{\"max_depth\": 100}, {}],\n", " KNeighborsClassifier: [{\"n_neighbors\": 7}, {}],\n", " },\n", " \"resampling_methods\": {\n", " GlobalCS: {\"default\": {\"shuffle\": True}},\n", - " MDO: {\"default\": {\"k1_frac\": 0.3, \"maj_int_min\":{\"maj\": [0, 1], \"min\": [2, 3, 4, 5]}}, \"glass\": {\"k1_frac\": 0.5}},\n", - " SOUP: {\"default\" : {\"shuffle\": True}},\n", + " MDO: {\"default\": {\"k1_frac\": 0.3, \"maj_int_min\": {\"maj\": [0, 1], \"min\": [2, 3, 4, 5]}}, \"cmc\": {\"k1_frac\": 0.5}},\n", " },\n", " \"metrics\": {geometric_mean_score: {\"correction\": 0.001}, accuracy_score: {}},\n", - " \"n_repeats\": 10, \n", - " \"split_method\": [\"train_test\",{}],\n", + " \"n_repeats\": 20,\n", + " \"split_method\": [\"train_test\", {\"test_size\": 0.3}],\n", "}" ] }, @@ -123,7 +127,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ @@ -141,40 +145,13 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 78, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "c:\\Users\\Mateusz\\Desktop\\venvy\\.project_venv\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1334: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.project_venv\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1334: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.project_venv\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1334: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.project_venv\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1334: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.project_venv\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1334: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.project_venv\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1334: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.project_venv\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1334: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.project_venv\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1334: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.project_venv\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1334: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n", - "c:\\Users\\Mateusz\\Desktop\\venvy\\.project_venv\\lib\\site-packages\\sklearn\\metrics\\_classification.py:1334: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", - " _warn_prf(average, modifier, msg_start, len(result))\n" - ] - } - ], + "outputs": [], "source": [ "c = Config.from_dict(config)\n", "pipeline = AnalysisPipeline(c)\n", - "pipeline.run_analysis(result_file.name, train_without_resampling = True)" + "pipeline.run_analysis(result_file.name, train_without_resampling=True)" ] }, { @@ -187,17 +164,17 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 79, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(['dermatology', 'glass', 'new_ecoli'],\n", - " ['kneighborsclassifier', 'decisiontreeclassifier'])" + "(['new_ecoli', 'cmc', 'cleveland'],\n", + " ['decisiontreeclassifier', 'kneighborsclassifier'])" ] }, - "execution_count": 21, + "execution_count": 79, "metadata": {}, "output_type": "execute_result" } @@ -208,13 +185,19 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 80, "metadata": {}, "outputs": [], "source": [ - "query_dict = {\"classifier\": [\"decisiontreeclassifier\", \"kneighborsclassifier\"], \"metric_name\": [\"geometric_mean_score\"], \"dataset_name\": [\"glass\", \"new_ecoli\"]}\n", + "query_dict = {\n", + " \"classifier\": [\"decisiontreeclassifier\", \"kneighborsclassifier\"],\n", + " \"metric_name\": [\"geometric_mean_score\"],\n", + " \"dataset_name\": [\"cmc\", \"new_ecoli\", \"cleveland\"],\n", + "}\n", "\n", - "results = pipeline.generate_summary(query_dict,save_to_csv=False, csv_path=result_file.name, aggregate_func=[min])" + "summary_results = pipeline.generate_summary(\n", + " query_dict, save_to_csv=False, csv_path=result_file.name, aggregate_func=[min], concat_results=False\n", + ")" ] }, { @@ -222,27 +205,27 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "There is two classifiers, one metric and two datasets, so the length of results will be $2\\cdot1\\cdot2=4$" + "There is two classifiers, one metric and three datasets, so the length of results will be $2\\cdot1\\cdot3=6$" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 81, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "4" + "6" ] }, - "execution_count": 30, + "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "len(results)" + "len(summary_results)" ] }, { @@ -255,7 +238,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 82, "metadata": {}, "outputs": [ { @@ -312,59 +295,46 @@ " \n", " \n", " \n", - " geometric_mean_score\n", - " decisiontreeclassifier\n", - " glass\n", + " geometric_mean_score\n", + " decisiontreeclassifier\n", + " cmc\n", " Not defined\n", " {'max_depth': 100}\n", - " 0.356512\n", - " 0.222512\n", - " 0.177231\n", + " 0.451510\n", + " 0.021547\n", + " 0.402010\n", " \n", " \n", " {}\n", - " 0.566907\n", - " 0.185909\n", - " 0.225857\n", + " 0.441469\n", + " 0.029395\n", + " 0.385481\n", " \n", " \n", " globalcs\n", " {'max_depth': 100}\n", - " 0.477872\n", - " 0.217623\n", - " 0.179135\n", + " 0.452274\n", + " 0.020277\n", + " 0.417400\n", " \n", " \n", " {}\n", - " 0.447095\n", - " 0.237363\n", - " 0.057638\n", + " 0.456366\n", + " 0.018096\n", + " 0.412834\n", " \n", " \n", " mdo\n", " {'max_depth': 100}\n", - " 0.589195\n", - " 0.170955\n", - " 0.205291\n", - " \n", - " \n", - " {}\n", - " 0.505958\n", - " 0.239950\n", - " 0.068824\n", - " \n", - " \n", - " soup\n", - " {'max_depth': 100}\n", - " 0.613124\n", - " 0.218905\n", - " 0.204827\n", + " 0.443627\n", + " 0.025639\n", + " 0.395393\n", " \n", " \n", " {}\n", - " 0.582646\n", - " 0.198582\n", - " 0.209060\n", + " 0.444472\n", + " 0.020072\n", + " 0.410953\n", " \n", " \n", "\n", @@ -374,47 +344,66 @@ " metric_value \\\n", " mean \n", "metric_name classifier dataset_name resampling_method clf_params \n", - "geometric_mean_score decisiontreeclassifier glass Not defined {'max_depth': 100} 0.356512 \n", - " {} 0.566907 \n", - " globalcs {'max_depth': 100} 0.477872 \n", - " {} 0.447095 \n", - " mdo {'max_depth': 100} 0.589195 \n", - " {} 0.505958 \n", - " soup {'max_depth': 100} 0.613124 \n", - " {} 0.582646 \n", + "geometric_mean_score decisiontreeclassifier cmc Not defined {'max_depth': 100} 0.451510 \n", + " {} 0.441469 \n", + " globalcs {'max_depth': 100} 0.452274 \n", + " {} 0.456366 \n", + " mdo {'max_depth': 100} 0.443627 \n", + " {} 0.444472 \n", "\n", " \\\n", " std \n", "metric_name classifier dataset_name resampling_method clf_params \n", - "geometric_mean_score decisiontreeclassifier glass Not defined {'max_depth': 100} 0.222512 \n", - " {} 0.185909 \n", - " globalcs {'max_depth': 100} 0.217623 \n", - " {} 0.237363 \n", - " mdo {'max_depth': 100} 0.170955 \n", - " {} 0.239950 \n", - " soup {'max_depth': 100} 0.218905 \n", - " {} 0.198582 \n", + "geometric_mean_score decisiontreeclassifier cmc Not defined {'max_depth': 100} 0.021547 \n", + " {} 0.029395 \n", + " globalcs {'max_depth': 100} 0.020277 \n", + " {} 0.018096 \n", + " mdo {'max_depth': 100} 0.025639 \n", + " {} 0.020072 \n", "\n", " \n", " min \n", "metric_name classifier dataset_name resampling_method clf_params \n", - "geometric_mean_score decisiontreeclassifier glass Not defined {'max_depth': 100} 0.177231 \n", - " {} 0.225857 \n", - " globalcs {'max_depth': 100} 0.179135 \n", - " {} 0.057638 \n", - " mdo {'max_depth': 100} 0.205291 \n", - " {} 0.068824 \n", - " soup {'max_depth': 100} 0.204827 \n", - " {} 0.209060 " + "geometric_mean_score decisiontreeclassifier cmc Not defined {'max_depth': 100} 0.402010 \n", + " {} 0.385481 \n", + " globalcs {'max_depth': 100} 0.417400 \n", + " {} 0.412834 \n", + " mdo {'max_depth': 100} 0.395393 \n", + " {} 0.410953 " ] }, - "execution_count": 31, + "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "results[0]" + "summary_results[0]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generate posthoc analysis for Wilcoxon test. You have to define names of classifiers, dataset names and metric names in query dict." + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [], + "source": [ + "query_dict = {\n", + " \"metric_name\": [\"geometric_mean_score\"],\n", + " \"classifier\": [\"decisiontreeclassifier\", \"kneighborsclassifier\"],\n", + " \"dataset_name\": [\"cmc\", \"new_ecoli\"],\n", + "}\n", + "\n", + "analysis_results, param_comb = AnalysisPipeline.generate_posthoc_analysis(\n", + " query_dict, save_to_csv=False, csv_path=result_file.name, posthoc_func_list=[[posthoc_wilcoxon, {}], [posthoc_mannwhitney, {}]]\n", + ")" ] }, { @@ -422,12 +411,26 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### Second result" + "Search for analysis for specific test (Wilcoxon), metric name, classifier and dataset using name of analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [], + "source": [ + "func_name = posthoc_wilcoxon.__name__\n", + "metric_name = \"geometric_mean_score\" # you can find it using pipeline.metric_names\n", + "clf_name = \"kneighborsclassifier\" # you can find it using pipeline.clf_names\n", + "dataset_name = \"cmc\" # you can find it using pipeline.dataset_names\n", + "\n", + "analysis_name = \"_\".join([func_name, metric_name, clf_name, dataset_name])" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 85, "metadata": {}, "outputs": [ { @@ -443,163 +446,120 @@ " vertical-align: top;\n", " }\n", "\n", - " .dataframe thead tr th {\n", - " text-align: left;\n", - " }\n", - "\n", - " .dataframe thead tr:last-of-type th {\n", + " .dataframe thead th {\n", " text-align: right;\n", " }\n", "\n", "\n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
metric_valuep-value
meanstdminglobalcs_0mdo_00.430433
metric_nameclassifierdataset_nameresampling_methodclf_paramsNot defined_00.009436
geometric_mean_scoredecisiontreeclassifiernew_ecoliNot defined{'max_depth': 100}0.7497840.0664260.602696globalcs_10.001209
{}0.6989340.0528910.620988mdo_10.000134
globalcs{'max_depth': 100}0.7179900.0348460.670200Not defined_10.000004
{}0.7008890.0800180.565038mdo_0Not defined_00.388376
mdo{'max_depth': 100}0.7677890.0455420.713386globalcs_10.004221
{}0.7713180.0489290.703493mdo_10.004860
soup{'max_depth': 100}0.7382330.0537730.649839Not defined_10.000210
{}0.7146000.0635480.583691Not defined_0globalcs_10.044054
mdo_10.021484
Not defined_10.000134
globalcs_1mdo_10.674223
Not defined_10.132727
mdo_1Not defined_10.026642
\n", "" ], "text/plain": [ - " metric_value \\\n", - " mean \n", - "metric_name classifier dataset_name resampling_method clf_params \n", - "geometric_mean_score decisiontreeclassifier new_ecoli Not defined {'max_depth': 100} 0.749784 \n", - " {} 0.698934 \n", - " globalcs {'max_depth': 100} 0.717990 \n", - " {} 0.700889 \n", - " mdo {'max_depth': 100} 0.767789 \n", - " {} 0.771318 \n", - " soup {'max_depth': 100} 0.738233 \n", - " {} 0.714600 \n", - "\n", - " \\\n", - " std \n", - "metric_name classifier dataset_name resampling_method clf_params \n", - "geometric_mean_score decisiontreeclassifier new_ecoli Not defined {'max_depth': 100} 0.066426 \n", - " {} 0.052891 \n", - " globalcs {'max_depth': 100} 0.034846 \n", - " {} 0.080018 \n", - " mdo {'max_depth': 100} 0.045542 \n", - " {} 0.048929 \n", - " soup {'max_depth': 100} 0.053773 \n", - " {} 0.063548 \n", - "\n", - " \n", - " min \n", - "metric_name classifier dataset_name resampling_method clf_params \n", - "geometric_mean_score decisiontreeclassifier new_ecoli Not defined {'max_depth': 100} 0.602696 \n", - " {} 0.620988 \n", - " globalcs {'max_depth': 100} 0.670200 \n", - " {} 0.565038 \n", - " mdo {'max_depth': 100} 0.713386 \n", - " {} 0.703493 \n", - " soup {'max_depth': 100} 0.649839 \n", - " {} 0.583691 " + " p-value\n", + "globalcs_0 mdo_0 0.430433\n", + " Not defined_0 0.009436\n", + " globalcs_1 0.001209\n", + " mdo_1 0.000134\n", + " Not defined_1 0.000004\n", + "mdo_0 Not defined_0 0.388376\n", + " globalcs_1 0.004221\n", + " mdo_1 0.004860\n", + " Not defined_1 0.000210\n", + "Not defined_0 globalcs_1 0.044054\n", + " mdo_1 0.021484\n", + " Not defined_1 0.000134\n", + "globalcs_1 mdo_1 0.674223\n", + " Not defined_1 0.132727\n", + "mdo_1 Not defined_1 0.026642" ] }, - "execution_count": 27, + "execution_count": 85, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "results[1]" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Third result" + "analysis_result = pd.DataFrame(analysis_results[analysis_name])\n", + "analysis_result" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 86, "metadata": {}, "outputs": [ { @@ -615,163 +575,99 @@ " vertical-align: top;\n", " }\n", "\n", - " .dataframe thead tr th {\n", - " text-align: left;\n", - " }\n", - "\n", - " .dataframe thead tr:last-of-type th {\n", + " .dataframe thead th {\n", " text-align: right;\n", " }\n", "\n", "\n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
metric_valuep-value
meanstdminglobalcs_0Not defined_00.009436
metric_nameclassifierdataset_nameresampling_methodclf_paramsglobalcs_10.001209
geometric_mean_scorekneighborsclassifierglassNot defined{'n_neighbors': 7}0.2046970.1916630.068485mdo_10.000134
{}0.1718390.1263950.075119Not defined_10.000004
globalcs{'n_neighbors': 7}0.5430430.1772860.202031mdo_0globalcs_10.004221
{}0.6723870.0882900.536591mdo_10.004860
mdo{'n_neighbors': 7}0.3870750.2069100.078940Not defined_10.000210
{}0.1456100.0616340.074212Not defined_0globalcs_10.044054
soup{'n_neighbors': 7}0.5854210.1548680.206894mdo_10.021484
{}0.3545660.2069000.153585Not defined_10.000134
mdo_1Not defined_10.026642
\n", "" ], "text/plain": [ - " metric_value \\\n", - " mean \n", - "metric_name classifier dataset_name resampling_method clf_params \n", - "geometric_mean_score kneighborsclassifier glass Not defined {'n_neighbors': 7} 0.204697 \n", - " {} 0.171839 \n", - " globalcs {'n_neighbors': 7} 0.543043 \n", - " {} 0.672387 \n", - " mdo {'n_neighbors': 7} 0.387075 \n", - " {} 0.145610 \n", - " soup {'n_neighbors': 7} 0.585421 \n", - " {} 0.354566 \n", - "\n", - " \\\n", - " std \n", - "metric_name classifier dataset_name resampling_method clf_params \n", - "geometric_mean_score kneighborsclassifier glass Not defined {'n_neighbors': 7} 0.191663 \n", - " {} 0.126395 \n", - " globalcs {'n_neighbors': 7} 0.177286 \n", - " {} 0.088290 \n", - " mdo {'n_neighbors': 7} 0.206910 \n", - " {} 0.061634 \n", - " soup {'n_neighbors': 7} 0.154868 \n", - " {} 0.206900 \n", - "\n", - " \n", - " min \n", - "metric_name classifier dataset_name resampling_method clf_params \n", - "geometric_mean_score kneighborsclassifier glass Not defined {'n_neighbors': 7} 0.068485 \n", - " {} 0.075119 \n", - " globalcs {'n_neighbors': 7} 0.202031 \n", - " {} 0.536591 \n", - " mdo {'n_neighbors': 7} 0.078940 \n", - " {} 0.074212 \n", - " soup {'n_neighbors': 7} 0.206894 \n", - " {} 0.153585 " + " p-value\n", + "globalcs_0 Not defined_0 0.009436\n", + " globalcs_1 0.001209\n", + " mdo_1 0.000134\n", + " Not defined_1 0.000004\n", + "mdo_0 globalcs_1 0.004221\n", + " mdo_1 0.004860\n", + " Not defined_1 0.000210\n", + "Not defined_0 globalcs_1 0.044054\n", + " mdo_1 0.021484\n", + " Not defined_1 0.000134\n", + "mdo_1 Not defined_1 0.026642" ] }, - "execution_count": 33, + "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "results[2]" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Fourth result" + "alpha = 0.05\n", + "analysis_result[analysis_result[\"p-value\"] < alpha]" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 87, "metadata": {}, "outputs": [ { @@ -800,137 +696,80 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " metric_value\n", + " metric_value\n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", " mean\n", " std\n", - " min\n", " \n", " \n", - " metric_name\n", - " classifier\n", - " dataset_name\n", " resampling_method\n", " clf_params\n", " \n", " \n", - " \n", " \n", " \n", " \n", " \n", - " geometric_mean_score\n", - " kneighborsclassifier\n", - " new_ecoli\n", " Not defined\n", " {'n_neighbors': 7}\n", - " 0.848028\n", - " 0.039082\n", - " 0.805196\n", + " 0.494294\n", + " 0.017129\n", " \n", " \n", " {}\n", - " 0.829293\n", - " 0.025965\n", - " 0.777511\n", + " 0.465963\n", + " 0.021218\n", " \n", " \n", " globalcs\n", " {'n_neighbors': 7}\n", - " 0.779214\n", - " 0.032680\n", - " 0.736093\n", + " 0.508187\n", + " 0.016241\n", " \n", " \n", " {}\n", - " 0.789073\n", - " 0.063314\n", - " 0.724630\n", + " 0.476802\n", + " 0.023789\n", " \n", " \n", " mdo\n", " {'n_neighbors': 7}\n", - " 0.784751\n", - " 0.040992\n", - " 0.722135\n", - " \n", - " \n", - " {}\n", - " 0.824005\n", - " 0.041023\n", - " 0.755261\n", - " \n", - " \n", - " soup\n", - " {'n_neighbors': 7}\n", - " 0.839763\n", - " 0.034917\n", - " 0.789633\n", + " 0.500966\n", + " 0.021062\n", " \n", " \n", " {}\n", - " 0.814969\n", - " 0.059139\n", - " 0.710178\n", + " 0.481425\n", + " 0.017707\n", " \n", " \n", "\n", "" ], "text/plain": [ - " metric_value \\\n", - " mean \n", - "metric_name classifier dataset_name resampling_method clf_params \n", - "geometric_mean_score kneighborsclassifier new_ecoli Not defined {'n_neighbors': 7} 0.848028 \n", - " {} 0.829293 \n", - " globalcs {'n_neighbors': 7} 0.779214 \n", - " {} 0.789073 \n", - " mdo {'n_neighbors': 7} 0.784751 \n", - " {} 0.824005 \n", - " soup {'n_neighbors': 7} 0.839763 \n", - " {} 0.814969 \n", - "\n", - " \\\n", - " std \n", - "metric_name classifier dataset_name resampling_method clf_params \n", - "geometric_mean_score kneighborsclassifier new_ecoli Not defined {'n_neighbors': 7} 0.039082 \n", - " {} 0.025965 \n", - " globalcs {'n_neighbors': 7} 0.032680 \n", - " {} 0.063314 \n", - " mdo {'n_neighbors': 7} 0.040992 \n", - " {} 0.041023 \n", - " soup {'n_neighbors': 7} 0.034917 \n", - " {} 0.059139 \n", - "\n", - " \n", - " min \n", - "metric_name classifier dataset_name resampling_method clf_params \n", - "geometric_mean_score kneighborsclassifier new_ecoli Not defined {'n_neighbors': 7} 0.805196 \n", - " {} 0.777511 \n", - " globalcs {'n_neighbors': 7} 0.736093 \n", - " {} 0.724630 \n", - " mdo {'n_neighbors': 7} 0.722135 \n", - " {} 0.755261 \n", - " soup {'n_neighbors': 7} 0.789633 \n", - " {} 0.710178 " + " metric_value \n", + " mean std\n", + "resampling_method clf_params \n", + "Not defined {'n_neighbors': 7} 0.494294 0.017129\n", + " {} 0.465963 0.021218\n", + "globalcs {'n_neighbors': 7} 0.508187 0.016241\n", + " {} 0.476802 0.023789\n", + "mdo {'n_neighbors': 7} 0.500966 0.021062\n", + " {} 0.481425 0.017707" ] }, - "execution_count": 34, + "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "results[3]" + "concat_summary_df = pipeline.generate_summary(query_dict, save_to_csv=False, csv_path=result_file.name, concat_results=True)\n", + "\n", + "concat_summary_df.loc[metric_name, clf_name, dataset_name]" ] }, { @@ -938,31 +777,806 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Generate posthoc analysis for Wilcoxon test. You have to define names of classifiers, dataset names and metric names in query dict." + "## 2. CLI" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also use the CLI to run the pipeline for analysis. To do this, you need to prepare JSON files that will contain configurations for the given functions." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Imports" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 88, "metadata": {}, "outputs": [], "source": [ - "query_dict = {\"classifier\": [\"decisiontreeclassifier\", \"kneighborsclassifier\"], \"metric_name\": [\"geometric_mean_score\"], \"dataset_name\": [\"glass\", \"new_ecoli\"]}\n", + "import json\n", + "from tempfile import NamedTemporaryFile, TemporaryDirectory\n", + "import pandas as pd\n", + "import os\n", + "from pathlib import Path\n", "\n", - "results = AnalysisPipeline.generate_posthoc_analysis(query_dict, save_to_csv=False, csv_path=result_file.name ,posthoc_func_list=[[posthoc_wilcoxon, {}]])" + "from multi_imbalance.datasets.helpers import read_summary_from_csv" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First we specify path to file which contain the defition of AnalysisPipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "cwd = Path.cwd()" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "path_to_analysis_file = str(cwd.parents[1] / \"multi_imbalance\" / \"datasets\" / \"analysis.py\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we print help with descriptions of options" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 91, "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
metric_nameclassifierdataset_nameresampling_methodmetric_valueno_repeatclf_params
0geometric_mean_scoredecisiontreeclassifiercmcglobalcs0.4394040{'max_depth': 100}
1accuracy_scoredecisiontreeclassifiercmcglobalcs0.4677970{'max_depth': 100}
2geometric_mean_scoredecisiontreeclassifiercmcmdo0.4682480{'max_depth': 100}
3accuracy_scoredecisiontreeclassifiercmcmdo0.5152540{'max_depth': 100}
4geometric_mean_scoredecisiontreeclassifiercmcNot defined0.4228630{'max_depth': 100}
5accuracy_scoredecisiontreeclassifiercmcNot defined0.4610170{'max_depth': 100}
\n", + "
" + ], + "text/plain": [ + " metric_name classifier dataset_name \\\n", + "0 geometric_mean_score decisiontreeclassifier cmc \n", + "1 accuracy_score decisiontreeclassifier cmc \n", + "2 geometric_mean_score decisiontreeclassifier cmc \n", + "3 accuracy_score decisiontreeclassifier cmc \n", + "4 geometric_mean_score decisiontreeclassifier cmc \n", + "5 accuracy_score decisiontreeclassifier cmc \n", + "\n", + " resampling_method metric_value no_repeat clf_params \n", + "0 globalcs 0.439404 0 {'max_depth': 100} \n", + "1 globalcs 0.467797 0 {'max_depth': 100} \n", + "2 mdo 0.468248 0 {'max_depth': 100} \n", + "3 mdo 0.515254 0 {'max_depth': 100} \n", + "4 Not defined 0.422863 0 {'max_depth': 100} \n", + "5 Not defined 0.461017 0 {'max_depth': 100} " + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.read_csv(result_file.name).head(6)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As before if we already have the results prepared we can generate a summary for them. Again, create a JSON file that will contain a query dictionary of specific combinations of classifiers, datasets etc." + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [], + "source": [ + "query_dict = {\n", + " \"metric_name\": [\"geometric_mean_score\"],\n", + " \"classifier\": [\"decisiontreeclassifier\", \"kneighborsclassifier\"],\n", + " \"dataset_name\": [\"cmc\", \"new_ecoli\", \"cleveland\"],\n", + "}\n", + "\n", + "query_json = NamedTemporaryFile(suffix=\".json\")\n", + "query_json.close()\n", + "with open(query_json.name, \"w\") as f:\n", + " json.dump(query_dict, f)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we will create an optional JSON file, which will contain a list of paths to aggregate functions" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [], + "source": [ + "aggr_func_list = [\"numpy.min\"]\n", + "\n", + "aggr_func_json = NamedTemporaryFile(suffix=\".json\")\n", + "aggr_func_json.close()\n", + "with open(aggr_func_json.name, \"w\") as f:\n", + " json.dump(aggr_func_list, f)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will also specify the destination path where the files generated from the summary will be located" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [], + "source": [ + "temp_dir = TemporaryDirectory()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can run summary" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Start\n", + "Run generate summary\n", + "Done\n" + ] + } + ], + "source": [ + "!python $path_to_analysis_file $result_file.name --summary --query-json $query_json.name --save-path $temp_dir.name --aggregate-json $aggr_func_json.name" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can check to see if as many files have been generated as expected (i.e. 6)" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(['geometric_mean_score_decisiontreeclassifier_cleveland.csv',\n", + " 'geometric_mean_score_decisiontreeclassifier_cmc.csv',\n", + " 'geometric_mean_score_decisiontreeclassifier_new_ecoli.csv',\n", + " 'geometric_mean_score_kneighborsclassifier_cleveland.csv',\n", + " 'geometric_mean_score_kneighborsclassifier_cmc.csv',\n", + " 'geometric_mean_score_kneighborsclassifier_new_ecoli.csv'],\n", + " 6)" + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "csv_dir_files = os.listdir(temp_dir.name)\n", + "\n", + "csv_dir_files, len(csv_dir_files)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will open the same file that was shown previously as the first result" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
metric_value
meanstdamin
metric_nameclf_namedataset_nameresampling_methodclf_params
geometric_mean_scoredecisiontreeclassifiercmcNot defined{'max_depth': 100}0.4447810.0326000.375016
{}0.4460820.0323150.358658
globalcs{'max_depth': 100}0.4575940.0305090.407711
{}0.4619980.0353990.379794
mdo{'max_depth': 100}0.4438740.0315240.380103
{}0.4465910.0283590.377730
\n", + "
" + ], + "text/plain": [ + " metric_value \\\n", + " mean \n", + "metric_name clf_name dataset_name resampling_method clf_params \n", + "geometric_mean_score decisiontreeclassifier cmc Not defined {'max_depth': 100} 0.444781 \n", + " {} 0.446082 \n", + " globalcs {'max_depth': 100} 0.457594 \n", + " {} 0.461998 \n", + " mdo {'max_depth': 100} 0.443874 \n", + " {} 0.446591 \n", + "\n", + " \\\n", + " std \n", + "metric_name clf_name dataset_name resampling_method clf_params \n", + "geometric_mean_score decisiontreeclassifier cmc Not defined {'max_depth': 100} 0.032600 \n", + " {} 0.032315 \n", + " globalcs {'max_depth': 100} 0.030509 \n", + " {} 0.035399 \n", + " mdo {'max_depth': 100} 0.031524 \n", + " {} 0.028359 \n", + "\n", + " \n", + " amin \n", + "metric_name clf_name dataset_name resampling_method clf_params \n", + "geometric_mean_score decisiontreeclassifier cmc Not defined {'max_depth': 100} 0.375016 \n", + " {} 0.358658 \n", + " globalcs {'max_depth': 100} 0.407711 \n", + " {} 0.379794 \n", + " mdo {'max_depth': 100} 0.380103 \n", + " {} 0.377730 " + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = read_summary_from_csv(os.path.join(temp_dir.name, \"geometric_mean_score_decisiontreeclassifier_cmc.csv\"))\n", + "df" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The resulting dataframe is the same in terms of structure, only some values, for example, for the mean are minimally different." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The last thing to do is to perform statistical tests. As before, you will need a JSON file containing the query and a second JSON file containing a dictionary with the paths to the functions and their possible parameters." + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [], + "source": [ + "query_dict = {\n", + " \"metric_name\": [\"geometric_mean_score\"],\n", + " \"classifier\": [\"decisiontreeclassifier\", \"kneighborsclassifier\"],\n", + " \"dataset_name\": [\"cmc\", \"new_ecoli\"],\n", + "}\n", + "\n", + "\n", + "query_json = NamedTemporaryFile(suffix=\".json\")\n", + "query_json.close()\n", + "with open(query_json.name, \"w\") as f:\n", + " json.dump(query_dict, f)" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [], + "source": [ + "posthoc_func_dict = {\"scikit_posthocs.posthoc_wilcoxon\": {}, \"scikit_posthocs.posthoc_mannwhitney\": {}}\n", + "\n", + "posthoc_func_json = NamedTemporaryFile(suffix=\".json\")\n", + "posthoc_func_json.close()\n", + "with open(posthoc_func_json.name, \"w\") as f:\n", + " json.dump(posthoc_func_dict, f)" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [], + "source": [ + "temp_dir = TemporaryDirectory()" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Start\n", + "Run generate posthoc analysis\n", + "Done\n" + ] + } + ], + "source": [ + "!python $path_to_analysis_file $result_file.name --posthoc-analysis --posthoc-query-json $query_json.name --save-path $temp_dir.name --posthoc-func-json $posthoc_func_json.name" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Again, let's check if as many files as expected have been obtained (this time 8)" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(['posthoc_mannwhitney_geometric_mean_score_decisiontreeclassifier_cmc.csv',\n", + " 'posthoc_mannwhitney_geometric_mean_score_decisiontreeclassifier_new_ecoli.csv',\n", + " 'posthoc_mannwhitney_geometric_mean_score_kneighborsclassifier_cmc.csv',\n", + " 'posthoc_mannwhitney_geometric_mean_score_kneighborsclassifier_new_ecoli.csv',\n", + " 'posthoc_wilcoxon_geometric_mean_score_decisiontreeclassifier_cmc.csv',\n", + " 'posthoc_wilcoxon_geometric_mean_score_decisiontreeclassifier_new_ecoli.csv',\n", + " 'posthoc_wilcoxon_geometric_mean_score_kneighborsclassifier_cmc.csv',\n", + " 'posthoc_wilcoxon_geometric_mean_score_kneighborsclassifier_new_ecoli.csv'],\n", + " 8)" + ] + }, + "execution_count": 106, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "csv_dir_files = os.listdir(temp_dir.name)\n", + "\n", + "csv_dir_files, len(csv_dir_files)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now open the same file as before" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": {}, + "outputs": [], + "source": [ + "func_name = posthoc_wilcoxon.__name__\n", + "metric_name = \"geometric_mean_score\"\n", + "clf_name = \"kneighborsclassifier\"\n", + "dataset_name = \"cmc\"\n", + "\n", + "analysis_name_file = \"_\".join([func_name, metric_name, clf_name, dataset_name]) + \".csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
p-value
globalcs_0globalcs_10.000851
mdo_10.000048
Not defined_10.000168
mdo_0globalcs_10.000586
mdo_10.004860
Not defined_10.000851
Not defined_0globalcs_10.000708
mdo_10.004221
Not defined_10.000105
\n", + "
" + ], + "text/plain": [ + " p-value\n", + "globalcs_0 globalcs_1 0.000851\n", + " mdo_1 0.000048\n", + " Not defined_1 0.000168\n", + "mdo_0 globalcs_1 0.000586\n", + " mdo_1 0.004860\n", + " Not defined_1 0.000851\n", + "Not defined_0 globalcs_1 0.000708\n", + " mdo_1 0.004221\n", + " Not defined_1 0.000105" ] }, - "execution_count": 37, + "execution_count": 115, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "results[0]" + "analysis_result[analysis_result[\"p-value\"] < alpha]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In both examples shown (using Python and CLI) for cmc dataset and KNN classifier, the resampling methods used (or not using any) differ significantly in some cases, e.g. GlobalCS with MDO." ] } ], @@ -1069,7 +1818,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.2 (tags/v3.9.2:1a79785, Feb 19 2021, 13:44:55) [MSC v.1928 64 bit (AMD64)]" + "version": "3.9.2" }, "orig_nbformat": 4, "vscode": { diff --git a/multi_imbalance/datasets/analysis.py b/multi_imbalance/datasets/analysis.py index c9cb722..0ad4f53 100644 --- a/multi_imbalance/datasets/analysis.py +++ b/multi_imbalance/datasets/analysis.py @@ -82,7 +82,8 @@ def generate_summary( save_to_csv: bool = False, save_path: Optional[str] = None, aggregate_func: Optional[List[Callable]] = None, - ) -> List[pd.DataFrame]: + concat_results: bool = False, + ) -> Union[List[pd.DataFrame], pd.DataFrame]: """ Generate summary of analysis results based on specified query parameters. @@ -105,6 +106,8 @@ def generate_summary( :param aggregate_func: Optional[List[Callable]], optional, a list of functions that will be applied to the `metric_value` column of the results to generate the summary + :param concat_results: + bool, optional, if `True`, the results of summary will be concatenated into one DataFrame :return: List[pd.DataFrame], a list of Pandas DataFrames containing the summary of the results of the analysis """ @@ -125,6 +128,9 @@ def generate_summary( group_df.reset_index().to_csv(Path(save_path) / ("_".join(i) + ".csv"), index=False) df_list.append(group_df) + if concat_results: + df_list = pd.concat(df_list).sort_index() + return df_list @staticmethod @@ -134,7 +140,7 @@ def generate_posthoc_analysis( posthoc_func_list: List[Tuple[Callable, Dict]], save_to_csv: bool = False, save_path: Optional[str] = None, - ) -> List[pd.DataFrame]: + ) -> Tuple[Dict[str, pd.DataFrame], Dict[str, Dict]]: """ Generates a posthoc analysis of the results of the analysis based on the specified query parameters and posthoc functions. `csv_path` should be the path to the CSV file containing the results of the analysis. @@ -152,7 +158,8 @@ def generate_posthoc_analysis( str, optional, the location where the summary csv files should be saved. If None summary csv files will be saved in the same location as csv_path :return: - List[pd.DataFrame], a list of Pandas DataFrames containing the posthoc analysis of the results + Dict[str, pd.DataFrame], a dictionary of Pandas DataFrames containing the posthoc analysis of the results + , Dict[str, Dict], a dictionary of params combinations (dicts) for definied classifiers """ chunksize = 1000 gen = pd.read_csv(csv_path, chunksize=chunksize) @@ -160,22 +167,27 @@ def generate_posthoc_analysis( selected_columns.remove("no_repeat") selected_columns.remove("metric_value") - df_list = [] + df_dict = {} + param_comb_dict = {} for i in product(*query_dict.values()): df = AnalysisPipeline._search_df_by_query(query_dict, combination=i, csv_path=csv_path, chunksize=chunksize) - + param_comb = {param: number for number, param in enumerate(df["clf_params"].unique())} + df["resampling_method"] += "_" + df["clf_params"].apply(lambda x: param_comb[x]).astype(str) for posthoc_func, params in posthoc_func_list: - posthoc_df = posthoc_func(df, "metric_value", "resampling_method", **params) - df_name = posthoc_func.__name__ + "_" + "_".join(i) - posthoc_df.columns.name = df_name + posthoc_df: pd.DataFrame = posthoc_func(df, "metric_value", "resampling_method", **params) + name = posthoc_func.__name__ + "_" + "_".join(i) + keep = (np.triu(np.ones_like(posthoc_df), k=1)).astype(bool).flatten() + posthoc_df = posthoc_df.stack()[keep] + posthoc_df.name = "p-value" if save_to_csv: if save_path is None: save_path = Path(csv_path).parent - df_path = save_path / (df_name + ".csv") + df_path = Path(save_path) / (name + ".csv") posthoc_df.to_csv(Path(df_path)) - df_list.append(posthoc_df) + df_dict[name] = posthoc_df + param_comb_dict[name] = param_comb - return df_list + return df_dict, param_comb_dict @property def dataset_names(self) -> List[str]: @@ -517,9 +529,9 @@ def _get_name(self, estimator: Union[ClassifierMixin, BaseSampler]) -> Tuple[str help="Option specifying if the analysis would be run without using resampling", ) @click.option( - "--save-to-csv", - is_flag=True, - help="Option defines if results from summary should be save to csv", + "--save-path", + help="Option defines where results from summary and posthoc analysis should be saved." + "If not specified files will be saved in the same directory as file from analysis pipeline", ) def main( output_path, @@ -532,7 +544,7 @@ def main( aggregate_json, posthoc_func_json, train_without_resampling, - save_to_csv, + save_path, ): """ This function helps to use pipeline analysis, summary and posthoc tests by CLI. @@ -556,7 +568,7 @@ def main( aggregate_func_paths = json.load(f) aggregate_func = list(map(import_from_string, aggregate_func_paths)) - AnalysisPipeline.generate_summary(query_dict, output_path, save_to_csv, aggregate_func=aggregate_func) + AnalysisPipeline.generate_summary(query_dict, output_path, save_to_csv=True, aggregate_func=aggregate_func, save_path=save_path) if posthoc_analysis: print("Run generate posthoc analysis") @@ -568,7 +580,9 @@ def main( posthoc_func_paths = json.load(f) posthoc_func = [[import_from_string(func_path), params] for func_path, params in posthoc_func_paths.items()] - AnalysisPipeline.generate_posthoc_analysis(query_dict, output_path, posthoc_func_list=posthoc_func, save_to_csv=save_to_csv) + AnalysisPipeline.generate_posthoc_analysis( + query_dict, output_path, posthoc_func_list=posthoc_func, save_to_csv=True, save_path=save_path + ) print("Done") diff --git a/multi_imbalance/datasets/helpers.py b/multi_imbalance/datasets/helpers.py index 090beef..a1af50f 100644 --- a/multi_imbalance/datasets/helpers.py +++ b/multi_imbalance/datasets/helpers.py @@ -1,6 +1,7 @@ from dataclasses import dataclass import importlib import json +import pandas as pd from typing import Callable, Dict, List, Tuple, Union from sklearn.base import ClassifierMixin from imblearn.base import BaseSampler @@ -12,6 +13,13 @@ def import_from_string(cls_path: str) -> Union[BaseSampler, ClassifierMixin, Cal return getattr(module, class_name) +def read_summary_from_csv(csv_path: str) -> pd.DataFrame: + df = pd.read_csv(csv_path, index_col=[0, 1, 2, 3, 4], header=[0, 1]) + df.index.names = ["metric_name", "clf_name", "dataset_name", "resampling_method", "clf_params"] + df.columns.names = [None, None] + return df + + @dataclass class Config: """A class representing the configuration for an analysis pipeline. diff --git a/tests/datasets/test_analysis.py b/tests/datasets/test_analysis.py index d60aafd..4c0cc80 100644 --- a/tests/datasets/test_analysis.py +++ b/tests/datasets/test_analysis.py @@ -293,15 +293,18 @@ def test_run_analysis_cli(prepare_dataset_file, output_file, run_analysis_config assert Path(output_file).exists() -def test_generate_summary(prepare_dataset_file, query_dict, output_file, run_analysis_config, tmp_path): +@pytest.mark.parametrize("concat_results", [True, False]) +def test_generate_summary(concat_results, prepare_dataset_file, query_dict, output_file, run_analysis_config, tmp_path): config = Config.from_dict(run_analysis_config) pipeline = AnalysisPipeline(config) pipeline.run_analysis(output_file, train_without_resampling=False) - list_of_df = pipeline.generate_summary(query_dict, csv_path=output_file, save_to_csv=True) - - assert len(list_of_df) == 1 - df = list_of_df[0] + list_of_df = pipeline.generate_summary(query_dict, csv_path=output_file, save_to_csv=True, concat_results=concat_results) + if concat_results: + df = list_of_df + else: + assert len(list_of_df) == 1 + df = list_of_df[0] assert df.shape[0] == 1 assert (tmp_path / ("_".join([j for i in query_dict.values() for j in i]) + ".csv")).exists() is True @@ -318,7 +321,7 @@ def test_generate_summary_cli(prepare_dataset_file, query_dict_json, output_file runner = CliRunner() result = runner.invoke( main, - [str(output_file), "--summary", "--query-json", str(query_dict_json), "--save-to-csv", "--aggregate-json", str(aggr_func_path)], + [str(output_file), "--summary", "--query-json", str(query_dict_json), "--aggregate-json", str(aggr_func_path)], ) assert result.exit_code == 0 assert result.output == "Start\nRun generate summary\nDone\n" @@ -334,14 +337,16 @@ def test_generate_posthoc_analysis(prepare_dataset_file, query_dict, output_file pipeline = AnalysisPipeline(config) pipeline.run_analysis(output_file, train_without_resampling=False) query_dict.pop("resampling_method") - list_of_df = pipeline.generate_posthoc_analysis( + df_dict, param_comb_dict = pipeline.generate_posthoc_analysis( query_dict, csv_path=output_file, posthoc_func_list=[[posthoc_dunn, {}]], save_to_csv=True ) - assert len(list_of_df) == 1 - df = list_of_df[0] - assert df.shape[0] == 1 - assert (tmp_path / ("_".join([posthoc_dunn.__name__, *[j for i in query_dict.values() for j in i]]) + ".csv")).exists() is True + assert len(df_dict) == 1 + name = "_".join([posthoc_dunn.__name__, *[j for i in query_dict.values() for j in i]]) + df = df_dict[name] + assert df.shape[0] == 0 + assert (tmp_path / (name + ".csv")).exists() is True + assert param_comb_dict == {name: {"{'max_depth': 30}": 0}} def test_generate_posthoc_analysis_cli(prepare_dataset_file, query_dict, output_file, run_analysis_config, tmp_path): @@ -366,7 +371,6 @@ def test_generate_posthoc_analysis_cli(prepare_dataset_file, query_dict, output_ "--posthoc-analysis", "--posthoc-query-json", str(query_dict_path), - "--save-to-csv", "--posthoc-func-json", str(posthoc_func_path), ], From 9462046b48cd97d64239782db0eb877fc74bf9ce Mon Sep 17 00:00:00 2001 From: Maciej Falbogowski Date: Sun, 15 Jan 2023 19:55:27 +0100 Subject: [PATCH 33/48] Rename parameters in SMOM --- multi_imbalance/resampling/smom.py | 24 ++++++++++++------------ multi_imbalance/utils/array_util.py | 21 --------------------- 2 files changed, 12 insertions(+), 33 deletions(-) diff --git a/multi_imbalance/resampling/smom.py b/multi_imbalance/resampling/smom.py index c7c5aff..b1ec450 100644 --- a/multi_imbalance/resampling/smom.py +++ b/multi_imbalance/resampling/smom.py @@ -108,8 +108,8 @@ class SMOM(BaseSampler): """ def __init__(self, - c: int, - zeta: int, + minority_class: int, + num_synth: int, k1: int = 12, k2: int = 8, rTh: float = 5 / 8, @@ -122,11 +122,11 @@ def __init__(self, shuffle: bool = False, metric: str = 'minkowski', p: int = 2, - seed: Optional[int] = None) -> None: + random_state: Optional[int, np.random.RandomState] = None) -> None: """ - :param c: + :param minority_class: The minority class under consideration. - :param zeta: + :param num_synth: Number of synthetic instances to be generated. :param k1: Number of nearest neighbors used to generate the synthetic instances. @@ -142,14 +142,14 @@ def __init__(self, Metric to use for distance computation. :param p: Power parameter for Minkowski metric. - :param seed: - Seed for random state. + :param random_state: + Optional seed for random state or a np.random.RandomState instance. """ super().__init__() self._sampling_type = 'over-sampling' self.maj_int_min = maj_int_min - self.c = c - self.zeta = zeta + self.c = minority_class + self.zeta = num_synth self.k1 = k1 self.k2 = k2 self.k3 = max(k1, k2) @@ -160,7 +160,7 @@ def __init__(self, self.r1 = r1 self.r2 = r2 self.shuffle = shuffle - self.random_state = sklearn.utils.check_random_state(seed) + self.random_state = sklearn.utils.check_random_state(random_state) if metric == 'minkowski': self._metric = neighbors.DistanceMetric.get_metric(metric, p=p) else: @@ -379,7 +379,7 @@ def _fit_resample(self, X, y): y_resampled = np.concatenate([y, [self.c] * SI.shape[0]], 0) if self.shuffle: - X_resampled, y_resampled = array_util.shuffle(X_resampled, + X_resampled, y_resampled = sklearn.utils.shuffle(X_resampled, y_resampled, - state=self.random_state) + state=self.random_state) return X_resampled, y_resampled diff --git a/multi_imbalance/utils/array_util.py b/multi_imbalance/utils/array_util.py index 5910ac7..e89a921 100644 --- a/multi_imbalance/utils/array_util.py +++ b/multi_imbalance/utils/array_util.py @@ -85,24 +85,3 @@ def intersect(arr1, arr2): if contains(arr2, x): result = union(result, np.array([x])) return result - - -def shuffle(*arrs: np.ndarray, - state: Optional[np.random.RandomState] = None) -> Tuple[ - np.ndarray, ...]: - """ - Shuffles rows of many arrays at once. - - Shuffles given arrays using a shuffled matrix of row indices. - The number of rows in the given arrays should be the same. - - :param arrs: - Numpy arrays to shuffle. - :param state: - Optional RandomState used to shuffle. - :return: - A tuple of shuffled copies of given arrays. - """ - indices = np.arange(arrs[0].shape[0]) - sklearn.utils.check_random_state(state).shuffle(indices) - return tuple(arr[indices] for arr in arrs) From eaaae7fa6cb81827221f977593da1d1c41b22de7 Mon Sep 17 00:00:00 2001 From: Maciej Falbogowski Date: Tue, 17 Jan 2023 22:25:17 +0100 Subject: [PATCH 34/48] Change parameters in SMOM --- examples/resampling/SMOM.ipynb | 20 ++--- multi_imbalance/resampling/smom.py | 136 +++++++++++++++++------------ tests/resampling/test_smom.py | 23 +++++ 3 files changed, 113 insertions(+), 66 deletions(-) create mode 100644 tests/resampling/test_smom.py diff --git a/examples/resampling/SMOM.ipynb b/examples/resampling/SMOM.ipynb index c4b8d27..b328f52 100644 --- a/examples/resampling/SMOM.ipynb +++ b/examples/resampling/SMOM.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 32, + "execution_count": 1, "outputs": [], "source": [ "%reload_ext autoreload\n", @@ -29,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 2, "outputs": [ { "name": "stdout", @@ -91,20 +91,20 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 3, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'maj': [0, 1], 'int': [], 'min': [4, 2, 3]}\n", - "(386, 7) (386,)\n" + "(546, 7) (546,)\n" ] } ], "source": [ "from multi_imbalance.resampling.smom import SMOM\n", - "clf = SMOM(maj_int_min=maj_int_min[dataset_name], c=3, zeta=50, shuffle=True, seed=1234)\n", + "clf = SMOM(maj_int_min=maj_int_min[dataset_name], prop=0.75, shuffle=True, random_state=1234)\n", "print(maj_int_min[dataset_name])\n", "resampled_X, resampled_y = clf.fit_resample(X, y)\n", "print(resampled_X.shape, resampled_y.shape)" @@ -130,20 +130,20 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 4, "outputs": [ { "data": { - "text/plain": "" + "text/plain": "" }, - "execution_count": 35, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": "
", - "image/png": "\n" + "image/png": "\n" }, "metadata": {}, "output_type": "display_data" @@ -183,7 +183,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, "outputs": [], "source": [], "metadata": { diff --git a/multi_imbalance/resampling/smom.py b/multi_imbalance/resampling/smom.py index b1ec450..7d2bdde 100644 --- a/multi_imbalance/resampling/smom.py +++ b/multi_imbalance/resampling/smom.py @@ -4,16 +4,15 @@ import numpy as np -from typing import List, Dict, Optional, Sequence +from typing import List, Dict, Optional, Union, Collection import sklearn.utils from sklearn import neighbors -from multi_imbalance.utils import array_util from imblearn.base import BaseSampler def _nbdos(Sc: List[int], k: int, - sNk: Dict[int, List[int]], rTh: float, + sNk: Dict[int, np.ndarray], rTh: float, nTh: int): """ NBDOS clustering algorithm implementation. @@ -85,7 +84,7 @@ def _compute_ss(Fs_i, Fs_d, i, dst): return np.array(Ss, dtype=np.int64) -def _normalized_entropy(classes_counts: Sequence[int]): +def _normalized_entropy(classes_counts: Collection[int]): if len(classes_counts) <= 1: E = 0 else: @@ -93,13 +92,23 @@ def _normalized_entropy(classes_counts: Sequence[int]): E_min = np.log(1. / total) E = sum(count / total * np.log(count / total) for count in classes_counts) / E_min - assert 0 <= E <= 1, f'{E=} not in range [0, 1]' + assert 0 <= E <= 1, f"{E=} not in range [0, 1]" return E +def _compute_number_of_synthetic_instances(Sc, n_to_generate): + N_syn = dict() + div, mod = divmod(n_to_generate, Sc.shape[0]) + for i in Sc: + N_syn[i] = div + (mod > 0) + mod -= 1 + return N_syn + + class SMOM(BaseSampler): """ - SMOM technique implementation for synthetic minority oversampling for multiclass imbalanced problems. + SMOM technique implementation for synthetic minority oversampling for + multiclass imbalanced problems. Reference: Zhu, Tuanfei, Yaping Lin, and Yonghe Liu. "Synthetic minority oversampling @@ -108,8 +117,8 @@ class SMOM(BaseSampler): """ def __init__(self, - minority_class: int, - num_synth: int, + minority_class: Optional[int] = None, + prop: float = 1.0, k1: int = 12, k2: int = 8, rTh: float = 5 / 8, @@ -118,24 +127,29 @@ def __init__(self, w2: float = 0.5, r1: float = 1 / 3, r2: float = 0.2, - maj_int_min: Optional[Dict[str, Sequence[int]]] = None, + maj_int_min: Optional[Dict[str, List[int]]] = None, shuffle: bool = False, - metric: str = 'minkowski', + metric: str = "minkowski", p: int = 2, - random_state: Optional[int, np.random.RandomState] = None) -> None: + random_state: Optional[Union[ + int, np.random.RandomState]] = None) -> None: """ :param minority_class: - The minority class under consideration. - :param num_synth: - Number of synthetic instances to be generated. + The minority class under consideration. If none, every minority + class will be resampled. + :param prop: + A float describing the number of instances after oversampling. The + number of instances after oversampling will be prop * size of + largest majority class. :param k1: - Number of nearest neighbors used to generate the synthetic instances. + Number of nearest neighbors used to generate synthetic instances. :params k2, rTh, nTh: The parameters used in clustering algorithm NBDOS. :params w1, w2, r1, r2: The parameters used for calculating the selection weights. :param maj_int_min: - Dict that contains lists of majority, intermediate and minority classes labels. + Dict that contains lists of majority, intermediate and minority + classes labels. :param shuffle: Shuffle resampled data. :param metric: @@ -146,10 +160,10 @@ def __init__(self, Optional seed for random state or a np.random.RandomState instance. """ super().__init__() - self._sampling_type = 'over-sampling' + self._sampling_type = "over-sampling" self.maj_int_min = maj_int_min self.c = minority_class - self.zeta = num_synth + self.prop = prop self.k1 = k1 self.k2 = k2 self.k3 = max(k1, k2) @@ -161,7 +175,7 @@ def __init__(self, self.r2 = r2 self.shuffle = shuffle self.random_state = sklearn.utils.check_random_state(random_state) - if metric == 'minkowski': + if metric == "minkowski": self._metric = neighbors.DistanceMetric.get_metric(metric, p=p) else: self._metric = neighbors.metrics.DistanceMetric.get_metric(metric) @@ -225,12 +239,10 @@ def _compute_min_maj(self, y): y_min_classes = {k: v for k, v in cnt.items() if v < M / L} else: cnt = Counter(y) - if 'int' not in self.maj_int_min: - self.maj_int_min['int'] = [] y_maj_classes = {cls: cnt[cls] for cls in - self.maj_int_min['maj'] + self.maj_int_min['int'] + self.maj_int_min["maj"] if cls != self.c} - y_min_classes = {cls: cnt[cls] for cls in self.maj_int_min['min'] + y_min_classes = {cls: cnt[cls] for cls in self.maj_int_min["min"] if cls != self.c} return y_maj_classes, y_min_classes @@ -295,14 +307,6 @@ def _obtain_probability_distribution(self, X, y, Sw, TiC): P[i][j] = Sw[i][j] / sum(Sw[i][k] for k in self.N_c_k1_i[i]) return P - def _compute_number_of_synthetic_instances(self, Sc): - N_syn = dict() - div, mod = divmod(self.zeta, Sc.shape[0]) - for i in Sc: - N_syn[i] = div + (mod > 0) - mod -= 1 - return N_syn - def _generate_synthetic_instances(self, X, Sc, N_syn, TiC, P): SI = [] for i in Sc: @@ -336,7 +340,7 @@ def _setup(self): self.Fs_i = dict() self.Fs_d = dict() - def _fit_resample(self, X, y): + def _fit_resample(self, X: np.ndarray, y: np.ndarray): """ Performs resampling @@ -348,38 +352,58 @@ def _fit_resample(self, X, y): Resampled X along with accordingly modified labels, resampled y """ - # 1 - Sc = np.array([i for i, _ in enumerate(X) if y[i] == self.c]) - Sct = np.array([i for i, _ in enumerate(X) if y[i] != self.c]) - y_maj_classes, y_min_classes = self._compute_min_maj(y) - self._setup() + if self.c is None: + target_classes = y_min_classes.keys() + else: + target_classes = [self.c] - self.kdt_c = neighbors.KDTree(X[Sc], metric=self._metric) - self.kdt_ct = neighbors.KDTree(X[Sct], metric=self._metric) + SI_synth = [] + y_synth = [] + target_number_to_generate = max( + int(max(y_maj_classes.values()) * self.prop), 1) + for target_class in target_classes: + self._setup() - for i in Sc: - self._find_nearest_k3_in_sc(X, Sc, i) - self._find_nearest_k3_in_sct(X, Sct, i) - self._find_k2_nearest_in_neighbor(X, i) + Sc = np.array([i for i, _ in enumerate(X) if y[i] == target_class]) + Sct = np.array( + [i for i, _ in enumerate(X) if y[i] != target_class]) + + to_generate = target_number_to_generate - Sc.shape[0] + if to_generate <= 0: + continue + + self.kdt_c = neighbors.KDTree(X[Sc], metric=self._metric) + self.kdt_ct = neighbors.KDTree(X[Sct], metric=self._metric) + + for i in Sc: + self._find_nearest_k3_in_sc(X, Sc, i) + self._find_nearest_k3_in_sct(X, Sct, i) + self._find_k2_nearest_in_neighbor(X, i) + + Sc_cl = self._run_nbdos(Sc) - Sc_cl = self._run_nbdos(Sc) + OiC = Sc[Sc_cl != 0] + TiC = Sc[Sc_cl == 0] - OiC = Sc[Sc_cl != 0] - TiC = Sc[Sc_cl == 0] + Sw = self._compute_selection_weights(X, Sc, TiC, OiC, + y_min_classes, + y_maj_classes) + P = self._obtain_probability_distribution(X, y, Sw, TiC) + N_syn = _compute_number_of_synthetic_instances(Sc, + to_generate) + SI = self._generate_synthetic_instances(X, Sc, N_syn, TiC, P) - Sw = self._compute_selection_weights(X, Sc, TiC, OiC, y_min_classes, - y_maj_classes) - P = self._obtain_probability_distribution(X, y, Sw, TiC) - N_syn = self._compute_number_of_synthetic_instances(Sc) - SI = self._generate_synthetic_instances(X, Sc, N_syn, TiC, P) + SI_synth.append(SI) + y_synth.append(np.full(SI.shape[0], target_class)) - X_resampled = np.concatenate([X, SI], 0) - y_resampled = np.concatenate([y, [self.c] * SI.shape[0]], 0) + X_resampled = np.concatenate([X, *SI_synth], 0) + y_resampled = np.concatenate([y, *y_synth], 0) if self.shuffle: - X_resampled, y_resampled = sklearn.utils.shuffle(X_resampled, - y_resampled, - state=self.random_state) + (X_resampled, y_resampled + ) = sklearn.utils.shuffle(X_resampled, + y_resampled, + random_state=self.random_state) return X_resampled, y_resampled diff --git a/tests/resampling/test_smom.py b/tests/resampling/test_smom.py new file mode 100644 index 0000000..ad9c89c --- /dev/null +++ b/tests/resampling/test_smom.py @@ -0,0 +1,23 @@ +from collections import Counter + +import numpy as np + +from multi_imbalance.resampling.smom import SMOM + + +def test_static_smote(): + X = np.vstack( + [ + np.random.normal(0, 1, (100, 2)), + np.random.normal(3, 5, (30, 2)), + np.random.normal(-2, 2, (20, 2)), + ] + ) + + y = np.array([1] * 100 + [2] * 30 + [3] * 20) + smom = SMOM() + X_resampled, y_resampled = smom.fit_resample(X, y) + cnt = Counter(y_resampled) + assert cnt[1] == 100 + assert cnt[2] == 100 + assert cnt[3] == 100 From d896b89cdb42d7babc690925a43773df93b04d31 Mon Sep 17 00:00:00 2001 From: Maciej Falbogowski Date: Tue, 17 Jan 2023 22:36:58 +0100 Subject: [PATCH 35/48] Remove unused imports and variables --- multi_imbalance/resampling/smom.py | 2 +- multi_imbalance/utils/array_util.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/multi_imbalance/resampling/smom.py b/multi_imbalance/resampling/smom.py index 7d2bdde..0a4b22d 100644 --- a/multi_imbalance/resampling/smom.py +++ b/multi_imbalance/resampling/smom.py @@ -226,7 +226,7 @@ def _find_k2_nearest_in_neighbor(self, X, i): dist_, ind_ = neighbors.KDTree(X[nc_nct_union_i], metric=self._metric).query([X[i]], self.k2) - ind, dist = ind_[0], dist_[0] + ind = ind_[0] self.N_k2_i[i] = nc_nct_union_i[ind] self.N_k2_d[i] = nc_nct_union_d[ind] diff --git a/multi_imbalance/utils/array_util.py b/multi_imbalance/utils/array_util.py index 38ce35f..a7b26d1 100644 --- a/multi_imbalance/utils/array_util.py +++ b/multi_imbalance/utils/array_util.py @@ -1,6 +1,4 @@ import numpy as np -from typing import Optional, Tuple -import sklearn def setdiff(arr1: np.ndarray, arr2: np.ndarray) -> np.ndarray: From bc149cfbb56556b26ef056870220062e885c4094 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Fri, 3 Mar 2023 18:33:07 +0100 Subject: [PATCH 36/48] add description --- examples/datasets/analysis.ipynb | 2 +- pyproject.toml | 51 ++++++++++++-------------------- tox.ini | 1 + 3 files changed, 21 insertions(+), 33 deletions(-) diff --git a/examples/datasets/analysis.ipynb b/examples/datasets/analysis.ipynb index 0432909..df57d46 100644 --- a/examples/datasets/analysis.ipynb +++ b/examples/datasets/analysis.ipynb @@ -13,7 +13,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In this notebook, we will be exploring the functionality that allows to run various classification and resampling methods of different datasets using both Python code and the command line interface (CLI). We will be able to compare the results and efficiency of these methods in order to determine the best approach for our specific use case." + "In this notebook, we will be exploring the functionality that allows to run various classification and resampling methods of different datasets using both Python code and the command line interface (CLI). We will be able to compare the results and efficiency of these methods in order to determine the best approach for our specific use case. The following will show the combination into one pipeline selected for resampling, classifiers and various metrics. Then, for selected resampling, classifier and metric methods, possible statistical analysis will be presented." ] }, { diff --git a/pyproject.toml b/pyproject.toml index 5e4427b..b0e5458 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,5 @@ [build-system] -requires = [ - "setuptools>=42", - "wheel" -] +requires = ["setuptools>=42", "wheel"] build-backend = "setuptools.build_meta" [project] @@ -10,16 +7,16 @@ name = "multi-imbalance" description = "Python package for tackling multiclass imbalance problems." version = "0.1.0" maintainers = [ - {name = "Damian Horna, Kamil Pluciński, Hanna Klimczak, Jacek Grycza, Jan Kozłowski, Maciej Falbogowski, Adam Wojciechowski, Mateusz Woźny", email = "horna.damian@gmail.com"} + { name = "Damian Horna, Kamil Pluciński, Hanna Klimczak, Jacek Grycza, Jan Kozłowski, Maciej Falbogowski, Adam Wojciechowski, Mateusz Woźny", email = "horna.damian@gmail.com" }, ] readme = "README.md" -classifiers=[ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: MIT License", - "Operating System :: OS Independent", - "Topic :: Software Development", - "Topic :: Scientific/Engineering", - "Programming Language :: Python :: 3.9", +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Topic :: Software Development", + "Topic :: Scientific/Engineering", + "Programming Language :: Python :: 3.9", ] dependencies = [ @@ -30,43 +27,33 @@ dependencies = [ "seaborn~=0.12.1", "matplotlib~=3.6.2", "click~=8.1.3", - "scikit-posthocs~=0.7.0" + "scikit-posthocs~=0.7.0", ] [project.optional-dependencies] -test = [ - "pytest~=7.2.0", - "pytest-cov~=4.0.0", - "coverage~=6.5.0" -] -lint = [ - "flake8~=5.0.4" -] -dev = [ - "tox~=3.27.0" -] +test = ["pytest~=7.2.0", "pytest-cov~=4.0.0", "coverage~=6.5.0"] +lint = ["flake8~=5.0.4"] +dev = ["tox~=3.27.0"] notebooks = [ "ipython~=8.6.0", "ipykernel~=6.17.0", "tqdm~=4.64.1", - "jupyter~=1.0.0" + "jupyter~=1.0.0", ] all = [ "multi-imbalance[test]", "multi-imbalance[lint]", "multi-imbalance[dev]", - "multi-imbalance[notebooks]" + "multi-imbalance[notebooks]", ] [project.urls] -homepage="https://github.com/damian-horna/multi-imbalance" -documentation="https://github.com/damian-horna/multi-imbalance/blob/master/README.md" +homepage = "https://github.com/damian-horna/multi-imbalance" +documentation = "https://github.com/damian-horna/multi-imbalance/blob/master/README.md" [tool.pytest.ini_options] -addopts = "--cov=multi_imbalance" -testpaths = [ - "tests", -] +addopts = "--cov=." +testpaths = ["tests"] [tool.setuptools] py-modules = ["multi_imbalance"] diff --git a/tox.ini b/tox.ini index 51f4878..610254d 100644 --- a/tox.ini +++ b/tox.ini @@ -15,6 +15,7 @@ deps = -r{toxinidir}/requirements.txt commands = pytest --cov-report term-missing --basetemp={envtmpdir} + coverage combine [testenv:flake8] basepython = python3.9 From ee9ca2bc627a5b1f10179da1ccdcd4a173b7e9fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Wed, 8 Mar 2023 19:18:35 +0100 Subject: [PATCH 37/48] init --- multi_imbalance/resampling/gmm_sampler.py | 437 ++++++++++++++++++++++ tests/resampling/test_gmm_sampler.py | 91 +++++ 2 files changed, 528 insertions(+) create mode 100644 multi_imbalance/resampling/gmm_sampler.py create mode 100644 tests/resampling/test_gmm_sampler.py diff --git a/multi_imbalance/resampling/gmm_sampler.py b/multi_imbalance/resampling/gmm_sampler.py new file mode 100644 index 0000000..7931554 --- /dev/null +++ b/multi_imbalance/resampling/gmm_sampler.py @@ -0,0 +1,437 @@ +from collections import OrderedDict, Counter +from copy import deepcopy +from typing import Optional, List, Dict, Tuple, Any, TypeVar + +import numpy as np +from imblearn.over_sampling.base import BaseSampler +from imblearn.utils import Substitution +from imblearn.utils._docstring import _random_state_docstring +from pydantic import validate_arguments +from scipy.spatial.distance import cdist +from sklearn.mixture import GaussianMixture +from sklearn.model_selection import train_test_split +from sklearn.neighbors import NearestNeighbors + +GMMS = TypeVar("GMMS", bound="GMMSampler") + + +@Substitution( + random_state=_random_state_docstring, +) +class GMMSampler(BaseSampler): + """ + GGMSampling algorithm that uses creating new examples by sampling from a multivariate normal distribution + (whose parameters are estimated from the input data) and removing troublesome examples from the majority class. + Parameters + ---------- + likelihood_threshold : float, default=0.0 + Minimum likelihood change threshold. A value below this threshold will be equivalent to no change. + k_neighbors : int, default=7 + The number of analyzed nearest neighbors during the analysis. + Used during both undersampling and oversampling. + undersample : bool, default=True + A binary value indicating whether to perform an undersampling operation on majority classes. + min_components : int, default=1 + Minimum number of components of GaussianMixture. + max_components : Optional[int], default=None + Maximum number of components of GaussianMixture. Without upper bound if not specified. + minority_classes : Optional[List[int]], default=None + List containing minority classes given by hand - no auto detection of minority classes will be done. + valid_size : float, default=0.25 + Size of validation set to perform test for components choosing. + filter_new : float, default=-1 + Parameter controlling the behavior after the oversampling operation. + Checks if and how to filter newly created examples: + -1 -> do not filter out + 0 -> filter out by max/mean value of created examples + >0 -> specify your own value e.g. 2.0 + add_after_filtration : bool, default=True + Value specifying whether to regenerate the examples after filtering. + iterations_after_filtration : int, default=50 + This value will potentially avoid an endless loop of deleting and re-generating examples. + The upper limit for the number of repetitions. + covariance_type : "full", "tied", "diag", "spherical", default="full" + String describing the type of covariance parameters to use in GaussianMixture. Must be one of: + - "full" + each component has its own general covariance matrix + - "tied" + all components share the same general covariance matrix + - "diag" + each component has its own diagonal covariance matrix + - "spherical" + each component has its own single variance + strategy : str "average" or "median", default="average" + The strategy of selecting the number of examples considers the target number in each class. + {random_state} + n_init : int, default=10 + The number of initializations to perform in GaussianMixture. The best results are kept. + tol : float, default=1e-3 + The convergence threshold in GaussianMixture. EM iterations will stop when the lower bound + average gain is below this threshold. + max_iter : int, default=100 + The number of EM iterations to perform in GaussianMixture. + Attributes + ---------- + likelihoods : dict + Likelihood of each minority class obtained after fitting the final Gaussian model. + gaussian_mixtures : dict + Dictionary containing all Gaussian models for each minority class separately. + class_sizes : Optional[Counter] + A dictionary containing the counts of each class. + neighborhood : Optional[dict] + Dictionary with information on the nearest points for each example separately. + maj_int_min : OrderedDict + A dictionary containing information on which class can be considered majority, + which minority and which remaining class - a heuristic matching. + size_to_align : Optional[np.ndarray] + ndarray containing information about the quantity considered the gold standard - + it is this size that we will want to generate and remove examples. + cdist_min_count : int + The minimum number of examples found in the data sample on which + distances between points are calculated (by the cdist method). + Examples + -------- + >>> import numpy as np + >>> from algorithms.gmm_sampler import GMMSampler + >>> from sklearn.datasets import make_blobs + >>> from collections import Counter + >>> blobs = [800, 100] + >>> X, y = make_blobs(blobs, centers=[(-4, 0), (0,0)]) + # Make this a binary classification problem + >>> y = np.array(y == 1, dtype=int) + >>> gmm_sampler = GMMSampler() + >>> X_res, y_res = gmm_sampler.fit_resample(X, y) + >>> print('Class distribution before GMMsampling: %s' % Counter(y)) + >>> print(f'Class distribution after GMMsampling: %s' % Counter(y_res)) + Class distribution before GMMsampling: Counter({{0: 800, 1: 100}}) + Class distribution after GMMsampling: Counter({{0: 450, 1: 450}}) + # Without undersampling + >>> gmm_sampler = GMMSampler(undersample=False) + >>> X_res, y_res = gmm_sampler.fit_resample(X, y) + >>> print('Class distribution before GMMsampling: %s' % Counter(y)) + >>> print(f'Class distribution after GMMsampling: %s' % Counter(y_res)) + Class distribution before GMMsampling: Counter({{0: 800, 1: 100}}) + Class distribution after GMMsampling: Counter({{0: 800, 1: 450}}) + """ + + _sampling_type = "over-sampling" + + @validate_arguments + def __init__( + self, + likelihood_threshold: float = 0.0, + k_neighbors: int = 7, + undersample: bool = True, + min_components: int = 1, + max_components: Optional[int] = None, + minority_classes: Optional[List[int]] = None, + valid_size: float = 0.25, + filter_new: float = -1.0, + add_after_filtration: bool = True, + iterations_after_filtration: int = 50, + strategy: str = "average", + covariance_type: str = "full", + n_init: int = 10, + tol: float = 1e-3, + max_iter: int = 100, + random_state: Optional[int] = None, + ): + super().__init__(sampling_strategy="auto") + self.likelihood_threshold: float = likelihood_threshold + self.k_neighbors: int = k_neighbors + self.undersample: bool = undersample + self.min_components: int = min_components + self.max_components: Optional[int] = max_components + self._minority_classes: Optional[List[int]] = minority_classes + self.valid_size: float = valid_size + self.filter_new: float = filter_new + self.add_after_filtration: bool = add_after_filtration + self.iterations_after_filtration: int = iterations_after_filtration + self.n_init: int = n_init + self.tol: float = tol + self.max_iter: int = max_iter + self.random_state: Optional[int] = random_state + np.random.seed(self.random_state) + + assert strategy in ["average", "median"], f"strategy '{strategy}' is invalid." + self.strategy: str = strategy + + assert covariance_type in [ + "full", + "tied", + "diag", + "spherical", + ], f"covariance_type '{covariance_type}' is invalid." + self.covariance_type: str = covariance_type + + self.likelihoods: Dict = dict() + self.gaussian_mixtures: Dict = dict() + self.class_sizes: Optional[Counter] = None + self.neighborhood: Optional[Dict] = None + self.maj_int_min: OrderedDict = OrderedDict({"maj": list(), "int": list(), "min": list()}) + self.size_to_align: Optional[np.ndarray] = None + self.__x_subset: Optional[np.ndarray] = None + self.cdist_min_count: int = 10 + + @property + def minority_classes(self) -> List: + if (self.class_sizes is None) or (self._minority_classes is not None): + return self._minority_classes + return self.maj_int_min["min"] + + def _fit_resample(self, X: Any, y: Any) -> Tuple[np.ndarray, np.ndarray]: + X_resample: np.ndarray + y_resample: np.ndarray + X_resample, y_resample = self._to_numpy(X, y) + + X_resample, y_resample = self._fit(X_resample, y_resample)._resample(X_resample, y_resample) + + indices = np.arange(y_resample.shape[0]) + np.random.shuffle(indices) + return X_resample[indices], y_resample[indices] + + def _fit(self, X: Any, y: Any) -> GMMS: + self.class_sizes: Counter = Counter(y) + self._construct_neighborhood(X, y) + + self._construct_maj_int_min() + self._set_size_to_align() + + self._fit_each_minority_class(X, y) + + return self + + @staticmethod + def _to_numpy(X: Any, y: Any) -> Tuple[np.ndarray, np.ndarray]: + try: + return np.array(X).copy(), np.array(y).copy() + except Exception as e: + raise e + + def _fit_each_minority_class(self, X: np.ndarray, y: np.ndarray) -> None: + minority_class: int + for minority_class in self.minority_classes: + self._fit_single_class(X, y, minority_class) + + def _fit_single_class(self, X: np.ndarray, y: np.ndarray, minority_class: int) -> None: + self.__x_subset = X[y == minority_class] + train: np.ndarray + valid: np.ndarray + train, valid = train_test_split(self.__x_subset, test_size=self.valid_size, random_state=self.random_state) + + current_component_count: int = self.min_components + + gaussian_mixture_model: GaussianMixture = self._init_model(current_component_count) + gaussian_mixture_model_temp: Optional[GaussianMixture] = None + gaussian_mixture_model.fit(train) + + likelihood = [float("-inf"), gaussian_mixture_model.score(valid)] + while self._perform_step(current_component_count, likelihood[1] - likelihood[0], train.shape[0]): + if gaussian_mixture_model_temp is not None: + gaussian_mixture_model = deepcopy(gaussian_mixture_model_temp) + + current_component_count += 1 + gaussian_mixture_model_temp = self._init_model(current_component_count) + gaussian_mixture_model_temp.fit(train) + + likelihood[0], likelihood[1] = likelihood[1], gaussian_mixture_model_temp.score(valid) + + gaussian_mixture_model.fit(self.__x_subset) + self.gaussian_mixtures[minority_class] = gaussian_mixture_model + self.likelihoods[minority_class] = gaussian_mixture_model.score(self.__x_subset) + self.__x_subset = None + + def _construct_neighborhood(self, X: np.ndarray, y: np.ndarray) -> None: + neigh_clf: NearestNeighbors = NearestNeighbors(n_neighbors=self.k_neighbors + 1).fit(X) + nearest_neighbor_idxs: np.ndarray = neigh_clf.kneighbors(X, return_distance=False)[:, 1:] + self.neighborhood = dict() + sample_idx: int + neigh_samples: np.ndarray + for sample_idx, neigh_samples in enumerate(nearest_neighbor_idxs): + neigh_counts: Counter = Counter(y[neigh_samples]) + self.neighborhood[sample_idx] = self._check_sample_neighborhood(y[sample_idx], neigh_counts) + + def _check_sample_neighborhood(self, sample_class: int, neigh_counts: Counter) -> float: + neighborhood: float = 0.0 + neigh_class: int + count: int + for neigh_class, count in neigh_counts.items(): + class_sizes: List = [ + self.class_sizes[sample_class], + self.class_sizes[neigh_class], + ] + neighborhood += count * (min(class_sizes) / max(class_sizes)) + neighborhood /= self.k_neighbors + if neighborhood > 1: + raise ValueError(f"Neighborhood is bigger than 1: {neighborhood}") + return neighborhood + + def _construct_maj_int_min(self) -> None: + middle_size = self._get_middle_size_based_on_strategy() + self._fill_maj_int_min(middle_size) + + def _get_middle_size_based_on_strategy(self) -> np.ndarray: + middle_size: np.ndarray + if self.strategy == "median": + middle_size = np.median(list(self.class_sizes.values())) + elif self.strategy == "average": + middle_size = np.mean(list(self.class_sizes.values())) + else: + raise ValueError(f'Unrecognized {self.strategy}. Only "median" and "average" are allowed.') + return middle_size + + def _fill_maj_int_min(self, middle_size) -> None: + class_label: int + class_size: int + for class_label, class_size in self.class_sizes.items(): + if class_size == middle_size: + class_group = "int" + elif class_size < middle_size: + class_group = "min" + else: + class_group = "maj" + + self.maj_int_min[class_group].append(class_label) + + def _set_size_to_align(self) -> None: + maj_q: List = [self.class_sizes[k] for k in self.maj_int_min["maj"]] + min_q: List = [self.class_sizes[k] for k in self.maj_int_min["min"]] + int_q: List = [self.class_sizes[k] for k in self.maj_int_min["int"]] + + if len(maj_q) == 0 and len(min_q) > 0: + self.size_to_align = np.mean(min_q, dtype=int) + elif len(min_q) == 0 and len(maj_q) > 0: + self.size_to_align = np.mean(maj_q, dtype=int) + return + elif len(maj_q) > 0 and len(min_q) > 0: + self.size_to_align = np.mean((max(min_q), min(maj_q)), dtype=int) + elif len(int_q) > 0: + self.size_to_align = np.mean(int_q, dtype=int) + else: + raise ValueError("Bad input - can not obtain desire size.") + + def _init_model(self, n_components: int) -> GaussianMixture: + return GaussianMixture( + n_components=n_components, + n_init=self.n_init, + covariance_type=self.covariance_type, + tol=self.tol, + max_iter=self.max_iter, + random_state=self.random_state, + ) + + def _perform_step(self, n_components: int, likelihood: float, num_samples: int) -> bool: + likelihood_condition: bool = likelihood >= self.likelihood_threshold + max_components_condition: bool = self.max_components is None or n_components <= self.max_components + num_samples_condition: bool = n_components < num_samples + return likelihood_condition and max_components_condition and num_samples_condition + + def _resample(self, X: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + X, y = self._oversample_each_minority_class(X, y) + if self.undersample and "maj" in self.maj_int_min: + X, y = self._undersample_majority_classes(X, y) + return X, y + + def _oversample_each_minority_class(self, X: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + X_copy: np.ndarray = X.copy() + y_copy: np.ndarray = y.copy() + minority_class: int + for minority_class in self.minority_classes: + self.__x_subset = X_copy[y_copy == minority_class] + X, y = self._oversample(X_copy, y_copy, minority_class) + self.__x_subset = None + return X, y + + def _oversample(self, X: np.ndarray, y: np.ndarray, minority_class: int) -> Tuple[np.ndarray, np.ndarray]: + means: np.ndarray + covariances: np.ndarray + means, covariances = self._get_coefficients(self.gaussian_mixtures[minority_class]) + + probabilities = self._get_probas_for_samples_in_component(X, y, minority_class) + + quantity_to_generate: int = self.size_to_align - self.__x_subset.shape[0] + for component in range(self.gaussian_mixtures[minority_class].n_components): + Nk: np.ndarray = probabilities[component] * quantity_to_generate + x: np.ndarray = self._create_samples(means[component], covariances[component], int(Nk)) + X = np.append(X, x, axis=0) + y = np.append(y, np.full((x.shape[0],), fill_value=minority_class), axis=0) + + return X, y + + def _get_probas_for_samples_in_component(self, X: np.ndarray, y: np.ndarray, minority_class: int) -> np.ndarray: + X_prob: np.ndarray = self.gaussian_mixtures[minority_class].predict_proba(X[y == minority_class]) + ratios: np.ndarray = np.array([v for k, v in self.neighborhood.items() if y[k] == minority_class]) + ratios = ratios[..., np.newaxis] + probabilities: np.ndarray = np.sum((1.0 - ratios) * X_prob, axis=0) + 1e-8 + probabilities = probabilities / np.sum(probabilities, keepdims=True) + return probabilities + + def _get_coefficients(self, gaussian_mixture: GaussianMixture) -> Tuple[np.ndarray, np.ndarray]: + means: np.ndarray = gaussian_mixture.means_ + covariances: np.ndarray = gaussian_mixture.covariances_ + if self.covariance_type == "tied": + covariances = np.array([covariances] * gaussian_mixture.n_components) + elif self.covariance_type == "diag": + cov_list: List = [] + for component in range(gaussian_mixture.n_components): + cov_list.append(np.diagflat(covariances[component, :])) + covariances = np.array(cov_list) + elif self.covariance_type == "spherical": + cov_list: List = [] + for component in range(gaussian_mixture.n_components): + var = np.array([covariances[component]] * self.__x_subset.shape[1]) + cov_list.append(np.diagflat(var)) + covariances = np.array(cov_list) + return means, covariances + + def _create_samples(self, mean: np.ndarray, covariance: np.ndarray, target_size: int) -> np.ndarray: + result: np.ndarray = np.empty((0, self.__x_subset.shape[1]), float) + iterations: int = 0 + threshold_dist: float = self.filter_new + while (result.shape[0] != target_size) and (iterations < self.iterations_after_filtration): + iterations += 1 + size: int = max(target_size - result.shape[0], result.shape[1] + 1) + x: np.ndarray = np.random.multivariate_normal(mean, covariance, size=size) + if self.filter_new == -1.0: + result = np.append(result, x, axis=0) + break + elif self.filter_new == 0.0: + mdist: np.ndarray = self._compute_mdist(self.__x_subset, mean, covariance) + threshold_dist: float = float(np.mean(mdist)) + + mdist = self._compute_mdist(x, mean, covariance)[: x.shape[0]] + x = x[mdist < threshold_dist] + x = x[: target_size - result.shape[0]] + result = np.append(result, x, axis=0) + if not self.add_after_filtration: + break + return result + + def _compute_mdist(self, in_data: np.ndarray, mean: np.ndarray, covariance: np.ndarray) -> np.ndarray: + mdist: np.ndarray + try: + data: np.ndarray = in_data + if data.shape[0] < self.cdist_min_count: + data: np.ndarray = np.concatenate((in_data, in_data), axis=0) + mdist = cdist(data, [mean], metric="mahalanobis", VI=np.linalg.inv(covariance))[:, 0] + except Exception as e: + print(f"Can't compute 'cdist' function. Distance threshold is set to 2.0") + print(f"For more information, examine exception: {e}") + mdist = np.full_like(in_data, fill_value=2.0)[:, 0] + return mdist + + def _undersample_majority_classes(self, X: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + maj_class: int + for maj_class in self.maj_int_min["maj"]: + X, y = self._undersample(X, y, maj_class) + return X, y + + def _undersample(self, X: np.ndarray, y: np.ndarray, class_id: int) -> Tuple[np.ndarray, np.ndarray]: + class_idxs: np.ndarray = np.where(y == class_id)[0] + sorted_neigh = sorted(self.neighborhood.items(), key=lambda item: item[1]) + class_idxs: List = [k for k, v in sorted_neigh if k in class_idxs] + size: int = max(0, int(self.class_sizes[class_id] - self.size_to_align)) + X = np.delete(X, class_idxs[:size], axis=0) + y = np.delete(y, class_idxs[:size], axis=0) + + return X, y diff --git a/tests/resampling/test_gmm_sampler.py b/tests/resampling/test_gmm_sampler.py new file mode 100644 index 0000000..f1d597f --- /dev/null +++ b/tests/resampling/test_gmm_sampler.py @@ -0,0 +1,91 @@ +import numpy as np +import pytest +from collections import Counter + +from multi_imbalance.resampling.gmm_sampler import GMMSampler + +X = np.array( + [ + [0.05837771, 0.57543339], + [0.06153624, 0.99871925], + [0.14308529, 0.00681144], + [0.23401697, 0.21188708], + [0.2418553, 0.02137086], + [0.32480534, 0.81547632], + [0.42478482, 0.31995162], + [0.50726834, 0.72621157], + [0.54580968, 0.58025914], + [0.55748531, 0.71866238], + [0.69208769, 0.63759459], + [0.70797377, 0.16348051], + [0.76410615, 0.70451542], + [0.81680686, 0.50793884], + [0.8490789, 0.53826627], + [0.8847505, 0.96856011], + [0.9287003, 0.97580299], + [0.9584236, 0.10536541], + [0.96983103, 0.87666093], + [0.97352367, 0.78807909], + ] +) + +majority_class = 0 +minority_class = 1 +num_classes = 2 + +y_balanced = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) +y_imb_easy = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1]) +y_imb_hard = np.array([0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0]) +complete_test_data = [ + (X, y_balanced), + (X, y_imb_easy), + (X, y_imb_hard), +] + + +@pytest.fixture() +def gmm_sampler_mock(): + def _get_parametrized_gmm_sampler(X, y, undersample): + gmm_sampler = GMMSampler(undersample=undersample) + return gmm_sampler + + return _get_parametrized_gmm_sampler + + +def get_goal_quantity(y): + quantities = Counter(y) + return np.mean((quantities[minority_class], quantities[majority_class]), dtype=int) + + +@pytest.mark.parametrize("X, y", complete_test_data) +def test_output_length_with_undersample(X, y, gmm_sampler_mock): + gmm_sampler = gmm_sampler_mock(X, y, True) + resampled_X, resampled_y = gmm_sampler.fit_resample(X, y) + + y_resampled_count = Counter(resampled_y) + for _, quantity in y_resampled_count.items(): + assert quantity == get_goal_quantity(y) + + assert len(resampled_X) == get_goal_quantity(y) * num_classes + assert len(resampled_y) == get_goal_quantity(y) * num_classes + + +@pytest.mark.parametrize("X, y", complete_test_data) +def test_output_length_without_undersample(X, y, gmm_sampler_mock): + gmm_sampler = gmm_sampler_mock(X, y, False) + resampled_X, resampled_y = gmm_sampler.fit_resample(X, y) + + y_count = Counter(y) + y_resampled_count = Counter(resampled_y) + + assert y_resampled_count[minority_class] == get_goal_quantity(y) + assert y_resampled_count[majority_class] == y_count[majority_class] + + +def test_perform_step_condition(gmm_sampler_mock): + gmm_sampler = GMMSampler() + assert gmm_sampler._perform_step(n_components=2, likelihood=1.0, num_samples=3) + assert not gmm_sampler._perform_step(n_components=2, likelihood=-1.0, num_samples=3) + assert not gmm_sampler._perform_step(n_components=2, likelihood=1.0, num_samples=1) + gmm_sampler.max_components = 1 + assert not gmm_sampler._perform_step(n_components=4, likelihood=1.0, num_samples=3) From bb20635c1e3603c11da712bc008269f9e04034c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Wed, 8 Mar 2023 19:23:11 +0100 Subject: [PATCH 38/48] update docstrings --- multi_imbalance/resampling/gmm_sampler.py | 24 +++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/multi_imbalance/resampling/gmm_sampler.py b/multi_imbalance/resampling/gmm_sampler.py index 7931554..1236fc2 100644 --- a/multi_imbalance/resampling/gmm_sampler.py +++ b/multi_imbalance/resampling/gmm_sampler.py @@ -22,34 +22,45 @@ class GMMSampler(BaseSampler): """ GGMSampling algorithm that uses creating new examples by sampling from a multivariate normal distribution (whose parameters are estimated from the input data) and removing troublesome examples from the majority class. + Parameters ---------- likelihood_threshold : float, default=0.0 Minimum likelihood change threshold. A value below this threshold will be equivalent to no change. + k_neighbors : int, default=7 The number of analyzed nearest neighbors during the analysis. Used during both undersampling and oversampling. + undersample : bool, default=True A binary value indicating whether to perform an undersampling operation on majority classes. + min_components : int, default=1 Minimum number of components of GaussianMixture. + max_components : Optional[int], default=None Maximum number of components of GaussianMixture. Without upper bound if not specified. + minority_classes : Optional[List[int]], default=None List containing minority classes given by hand - no auto detection of minority classes will be done. + valid_size : float, default=0.25 Size of validation set to perform test for components choosing. + filter_new : float, default=-1 Parameter controlling the behavior after the oversampling operation. Checks if and how to filter newly created examples: -1 -> do not filter out 0 -> filter out by max/mean value of created examples >0 -> specify your own value e.g. 2.0 + add_after_filtration : bool, default=True Value specifying whether to regenerate the examples after filtering. + iterations_after_filtration : int, default=50 This value will potentially avoid an endless loop of deleting and re-generating examples. The upper limit for the number of repetitions. + covariance_type : "full", "tied", "diag", "spherical", default="full" String describing the type of covariance parameters to use in GaussianMixture. Must be one of: - "full" @@ -60,35 +71,48 @@ class GMMSampler(BaseSampler): each component has its own diagonal covariance matrix - "spherical" each component has its own single variance + strategy : str "average" or "median", default="average" The strategy of selecting the number of examples considers the target number in each class. + {random_state} + n_init : int, default=10 The number of initializations to perform in GaussianMixture. The best results are kept. + tol : float, default=1e-3 The convergence threshold in GaussianMixture. EM iterations will stop when the lower bound average gain is below this threshold. + max_iter : int, default=100 The number of EM iterations to perform in GaussianMixture. + Attributes ---------- likelihoods : dict Likelihood of each minority class obtained after fitting the final Gaussian model. + gaussian_mixtures : dict Dictionary containing all Gaussian models for each minority class separately. + class_sizes : Optional[Counter] A dictionary containing the counts of each class. + neighborhood : Optional[dict] Dictionary with information on the nearest points for each example separately. + maj_int_min : OrderedDict A dictionary containing information on which class can be considered majority, which minority and which remaining class - a heuristic matching. + size_to_align : Optional[np.ndarray] ndarray containing information about the quantity considered the gold standard - it is this size that we will want to generate and remove examples. + cdist_min_count : int The minimum number of examples found in the data sample on which distances between points are calculated (by the cdist method). + Examples -------- >>> import numpy as np From 6e70b958bfeed4f000f7e2fd0137dccef7b94156 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Wed, 15 Mar 2023 19:04:36 +0100 Subject: [PATCH 39/48] change typing --- multi_imbalance/resampling/gmm_sampler.py | 148 ++++++++++------------ 1 file changed, 69 insertions(+), 79 deletions(-) diff --git a/multi_imbalance/resampling/gmm_sampler.py b/multi_imbalance/resampling/gmm_sampler.py index 1236fc2..de1ccd3 100644 --- a/multi_imbalance/resampling/gmm_sampler.py +++ b/multi_imbalance/resampling/gmm_sampler.py @@ -2,6 +2,7 @@ from copy import deepcopy from typing import Optional, List, Dict, Tuple, Any, TypeVar +import logging import numpy as np from imblearn.over_sampling.base import BaseSampler from imblearn.utils import Substitution @@ -14,6 +15,12 @@ GMMS = TypeVar("GMMS", bound="GMMSampler") +logging.basicConfig( + level=logging.INFO, + format="[%(asctime)s] %(levelname)s %(name)s %(message)s", + datefmt="%d.%m.%Y %H:%M:%S", +) + @Substitution( random_state=_random_state_docstring, @@ -161,24 +168,24 @@ def __init__( random_state: Optional[int] = None, ): super().__init__(sampling_strategy="auto") - self.likelihood_threshold: float = likelihood_threshold - self.k_neighbors: int = k_neighbors - self.undersample: bool = undersample - self.min_components: int = min_components - self.max_components: Optional[int] = max_components - self._minority_classes: Optional[List[int]] = minority_classes - self.valid_size: float = valid_size - self.filter_new: float = filter_new - self.add_after_filtration: bool = add_after_filtration - self.iterations_after_filtration: int = iterations_after_filtration - self.n_init: int = n_init - self.tol: float = tol - self.max_iter: int = max_iter - self.random_state: Optional[int] = random_state + self.likelihood_threshold = likelihood_threshold + self.k_neighbors = k_neighbors + self.undersample = undersample + self.min_components = min_components + self.max_components = max_components + self._minority_classes = minority_classes + self.valid_size = valid_size + self.filter_new = filter_new + self.add_after_filtration = add_after_filtration + self.iterations_after_filtration = iterations_after_filtration + self.n_init = n_init + self.tol = tol + self.max_iter = max_iter + self.random_state = random_state np.random.seed(self.random_state) assert strategy in ["average", "median"], f"strategy '{strategy}' is invalid." - self.strategy: str = strategy + self.strategy = strategy assert covariance_type in [ "full", @@ -186,26 +193,25 @@ def __init__( "diag", "spherical", ], f"covariance_type '{covariance_type}' is invalid." - self.covariance_type: str = covariance_type - - self.likelihoods: Dict = dict() - self.gaussian_mixtures: Dict = dict() - self.class_sizes: Optional[Counter] = None - self.neighborhood: Optional[Dict] = None - self.maj_int_min: OrderedDict = OrderedDict({"maj": list(), "int": list(), "min": list()}) - self.size_to_align: Optional[np.ndarray] = None + self.covariance_type = covariance_type + + self.likelihoods: Dict[int, float] = dict() + self.gaussian_mixtures: Dict[int, GaussianMixture] = dict() + self.class_sizes: Optional[Counter[int]] = None + self.neighborhood: Optional[Dict[int, float]] = None + self.maj_int_min = OrderedDict({"maj": list(), "int": list(), "min": list()}) + self.size_to_align: Optional[int] = None self.__x_subset: Optional[np.ndarray] = None - self.cdist_min_count: int = 10 + self.cdist_min_count = 10 + self.__logger = logging.getLogger("GMMSampler") @property - def minority_classes(self) -> List: + def minority_classes(self) -> List[int]: if (self.class_sizes is None) or (self._minority_classes is not None): return self._minority_classes return self.maj_int_min["min"] def _fit_resample(self, X: Any, y: Any) -> Tuple[np.ndarray, np.ndarray]: - X_resample: np.ndarray - y_resample: np.ndarray X_resample, y_resample = self._to_numpy(X, y) X_resample, y_resample = self._fit(X_resample, y_resample)._resample(X_resample, y_resample) @@ -215,7 +221,7 @@ def _fit_resample(self, X: Any, y: Any) -> Tuple[np.ndarray, np.ndarray]: return X_resample[indices], y_resample[indices] def _fit(self, X: Any, y: Any) -> GMMS: - self.class_sizes: Counter = Counter(y) + self.class_sizes = Counter(y) self._construct_neighborhood(X, y) self._construct_maj_int_min() @@ -227,13 +233,9 @@ def _fit(self, X: Any, y: Any) -> GMMS: @staticmethod def _to_numpy(X: Any, y: Any) -> Tuple[np.ndarray, np.ndarray]: - try: - return np.array(X).copy(), np.array(y).copy() - except Exception as e: - raise e + return np.array(X).copy(), np.array(y).copy() def _fit_each_minority_class(self, X: np.ndarray, y: np.ndarray) -> None: - minority_class: int for minority_class in self.minority_classes: self._fit_single_class(X, y, minority_class) @@ -243,10 +245,10 @@ def _fit_single_class(self, X: np.ndarray, y: np.ndarray, minority_class: int) - valid: np.ndarray train, valid = train_test_split(self.__x_subset, test_size=self.valid_size, random_state=self.random_state) - current_component_count: int = self.min_components + current_component_count = self.min_components - gaussian_mixture_model: GaussianMixture = self._init_model(current_component_count) - gaussian_mixture_model_temp: Optional[GaussianMixture] = None + gaussian_mixture_model = self._init_model(current_component_count) + gaussian_mixture_model_temp = None gaussian_mixture_model.fit(train) likelihood = [float("-inf"), gaussian_mixture_model.score(valid)] @@ -269,16 +271,13 @@ def _construct_neighborhood(self, X: np.ndarray, y: np.ndarray) -> None: neigh_clf: NearestNeighbors = NearestNeighbors(n_neighbors=self.k_neighbors + 1).fit(X) nearest_neighbor_idxs: np.ndarray = neigh_clf.kneighbors(X, return_distance=False)[:, 1:] self.neighborhood = dict() - sample_idx: int neigh_samples: np.ndarray for sample_idx, neigh_samples in enumerate(nearest_neighbor_idxs): neigh_counts: Counter = Counter(y[neigh_samples]) self.neighborhood[sample_idx] = self._check_sample_neighborhood(y[sample_idx], neigh_counts) - def _check_sample_neighborhood(self, sample_class: int, neigh_counts: Counter) -> float: - neighborhood: float = 0.0 - neigh_class: int - count: int + def _check_sample_neighborhood(self, sample_class: int, neigh_counts: Counter[int]) -> float: + neighborhood = 0.0 for neigh_class, count in neigh_counts.items(): class_sizes: List = [ self.class_sizes[sample_class], @@ -294,19 +293,16 @@ def _construct_maj_int_min(self) -> None: middle_size = self._get_middle_size_based_on_strategy() self._fill_maj_int_min(middle_size) - def _get_middle_size_based_on_strategy(self) -> np.ndarray: - middle_size: np.ndarray + def _get_middle_size_based_on_strategy(self) -> int: if self.strategy == "median": - middle_size = np.median(list(self.class_sizes.values())) + middle_size = int(np.median(list(self.class_sizes.values()))) elif self.strategy == "average": - middle_size = np.mean(list(self.class_sizes.values())) + middle_size = np.mean(list(self.class_sizes.values()), dtype=int) else: raise ValueError(f'Unrecognized {self.strategy}. Only "median" and "average" are allowed.') return middle_size - def _fill_maj_int_min(self, middle_size) -> None: - class_label: int - class_size: int + def _fill_maj_int_min(self, middle_size: int) -> None: for class_label, class_size in self.class_sizes.items(): if class_size == middle_size: class_group = "int" @@ -318,9 +314,9 @@ def _fill_maj_int_min(self, middle_size) -> None: self.maj_int_min[class_group].append(class_label) def _set_size_to_align(self) -> None: - maj_q: List = [self.class_sizes[k] for k in self.maj_int_min["maj"]] - min_q: List = [self.class_sizes[k] for k in self.maj_int_min["min"]] - int_q: List = [self.class_sizes[k] for k in self.maj_int_min["int"]] + maj_q = [self.class_sizes[k] for k in self.maj_int_min["maj"]] + min_q = [self.class_sizes[k] for k in self.maj_int_min["min"]] + int_q = [self.class_sizes[k] for k in self.maj_int_min["int"]] if len(maj_q) == 0 and len(min_q) > 0: self.size_to_align = np.mean(min_q, dtype=int) @@ -345,9 +341,9 @@ def _init_model(self, n_components: int) -> GaussianMixture: ) def _perform_step(self, n_components: int, likelihood: float, num_samples: int) -> bool: - likelihood_condition: bool = likelihood >= self.likelihood_threshold - max_components_condition: bool = self.max_components is None or n_components <= self.max_components - num_samples_condition: bool = n_components < num_samples + likelihood_condition = likelihood >= self.likelihood_threshold + max_components_condition = self.max_components is None or n_components <= self.max_components + num_samples_condition = n_components < num_samples return likelihood_condition and max_components_condition and num_samples_condition def _resample(self, X: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: @@ -357,9 +353,8 @@ def _resample(self, X: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, np.ndarra return X, y def _oversample_each_minority_class(self, X: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: - X_copy: np.ndarray = X.copy() - y_copy: np.ndarray = y.copy() - minority_class: int + X_copy = X.copy() + y_copy = y.copy() for minority_class in self.minority_classes: self.__x_subset = X_copy[y_copy == minority_class] X, y = self._oversample(X_copy, y_copy, minority_class) @@ -367,16 +362,13 @@ def _oversample_each_minority_class(self, X: np.ndarray, y: np.ndarray) -> Tuple return X, y def _oversample(self, X: np.ndarray, y: np.ndarray, minority_class: int) -> Tuple[np.ndarray, np.ndarray]: - means: np.ndarray - covariances: np.ndarray means, covariances = self._get_coefficients(self.gaussian_mixtures[minority_class]) probabilities = self._get_probas_for_samples_in_component(X, y, minority_class) - - quantity_to_generate: int = self.size_to_align - self.__x_subset.shape[0] + quantity_to_generate = self.size_to_align - self.__x_subset.shape[0] for component in range(self.gaussian_mixtures[minority_class].n_components): Nk: np.ndarray = probabilities[component] * quantity_to_generate - x: np.ndarray = self._create_samples(means[component], covariances[component], int(Nk)) + x = self._create_samples(means[component], covariances[component], int(Nk)) X = np.append(X, x, axis=0) y = np.append(y, np.full((x.shape[0],), fill_value=minority_class), axis=0) @@ -384,7 +376,7 @@ def _oversample(self, X: np.ndarray, y: np.ndarray, minority_class: int) -> Tupl def _get_probas_for_samples_in_component(self, X: np.ndarray, y: np.ndarray, minority_class: int) -> np.ndarray: X_prob: np.ndarray = self.gaussian_mixtures[minority_class].predict_proba(X[y == minority_class]) - ratios: np.ndarray = np.array([v for k, v in self.neighborhood.items() if y[k] == minority_class]) + ratios = np.array([v for k, v in self.neighborhood.items() if y[k] == minority_class]) ratios = ratios[..., np.newaxis] probabilities: np.ndarray = np.sum((1.0 - ratios) * X_prob, axis=0) + 1e-8 probabilities = probabilities / np.sum(probabilities, keepdims=True) @@ -409,19 +401,19 @@ def _get_coefficients(self, gaussian_mixture: GaussianMixture) -> Tuple[np.ndarr return means, covariances def _create_samples(self, mean: np.ndarray, covariance: np.ndarray, target_size: int) -> np.ndarray: - result: np.ndarray = np.empty((0, self.__x_subset.shape[1]), float) - iterations: int = 0 - threshold_dist: float = self.filter_new + result = np.empty((0, self.__x_subset.shape[1]), float) + iterations = 0 + threshold_dist = self.filter_new while (result.shape[0] != target_size) and (iterations < self.iterations_after_filtration): iterations += 1 - size: int = max(target_size - result.shape[0], result.shape[1] + 1) - x: np.ndarray = np.random.multivariate_normal(mean, covariance, size=size) + size = max(target_size - result.shape[0], result.shape[1] + 1) + x = np.random.multivariate_normal(mean, covariance, size=size) if self.filter_new == -1.0: result = np.append(result, x, axis=0) break elif self.filter_new == 0.0: - mdist: np.ndarray = self._compute_mdist(self.__x_subset, mean, covariance) - threshold_dist: float = float(np.mean(mdist)) + mdist = self._compute_mdist(self.__x_subset, mean, covariance) + threshold_dist = float(np.mean(mdist)) mdist = self._compute_mdist(x, mean, covariance)[: x.shape[0]] x = x[mdist < threshold_dist] @@ -432,29 +424,27 @@ def _create_samples(self, mean: np.ndarray, covariance: np.ndarray, target_size: return result def _compute_mdist(self, in_data: np.ndarray, mean: np.ndarray, covariance: np.ndarray) -> np.ndarray: - mdist: np.ndarray try: - data: np.ndarray = in_data + data = in_data if data.shape[0] < self.cdist_min_count: - data: np.ndarray = np.concatenate((in_data, in_data), axis=0) + data = np.concatenate((in_data, in_data), axis=0) mdist = cdist(data, [mean], metric="mahalanobis", VI=np.linalg.inv(covariance))[:, 0] except Exception as e: - print(f"Can't compute 'cdist' function. Distance threshold is set to 2.0") - print(f"For more information, examine exception: {e}") + self.__logger.error("Can't compute 'cdist' function. Distance threshold is set to 2.0") + self.__logger.info(f"For more information, examine exception: {e}") mdist = np.full_like(in_data, fill_value=2.0)[:, 0] return mdist def _undersample_majority_classes(self, X: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: - maj_class: int for maj_class in self.maj_int_min["maj"]: X, y = self._undersample(X, y, maj_class) return X, y def _undersample(self, X: np.ndarray, y: np.ndarray, class_id: int) -> Tuple[np.ndarray, np.ndarray]: - class_idxs: np.ndarray = np.where(y == class_id)[0] + class_idxs = np.where(y == class_id)[0] sorted_neigh = sorted(self.neighborhood.items(), key=lambda item: item[1]) - class_idxs: List = [k for k, v in sorted_neigh if k in class_idxs] - size: int = max(0, int(self.class_sizes[class_id] - self.size_to_align)) + class_idxs = [k for k, _ in sorted_neigh if k in class_idxs] + size = max(0, int(self.class_sizes[class_id] - self.size_to_align)) X = np.delete(X, class_idxs[:size], axis=0) y = np.delete(y, class_idxs[:size], axis=0) From 263a7225ef8a34218b3cf7a8ccba8723f3142a86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Sun, 23 Apr 2023 11:53:00 +0200 Subject: [PATCH 40/48] add more tests --- multi_imbalance/resampling/gmm_sampler.py | 8 +- pyproject.toml | 1 + tests/resampling/test_gmm_sampler.py | 99 ++++++++++++++++++++--- 3 files changed, 95 insertions(+), 13 deletions(-) diff --git a/multi_imbalance/resampling/gmm_sampler.py b/multi_imbalance/resampling/gmm_sampler.py index de1ccd3..8c4ba78 100644 --- a/multi_imbalance/resampling/gmm_sampler.py +++ b/multi_imbalance/resampling/gmm_sampler.py @@ -279,7 +279,7 @@ def _construct_neighborhood(self, X: np.ndarray, y: np.ndarray) -> None: def _check_sample_neighborhood(self, sample_class: int, neigh_counts: Counter[int]) -> float: neighborhood = 0.0 for neigh_class, count in neigh_counts.items(): - class_sizes: List = [ + class_sizes = [ self.class_sizes[sample_class], self.class_sizes[neigh_class], ] @@ -388,12 +388,12 @@ def _get_coefficients(self, gaussian_mixture: GaussianMixture) -> Tuple[np.ndarr if self.covariance_type == "tied": covariances = np.array([covariances] * gaussian_mixture.n_components) elif self.covariance_type == "diag": - cov_list: List = [] + cov_list = [] for component in range(gaussian_mixture.n_components): cov_list.append(np.diagflat(covariances[component, :])) covariances = np.array(cov_list) elif self.covariance_type == "spherical": - cov_list: List = [] + cov_list = [] for component in range(gaussian_mixture.n_components): var = np.array([covariances[component]] * self.__x_subset.shape[1]) cov_list.append(np.diagflat(var)) @@ -431,7 +431,7 @@ def _compute_mdist(self, in_data: np.ndarray, mean: np.ndarray, covariance: np.n mdist = cdist(data, [mean], metric="mahalanobis", VI=np.linalg.inv(covariance))[:, 0] except Exception as e: self.__logger.error("Can't compute 'cdist' function. Distance threshold is set to 2.0") - self.__logger.info(f"For more information, examine exception: {e}") + self.__logger.error(f"For more information, examine exception: {e}") mdist = np.full_like(in_data, fill_value=2.0)[:, 0] return mdist diff --git a/pyproject.toml b/pyproject.toml index b0e5458..3e75717 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ dependencies = [ "matplotlib~=3.6.2", "click~=8.1.3", "scikit-posthocs~=0.7.0", + "pydantic~=1.10.5", ] [project.optional-dependencies] diff --git a/tests/resampling/test_gmm_sampler.py b/tests/resampling/test_gmm_sampler.py index f1d597f..caff78a 100644 --- a/tests/resampling/test_gmm_sampler.py +++ b/tests/resampling/test_gmm_sampler.py @@ -1,6 +1,7 @@ +import logging import numpy as np import pytest -from collections import Counter +from collections import Counter, OrderedDict from multi_imbalance.resampling.gmm_sampler import GMMSampler @@ -45,8 +46,8 @@ @pytest.fixture() def gmm_sampler_mock(): - def _get_parametrized_gmm_sampler(X, y, undersample): - gmm_sampler = GMMSampler(undersample=undersample) + def _get_parametrized_gmm_sampler(*args, **kwargs) -> GMMSampler: + gmm_sampler = GMMSampler(*args, **kwargs) return gmm_sampler return _get_parametrized_gmm_sampler @@ -57,13 +58,14 @@ def get_goal_quantity(y): return np.mean((quantities[minority_class], quantities[majority_class]), dtype=int) +@pytest.mark.parametrize("strategy, filter_new", [("median", 1.0), ("average", 0.0)]) @pytest.mark.parametrize("X, y", complete_test_data) -def test_output_length_with_undersample(X, y, gmm_sampler_mock): - gmm_sampler = gmm_sampler_mock(X, y, True) +def test_output_length_with_undersample(X, y, strategy, filter_new, gmm_sampler_mock): + gmm_sampler = gmm_sampler_mock(undersample=True, strategy=strategy, filter_new=filter_new) resampled_X, resampled_y = gmm_sampler.fit_resample(X, y) y_resampled_count = Counter(resampled_y) - for _, quantity in y_resampled_count.items(): + for quantity in y_resampled_count.values(): assert quantity == get_goal_quantity(y) assert len(resampled_X) == get_goal_quantity(y) * num_classes @@ -72,8 +74,8 @@ def test_output_length_with_undersample(X, y, gmm_sampler_mock): @pytest.mark.parametrize("X, y", complete_test_data) def test_output_length_without_undersample(X, y, gmm_sampler_mock): - gmm_sampler = gmm_sampler_mock(X, y, False) - resampled_X, resampled_y = gmm_sampler.fit_resample(X, y) + gmm_sampler = gmm_sampler_mock(undersample=False) + _, resampled_y = gmm_sampler.fit_resample(X, y) y_count = Counter(y) y_resampled_count = Counter(resampled_y) @@ -83,9 +85,88 @@ def test_output_length_without_undersample(X, y, gmm_sampler_mock): def test_perform_step_condition(gmm_sampler_mock): - gmm_sampler = GMMSampler() + gmm_sampler = gmm_sampler_mock() assert gmm_sampler._perform_step(n_components=2, likelihood=1.0, num_samples=3) assert not gmm_sampler._perform_step(n_components=2, likelihood=-1.0, num_samples=3) assert not gmm_sampler._perform_step(n_components=2, likelihood=1.0, num_samples=1) gmm_sampler.max_components = 1 assert not gmm_sampler._perform_step(n_components=4, likelihood=1.0, num_samples=3) + + +def test_minority_classes(gmm_sampler_mock): + minority_classes = [0, 1] + gmm_sampler = gmm_sampler_mock(minority_classes=minority_classes) + + assert gmm_sampler.minority_classes == minority_classes + + +@pytest.mark.parametrize( + "maj_int_min, expected_size", + [ + ({"maj": [], "int": [], "min": [1]}, 6), + ({"maj": [1], "int": [], "min": []}, 6), + ({"maj": [], "int": [1], "min": []}, 6), + ], +) +def test_set_size_to_align(gmm_sampler_mock, expected_size, maj_int_min): + gmm_sampler = gmm_sampler_mock() + gmm_sampler.class_sizes = Counter(y_imb_hard) + gmm_sampler.maj_int_min = OrderedDict(maj_int_min) + + gmm_sampler._set_size_to_align() + assert gmm_sampler.size_to_align == expected_size + + +def test_compute_mdist(gmm_sampler_mock, caplog): + caplog.set_level(logging.ERROR) + gmm_sampler = gmm_sampler_mock() + mean = [0, 0] + covariance = np.eye(2) + x = np.random.multivariate_normal(mean, covariance, size=2) + + gmm_sampler._compute_mdist(x, mean, np.ones((2, 2))) + + no_exception_check = 0 + for record in caplog.records: + if record.levelname == "ERROR": + msg = record.message + no_exception_check += ( + msg == "Can't compute 'cdist' function. Distance threshold is set to 2.0" + or msg == "For more information, examine exception: Singular matrix" + ) + + assert no_exception_check == 2 + + +@pytest.mark.parametrize( + "strategy, class_count, expected_middle_size", + [("median", [10, 6, 4], 6), ("median", [4, 12, 4], 4), ("average", [4, 4, 12], 6), ("average", [8, 4, 8], 6)], +) +def test_get_middle_size_based_on_strategy(strategy, class_count, expected_middle_size, gmm_sampler_mock): + gmm_sampler = gmm_sampler_mock(strategy=strategy) + + gmm_sampler._fit(X, np.array([*[0] * class_count[0], *[1] * class_count[1], *[2] * class_count[2]])) + middle_size = gmm_sampler._get_middle_size_based_on_strategy() + assert middle_size == expected_middle_size + + +def test_get_middle_size_based_on_strategy_exception(gmm_sampler_mock): + gmm_sampler = gmm_sampler_mock() + gmm_sampler.strategy = "min" + + with pytest.raises(ValueError) as ex: + gmm_sampler._get_middle_size_based_on_strategy() + + assert str(ex.value) == 'Unrecognized min. Only "median" and "average" are allowed.' + + +def test_set_size_to_align_exception(gmm_sampler_mock): + maj_int_min = {"maj": [], "int": [], "min": []} + gmm_sampler = gmm_sampler_mock() + gmm_sampler.class_sizes = Counter(y_imb_hard) + gmm_sampler.maj_int_min = OrderedDict(maj_int_min) + + with pytest.raises(ValueError) as ex: + gmm_sampler._set_size_to_align() + + assert str(ex.value) == "Bad input - can not obtain desire size." From 693b8ffee1747648b4e86bc094021c34f78b51fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Sun, 23 Apr 2023 11:58:53 +0200 Subject: [PATCH 41/48] add future annotations --- multi_imbalance/resampling/gmm_sampler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/multi_imbalance/resampling/gmm_sampler.py b/multi_imbalance/resampling/gmm_sampler.py index 8c4ba78..3421bc3 100644 --- a/multi_imbalance/resampling/gmm_sampler.py +++ b/multi_imbalance/resampling/gmm_sampler.py @@ -1,3 +1,4 @@ +from __future__ import annotations from collections import OrderedDict, Counter from copy import deepcopy from typing import Optional, List, Dict, Tuple, Any, TypeVar From 91f6b582f1d435fce28ec816f6ad4630fe006a47 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Mon, 24 Apr 2023 17:52:57 +0200 Subject: [PATCH 42/48] add example notebook --- examples/resampling/GMMSampler.ipynb | 257 +++++++++++++++++++++++++++ 1 file changed, 257 insertions(+) create mode 100644 examples/resampling/GMMSampler.ipynb diff --git a/examples/resampling/GMMSampler.ipynb b/examples/resampling/GMMSampler.ipynb new file mode 100644 index 0000000..c1c369d --- /dev/null +++ b/examples/resampling/GMMSampler.ipynb @@ -0,0 +1,257 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Unzip datasets and prepare data:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0.49 0.29 0.48 0.5 0.56 0.24 0.35]\n", + " [0.07 0.4 0.48 0.5 0.54 0.35 0.44]\n", + " [0.56 0.4 0.48 0.5 0.49 0.37 0.46]\n", + " [0.59 0.49 0.48 0.5 0.52 0.45 0.36]\n", + " [0.23 0.32 0.48 0.5 0.55 0.25 0.35]]\n", + "[0 0 0 0 0]\n" + ] + } + ], + "source": [ + "from collections import Counter\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from sklearn.decomposition import PCA\n", + "\n", + "from multi_imbalance.datasets import load_datasets\n", + "from multi_imbalance.resampling.gmm_sampler import GMMSampler\n", + "from multi_imbalance.utils.data import construct_flat_2pc_df\n", + "\n", + "%matplotlib inline\n", + "sns.set_style(\"darkgrid\")\n", + "\n", + "%matplotlib inline\n", + "sns.set_style(\"darkgrid\")\n", + "\n", + "dataset = load_datasets()[\"new_ecoli\"]\n", + "\n", + "X, y = dataset.data, dataset.target\n", + "print(X[:5])\n", + "print(y[:5])" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Counter({0: 145, 1: 77, 2: 37, 3: 25, 4: 52})" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Counter(dataset.target)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Resample data using GMMSampler algorithm" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "clf = GMMSampler(minority_classes=[2, 3])\n", + "resampled_X, resampled_y = clf.fit_resample(X, y)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Compare results by plotting data in 2 dimensions" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "n = len(Counter(y).keys())\n", + "p = sns.color_palette(\"husl\", n)\n", + "\n", + "pca = PCA(n_components=2)\n", + "pca.fit(X)\n", + "\n", + "fig, axs = plt.subplots(ncols=2, nrows=2)\n", + "fig.set_size_inches(16, 10)\n", + "axs = axs.flatten()\n", + "\n", + "axs[1].set_title(\"Base\")\n", + "sns.countplot(x=y, ax=axs[0], palette=p)\n", + "X = pca.transform(X)\n", + "df = construct_flat_2pc_df(X, y)\n", + "sns.scatterplot(\n", + " x=\"x1\",\n", + " y=\"x2\",\n", + " hue=\"y\",\n", + " style=\"y\",\n", + " data=df,\n", + " alpha=0.7,\n", + " ax=axs[1],\n", + " legend=\"full\",\n", + " palette=p,\n", + ")\n", + "\n", + "\n", + "axs[3].set_title(\"GMMSampler\")\n", + "sns.countplot(x=resampled_y, ax=axs[2], palette=p)\n", + "resampled_X = pca.transform(resampled_X)\n", + "df = construct_flat_2pc_df(resampled_X, resampled_y)\n", + "sns.scatterplot(\n", + " x=\"x1\",\n", + " y=\"x2\",\n", + " hue=\"y\",\n", + " style=\"y\",\n", + " data=df,\n", + " alpha=0.7,\n", + " ax=axs[3],\n", + " legend=\"full\",\n", + " palette=p,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Pipeline example" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.95 0.97 0.96 36\n", + " 1 0.75 0.63 0.69 19\n", + " 2 0.50 0.60 0.55 10\n", + " 3 1.00 1.00 1.00 6\n", + " 4 1.00 1.00 1.00 13\n", + "\n", + " accuracy 0.86 84\n", + " macro avg 0.84 0.84 0.84 84\n", + "weighted avg 0.86 0.86 0.86 84\n", + "\n" + ] + } + ], + "source": [ + "from imblearn.pipeline import Pipeline\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.neighbors import KNeighborsClassifier as KNN\n", + "from sklearn.metrics import classification_report\n", + "\n", + "dataset = load_datasets()[\"new_ecoli\"]\n", + "\n", + "X, y = dataset.data, dataset.target\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)\n", + "pipeline = Pipeline(\n", + " [(\"scaler\", StandardScaler()), (\"gmm_sampler\", GMMSampler()), (\"knn\", KNN())]\n", + ")\n", + "pipeline.fit(X_train, y_train)\n", + "y_hat = pipeline.predict(X_test)\n", + "print(classification_report(y_test, y_hat))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.2" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} From e5b8710597def2639f621dc8daafdf4e2656b30e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Tue, 25 Apr 2023 20:08:31 +0200 Subject: [PATCH 43/48] black + new code --- examples/resampling/GMMSampler.ipynb | 26 +++++++++++------------ multi_imbalance/resampling/gmm_sampler.py | 3 +++ 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/examples/resampling/GMMSampler.ipynb b/examples/resampling/GMMSampler.ipynb index c1c369d..552daee 100644 --- a/examples/resampling/GMMSampler.ipynb +++ b/examples/resampling/GMMSampler.ipynb @@ -88,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": { "pycharm": { "name": "#%%\n" @@ -96,7 +96,7 @@ }, "outputs": [], "source": [ - "clf = GMMSampler(minority_classes=[2, 3])\n", + "clf = GMMSampler()\n", "resampled_X, resampled_y = clf.fit_resample(X, y)" ] }, @@ -109,7 +109,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": { "pycharm": { "name": "#%%\n" @@ -122,13 +122,13 @@ "" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] @@ -191,7 +191,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -201,14 +201,14 @@ " precision recall f1-score support\n", "\n", " 0 0.95 0.97 0.96 36\n", - " 1 0.75 0.63 0.69 19\n", - " 2 0.50 0.60 0.55 10\n", + " 1 0.75 0.47 0.58 19\n", + " 2 0.44 0.70 0.54 10\n", " 3 1.00 1.00 1.00 6\n", " 4 1.00 1.00 1.00 13\n", "\n", - " accuracy 0.86 84\n", - " macro avg 0.84 0.84 0.84 84\n", - "weighted avg 0.86 0.86 0.86 84\n", + " accuracy 0.83 84\n", + " macro avg 0.83 0.83 0.82 84\n", + "weighted avg 0.85 0.83 0.83 84\n", "\n" ] } @@ -224,9 +224,7 @@ "\n", "X, y = dataset.data, dataset.target\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)\n", - "pipeline = Pipeline(\n", - " [(\"scaler\", StandardScaler()), (\"gmm_sampler\", GMMSampler()), (\"knn\", KNN())]\n", - ")\n", + "pipeline = Pipeline([(\"scaler\", StandardScaler()), (\"gmm_sampler\", GMMSampler()), (\"knn\", KNN())])\n", "pipeline.fit(X_train, y_train)\n", "y_hat = pipeline.predict(X_test)\n", "print(classification_report(y_test, y_hat))" diff --git a/multi_imbalance/resampling/gmm_sampler.py b/multi_imbalance/resampling/gmm_sampler.py index 3421bc3..18fa5c0 100644 --- a/multi_imbalance/resampling/gmm_sampler.py +++ b/multi_imbalance/resampling/gmm_sampler.py @@ -359,6 +359,7 @@ def _oversample_each_minority_class(self, X: np.ndarray, y: np.ndarray) -> Tuple for minority_class in self.minority_classes: self.__x_subset = X_copy[y_copy == minority_class] X, y = self._oversample(X_copy, y_copy, minority_class) + X_copy, y_copy = X, y self.__x_subset = None return X, y @@ -367,9 +368,11 @@ def _oversample(self, X: np.ndarray, y: np.ndarray, minority_class: int) -> Tupl probabilities = self._get_probas_for_samples_in_component(X, y, minority_class) quantity_to_generate = self.size_to_align - self.__x_subset.shape[0] + for component in range(self.gaussian_mixtures[minority_class].n_components): Nk: np.ndarray = probabilities[component] * quantity_to_generate x = self._create_samples(means[component], covariances[component], int(Nk)) + X = np.append(X, x, axis=0) y = np.append(y, np.full((x.shape[0],), fill_value=minority_class), axis=0) From 5da701e6ee6a19a5f9d41e9928c96b97fd7f9952 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Thu, 27 Apr 2023 21:03:47 +0200 Subject: [PATCH 44/48] fix docs --- multi_imbalance/resampling/gmm_sampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/multi_imbalance/resampling/gmm_sampler.py b/multi_imbalance/resampling/gmm_sampler.py index 18fa5c0..e9fb65c 100644 --- a/multi_imbalance/resampling/gmm_sampler.py +++ b/multi_imbalance/resampling/gmm_sampler.py @@ -124,7 +124,7 @@ class GMMSampler(BaseSampler): Examples -------- >>> import numpy as np - >>> from algorithms.gmm_sampler import GMMSampler + >>> from multi_imbalance.resampling.gmm_sampler import GMMSampler >>> from sklearn.datasets import make_blobs >>> from collections import Counter >>> blobs = [800, 100] From aff0cafe8e6b508e544966dd6a30ffff93eac563 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20Wo=C5=BAny?= Date: Wed, 3 May 2023 20:27:36 +0200 Subject: [PATCH 45/48] merge oversamples at the end --- multi_imbalance/resampling/gmm_sampler.py | 25 ++++++++++++++--------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/multi_imbalance/resampling/gmm_sampler.py b/multi_imbalance/resampling/gmm_sampler.py index e9fb65c..362dcda 100644 --- a/multi_imbalance/resampling/gmm_sampler.py +++ b/multi_imbalance/resampling/gmm_sampler.py @@ -354,29 +354,34 @@ def _resample(self, X: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, np.ndarra return X, y def _oversample_each_minority_class(self, X: np.ndarray, y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: - X_copy = X.copy() - y_copy = y.copy() + X_oversample = [] + y_oversample = [] + X_oversample.append(X) + y_oversample.append(y) for minority_class in self.minority_classes: - self.__x_subset = X_copy[y_copy == minority_class] - X, y = self._oversample(X_copy, y_copy, minority_class) - X_copy, y_copy = X, y + self.__x_subset = X[y == minority_class] + X_subset_oversample, y_subset_oversample = self._oversample(X, y, minority_class) + X_oversample.append(X_subset_oversample) + y_oversample.append(y_subset_oversample) self.__x_subset = None - return X, y + + return np.vstack(X_oversample), np.hstack(y_oversample) def _oversample(self, X: np.ndarray, y: np.ndarray, minority_class: int) -> Tuple[np.ndarray, np.ndarray]: means, covariances = self._get_coefficients(self.gaussian_mixtures[minority_class]) probabilities = self._get_probas_for_samples_in_component(X, y, minority_class) quantity_to_generate = self.size_to_align - self.__x_subset.shape[0] - + X_subset_oversample = [] + y_subset_oversample = [] for component in range(self.gaussian_mixtures[minority_class].n_components): Nk: np.ndarray = probabilities[component] * quantity_to_generate x = self._create_samples(means[component], covariances[component], int(Nk)) - X = np.append(X, x, axis=0) - y = np.append(y, np.full((x.shape[0],), fill_value=minority_class), axis=0) + X_subset_oversample.append(x) + y_subset_oversample.append([minority_class] * x.shape[0]) - return X, y + return np.vstack(X_subset_oversample), np.hstack(y_subset_oversample) def _get_probas_for_samples_in_component(self, X: np.ndarray, y: np.ndarray, minority_class: int) -> np.ndarray: X_prob: np.ndarray = self.gaussian_mixtures[minority_class].predict_proba(X[y == minority_class]) From 957b160a66c013a1988bad46b846ee03ee6743c1 Mon Sep 17 00:00:00 2001 From: Adam Wojciechowski Date: Tue, 30 May 2023 10:56:29 +0200 Subject: [PATCH 46/48] Data difficulty module created, source file added --- .../data_difficulty/data_difficuulty.py | 868 ++++++++++++++++++ 1 file changed, 868 insertions(+) create mode 100644 multi_imbalance/data_difficulty/data_difficuulty.py diff --git a/multi_imbalance/data_difficulty/data_difficuulty.py b/multi_imbalance/data_difficulty/data_difficuulty.py new file mode 100644 index 0000000..7ea9c44 --- /dev/null +++ b/multi_imbalance/data_difficulty/data_difficuulty.py @@ -0,0 +1,868 @@ +from sklearn.neighbors import NearestNeighbors +from sklearn.cluster import KMeans +from sklearn.mixture import GaussianMixture +import numpy as np +from abc import ABC, abstractmethod +from matplotlib import pyplot as plt +from numpy.typing import NDArray +from matplotlib.figure import Figure +from typing import Optional, List, Dict, Tuple, TypeVar +from seaborn import heatmap + + +__author__ = 'Adam Wojciechowski' + + +class Summary: + """A class to quickly run all of the implemented algorithms + in form of a report with outputs in the form of various plots. + """ + def fit(self, + X: NDArray, + y: NDArray, + classes: Optional[List[int]] = None, + k: int = 5, + n_jobs: int = -1) -> None: + """Creates the summary report. + + :param X: The examples + :param y: The labels + :param classes: List of classes to be processed, defaults to None + :param k: No neighbors, defaults to 5 + :param n_jobs: No threads for distributed KNN action, defaults to -1 + """ + self.X, self.y = X, y + self.classes = classes + self.k, self.n_jobs = k, n_jobs + print('Start computing imbalance ratios...') + imbalance_ratio = ImbalanceRatio() + self.imbalance_ratio_results = imbalance_ratio.fit(self.y) + print('Imbalance ratios: ') + for k, v in self.imbalance_ratio_results.items(): + print(f'class {k} : {v}') + + print('Start data difficulty study...') + print('KNN based method') + knn_data_difficulty = KNNDataDifficulty(classes=self.classes, + k=self.k, + n_jobs=self.n_jobs) + self.knn_data_difficulty_results = knn_data_difficulty.fit_plot(self.X, + self.y) + print('Kernel density function based method') + kernel_data_difficulty = KernelDataDifficulty(classes=self.classes, + k=self.k, + n_jobs=self.n_jobs) + self.kernel_data_difficulty_results = kernel_data_difficulty.fit_plot(self.X, + self.y) + print('Continuous data difficulty') + continuous_data_difficulty = ContinuousDataDifficulty(classes=self.classes, + k=self.k, + n_jobs=self.n_jobs) + self.continuous_data_difficulty_results = continuous_data_difficulty.fit_plot(self.X, + self.y) + + print('Class neighboring study...') + border_class_matrix = BorderClassMatrix(classes=self.classes, + k=self.k, + n_jobs=n_jobs) + self.border_class_matrix_results = border_class_matrix.fit_plot(self.X, + self.y) + + print('Class homogeneity study...') + print('KMeans Elbow') + kmeans_elbow = KMeansElbowMethod(classes=self.classes, + cluster_count_selection=list(range(1, 15))) + self.kmeans_elbow_results = kmeans_elbow.fit_plot(X=self.X, + y=self.y) + print('Gaussian mixture Elbow') + gaussian_mixture = GaussianMixtureElbowMethod(classes=self.classes, + cluster_count_selection=list(range(1, 15))) + self.gaussian_mixture_results = gaussian_mixture.fit_plot(X=self.X, + y=self.y) + + +class ImbalanceRatio: + """A class which performs imbalance ratio computations + resulting in imbalance ratio values for each class. + """ + def fit(self, + y: NDArray) -> Dict[int, float]: + """Main class method performing computations. + + :param y: The labels + :return: Imbalance ratio values for each class in y + """ + classes, cardinalities = np.unique(y, return_counts=True) + maj_class_id = np.argmax(cardinalities) + return { + classes[i]: cardinalities[maj_class_id] / cardinalities[i] + for i in range(len(classes)) + } + + +class DataDifficulty(ABC): + """An abstract class serving as parent class to: + + * KNNDataDifficulty + * ContinuousDataDifficulty + * BorderClassMatrix + * KernelDataDifficulty + + """ + def __init__(self, + classes: Optional[List[int]] = None, + k: int = 5, + n_jobs: int = -1) -> None: + """Class constructor. + + :param classes: List of classes to be processed, defaults to None + :param k: No neighbors, defaults to 5 + :param n_jobs: No threads for distributed KNN action, defaults to -1 + :raises TypeError: Raised if classes attribute is of invalid type, + because the validity of this attribute is esential for all of the + child classes + """ + if classes is not None and type(classes) is not list: + raise TypeError('Wrong type of classes parameter! Should be None or list.') + self.classes = classes + self.k = k + self.knn = NearestNeighbors(n_neighbors=self.k, n_jobs=n_jobs) + + @abstractmethod + def fit(self, + X: NDArray, + y: NDArray) -> None: + """Main class method performing the algorithm, to be implemented + inside child classes. + + :param X: The examples + :param y: The labels + """ + pass + + def _induce_classes(self, + y: NDArray) -> NDArray: + """Method for automatic minority class list induction + if none was provided by the user. + + :param y: The labels + :return: An array of minority class labels + """ + distribution = np.unique(y, return_counts=True) + majority_cardinality = max(distribution[1]) + return distribution[0][distribution[1] < majority_cardinality] + + +class KernelFunction(ABC): + """An abstract class serving as parent class to: + + * EpanechnikovKernelFunction + * TriangularKernelFuntion + * UniformKernelFunction + + """ + def __init__(self, kernel_bandwidth: float) -> None: + """Class constructor. + + :param kernel_bandwidth: The bandwidth of the krenel function. + The kernel function will return positive values, if + the argument `u` will be in range + (-kernel_bandwidth / 2; kernel_bandwidth / 2) + """ + super().__init__() + self.kernel_bandwidth = kernel_bandwidth + self.scaling_factor = 2 / self.kernel_bandwidth + + @abstractmethod + def K(self, u: float) -> float: + """Kernel function. + + :param u: The argument of the kernel function K + :return: The numerical value K(u) + """ + pass + + +KernelFunctionSubclass = TypeVar('KernelFunctionSubclass', bound=KernelFunction) + + +class EpanechnikovKernelFunction(KernelFunction): + """Epanechnikov (parabolic) kernel function. + """ + def __init__(self, kernel_bandwidth: float) -> None: + """Class constructor + + :param kernel_bandwidth: The bandwidth of the krenel function. + The kernel function will return positive values, if + the argument `u` will be in range + (-kernel_bandwidth / 2; kernel_bandwidth / 2) + """ + super().__init__(kernel_bandwidth) + + def K(self, u: float) -> float: + """Kernel function. + It is defined as: + K(u) = (3 / 4) * (1 - u ** 2), for |u| <= 1 + K(u) = 0, otherwise. + + :param u: The argument of the kernel function K + :return: The numerical value K(u) + """ + if np.abs(self.scaling_factor * u) <= 1: + return (3 / 4) * (1 - np.square((self.scaling_factor * u))) + else: + return 0. + + +class TriangularKernelFuntion(KernelFunction): + """Triangular (linear) kernel function. + """ + def __init__(self, kernel_bandwidth: float) -> None: + """Class constructor + + :param kernel_bandwidth: The bandwidth of the krenel function. + The kernel function will return positive values, if + the argument `u` will be in range + (-kernel_bandwidth / 2; kernel_bandwidth / 2) + """ + super().__init__(kernel_bandwidth) + + def K(self, u: float) -> float: + """Kernel function. + It is defined as: + K(u) = 1 - |u|, for |u| <= 1 + K(u) = 0, otherwise. + + :param u: The argument of the kernel function K + :return: The numerical value K(u) + """ + if np.abs(self.scaling_factor * u) <= 1: + return 1 - np.abs(self.scaling_factor * u) + else: + return 0. + + +class UniformKernelFunction(KernelFunction): + """Uniform (constant) kernel function. + """ + def __init__(self, kernel_bandwidth: float) -> None: + """Class constructor + + :param kernel_bandwidth: The bandwidth of the krenel function. + The kernel function will return positive values, if + the argument `u` will be in range + (-kernel_bandwidth / 2; kernel_bandwidth / 2) + """ + super().__init__(kernel_bandwidth) + + def K(self, u: float) -> float: + """Kernel function. + K(u) = 1 / 2, for |u| <= 1 + K(u) = 0, otherwise. + + :param u: The argument of the kernel function K + :return: The numerical value K(u) + """ + return 1 / 2 if np.abs(self.scaling_factor * u) <= 1 else 0. + + +class KernelDataDifficulty(DataDifficulty): + """Kernel approach to data difficulty discovery. + """ + def __init__(self, + classes: Optional[List[int]] = None, + k: int = 5, + n_jobs: int = -1, + kernel_type: str = 'epanechnikov') -> None: + """Class constructor. + + :param classes: List of classes to be processed, defaults to None + :param k: No neighbors, defaults to 5 + :param n_jobs: No threads for distributed KNN action, defaults to -1 + :param kernel_type: Type of kernel function to be used, defaults to 'epanechnikov'. + currently available kernel types: + + * 'epanechnikov' + * 'triangular' + * 'uniform' + + """ + super().__init__(classes, k, n_jobs) + self.kernel_type = kernel_type + + def fit(self, + X: NDArray, + y: NDArray, + kernel_bandwidth_override: Optional[float] = None) -> Dict[int, List[float]]: + """Method for running the algorithm. + + :param X: The examples + :param y: The labels + :param kernel_bandwidth_override: An optional float value to be set as kernel bandwidth. + If not set the algorithm uses automatic bandwidth tuning method, defaults to None + :return: Dictionary {class_label : [scores for the examples of a given class]}. + The ordering inside the lists is as if iterated over a minority class + """ + if self.classes is None: + self.classes = self._induce_classes(y) + self.X, self.y = X, y + if kernel_bandwidth_override is None: + self.kernel_bandwidth = self._bandwidth_tune() + else: + self.kernel_bandwidth = kernel_bandwidth_override + self.kernel_object = self._get_kernel_object(self.kernel_type, + self.kernel_bandwidth) + c_scores = {} + for c in self.classes: + c_scores[c] = [] + # x - currently considered from minority class + for x in self.X[self.y==c]: + # list of weighted distances between same (c) class examples + c_weighted_distances = [] + # list of weighted distances between x and any other example + any_weighted_distances = [] + # xx - every other from any class + for xx, y_of_xx in zip(self.X, self.y): + if np.all(x == xx): continue + distance = np.linalg.norm(x - xx) + weighted_distance = self.kernel_object.K(distance) + any_weighted_distances.append(weighted_distance) + if y_of_xx in self.classes: + c_weighted_distances.append(weighted_distance) + c_weighted_sum = np.sum(c_weighted_distances) + any_weighted_sum = np.sum(any_weighted_distances) + # when conditions is met, according to authors + # there is 'not enough info' to compute the score + if any_weighted_sum == 0.: + c_scores[c].append(-1.) + else: + c_scores[c].append(c_weighted_sum / any_weighted_sum) + # round to eliminate numerical stability errors + c_scores[c] = list(np.around(c_scores[c], decimals=10)) + return c_scores + + def fit_label_difficulty(self, + X: NDArray, + y: NDArray, + kernel_bandwidth_override: Optional[float] = None) -> Dict[int, List[str]]: + """Performs self.fit() method, but translates scores to + safety levels (safe, border, rare, ...). + + :param X: The examples + :param y: The labels + :param kernel_bandwidth_override: An optional float value to be set as kernel bandwidth. + If not set the algorithm uses automatic bandwidth tuning method, defaults to None + :return: Dictionary {class_label : [safety levels for the examples of a given class]}. + The ordering inside the lists is as if iterated over a minority class + """ + c_scores = self.fit(X, y, kernel_bandwidth_override) + return { + c: [self._switch_case(score) for score in scores] + for c, scores in c_scores.items() + } + + def fit_plot(self, + X: NDArray, + y: NDArray, + kernel_bandwidth_override: Optional[float] = None) -> List[Figure]: + """Performs self.fit() method, but presents its results + as a histogram. + + :param X: The examples + :param y: The labels + :param kernel_bandwidth_override: An optional float value to be set as kernel bandwidth. + If not set the algorithm uses automatic bandwidth tuning method, defaults to None + :return: List of plots, each for every class specified in classes + """ + c_scores = self.fit(X, y, kernel_bandwidth_override) + plots = [] + for k, v in c_scores.items(): + fig = plt.figure() + plt.title(f'Class {k}') + plt.hist(v) + plt.xlim((-1., 1.)) + plt.xlabel('Bins') + plt.ylabel('Safe scores') + plt.show() + plots.append(fig) + return plots + + def _switch_case(self, + score: float) -> str: + """Helper, switch-case-alike method to map + score to name. + + :param score: safe score + :return: A safe level name + """ + if 1 >= score > .7: + return 'safe' + elif .7 >= score > .3: + return 'border' + elif .3 >= score > .1: + return 'rare' + elif .1 >= score > 0: + return 'outlier' + elif score == 0.: + return 'zero' + elif score == -1: + return 'not enough info' + else: + raise ValueError('Score value out of spec!') + + def _bandwidth_tune(self) -> float: + """A method for automatic kernel bandwidth tuning. + + :return: A numerical value being the kernel's bandwidth + required to compute scaling factor in kernel object + """ + self.knn.fit(self.X, self.y) + kernel_bandwidth, counter_for_avg = 0., 0 + for c in self.classes: + for x in self.X[self.y==c]: + # return all k nearest distances + distances, _ = self.knn.kneighbors([x], + self.k + 1, + return_distance=True) + # only accumulate the kth distance + kernel_bandwidth += distances.squeeze()[self.k] + counter_for_avg += 1 + kernel_bandwidth /= counter_for_avg + return kernel_bandwidth + + + def _get_kernel_object(self, + kernel_type: str, + kernel_bandwidth: float) -> KernelFunctionSubclass: + """Helper, switch-case-alike method to map krenel type (str) + to an actual, appropriate kernel function object. + + :param kernel_type: name of the kernel function as specified in + class constructor + :param kernel_bandwidth: A numerical value being kernel's bandwidth + either computed automatically (self._bandwidth_tune() method) or + acquired with kernel_bandwidth_override attribute in self.fit() method + :return: Kernel Function object respective to kernel_type + """ + if kernel_type == 'epanechnikov': + return EpanechnikovKernelFunction(kernel_bandwidth) + elif kernel_type == 'triangular': + return TriangularKernelFuntion(kernel_bandwidth) + elif kernel_type == 'uniform': + return UniformKernelFunction(kernel_bandwidth) + + +class KNNDataDifficulty(DataDifficulty): + """Class which performs DataDifficulty algorithm, + resulting in a dictionary. (see self._get_names_dict method) + """ + + def __init__(self, + classes: Optional[List[int]] = None, + k: int=5, + n_jobs: int=-1) -> None: + """Class constructor. + + :param classes: List of classes to be processed, defaults to None + :param k: No neighbors, defaults to 5 + :param n_jobs: No threads for distributed KNN action, defaults to -1 + """ + super().__init__(classes, k, n_jobs) + + def fit(self, + X: NDArray, + y: NDArray) -> Dict[int, Dict[str, int]]: + """Method for running the algorithm. + + :param X: The examples + :param y: The labels + :return: Dictionary {class_label : {safe_level_name : count}} + """ + if self.classes is None: + self.classes = self._induce_classes(y) + self.X, self.y = X, y + self.knn.fit(self.X, self.y) + non_c_count = {} + for c in self.classes: + non_c_count[c] = self._get_names_dict() + for x in self.X[self.y==c]: + neighbors = self.knn.kneighbors([x], self.k + 1, return_distance=False) + neighbors = np.squeeze(neighbors)[1:] + neighbors = self.y[neighbors] + count_non_c = sum(neighbors != c) + non_c_count[c][self._switch_case(count_non_c)] += 1 + return non_c_count + + def fit_plot(self, + X: NDArray, + y: NDArray) -> List[Figure]: + """Performs fit method and returns its results as histograms. + + :param X: The examples + :param y: The labels + :return: List of plots, each for every class specified in classes + """ + fit_dict = self.fit(X, y) + plots = [] + for k, v in fit_dict.items(): + fig = plt.figure() + plt.title(f'Class {k}') + plt.xticks(rotation=90) + plt.bar(v.keys(), v.values()) + plt.xlabel('Type') + plt.ylabel('Count') + plt.show() + plots.append(fig) + return plots + + def _get_names_dict(self) -> Dict[str, int]: + """Method for safe level dictionary creation. + + :return: safe level dictionary + """ + return {'safe': 0, 'border': 0, 'rare': 0, 'outlier': 0} + + def _switch_case(self, + count: int) -> str: + """Helper, switch-case-alike method to map + count to name. + + :param count: Non-c neighbor count + :return: A safe level name + """ + if count < 2: + return 'safe' + elif count < 4: + return 'border' + elif count < 5: + return 'rare' + else: + return 'outlier' + + +class ContinuousDataDifficulty(DataDifficulty): + """Class which performs DataDifficulty algorithm, + resulting in a histogram. (see self.fit_plot method) + """ + + def __init__(self, + classes: Optional[List[int]] = None, + k: int = 5, + n_jobs: int = -1) -> None: + """Class constructor. + + :param classes: List of classes to be processed, defaults to None + :param k: No neighbors, defaults to 5 + :param n_jobs: No threads for distributed KNN action, defaults to -1 + """ + super().__init__(classes, k, n_jobs) + + def fit(self, + X: NDArray, + y: NDArray, + similarities: Optional[NDArray] = None) -> Dict[int, List[float]]: + """Method for running the algorithm. + + :param X: The examples + :param y: The labels + :param similarities: Similarity matrix, defaults to None + :raises ValueError: When user passed a non-minority class to + classes inside the constructor + :return: Dictionary {class_label : [scores for the examples of a given class]}. + The ordering inside the lists is as if iterated over a minority class + """ + induced_classes = self._induce_classes(y) + if self.classes is None: + self.classes = induced_classes + elif not set(self.classes).issubset(set(induced_classes)): + error_msg = '''Cannot perform this method on non-minority classes! + (majority class has been passed to classes)''' + raise ValueError(error_msg) + if similarities is None: + all_classes = np.unique(y) + self.similarities = np.zeros(shape=(all_classes.shape[0], + all_classes.shape[0]), + dtype=np.int64) + # self.classes are the classes to be processed + for c in self.classes: + # all_classes is a set of every possible class in the dataset + for cc in all_classes: + if c == cc: continue + # if current cc (any kind of class) is a minority class + if cc in self.classes: + self.similarities[c, cc] = 1 + else: + self.similarities = similarities + self.X, self.y = X, y + self.knn.fit(self.X, self.y) + c_scores = {} + for c in self.classes: + c_scores[c] = [] + for x in self.X[self.y==c]: + neighbors = self.knn.kneighbors([x], self.k + 1, return_distance=False) + neighbors = np.squeeze(neighbors)[1:] + neighbors = self.y[neighbors] + non_c_neighbors = neighbors[neighbors != c] + # if no non-c neighbors, the score is zero + if not np.size(non_c_neighbors): + c_scores[c].append(0.) + # else, the score is calculated according to the formula + else: + safe_score = [self.similarities[c,neighbor] for neighbor in non_c_neighbors] + safe_score = sum(safe_score) + safe_score /= self.k + c_scores[c].append(safe_score) + return c_scores + + def fit_plot(self, + X: NDArray, + y: NDArray, + similarities: Optional[NDArray] = None) -> List[Figure]: + """Performs fit method and returns its results as histograms. + + :param X: The examples + :param y: The labels + :param similarities: Similarity matrix, defaults to None + :return: List of plots, each for every class specified in classes + """ + c_scores = self.fit(X, y, similarities) + plots = [] + for k, v in c_scores.items(): + fig = plt.figure() + plt.title(f'Class {k}') + plt.hist(v) + plt.xlim((0., 1.)) + plt.xlabel('Bins') + plt.ylabel('Safe scores') + plt.show() + plots.append(fig) + return plots + + +class BorderClassMatrix(DataDifficulty): + """Class which performs Border Class Matrix algorithm, + resulting in a border class matrix. + """ + + def __init__(self, + classes: Optional[List[int]] = None, + k: int = 5, + n_jobs: int = -1) -> None: + """Class constructor. + + :param classes: List of classes to be processed, defaults to None + :param k: No neighbors, defaults to 5 + :param n_jobs: No threads for distributed KNN action, defaults to -1 + """ + super().__init__(classes, k, n_jobs) + + def fit(self, + X: NDArray, + y: NDArray) -> NDArray: + """Runs the algorithm. + + :param X: The examples + :param y: The labels + :return: Border class matrix + """ + if self.classes is None: + self.classes = self._induce_classes(y) + self.X, self.y = X, y + all_classes_count = int(np.max(y)) + 1 + neighbor_matrix = np.zeros((all_classes_count, + all_classes_count), + dtype=np.int64) + self.knn.fit(self.X, self.y) + for c in self.classes: + for x in self.X[self.y==c]: + neighbors = self.knn.kneighbors([x], self.k + 1, return_distance=False) + neighbors = np.squeeze(neighbors)[1:] + neighbors = self.y[neighbors] + non_c_neighbors = neighbors[neighbors != c] + if not np.size(non_c_neighbors): continue + for neighboring_class in non_c_neighbors: + neighbor_matrix[c, neighboring_class] += 1 + return neighbor_matrix + + def fit_plot(self, + X: NDArray, + y: NDArray) -> Figure: + matrix = self.fit(X, y) + fig = heatmap(data=matrix, + annot=True, + cmap='jet') + plt.title('Per class crossovers') + plt.show() + return fig + + +class WCSSElbowMethod(ABC): + """An abstract class serving as parent class to: + + * KMeansElbowMethod + * GaussianMixtureElbowMethod. + + This class computes within-cluster sum of squares + for each cluster, created by underlying clustering method, + and produces an elbow plot. Upon the analysis of this plot + user can assess how many clusters a given class comprises of. + """ + def __init__(self, + classes: List[int], + cluster_count_selection: List[int]) -> None: + """Class constructor. + + :param classes: List of classes to be processed, defaults to None + :param cluster_count_selection: List of cluster counts to be put on + x axis of the elbow plot + """ + super().__init__() + self.classes = classes + self.cluster_count_selection = cluster_count_selection + + def _induce_classes(self, + y: NDArray) -> NDArray: + """Method for automatic minority class list induction + if none was provided by the user. + + :param y: The labels + :return: An array of minority class labels + """ + distribution = np.unique(y, return_counts=True) + majority_cardinality = max(distribution[1]) + return distribution[0][distribution[1] < majority_cardinality] + + @abstractmethod + def _cluster(self, + n_clusters: int) -> NDArray: + """Performs clustering, which would in n_clusters clusters, + with some clustering algorithm, to be implemented inside child classes. + + :param n_clusters: No clusters for the clustering method + :return: An array of new labels (not y) artificially created by + clustering algorithm + """ + pass + + def _wcss(self, + X: NDArray, + clustering_induced_labels: NDArray) -> float: + """Computes within-cluster sum of squares for all clusters, + for a given cluster labeling. + + :param X: The examples + :param clustering_induced_labels: cluster-induced labels + :return: WCSS value + """ + meta_sum = 0. + for label in np.unique(clustering_induced_labels): + X_with_label = X[clustering_induced_labels == label] + mean = np.mean(X_with_label, axis=0) + sum_of_squares = 0. + for data_point in X_with_label: + sum_of_squares += np.square(data_point - mean) + meta_sum += sum_of_squares + return meta_sum.sum().item() + + def fit(self, + X: NDArray, + y: NDArray) -> Dict[int, List[Tuple[int, float]]]: + """Runs the algorithm, resulting in a dictionary. + + :param X: The examples + :param y: The labels + :return: A dictionary {class_name : [(cluster_count, wcss for the given cluster_count), ...]} + """ + self.X, self.y = X, y + if self.classes is None: + self.classes = self._induce_classes(y) + wcss_dict = {} + for c in self.classes: + X_c = self.X[self.y == c] + wcss_dict[c] = [] + for cluster_count in self.cluster_count_selection: + clustering_induced_labels = self._cluster(X=X_c, + n_clusters=cluster_count) + # results for a single cluster + wcss_results = self._wcss(X_c, clustering_induced_labels) + # add to the results of all clusters + wcss_dict[c].append((cluster_count, wcss_results)) + return wcss_dict + + def fit_plot(self, + X: NDArray, + y: NDArray) -> List[Figure]: + """ + Performs fit method and returns its results as plots. + :return: List of plots, each for every class specified in classes + """ + wcss_dict = self.fit(X, y) + plots = [] + for k, v in wcss_dict.items(): + fig = plt.figure() + plt.title(f'Class {k}') + x, y = list(zip(*v)) + plt.plot(x, y) + plt.xticks(x) + plt.xlabel('Clusters') + plt.ylabel('WCSS') + plt.show() + plots.append(fig) + return plots + + +class KMeansElbowMethod(WCSSElbowMethod): + """Performs elbow method with KMeans as underlying + clustering algorithm. + """ + def __init__(self, + classes: List[int], + cluster_count_selection: List[int]) -> None: + """Class constructor. + + :param classes: List of classes to be processed, defaults to None + :param cluster_count_selection: List of cluster counts to be put on + x axis of the elbow plot + """ + super().__init__(classes, cluster_count_selection) + + def _cluster(self, + X: NDArray, + n_clusters: int) -> NDArray: + """Performs clustering with KMeans. + + :param X: The examples + :param n_clusters: Desired cluster count + :return: An array of clustering induced labels + """ + kmeans = KMeans(n_clusters=n_clusters) + kmeans.fit(X) + return kmeans.labels_ + + +class GaussianMixtureElbowMethod(WCSSElbowMethod): + """Performs elbow method with GaussianMixture(Expectation/Maximization) + as underlying clustering algorithm. + """ + def __init__(self, + classes: List[int], + cluster_count_selection: List[int]) -> None: + """Class constructor. + + :param classes: List of classes to be processed, defaults to None + :param cluster_count_selection: List of cluster counts to be put on + x axis of the elbow plot + """ + super().__init__(classes, cluster_count_selection) + + def _cluster(self, + X: NDArray, + n_clusters: int) -> NDArray: + """Performs clustering with GaussianMixture(Expectation/Maximization). + + :param X: The examples + :param n_clusters: Desired cluster count + :return: An array of clustering induced labels + """ + gm = GaussianMixture(n_components=n_clusters) + return gm.fit_predict(X) From b3e4301f35421614165769990843f0b6b8603493 Mon Sep 17 00:00:00 2001 From: Adam Wojciechowski Date: Tue, 30 May 2023 11:00:38 +0200 Subject: [PATCH 47/48] correct typo --- .../data_difficulty/{data_difficuulty.py => data_difficulty.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename multi_imbalance/data_difficulty/{data_difficuulty.py => data_difficulty.py} (100%) diff --git a/multi_imbalance/data_difficulty/data_difficuulty.py b/multi_imbalance/data_difficulty/data_difficulty.py similarity index 100% rename from multi_imbalance/data_difficulty/data_difficuulty.py rename to multi_imbalance/data_difficulty/data_difficulty.py From 1d1b04a6a7993de7654885bb4edeb132e1452411 Mon Sep 17 00:00:00 2001 From: Adam Wojciechowski Date: Tue, 30 May 2023 11:14:38 +0200 Subject: [PATCH 48/48] exemplary notebook with instructions on how to use --- examples/data_difficulty/X.npy | Bin 0 -> 3968 bytes examples/data_difficulty/X_altered.npy | Bin 0 -> 2048 bytes .../data_difficulty/notebook_example.ipynb | 1142 +++++++++++++++++ examples/data_difficulty/y.npy | Bin 0 -> 1088 bytes examples/data_difficulty/y_altered.npy | Bin 0 -> 608 bytes 5 files changed, 1142 insertions(+) create mode 100644 examples/data_difficulty/X.npy create mode 100644 examples/data_difficulty/X_altered.npy create mode 100644 examples/data_difficulty/notebook_example.ipynb create mode 100644 examples/data_difficulty/y.npy create mode 100644 examples/data_difficulty/y_altered.npy diff --git a/examples/data_difficulty/X.npy b/examples/data_difficulty/X.npy new file mode 100644 index 0000000000000000000000000000000000000000..66c0ceae369e35eea1a82f538e550c445f41fef9 GIT binary patch literal 3968 zcmbVP`8yPf-z8ZhOQ9k{ib#~QwCIy8MZ|ScLKIxwJ1tNBH0PWXHrB6 zS(?FM#xR(XS&V)8_Wld+`R$zF&U2pgoaZDScd)i~;^zzDyQ1#p9po9PzE4^Gz*$3e zEoJqy{(-@P9)52AfnMJKr<;3R4D$Y?2c7r$$NLZ8rn_gSma?vfmU4vh{}ZCSUljHD zb~0e~p(jGu_sl?yX!cBDp9mo%iaI}#wSa|B^+e3(>(Mm4Bjdo1Da;x9>!Zl=MP%v| zniRjWL2@ycmqj0h=F&5_EpIR}HA8ZEf;a%Z!vn*;JLXZAul$n3hJJ8*Yh={Jra@+- z2bYr74kraZCp~H(2f57MIsPUi_*l^^aBDjotn!{jT~#B)G0WGF65meZ#kRuv!$wIM zdPm#8OKlRpNpIP|^%^03reL;9OMu{3Oultaw*fsnm#4+71~8JSxK2%L7~XCz>o@#3 zg3RDIjLZ{ENdGAIEBh23#c#OV?B)p*S6OURUP2M?_6>g(T;&H#aw&@?xf-?+oD&7(@cz&d&=e6ABw?V4!Y0+B zUZ`0&Ods-S)ynbHd%>R zuH3tVz)~?g(oNuD(xV!^JkAfKG!_a*>5QRZ0QJWUu57P@*CVkUj!~AHq6x(4Oiyj`Zs85`Qe+nvg#Dcd$ryUV@{xLw64QEu+KJSGy9=jDTKcXyt=N z0RrwLQQb$FVCpKl;|$Os!FspR&b2JKEL&Q(y}Jiw3A)$yH*j#WDt(sM%Y*zNBU`HX z6x1|to4iy`frr9*Pm^=Fuqa>{#^Q}5ztzqUQtADmwley3DvOJ88j*fov>UJnZS7d~;|dol4ZWHCU^@S%8| z{SdB_j(Hce&c)kCy1*|w>$?G&jCDcQTJ7X1C`g9>_jyBjw(iK1 z@k|3~uIC*N4Vc5Xd7tI)-Ry#RmoGO`T{yT=-AnpW6cb!KYXdr(dqHlOWK>%S1u~8X z$E!v&Kt#yfgl9AZYZJv2wtJA^pC3z1SN$1u=bZJ*r1iknhBomBBW+N8XErhT-Jia% zdHEtsJWQV%S)O`&rYEq_Ceik;cK0xh z@m6$lUQgmPN6Grx9uoMky}x$L2`=cr)H^n!ON5B(ptk?Yhrv%+qWaKHVZz|h`5hMf zC!y-Hi&=%x0A!|S7>V2%fgsk7#PG6O&=LB)+WE;aT%4)adawNpy**n#Q4QHp`Iz=v z;2;-_BW0e(8}d*r#_Zd!(pJ3V@?JhEg^DRph9et&NTAjAb@zwI<8VX!>!E(Zew5Mb zxw)H8gEndVx8f5Q&y}XSS|#d=ujBMsCVkOoZOhu*1hsT$_5s? zT^A%0-IK3lxVzqpRu3F(*1oWgcv9 zn{36@#$^K&tvR5R_j;|nQi0Y}vLX3?bHIK)NG5I{08jmULidl(fYHw*MlMBkcy5)k znE!}NIysI}J&qf#ccz3?&td7m z#n;B)urNFC_!$);GJ4Pb{d@Kr2ZVE_7~kfrAtQH9i;Tnoe28U8S(i;CEBdBsYrj7% ztcoj8f6#`4SuJjCtqJHBa}Vf|C*o1>4s#=wXGlF0AQA3O1w+|=;xkJu@UOW~xGq11 z=L(Gt?Jo{spv#k6uh$QNDed$fcQy;{inVPg)A}I&g?(gM5gUnyrZb)XpMfEH&grx* z1K(DtHInCkf$Zh17U>^M5ZHKYu+NYOaxIHB8?^_)ICs=w>J>ks#nLsWG@}jV4y1I3 zDE!%T-KI8^Qw=D-JTN-%OT%XuN8f&6cYtq5=;n8w-7sCfX^qI<2^g1i78P7Wg0yCn ztBF6y@%9Dr9R5usxcx)itY`%d&UBjkDwF8Iw2&T1PvGMHJy|Iid2ArSh4(C#e&jBs z{(XCy3sRm<86`Ldv+|~dYt1AWTz!&q-c%}X%wjn5}B5(V!+kdS^B!o~mS-70UhN$`zUVAU49 z2p_}D3%(6eP~Y_O3{7Jmw!SYiVy~q`a%Q>T*8CnY%hEbGsy+z;+vh5B5*WaD`miXC zF^}O^?`1@|lUR9_mE+a>3)0jsnO$Bri3LV!r4e>32#(IIL5qdW6vRfZ zP=_%cs+^x}J+YsPb-vQA3bO(P{ru$ZohmHY>TXeP9mqnUVQ6C_VHMm%q?hO7j>sX`|foa><(hJB`B$*mOyZrxna zZpu7dmruc8Wy17l$&+BG@b|og*E9(FTBSz_a-sjhMbo|WLm;(Ys3gOciU%(qIwLL0 z0eLLDXQC~;ld6dV4Q_B;0m-y=^Z&L@vD;x;) zSmJ`je|wgvS1n_^J4MV;VjAjRSPIc9Xc)hfwE21l3xDqoI(xjEjoZ{tsVsI6pp3P) zL$L}472|?-+VpqB>zD`u`|~6$o0>V9Gr)!DRTmujdV9gl`jPyO@=3g-Nm4Gm)Pz^- zjhnWQ)SyD}Dgn_DF34(Ksun%mgCc&t8>aS71C5kl816ZT1F}X{A^}{;mHW(JCo_WY zQ|stgZx^9VbhPQwEi-sl;R`1qVF*^xBN>%%zTl^OQcB8Shu~`H(Dt%`eher)xstq> zhuVtIuY{C*hn$o!0Ur!twGErtc#>lWF@}B zer5514S#m<-e{dcPU;B6TJ?n=xhpY<^N_~+l8lj$&yd4pN2cX~M*72>npd*Qmv1F+kz@)V(S5^jFU zsNQZe3_gyx3(DdL;X&$154HF=G|v3&Q&8qWqlu5^JDV}kq859)H&r3AA+_NA)iG?o zyAUCw(FIB;^J{9{C^#dv(dzs{HYV)S$&sS|$=AGBJ)?t$1*?u+yAso30gtFud(Uy`LLT*ro6*?J6x zEgX>kQF|v-n+SDlY${K;bU;R_gNB#*D9}`F8bujA92Yy%@Te;taMd_RY=i+0se9Uk zUHf6n-UubS;x{;S-rCzRhK1{hZ{j0jjz-HZEQqY&Opn zQ>g5LcS*NjYQ12Ao&Ka!@=X>jtJ>HqUTY< zAq9e`|BahBF8R6o-|)EdzixKPH}J~O9_V!`;L1-|)z;BEq@bgANFiL||6~idT9!50 z)B_1$>~!W*Cn58;@T=2#bg-5SyJu#P$9&zhNQM70LG8yj#qJ%Ah#9&@*r5?elu%RX@{Ri$O7wj>Vd?Anr>KP^5e|thO^NxDz}r<7w@5T4N}5kbjEP5DYVUOo?6#8i*kzMX(b z8X;0?x*KY00wxV#)}rvi4{`WTQ7n=sY91V31bwdp-hS?zfOOVOmFnFOaJ-Oq*Ib(m z5hl;B?bDnB4QCs^Y*7V7+C1eI`c1-jUi9T{ZG&)rWmJ2&Ne2+qbgB5|W*9Asfy2{OSH%eACjr+g(V;H!=9o;FdcrsQYjyYRXU1?5#PK=e&sq( zE@g3nDpB00AVUS>OQlm~>s?Tl=qV%U`wsn+UriY})(e{HiBsn!hvAU}{i@2udsq$5 z^;*3<2KR`69P93FfjOH^8d1KJ;Ie;1it%g{<}O3Ukz)lhyJh;lFSaarK9SoxCpd&$ z+rCjKxLz=;*eE~Ev!lgK+M6#5ntK|*JBdLn!37R zmDrh=0|=T zf|W57DVcZ?Y?D!rA;E|Xcl2du%gJvcg%z))yq5;5n#@Le2M=2H*G!hAM&Yo^g%Jb! z=ctbMHM{X^Cxl;!SJk@q9Mad=*(^sqHX2SoHAxZ0B;M&qj!fe)d1Z4wLLMFq={SPR zH6I7hJEjsO%`sSZ5_?&2@&k0BFiz^fL(nZOD_yrz2PfKQy|1Ssw4z%O8&J%JPeu;~ z9?r7CP2DTF_bCe;ei+=Qi@!i^u1O-X{0Wf4H*YB04*}t{*G&$o4K@y#F5r%igF#jS zekx)LyoFT~l@7Fl;#(DwW$rK-PPr07uJU1uTzdFk2@kZF_D49|dH{0!a7(uWhZ*w% zibs4V;P&?!7aP_P_(o(KnU@cO&W%m;-(R!flG%*y=CwW~t`$?3)y@H#G%0myRW>Ya zEZ)Ce+6k?Mw#`~rA0W(M?hz5O;2?TPd*D%tQq-y&+1;X;;Bv^;a-SAB{>LugbS*k0 z>d#TWhd zk{2rbGc$uUKJd?He#6jX#m=+?1Vj%@R)0DqPSzm z5Cr|H9=d&)0k1+fGMH-fV4tjE9Qhc5v4y4hZ%RCn#{ z=rsxNOOu`@ZSDbm9!rl^KMee;r%Af=)$q&x1l=-^F?2%Nboge*1UxF#d)qG^oNU=mbw)6Y+k%L|!t)Yrk`QjFls4lW*q-<=z8%ZIA5rl5yx;+8N)y sFaVrBqvkXp2EyO2u}cUV1j>o~^NJlC=wn?Pl&>>D!MupPvHl4C2b_8S-2eap literal 0 HcmV?d00001 diff --git a/examples/data_difficulty/notebook_example.ipynb b/examples/data_difficulty/notebook_example.ipynb new file mode 100644 index 0000000..f8e6dbb --- /dev/null +++ b/examples/data_difficulty/notebook_example.ipynb @@ -0,0 +1,1142 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "23377c4d", + "metadata": {}, + "outputs": [], + "source": [ + "from DataDifficulty import *\n", + "from matplotlib import pyplot as plt\n", + "from pprint import pprint" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6f79cd9d", + "metadata": {}, + "outputs": [], + "source": [ + "X = np.load('X.npy')\n", + "y = np.load('y.npy')" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "id": "c3bfc46d", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.scatter(X[:,0][y==0], X[:,1][y==0], c='black', label=0)\n", + "plt.scatter(X[:,0][y==1], X[:,1][y==1], c='brown', label=1)\n", + "plt.scatter(X[:,0][y==2], X[:,1][y==2], c='green', label=2)\n", + "plt.scatter(X[:,0][y==3], X[:,1][y==3], c='blue', label=3)\n", + "plt.title('Multiclass imbalanced data')\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d05f7414", + "metadata": {}, + "outputs": [], + "source": [ + "MINORITY_CLASSES = [0, 2]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c81c3cb3", + "metadata": {}, + "source": [ + "For quick, tune-free summary report run this method:\n", + "\n", + "(In general user can omit `classes` parameter, and any method will automatically detect minority classes. Despite of that ability, it is encouraged to explicitly pass minority classes as list of labels, because since the definition of *minority* and *majority* class may vary between any project, thus the automatic minority class detection views the class with maximal cardinality among all the classes as *majority* class, and any class of cardinality smaller than the cardinality of majority class as *minority* class.)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "3074d154", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Start computing imbalance ratios...\n", + "Imbalance ratios: \n", + "class 0 : 5.0\n", + "class 1 : 1.0\n", + "class 2 : 5.0\n", + "class 3 : 1.0\n", + "Start data difficulty study...\n", + "KNN based method\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjIAAAHnCAYAAACi17dYAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjYuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/P9b71AAAACXBIWXMAAA9hAAAPYQGoP6dpAAAuSUlEQVR4nO3de1RVdeL+8eeAeECEI5ogKAkq3i9ppqOW5ehoapbfsrtJNtoNJXUyY33Tbl/TnFIrHbuh0sWszJwaS8tLOpWSiklqeVfIa2KAiiDC/v3R8vyGARHwwD4ffL/W2mt5Pnvvw4NnKQ+ffXNYlmUJAADAQD52BwAAAKgoigwAADAWRQYAABiLIgMAAIxFkQEAAMaiyAAAAGNRZAAAgLEoMgAAwFgUGQAAYCyKDADbREVF6f7777c7BgCDUWQAeNyePXv00EMPqUmTJvL391dwcLB69OihV155RWfOnLE73kXl5eVpwoQJioiIUEBAgLp27aqvv/7a7lgASlDD7gAAqpelS5fq9ttvl9Pp1LBhw9S2bVudPXtW3377rcaPH69t27bpzTfftDtmqe6//34tWrRIY8aMUUxMjObPn68BAwZo9erVuvbaa+2OB+A/OHhoJABP2bdvn9q3b69GjRpp1apVCg8PL7J+9+7dWrp0qR577DFJfxxauuGGGzR//nwb0pbshx9+UNeuXfX3v/9djz/+uCQpNzdXbdu2VWhoqL7//nubEwL4TxxaAuAx06ZN06lTp5SYmFisxEhSs2bN3CWmJCdOnNDjjz+udu3aqXbt2goODlb//v21ZcuWYtu+9tpratOmjWrVqqWQkBB17txZCxYscK8/efKkxowZo6ioKDmdToWGhuovf/mLUlJSSv0eFi1aJF9fXz344IPuMX9/f/31r3/VunXrlJ6eXpa/CgBVhENLADzm888/V5MmTdS9e/cK7b93714tWbJEt99+u6Kjo3X06FG98cYbuv7667V9+3ZFRERIkt566y3Fx8dryJAheuyxx5Sbm6vU1FQlJyfrnnvukSQ9/PDDWrRokUaNGqXWrVsrIyND3377rX7++Wd16tTpghk2b96s5s2bKzg4uMh4ly5dJEk//vijIiMjK/T9AfA8igwAj8jOztbBgwd1yy23VPg92rVrp507d8rH5/9PFt93331q2bKlEhMTNXHiREl/nIfTpk0bffzxxxd8r6VLl2rkyJF6+eWX3WNPPPHERTMcPny4xNmk82OHDh0q8/cDoPJxaAmAR2RnZ0uSgoKCKvweTqfTXWIKCgqUkZGh2rVrq0WLFkUOCdWpU0e//vqrNmzYcMH3qlOnjpKTk8tdPM6cOSOn01ls3N/f370egPegyADwiPOHYk6ePFnh9ygsLNSMGTMUExMjp9OpK664QvXr11dqaqqysrLc202YMEG1a9dWly5dFBMTo7i4OH333XdF3mvatGnaunWrIiMj1aVLFz3zzDPau3fvRTMEBAQoLy+v2Hhubq57PQDvQZEB4BHBwcGKiIjQ1q1bK/weL7zwgsaNG6eePXvqvffe0/Lly/X111+rTZs2KiwsdG/XqlUr7dixQwsXLtS1116rTz75RNdee62efvpp9zZ33HGH9u7dq9dee00RERH6+9//rjZt2ujLL78sNUN4eLgOHz5cbPz82PnzdAB4B4oMAI+56aabtGfPHq1bt65C+y9atEi9evVSYmKi7rrrLvXt21d9+vRRZmZmsW0DAwN15513at68eUpLS9PAgQM1efJk98yJ9EcpefTRR7VkyRLt27dP9erV0+TJk0vNcNVVV2nnzp3uQ2XnJScnu9cD8B4UGQAe88QTTygwMFAjRozQ0aNHi63fs2ePXnnllQvu7+vrq/++tdXHH3+sgwcPFhnLyMgo8rpmzZpq3bq1LMtSfn6+CgoKihyKkqTQ0FBFRESUeNjoPw0ZMkQFBQVFbtqXl5enefPmqWvXrlyxBHgZrloC4DFNmzbVggULdOedd6pVq1ZF7uz7/fff6+OPPy712Uo33XSTnnvuOQ0fPlzdu3fXTz/9pPfff19NmjQpsl3fvn3VoEED9ejRQ2FhYfr55581a9YsDRw4UEFBQcrMzFSjRo00ZMgQdejQQbVr19aKFSu0YcOGIlcxlaRr1666/fbblZCQoGPHjqlZs2ZKSkrS/v37lZiY6Im/JgCeZAGAh+3cudMaOXKkFRUVZdWsWdMKCgqyevToYb322mtWbm6ue7vGjRtbsbGx7te5ubnW3/72Nys8PNwKCAiwevToYa1bt866/vrrreuvv9693RtvvGH17NnTqlevnuV0Oq2mTZta48ePt7KysizLsqy8vDxr/PjxVocOHaygoCArMDDQ6tChg/WPf/yjTPnPnDljPf7441aDBg0sp9NpXXPNNdayZcs88ncDwLN4RAEAADAW58gAAABjUWQAAICxKDIAAMBYFBkAAGAsigwAADAWRQYAABir2t8Qr7CwUIcOHVJQUJAcDofdcQAAQBlYlqWTJ08qIiJCPj4Xnnep9kXm0KFD3FIcAABDpaenq1GjRhdcX+2LTFBQkKQ//iKCg4NtTgMAAMoiOztbkZGR7p/jF1Lti8z5w0nBwcEUGQAADHOx00I42RcAABiLIgMAAIxFkQEAAMaiyAAAAGNRZAAAgLEoMgAAwFgUGQAAYCyKDAAAMBZFBgAAGIsiAwAAjEWRAQAAxqLIAAAAY1FkAACAsSgyAADAWBQZAABgrBp2BzBZ1JNL7Y5w2do/daDdEQAAXoAZGQAAYCyKDAAAMBZFBgAAGIsiAwAAjEWRAQAAxqLIAAAAY1FkAACAsSgyAADAWBQZAABgLIoMAAAwFkUGAAAYiyIDAACMRZEBAADGosgAAABjUWQAAICxKDIAAMBYFBkAAGAsigwAADAWRQYAABiLIgMAAIxFkQEAAMaiyAAAAGPZWmTWrl2rQYMGKSIiQg6HQ0uWLHGvy8/P14QJE9SuXTsFBgYqIiJCw4YN06FDh+wLDAAAvIqtReb06dPq0KGDZs+eXWxdTk6OUlJSNHHiRKWkpGjx4sXasWOHbr75ZhuSAgAAb1TDzi/ev39/9e/fv8R1LpdLX3/9dZGxWbNmqUuXLkpLS9OVV15ZFREBAIAXs7XIlFdWVpYcDofq1KlzwW3y8vKUl5fnfp2dnV0FyQAAgB2MOdk3NzdXEyZM0N13363g4OALbjdlyhS5XC73EhkZWYUpAQBAVTKiyOTn5+uOO+6QZVmaM2dOqdsmJCQoKyvLvaSnp1dRSgAAUNW8/tDS+RJz4MABrVq1qtTZGElyOp1yOp1VlA4AANjJq4vM+RKza9curV69WvXq1bM7EgAA8CK2FplTp05p9+7d7tf79u3Tjz/+qLp16yo8PFxDhgxRSkqK/vWvf6mgoEBHjhyRJNWtW1c1a9a0KzYAAPASthaZjRs3qlevXu7X48aNkyTFxsbqmWee0WeffSZJuuqqq4rst3r1at1www1VFRMAAHgpW4vMDTfcIMuyLri+tHUAAABGXLUEAABQEooMAAAwFkUGAAAYiyIDAACMRZEBAADGosgAAABjUWQAAICxKDIAAMBYFBkAAGAsigwAADAWRQYAABiLIgMAAIxFkQEAAMaiyAAAAGNRZAAAgLEoMgAAwFgUGQAAYCyKDAAAMBZFBgAAGIsiAwAAjEWRAQAAxqLIAAAAY1FkAACAsSgyAADAWBQZAABgLIoMAAAwFkUGAAAYiyIDAACMRZEBAADGosgAAABjUWQAAICxKDIAAMBYFBkAAGAsigwAADAWRQYAABiLIgMAAIxFkQEAAMaiyAAAAGNRZAAAgLEoMgAAwFgUGQAAYCyKDAAAMBZFBgAAGMvWIrN27VoNGjRIERERcjgcWrJkSZH1lmVp0qRJCg8PV0BAgPr06aNdu3bZExYAAHgdW4vM6dOn1aFDB82ePbvE9dOmTdOrr76q119/XcnJyQoMDFS/fv2Um5tbxUkBAIA3qmHnF+/fv7/69+9f4jrLsjRz5kw99dRTuuWWWyRJ77zzjsLCwrRkyRLdddddVRkVAAB4Ia89R2bfvn06cuSI+vTp4x5zuVzq2rWr1q1bd8H98vLylJ2dXWQBAADVk9cWmSNHjkiSwsLCioyHhYW515VkypQpcrlc7iUyMrJScwIAAPt4bZGpqISEBGVlZbmX9PR0uyMBAIBK4rVFpkGDBpKko0ePFhk/evSoe11JnE6ngoODiywAAKB68toiEx0drQYNGmjlypXusezsbCUnJ6tbt242JgMAAN7C1quWTp06pd27d7tf79u3Tz/++KPq1q2rK6+8UmPGjNH//d//KSYmRtHR0Zo4caIiIiI0ePBg+0IDAACvYWuR2bhxo3r16uV+PW7cOElSbGys5s+fryeeeEKnT5/Wgw8+qMzMTF177bVatmyZ/P397YoMAAC8iMOyLMvuEJUpOztbLpdLWVlZHj9fJurJpR59P5Td/qkD7Y4AAKhEZf357bXnyAAAAFwMRQYAABiLIgMAAIxFkQEAAMaiyAAAAGNRZAAAgLEoMgAAwFgUGQAAYCyKDAAAMBZFBgAAGIsiAwAAjEWRAQAAxqLIAAAAY1FkAACAsSgyAADAWBQZAABgLIoMAAAwFkUGAAAYiyIDAACMRZEBAADGosgAAABjUWQAAICxKDIAAMBYFBkAAGAsigwAADAWRQYAABiLIgMAAIxFkQEAAMaiyAAAAGNRZAAAgLEoMgAAwFgUGQAAYCyKDAAAMBZFBgAAGIsiAwAAjEWRAQAAxqLIAAAAY1FkAACAsSgyAADAWBQZAABgLIoMAAAwFkUGAAAYiyIDAACM5dVFpqCgQBMnTlR0dLQCAgLUtGlTPf/887Isy+5oAADAC9SwO0BpXnzxRc2ZM0dJSUlq06aNNm7cqOHDh8vlcik+Pt7ueAAAwGZeXWS+//573XLLLRo4cKAkKSoqSh988IF++OEHm5MBAABv4NWHlrp3766VK1dq586dkqQtW7bo22+/Vf/+/S+4T15enrKzs4ssAACgevLqGZknn3xS2dnZatmypXx9fVVQUKDJkyfr3nvvveA+U6ZM0bPPPluFKQEAgF28ekbmo48+0vvvv68FCxYoJSVFSUlJeumll5SUlHTBfRISEpSVleVe0tPTqzAxAACoSl49IzN+/Hg9+eSTuuuuuyRJ7dq104EDBzRlyhTFxsaWuI/T6ZTT6azKmAAAwCZePSOTk5MjH5+iEX19fVVYWGhTIgAA4E28ekZm0KBBmjx5sq688kq1adNGmzdv1vTp0/XAAw/YHQ0AAHgBry4yr732miZOnKhHH31Ux44dU0REhB566CFNmjTJ7mgAAMALeHWRCQoK0syZMzVz5ky7owAAAC/k1efIAAAAlIYiAwAAjEWRAQAAxqLIAAAAY1FkAACAsSgyAADAWBQZAABgrAoVmSZNmigjI6PYeGZmppo0aXLJoQAAAMqiQkVm//79KigoKDael5engwcPXnIoAACAsijXnX0/++wz95+XL18ul8vlfl1QUKCVK1cqKirKY+EAAABKU64iM3jwYEmSw+FQbGxskXV+fn6KiorSyy+/7LFwAAAApSlXkSksLJQkRUdHa8OGDbriiisqJRQAAEBZVOihkfv27fN0DgAAgHKr8NOvV65cqZUrV+rYsWPumZrz5s6de8nBAAAALqZCRebZZ5/Vc889p86dOys8PFwOh8PTuQAAAC6qQkXm9ddf1/z583Xfffd5Og8AAECZVeg+MmfPnlX37t09nQUAAKBcKlRkRowYoQULFng6CwAAQLlU6NBSbm6u3nzzTa1YsULt27eXn59fkfXTp0/3SDgAAIDSVKjIpKam6qqrrpIkbd26tcg6TvwFAABVpUJFZvXq1Z7OAQAAUG4VOkcGAADAG1RoRqZXr16lHkJatWpVhQMBAACUVYWKzPnzY87Lz8/Xjz/+qK1btxZ7mCQAAEBlqVCRmTFjRonjzzzzjE6dOnVJgQAAAMrKo+fIDB06lOcsAQCAKuPRIrNu3Tr5+/t78i0BAAAuqEKHlm699dYiry3L0uHDh7Vx40ZNnDjRI8EAAAAupkJFxuVyFXnt4+OjFi1a6LnnnlPfvn09EgwAAOBiKlRk5s2b5+kcAAAA5VahInPepk2b9PPPP0uS2rRpo44dO3okFAAAQFlUqMgcO3ZMd911l7755hvVqVNHkpSZmalevXpp4cKFql+/viczAgAAlKhCVy2NHj1aJ0+e1LZt23TixAmdOHFCW7duVXZ2tuLj4z2dEQAAoEQVmpFZtmyZVqxYoVatWrnHWrdurdmzZ3OyLwAAqDIVmpEpLCyUn59fsXE/Pz8VFhZecigAAICyqFCR+fOf/6zHHntMhw4dco8dPHhQY8eOVe/evT0WDgAAoDQVKjKzZs1Sdna2oqKi1LRpUzVt2lTR0dHKzs7Wa6+95umMAAAAJarQOTKRkZFKSUnRihUr9Msvv0iSWrVqpT59+ng0HAAAQGnKNSOzatUqtW7dWtnZ2XI4HPrLX/6i0aNHa/To0brmmmvUpk0b/fvf/66srAAAAEWUq8jMnDlTI0eOVHBwcLF1LpdLDz30kKZPn+6xcAAAAKUpV5HZsmWLbrzxxguu79u3rzZt2nTJoQAAAMqiXEXm6NGjJV52fV6NGjX022+/XXIoAACAsihXkWnYsKG2bt16wfWpqakKDw+/5FAAAABlUa4iM2DAAE2cOFG5ubnF1p05c0ZPP/20brrpJo+Fk/64P83QoUNVr149BQQEqF27dtq4caNHvwYAADBTuS6/fuqpp7R48WI1b95co0aNUosWLSRJv/zyi2bPnq2CggL97//+r8fC/f777+rRo4d69eqlL7/8UvXr19euXbsUEhLisa8BAADMVa4iExYWpu+//16PPPKIEhISZFmWJMnhcKhfv36aPXu2wsLCPBbuxRdfVGRkpObNm+cei46O9tj7AwAAs5X7hniNGzfWF198od9//127d++WZVmKiYmplFmSzz77TP369dPtt9+uNWvWqGHDhnr00Uc1cuTIC+6Tl5envLw89+vs7GyP5wIAAN6hQo8okKSQkBBdc8016tKlS6Ud6tm7d6/mzJmjmJgYLV++XI888oji4+OVlJR0wX2mTJkil8vlXiIjIyslGwAAsJ/DOn98yAvVrFlTnTt31vfff+8ei4+P14YNG7Ru3boS9ylpRiYyMlJZWVkl3sjvUkQ9udSj74ey2z91oN0RAACVKDs7Wy6X66I/vys8I1MVwsPD1bp16yJjrVq1Ulpa2gX3cTqdCg4OLrIAAIDqyauLTI8ePbRjx44iYzt37lTjxo1tSgQAALyJVxeZsWPHav369XrhhRe0e/duLViwQG+++abi4uLsjgYAALyAVxeZa665Rp9++qk++OADtW3bVs8//7xmzpype++91+5oAADAC5T78uuqdtNNN3n8bsEAAKB68OoZGQAAgNJQZAAAgLEoMgAAwFgUGQAAYCyKDAAAMBZFBgAAGIsiAwAAjEWRAQAAxqLIAAAAY1FkAACAsSgyAADAWBQZAABgLIoMAAAwFkUGAAAYiyIDAACMRZEBAADGosgAAABjUWQAAICxKDIAAMBYFBkAAGAsigwAADAWRQYAABiLIgMAAIxFkQEAAMaiyAAAAGNRZAAAgLEoMgAAwFgUGQAAYCyKDAAAMBZFBgAAGIsiAwAAjEWRAQAAxqLIAAAAY1FkAACAsSgyAADAWBQZAABgLIoMAAAwFkUGAAAYiyIDAACMRZEBAADGosgAAABjUWQAAICxKDIAAMBYRhWZqVOnyuFwaMyYMXZHAQAAXsCYIrNhwwa98cYbat++vd1RAACAlzCiyJw6dUr33nuv3nrrLYWEhNgdBwAAeAkjikxcXJwGDhyoPn36XHTbvLw8ZWdnF1kAAED1VMPuABezcOFCpaSkaMOGDWXafsqUKXr22WcrORUAAPAGXj0jk56erscee0zvv/++/P39y7RPQkKCsrKy3Et6enolpwQAAHbx6hmZTZs26dixY+rUqZN7rKCgQGvXrtWsWbOUl5cnX1/fIvs4nU45nc6qjgoAAGzg1UWmd+/e+umnn4qMDR8+XC1bttSECROKlRgAAHB58eoiExQUpLZt2xYZCwwMVL169YqNAwCAy49XnyMDAABQGq+ekSnJN998Y3cEAADgJZiRAQAAxqLIAAAAY1FkAACAsSgyAADAWBQZAABgLIoMAAAwFkUGAAAYiyIDAACMRZEBAADGosgAAABjUWQAAICxKDIAAMBYFBkAAGAsigwAADAWRQYAABiLIgMAAIxFkQEAAMaiyAAAAGNRZAAAgLEoMgAAwFg17A4AeKOoJ5faHeGytX/qwEp9fz5b+1T2Z4vLEzMyAADAWBQZAABgLIoMAAAwFkUGAAAYiyIDAACMRZEBAADGosgAAABjUWQAAICxKDIAAMBYFBkAAGAsigwAADAWRQYAABiLIgMAAIxFkQEAAMaiyAAAAGNRZAAAgLEoMgAAwFgUGQAAYCyKDAAAMBZFBgAAGIsiAwAAjEWRAQAAxvLqIjNlyhRdc801CgoKUmhoqAYPHqwdO3bYHQsAAHgJry4ya9asUVxcnNavX6+vv/5a+fn56tu3r06fPm13NAAA4AVq2B2gNMuWLSvyev78+QoNDdWmTZvUs2dPm1IBAABv4dUzMv8tKytLklS3bl2bkwAAAG/g1TMy/6mwsFBjxoxRjx491LZt2wtul5eXp7y8PPfr7OzsqogHAABsYMyMTFxcnLZu3aqFCxeWut2UKVPkcrncS2RkZBUlBAAAVc2IIjNq1Cj961//0urVq9WoUaNSt01ISFBWVpZ7SU9Pr6KUAACgqnn1oSXLsjR69Gh9+umn+uabbxQdHX3RfZxOp5xOZxWkAwAAdvPqIhMXF6cFCxbon//8p4KCgnTkyBFJksvlUkBAgM3pAACA3bz60NKcOXOUlZWlG264QeHh4e7lww8/tDsaAADwAl49I2NZlt0RAACAF/PqGRkAAIDSUGQAAICxKDIAAMBYFBkAAGAsigwAADAWRQYAABiLIgMAAIxFkQEAAMaiyAAAAGNRZAAAgLEoMgAAwFgUGQAAYCyKDAAAMBZFBgAAGIsiAwAAjEWRAQAAxqLIAAAAY1FkAACAsSgyAADAWBQZAABgLIoMAAAwVg27AwAAcKminlxqd4TL1v6pA239+szIAAAAY1FkAACAsSgyAADAWBQZAABgLIoMAAAwFkUGAAAYiyIDAACMRZEBAADGosgAAABjUWQAAICxKDIAAMBYFBkAAGAsigwAADAWRQYAABiLIgMAAIxFkQEAAMaiyAAAAGNRZAAAgLEoMgAAwFgUGQAAYCyKDAAAMJYRRWb27NmKioqSv7+/unbtqh9++MHuSAAAwAt4fZH58MMPNW7cOD399NNKSUlRhw4d1K9fPx07dszuaAAAwGZeX2SmT5+ukSNHavjw4WrdurVef/111apVS3PnzrU7GgAAsJlXF5mzZ89q06ZN6tOnj3vMx8dHffr00bp162xMBgAAvEENuwOU5vjx4yooKFBYWFiR8bCwMP3yyy8l7pOXl6e8vDz366ysLElSdna2x/MV5uV4/D1RNpXxef4nPlv78NlWX5X52fK52qeyPtfz72tZVqnbeXWRqYgpU6bo2WefLTYeGRlpQxpUFtdMuxOgsvDZVl98ttVTZX+uJ0+elMvluuB6ry4yV1xxhXx9fXX06NEi40ePHlWDBg1K3CchIUHjxo1zvy4sLNSJEydUr149ORyOSs1rkuzsbEVGRio9PV3BwcF2x4EH8dlWT3yu1Refbcksy9LJkycVERFR6nZeXWRq1qypq6++WitXrtTgwYMl/VFMVq5cqVGjRpW4j9PplNPpLDJWp06dSk5qruDgYP7hVFN8ttUTn2v1xWdbXGkzMed5dZGRpHHjxik2NladO3dWly5dNHPmTJ0+fVrDhw+3OxoAALCZ1xeZO++8U7/99psmTZqkI0eO6KqrrtKyZcuKnQAMAAAuP15fZCRp1KhRFzyUhIpxOp16+umnix2Gg/n4bKsnPtfqi8/20jisi13XBAAA4KW8+oZ4AAAApaHIAAAAY1FkAACAsSgyAADAWBQZAACqUH5+vnr37q1du3bZHaVaoMhcZjIzM/X2228rISFBJ06ckCSlpKTo4MGDNicDgMuDn5+fUlNT7Y5RbVBkLiOpqalq3ry5XnzxRb300kvKzMyUJC1evFgJCQn2hkOF5efnq0aNGtq6davdUVAJ9uzZo6eeekp33323jh07Jkn68ssvtW3bNpuT4VIMHTpUiYmJdseoFoy4IR48Y9y4cbr//vs1bdo0BQUFuccHDBige+65x8ZkuBR+fn668sorVVBQYHcUeNiaNWvUv39/9ejRQ2vXrtXkyZMVGhqqLVu2KDExUYsWLbI7Iiro3Llzmjt3rlasWKGrr75agYGBRdZPnz7dpmTm4YZ4lxGXy6WUlBQ1bdpUQUFB2rJli5o0aaIDBw6oRYsWys3NtTsiKigxMVGLFy/Wu+++q7p169odBx7SrVs33X777Ro3blyRf7M//PCDbr31Vv366692R0QF9erV64LrHA6HVq1aVYVpzMaMzGXE6XQqOzu72PjOnTtVv359GxLBU2bNmqXdu3crIiJCjRs3LvbbXUpKik3JcCl++uknLViwoNh4aGiojh8/bkMieMrq1avtjlBtUGQuIzfffLOee+45ffTRR5L+aP1paWmaMGGCbrvtNpvT4VIMHjzY7gioBHXq1NHhw4cVHR1dZHzz5s1q2LChTangSbt379aePXvUs2dPBQQEyLIsORwOu2MZhUNLl5GsrCwNGTJEGzdu1MmTJxUREaEjR46oW7du+uKLL4r9Fg/AXo8//riSk5P18ccfq3nz5kpJSdHRo0c1bNgwDRs2TE8//bTdEVFBGRkZuuOOO7R69Wo5HA7t2rVLTZo00QMPPKCQkBC9/PLLdkc0BkXmMvTdd99py5YtOnXqlDp16qQ+ffrYHQkekJmZqUWLFmnPnj0aP3686tatq5SUFIWFhfHbu6HOnj2ruLg4zZ8/XwUFBapRo4YKCgp0zz33aP78+fL19bU7Iipo2LBhOnbsmN5++221atXKff7T8uXLNW7cOK5KKweKTDVXt25d7dy5U1dccYUeeOABvfLKK0WuWEL1kJqaqj59+sjlcmn//v3asWOHmjRpoqeeekppaWl655137I6IcrIsS+np6apfv76OHz+un376SadOnVLHjh0VExNjdzxcogYNGmj58uXq0KFDkRO59+7dq/bt2+vUqVN2RzQG95Gp5s6ePes+wTcpKYkrk6qp85fW79q1S/7+/u7xAQMGaO3atTYmQ0VZlqVmzZrp119/VWRkpAYMGKA77riDElNNnD59WrVq1So2fuLECTmdThsSmYuTfau5bt26afDgwbr66qtlWZbi4+MVEBBQ4rZz586t4nTwlA0bNuiNN94oNt6wYUMdOXLEhkS4VD4+PoqJiVFGRgblpRq67rrr9M477+j555+X9MfFF4WFhZo2bVqpl2ajOIpMNffee+9pxowZ2rNnjxwOh7KyspiVqYa4tL56mjp1qsaPH685c+aobdu2dseBB02bNk29e/fWxo0bdfbsWT3xxBPatm2bTpw4oe+++87ueEbhHJnLSHR0tDZu3Kh69erZHQUeNmLECGVkZOijjz5S3bp1lZqaKl9fXw0ePFg9e/bUzJkz7Y6ICggJCVFOTo7OnTunmjVrFptNPf+8NJgpKytLs2bNKnLxRVxcnMLDw+2OZhSKDFANcGl99ZSUlFTq+tjY2CpKAngvisxl5vTp01qzZo3S0tJ09uzZIuvi4+NtSgVP+fbbb5Wamsql9YAXSk1NVdu2beXj43PRp1+3b9++ilKZjyJzGdm8ebMGDBignJwcnT59WnXr1tXx48dVq1YthYaGau/evXZHBHABubm5xX75CA4OtikNKsLHx0dHjhxRaGiofHx85HA4VNKPYIfDwUNgy4GTfS8jY8eO1aBBg/T666/L5XJp/fr18vPz09ChQ/XYY4/ZHQ/l9Oqrr5Z5W2bbzHT69GlNmDBBH330kTIyMoqt54edWfbt2+c++X7fvn02p6k+mJG5jNSpU0fJyclq0aKF6tSpo3Xr1qlVq1ZKTk5WbGysfvnlF7sjohz++/k7v/32m3JyclSnTh1Jf9zpl9k2s8XFxWn16tV6/vnndd9992n27Nk6ePCg3njjDU2dOlX33nuv3REB2zEjcxnx8/OTj88f90AMDQ1VWlqaWrVqJZfLpfT0dJvTobz+8ze6BQsW6B//+IcSExPVokULSdKOHTs0cuRIPfTQQ3ZFxCX6/PPP9c477+iGG27Q8OHDdd1116lZs2Zq3Lix3n//fYqMYT777LMyb3vzzTdXYpLqhRmZy0jfvn11//3365577tHIkSOVmpqq+Ph4vfvuu/r999+VnJxsd0RUUNOmTbVo0SJ17NixyPimTZs0ZMgQprENVbt2bW3fvl1XXnmlGjVqpMWLF6tLly7at2+f2rVrx23sDXP+F8mL4RyZ8uERBZeRF154wX1/gsmTJyskJESPPPKIjh8/XuJdYWGOw4cP69y5c8XGCwoKdPToURsSwROaNGniLqEtW7bURx99JOmPmZrzhxBhjsLCwjItlJjyYUbmMnLmzBlZluV+vsf+/fv16aefqnXr1urXr5/N6XApBg0apIMHD+rtt99Wp06dJP0xG/Pggw+qYcOG5ZrShveYMWOGfH19FR8frxUrVmjQoEGyLEv5+fmaPn06J+kb7J133tGdd95Z7LlKZ8+e1cKFCzVs2DCbkpmHInMZ6du3r2699VY9/PDDyszMVMuWLeXn56fjx49r+vTpeuSRR+yOiAr67bffFBsbq2XLlsnPz0+SdO7cOfXr10/z589XaGiozQlRXvn5+brxxhv1+uuvu5+1dODAAW3atEnNmjXjPiOG8/X11eHDh4v928zIyFBoaCizMuXAyb6XkZSUFM2YMUOStGjRIoWFhWnz5s365JNPNGnSJIqMoSzL0pkzZ/TJJ5/o119/1c8//yzpj0MRzZs3tzkdKsrPz6/YTdMaN26sxo0b25QInmRZlhwOR7HxX3/9VS6Xy4ZE5qLIXEZycnIUFBQkSfrqq6906623ysfHR3/605904MABm9OhoizLUrNmzbRt2zbFxMTwpORqZOjQoUpMTNTUqVPtjgIP6dixoxwOhxwOh3r37q0aNf7/j+GCggLt27dPN954o40JzUORuYw0a9ZMS5Ys0f/8z/9o+fLlGjt2rCTp2LFj3CHUYD4+PoqJiVFGRgYlppo5d+6c5s6dqxUrVujqq68u9sys6dOn25QMFTV48GBJ0o8//qh+/fqpdu3a7nU1a9ZUVFSUbrvtNpvSmYlzZC4jixYt0j333KOCggL17t1bX331lSRpypQpWrt2rb788kubE6KiPv/8c02bNk1z5sxR27Zt7Y4DD+nVq9cF1zkcDq1ataoK08CTkpKSdOedd8rf39/uKMajyFxmjhw5osOHD6tDhw7uexr88MMPCg4OVsuWLW1Oh4oKCQlRTk6Ozp07p5o1ayogIKDI+hMnTtiUDAAqF0UGqAaSkpJKXR8bG1tFSQCUxfmHRl4IVy2VHefIANUARQUwy+LFi4sUmfz8fG3evFlJSUl69tlnbUxmHmZkgGqioKBAS5YscV9+3aZNG918883y9fW1ORmAslqwYIE+/PBD/fOf/7Q7ijEoMkA1sHv3bg0YMEAHDx4s8tDIyMhILV26VE2bNrU5IYCy2Lt3r9q3b89ztMqBZy0B1UB8fLyaNm2q9PR0paSkKCUlRWlpaYqOjlZ8fLzd8QCUwZkzZ/Tqq6+qYcOGdkcxCjMyQDUQGBio9evXq127dkXGt2zZoh49evDbHeBlQkJCipwjY1mWTp48qVq1aum9997TzTffbGM6s3CyL1ANOJ1OnTx5stj4qVOnVLNmTRsSASjNjBkzihQZHx8f1a9fX127dlVISIiNyczDoSWgGrjpppv04IMPKjk5WZZlybIsrV+/Xg8//DC/2QFe6P7779ctt9yi48eP69///rfWrFmj7du3u+/vhbLj0BJQDWRmZio2Nlaff/65++nX+fn5uuWWWzR//nweQgd4mY0bN+rGG2+Uv7+/unTpIknasGGDzpw5o6+++kqdOnWyOaE5KDJANbJ7925t375dktS6dWs1a9bM5kQASnLdddepWbNmeuutt9wPjjx37pxGjBihvXv3au3atTYnNAdFBqgmEhMTNWPGDO3atUuSFBMTozFjxmjEiBE2JwPw3wICArR58+Zij4bZvn27OnfurJycHJuSmYeTfYFqYNKkSZo+fbpGjx6tbt26SZLWrVunsWPHKi0tTc8995zNCQH8p+DgYKWlpRUrMunp6QoKCrIplZmYkQGqgfr16+vVV1/V3XffXWT8gw8+0OjRo3X8+HGbkgEoSXx8vD799FO99NJL6t69uyTpu+++0/jx43Xbbbdp5syZ9gY0CDMyQDWQn5+vzp07Fxu/+uqrde7cORsSASjNSy+9JIfDoWHDhrn/jfr5+emRRx7R1KlTbU5nFmZkgGpg9OjR8vPz0/Tp04uMP/744zpz5oxmz55tUzIApcnJydGePXskSU2bNlWtWrVsTmQeZmQAQ40bN879Z4fDobfffltfffWV/vSnP0mSkpOTlZaWpmHDhtkVEcBF1KpVq9gduVE+zMgAhurVq1eZtnM4HFq1alUlpwEAe1BkAACAsbgXMgAAMBZFBgAAGIsiAwAAjEWRAQAAxqLIALCVw+EodXnmmWfsjgjAi3EfGQC2Onz4sPvPH374oSZNmqQdO3a4x2rXrm1HLACGYEYGgK0aNGjgXlwulxwOhxo0aKCgoCA1b95cy5YtK7L9kiVLFBgYqJMnT2r//v1yOBxauHChunfvLn9/f7Vt21Zr1qwpss/WrVvVv39/1a5dW2FhYbrvvvt4/hRQTVBkAHilwMBA3XXXXZo3b16R8Xnz5mnIkCFFnhA8fvx4/e1vf9PmzZvVrVs3DRo0SBkZGZKkzMxM/fnPf1bHjh21ceNGLVu2TEePHtUdd9xRpd8PgMpBkQHgtUaMGKHly5e7Dz8dO3ZMX3zxhR544IEi240aNUq33XabWrVqpTlz5sjlcikxMVGSNGvWLHXs2FEvvPCCWrZsqY4dO2ru3LlavXq1du7cWeXfEwDPosgA8FpdunRRmzZtlJSUJEl677331LhxY/Xs2bPIdt26dXP/uUaNGurcubN+/vlnSdKWLVu0evVq1a5d2720bNlSktwP6wNgLk72BeDVRowYodmzZ+vJJ5/UvHnzNHz4cDkcjjLvf+rUKQ0aNEgvvvhisXXh4eGejArABszIAPBqQ4cO1YEDB/Tqq69q+/btio2NLbbN+vXr3X8+d+6cNm3apFatWkmSOnXqpG3btikqKkrNmjUrsgQGBlbZ9wGgclBkAHi1kJAQ3XrrrRo/frz69u2rRo0aFdtm9uzZ+vTTT/XLL78oLi5Ov//+u/s8mri4OJ04cUJ33323NmzYoD179mj58uUaPny4CgoKqvrbAeBhFBkAXu+vf/2rzp49W+wk3/OmTp2qqVOnqkOHDvr222/12Wef6YorrpAkRURE6LvvvlNBQYH69u2rdu3aacyYMapTp458fPgvEDCdw7Isy+4QAFCad999V2PHjtWhQ4dUs2ZN9/j+/fsVHR2tzZs366qrrrIvIADbcLIvAK+Vk5Ojw4cPa+rUqXrooYeKlBgAkDi0BMCLTZs2TS1btlSDBg2UkJBgdxwAXohDSwAAwFjMyAAAAGNRZAAAgLEoMgAAwFgUGQAAYCyKDAAAMBZFBgAAGIsiAwAAjEWRAQAAxqLIAAAAY/0/5wNCD/spiX8AAAAASUVORK5CYII=", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Kernel density function based method\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Continuous data difficulty\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Class neighboring study...\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Class homogeneity study...\n", + "KMeans Elbow\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Gaussian mixture Elbow\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "summary = Summary()\n", + "summary.fit(X=X, y=y, classes=MINORITY_CLASSES)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "a275e3c1", + "metadata": {}, + "source": [ + "Now, let's break down these figures..." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "90b7ef87", + "metadata": {}, + "source": [ + "The report starts with imbalance ratios. It is the most common and basic metric to gain some insight into your imbalanced dataset. We define it as $IR=\\frac{C_{maj}}{C_{min}}$, which is a ratio between a given class' cardinality and the cardinality of the largest class in the dataset. You can see imbalance ratios for each class using our library. Let's see how to use this method separately from the summary:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "32bd8e3b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{0: 5.0, 1: 1.0, 2: 5.0, 3: 1.0}\n" + ] + } + ], + "source": [ + "ir_object = ImbalanceRatio()\n", + "ir_results = ir_object.fit(y)\n", + "pprint(ir_results)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "f027626b", + "metadata": {}, + "source": [ + "We get a dictionary in the form of {class label : imbalance ratio}. As you can see we use `.fit()` method to use our method. Running `.fit()` is the way to use any of `DataDifficulty` methods, and most of them (apart form imbalance ratio, since we usually care about seeing the results quite accurately, which is hard to do on a plot) also come equipped with `.fit_plot()` method, which dresses up the results of `.fit()` with some nice plots." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "d622e2e6", + "metadata": {}, + "source": [ + "Next we want to study the data difficulty. The standard algorithm tells us what level of difficulty each minority class data point has. You can learn about this method here: \n", + "\n", + "Doe, J., & Smith, J. (2021). Title of the article. Title of the Journal, Volume(Issue), Page(s). https://example.com/article\n", + "\n", + "\n", + "It can be run as any other of our algorithms:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "1d043af2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{0: {'border': 3, 'outlier': 1, 'rare': 3, 'safe': 13},\n", + " 2: {'border': 4, 'outlier': 2, 'rare': 2, 'safe': 12}}\n" + ] + } + ], + "source": [ + "knn_dd_object = KNNDataDifficulty(classes=MINORITY_CLASSES)\n", + "knn_dd_results = knn_dd_object.fit(X=X, y=y)\n", + "pprint(knn_dd_results)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c25dbe77", + "metadata": {}, + "source": [ + "Thus we get intuitively structured output, in form of {class label : {each difficulty level : count(examples of that difficulty level in that minority class)}}. Each `DataDifficulty` function has a comprehensive documentation with clear type hinting. If you feel like you are lost on what kind of object is returned, please see the docs.\n", + "\n", + "As mentioned, we can easily yield the output in form of a appropriate figure:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "e09ab5be", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjIAAAHnCAYAAACi17dYAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjYuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/P9b71AAAACXBIWXMAAA9hAAAPYQGoP6dpAAAt8UlEQVR4nO3dfXyOdcPH8e+5mROzzZSNZWxseZiHSLpQ4uIiRG5JT7J0k7rHQkl73VF0F7kLFdEDUZJKuHQJ5aFchDBZnp9Z2GTaZsbMdtx/9HLe167ZbHNux/mbz/v1Ol+vnb/jOM591/nSvvsdv/M4HJZlWQIAADCQl90BAAAASooiAwAAjEWRAQAAxqLIAAAAY1FkAACAsSgyAADAWBQZAABgLIoMAAAwFkUGAAAYiyIDwDZhYWF64okn7I4BwGAUGQBud+jQIQ0ZMkT16tVTpUqV5O/vr3bt2untt9/WhQsX7I5XqC1btmjo0KGKioqSr6+v6tSpo379+mn//v12RwNwFRXsDgCgfFm2bJkefPBBOZ1ODRgwQE2aNNGlS5e0fv16jRo1Srt27dIHH3xgd8wCvfHGG9qwYYMefPBBNWvWTElJSZo2bZpatmypTZs2qUmTJnZHBPAvHNw0EoC7HDlyRM2aNVPt2rW1Zs0a1apVK8/2gwcPatmyZXr22Wcl/XlqqUOHDpozZ44Naa/up59+UqtWrVSxYkXX2IEDB9S0aVP17dtX8+bNszEdgH/HqSUAbjNp0iRlZGRo1qxZ+UqMJEVERLhKzNWcPXtWzz//vJo2baqqVavK399f3bp1044dO/Lt++677yoqKkpVqlRRYGCgWrVqpfnz57u2nzt3TsOHD1dYWJicTqeCgoL0t7/9TfHx8YX+DG3bts1TYiQpMjJSUVFR2rNnz7X+EwAoY5xaAuA233zzjerVq6e2bduW6PjDhw9ryZIlevDBBxUeHq7k5GS9//77uueee7R7926FhIRIkj788EPFxsaqb9++evbZZ3Xx4kUlJCRo8+bNevTRRyVJTz/9tBYuXKihQ4eqcePGSklJ0fr167Vnzx61bNmyWLksy1JycrKioqJK9HMBKD0UGQBukZ6erhMnTuj+++8v8Ws0bdpU+/fvl5fX/08WP/7442rYsKFmzZqlMWPGSPpzHU5UVJS++uqrAl9r2bJlGjx4sN566y3X2AsvvFCiXJ999plOnDih8ePHl+h4AKWHU0sA3CI9PV2S5OfnV+LXcDqdrhKTk5OjlJQUVa1aVQ0aNMhzSqhatWr67bfftGXLlgJfq1q1atq8ebNOnjxZ4jyStHfvXsXExKhNmzaKjo6+rtcC4H4UGQBu4e/vL+nPtSkllZubqylTpigyMlJOp1M333yzatSooYSEBKWlpbn2Gz16tKpWrarWrVsrMjJSMTEx2rBhQ57XmjRpknbu3KnQ0FC1bt1ar7zyig4fPlysPElJSerRo4cCAgK0cOFCeXt7l/hnA1A6KDIA3MLf318hISHauXNniV/j9ddf18iRI9W+fXvNmzdPK1eu1Pfff6+oqCjl5ua69mvUqJH27dunBQsW6K677tLXX3+tu+66Sy+//LJrn379+unw4cN69913FRISov/93/9VVFSUli9fXqQsaWlp6tatm1JTU7VixQrX+hwAnoWPXwNwmyFDhuiDDz7QTz/9pDZt2lxz/3//+PVtt92m6tWra82aNXn2q127tiIiIvTDDz9c9XUuXbqkPn36aMWKFcrIyFClSpXy7XP69Gm1bNlSYWFhWr9+faG5Ll68qC5dumjbtm1atWpVkX4WAPZgRgaA27zwwgvy9fXVoEGDlJycnG/7oUOH9Pbbbxd4vLe3t/79b6uvvvpKJ06cyDOWkpKS53nFihXVuHFjWZal7Oxs5eTk5DkVJUlBQUEKCQlRVlZWoT9DTk6OHnroIW3cuFFfffUVJQbwcHxqCYDb1K9fX/Pnz9dDDz2kRo0a5bmy708//aSvvvqq0Hsr3XfffRo/frwGDhyotm3b6tdff9Vnn32mevXq5dmvS5cuqlmzptq1a6fg4GDt2bNH06ZNU48ePeTn56fU1FTVrl1bffv2VfPmzVW1alWtWrVKW7ZsyfMppqt57rnntHTpUvXs2VNnz57NdwG8/v37l/i/D4BSYAGAm+3fv98aPHiwFRYWZlWsWNHy8/Oz2rVrZ7377rvWxYsXXfvVrVvXio6Odj2/ePGi9dxzz1m1atWyKleubLVr187auHGjdc8991j33HOPa7/333/fat++vXXTTTdZTqfTql+/vjVq1CgrLS3NsizLysrKskaNGmU1b97c8vPzs3x9fa3mzZtb77333jWz33PPPZakAh8APAtrZAAAgLFYIwMAAIxFkQEAAMaiyAAAAGNRZAAAgLEoMgAAwFgUGQAAYKxyf0G83NxcnTx5Un5+fnI4HHbHAQAARWBZls6dO6eQkBB5eRU871Lui8zJkycVGhpqdwwAAFACiYmJql27doHby32R8fPzk/Tnfwh/f3+b0wAAgKJIT09XaGio6/d4Qcp9kblyOsnf358iAwCAYa61LITFvgAAwFgUGQAAYCyKDAAAMBZFBgAAGIsiAwAAjEWRAQAAxqLIAAAAY1FkAACAsSgyAADAWBQZAABgLFuLzLp169SzZ0+FhITI4XBoyZIlrm3Z2dkaPXq0mjZtKl9fX4WEhGjAgAE6efKkfYEBAIBHsbXInD9/Xs2bN9f06dPzbcvMzFR8fLzGjBmj+Ph4LVq0SPv27VOvXr1sSAoAADyRw7Isy+4Q0p83hVq8eLF69+5d4D5btmxR69atdezYMdWpU6dIr5uenq6AgAClpaVx00gAAAxR1N/fRq2RSUtLk8PhULVq1eyOAgAAPEAFuwMU1cWLFzV69Gg98sgjhTazrKwsZWVluZ6np6eXRTwAAGADI4pMdna2+vXrJ8uyNGPGjEL3nTBhgsaNG1cmucJeXFYm3wf5HZ3Yw+4IAAAP4PGnlq6UmGPHjun777+/5jqXuLg4paWluR6JiYlllBQAAJQ1j56RuVJiDhw4oLVr1+qmm2665jFOp1NOp7MM0gEAALvZWmQyMjJ08OBB1/MjR47ol19+UfXq1VWrVi317dtX8fHx+sc//qGcnBwlJSVJkqpXr66KFSvaFRsAAHgIW4vM1q1b1bFjR9fzkSNHSpKio6P1yiuvaOnSpZKk2267Lc9xa9euVYcOHcoqJgAA8FC2FpkOHTqosMvYeMglbgAAgIfy+MW+AAAABaHIAAAAY1FkAACAsSgyAADAWBQZAABgLIoMAAAwFkUGAAAYiyIDAACMRZEBAADGosgAAABjUWQAAICxKDIAAMBYFBkAAGAsigwAADAWRQYAABiLIgMAAIxFkQEAAMaiyAAAAGNRZAAAgLEoMgAAwFgUGQAAYCyKDAAAMBZFBgAAGIsiAwAAjEWRAQAAxqLIAAAAY1FkAACAsSgyAADAWBQZAABgLIoMAAAwFkUGAAAYiyIDAACMRZEBAADGosgAAABjUWQAAICxKDIAAMBYFBkAAGAsigwAADAWRQYAABiLIgMAAIxFkQEAAMaiyAAAAGNRZAAAgLEoMgAAwFgUGQAAYCyKDAAAMBZFBgAAGIsiAwAAjGVrkVm3bp169uypkJAQORwOLVmyJM92y7I0duxY1apVS5UrV1bnzp114MABe8ICAACPY2uROX/+vJo3b67p06dfdfukSZP0zjvvaObMmdq8ebN8fX3VtWtXXbx4sYyTAgAAT1TBzm/erVs3devW7arbLMvS1KlT9dJLL+n++++XJH3yyScKDg7WkiVL9PDDD5dlVAAA4IE8do3MkSNHlJSUpM6dO7vGAgICdOedd2rjxo0FHpeVlaX09PQ8DwAAUD55bJFJSkqSJAUHB+cZDw4Odm27mgkTJiggIMD1CA0NLdWcAADAPh5bZEoqLi5OaWlprkdiYqLdkQAAQCnx2CJTs2ZNSVJycnKe8eTkZNe2q3E6nfL398/zAAAA5ZPHFpnw8HDVrFlTq1evdo2lp6dr8+bNatOmjY3JAACAp7D1U0sZGRk6ePCg6/mRI0f0yy+/qHr16qpTp46GDx+u//mf/1FkZKTCw8M1ZswYhYSEqHfv3vaFBgAAHsPWIrN161Z17NjR9XzkyJGSpOjoaM2ZM0cvvPCCzp8/r6eeekqpqam66667tGLFClWqVMmuyAAAwIM4LMuy7A5RmtLT0xUQEKC0tDS3r5cJe3GZW18PRXd0Yg+7IwAASlFRf3977BoZAACAa6HIAAAAY1FkAACAsSgyAADAWBQZAABgLIoMAAAwFkUGAAAYiyIDAACMRZEBAADGosgAAABjUWQAAICxKDIAAMBYFBkAAGAsigwAADAWRQYAABiLIgMAAIxFkQEAAMaiyAAAAGNRZAAAgLEoMgAAwFgUGQAAYCyKDAAAMBZFBgAAGIsiAwAAjEWRAQAAxqLIAAAAY1FkAACAsSgyAADAWBQZAABgLIoMAAAwFkUGAAAYiyIDAACMRZEBAADGosgAAABjUWQAAICxKDIAAMBYFBkAAGAsigwAADAWRQYAABiLIgMAAIxFkQEAAMaiyAAAAGNRZAAAgLEoMgAAwFgUGQAAYCyKDAAAMBZFBgAAGIsiAwAAjOXRRSYnJ0djxoxReHi4KleurPr16+vVV1+VZVl2RwMAAB6ggt0BCvPGG29oxowZmjt3rqKiorR161YNHDhQAQEBio2NtTseAACwmUcXmZ9++kn333+/evToIUkKCwvT559/rp9//tnmZAAAwBN49Kmltm3bavXq1dq/f78kaceOHVq/fr26detW4DFZWVlKT0/P8wAAAOWTR8/IvPjii0pPT1fDhg3l7e2tnJwcvfbaa3rssccKPGbChAkaN25cGaYEAAB28egZmS+//FKfffaZ5s+fr/j4eM2dO1dvvvmm5s6dW+AxcXFxSktLcz0SExPLMDEAAChLHj0jM2rUKL344ot6+OGHJUlNmzbVsWPHNGHCBEVHR1/1GKfTKafTWZYxAQCATTx6RiYzM1NeXnkjent7Kzc316ZEAADAk3j0jEzPnj312muvqU6dOoqKitL27ds1efJkPfnkk3ZHAwAAHsCji8y7776rMWPG6L/+6790+vRphYSEaMiQIRo7dqzd0QAAgAfw6CLj5+enqVOnaurUqXZHAQAAHsij18gAAAAUhiIDAACMRZEBAADGosgAAABjUWQAAICxKDIAAMBYFBkAAGAsigwAADAWRQYAABiLIgMAAIxFkQEAAMaiyAAAAGNRZAAAgLEoMgAAwFgUGQAAYCyKDAAAMBZFBgAAGIsiAwAAjEWRAQAAxqLIAAAAY1FkAACAsSgyAADAWBQZAABgLIoMAAAwFkUGAAAYq0RFpl69ekpJSck3npqaqnr16l13KAAAgKIoUZE5evSocnJy8o1nZWXpxIkT1x0KAACgKCoUZ+elS5e6vl65cqUCAgJcz3NycrR69WqFhYW5LRwAAEBhilVkevfuLUlyOByKjo7Os83Hx0dhYWF666233BYOAACgMMUqMrm5uZKk8PBwbdmyRTfffHOphAIAACiKYhWZK44cOeLuHAAAAMVWoiIjSatXr9bq1at1+vRp10zNFbNnz77uYAAAANdSoiIzbtw4jR8/Xq1atVKtWrXkcDjcnQsAAOCaSlRkZs6cqTlz5ujxxx93dx4AAIAiK9F1ZC5duqS2bdu6OwsAAECxlKjIDBo0SPPnz3d3FgAAgGIp0amlixcv6oMPPtCqVavUrFkz+fj45Nk+efJkt4QDAAAoTImKTEJCgm677TZJ0s6dO/NsY+EvAAAoKyUqMmvXrnV3DgAAgGIr0RoZAAAAT1CiGZmOHTsWegppzZo1JQ4EAABQVCUqMlfWx1yRnZ2tX375RTt37sx3M0kAAIDSUqIiM2XKlKuOv/LKK8rIyLiuQAAAAEXl1jUy/fv35z5LAACgzLi1yGzcuFGVKlVy50sCAAAUqESnlvr06ZPnuWVZOnXqlLZu3aoxY8a4JRgAAMC1lKjIBAQE5Hnu5eWlBg0aaPz48erSpYtbggEAAFxLiYrMxx9/7O4cBTpx4oRGjx6t5cuXKzMzUxEREfr444/VqlWrMssAAAA8U4mKzBXbtm3Tnj17JElRUVFq0aKFW0Jd8ccff6hdu3bq2LGjli9frho1aujAgQMKDAx06/cBAABmKlGROX36tB5++GH98MMPqlatmiQpNTVVHTt21IIFC1SjRg23hHvjjTcUGhqaZwYoPDzcLa8NAADMV6JPLQ0bNkznzp3Trl27dPbsWZ09e1Y7d+5Uenq6YmNj3RZu6dKlatWqlR588EEFBQWpRYsW+vDDD932+gAAwGwlKjIrVqzQe++9p0aNGrnGGjdurOnTp2v58uVuC3f48GHNmDFDkZGRWrlypZ555hnFxsZq7ty5BR6TlZWl9PT0PA8AAFA+lejUUm5urnx8fPKN+/j4KDc397pD/ev3adWqlV5//XVJUosWLbRz507NnDmzwFshTJgwQePGjXNbBgAA4LlKNCPz17/+Vc8++6xOnjzpGjtx4oRGjBihTp06uS1crVq11Lhx4zxjjRo10vHjxws8Ji4uTmlpaa5HYmKi2/IAAADPUqIZmWnTpqlXr14KCwtTaGioJCkxMVFNmjTRvHnz3BauXbt22rdvX56x/fv3q27dugUe43Q65XQ63ZYBAAB4rhIVmdDQUMXHx2vVqlXau3evpD9nSjp37uzWcCNGjFDbtm31+uuvq1+/fvr555/1wQcf6IMPPnDr9wEAAGYq1qmlNWvWqHHjxkpPT5fD4dDf/vY3DRs2TMOGDdMdd9yhqKgo/fOf/3RbuDvuuEOLFy/W559/riZNmujVV1/V1KlT9dhjj7ntewAAAHMVa0Zm6tSpGjx4sPz9/fNtCwgI0JAhQzR58mTdfffdbgt433336b777nPb6wEAgPKjWDMyO3bs0L333lvg9i5dumjbtm3XHQoAAKAoilVkkpOTr/qx6ysqVKig33///bpDAQAAFEWxiswtt9yinTt3Frg9ISFBtWrVuu5QAAAARVGsItO9e3eNGTNGFy9ezLftwoULevnll1nPAgAAykyxFvu+9NJLWrRokW699VYNHTpUDRo0kCTt3btX06dPV05Ojv77v/+7VIICAAD8u2IVmeDgYP3000965plnFBcXJ8uyJEkOh0Ndu3bV9OnTFRwcXCpBAQAA/l2xL4hXt25dffvtt/rjjz908OBBWZalyMhIBQYGlkY+AACAApXoyr6SFBgYqDvuuMOdWQAAAIqlRDeNBAAA8AQUGQAAYCyKDAAAMBZFBgAAGIsiAwAAjEWRAQAAxqLIAAAAY1FkAACAsSgyAADAWBQZAABgLIoMAAAwFkUGAAAYiyIDAACMRZEBAADGosgAAABjUWQAAICxKDIAAMBYFBkAAGAsigwAADAWRQYAABiLIgMAAIxFkQEAAMaiyAAAAGNRZAAAgLEoMgAAwFgUGQAAYCyKDAAAMBZFBgAAGIsiAwAAjEWRAQAAxqLIAAAAY1FkAACAsSgyAADAWBQZAABgLIoMAAAwFkUGAAAYiyIDAACMRZEBAADGosgAAABjUWQAAICxjCoyEydOlMPh0PDhw+2OAgAAPIAxRWbLli16//331axZM7ujAAAAD2FEkcnIyNBjjz2mDz/8UIGBgXbHAQAAHsKIIhMTE6MePXqoc+fO19w3KytL6enpeR4AAKB8qmB3gGtZsGCB4uPjtWXLliLtP2HCBI0bN66UU6G8C3txmd0RblhHJ/awOwIAg3j0jExiYqKeffZZffbZZ6pUqVKRjomLi1NaWprrkZiYWMopAQCAXTx6Rmbbtm06ffq0WrZs6RrLycnRunXrNG3aNGVlZcnb2zvPMU6nU06ns6yjAgAAG3h0kenUqZN+/fXXPGMDBw5Uw4YNNXr06HwlBgAA3Fg8usj4+fmpSZMmecZ8fX1100035RsHAAA3Ho9eIwMAAFAYj56RuZoffvjB7ggAAMBDMCMDAACMRZEBAADGosgAAABjUWQAAICxKDIAAMBYFBkAAGAsigwAADAWRQYAABiLIgMAAIxFkQEAAMaiyAAAAGNRZAAAgLEoMgAAwFgUGQAAYCyKDAAAMBZFBgAAGIsiAwAAjEWRAQAAxqLIAAAAY1FkAACAsSgyAADAWBQZAABgLIoMAAAwFkUGAAAYiyIDAACMRZEBAADGosgAAABjUWQAAICxKDIAAMBYFBkAAGAsigwAADAWRQYAABiLIgMAAIxFkQEAAMaiyAAAAGNRZAAAgLEoMgAAwFgUGQAAYCyKDAAAMBZFBgAAGIsiAwAAjEWRAQAAxqLIAAAAY1FkAACAsSgyAADAWBQZAABgLIoMAAAwFkUGAAAYy6OLzIQJE3THHXfIz89PQUFB6t27t/bt22d3LAAA4CE8usj8+OOPiomJ0aZNm/T9998rOztbXbp00fnz5+2OBgAAPEAFuwMUZsWKFXmez5kzR0FBQdq2bZvat29vUyoAAOApPLrI/Lu0tDRJUvXq1QvcJysrS1lZWa7n6enppZ4LAADYw5gik5ubq+HDh6tdu3Zq0qRJgftNmDBB48aNK8NkAEwS9uIyuyPcsI5O7FFqr837ap/SfF+LwqPXyPyrmJgY7dy5UwsWLCh0v7i4OKWlpbkeiYmJZZQQAACUNSNmZIYOHap//OMfWrdunWrXrl3ovk6nU06ns4ySAQAAO3l0kbEsS8OGDdPixYv1ww8/KDw83O5IAADAg3h0kYmJidH8+fP197//XX5+fkpKSpIkBQQEqHLlyjanAwAAdvPoNTIzZsxQWlqaOnTooFq1arkeX3zxhd3RAACAB/DoGRnLsuyOAAAAPJhHz8gAAAAUhiIDAACMRZEBAADGosgAAABjUWQAAICxKDIAAMBYFBkAAGAsigwAADAWRQYAABiLIgMAAIxFkQEAAMaiyAAAAGNRZAAAgLEoMgAAwFgUGQAAYCyKDAAAMBZFBgAAGIsiAwAAjEWRAQAAxqLIAAAAY1FkAACAsSgyAADAWBQZAABgLIoMAAAwFkUGAAAYiyIDAACMRZEBAADGosgAAABjUWQAAICxKDIAAMBYFBkAAGAsigwAADAWRQYAABiLIgMAAIxFkQEAAMaiyAAAAGNRZAAAgLEoMgAAwFgUGQAAYCyKDAAAMBZFBgAAGIsiAwAAjEWRAQAAxqLIAAAAY1FkAACAsSgyAADAWBQZAABgLCOKzPTp0xUWFqZKlSrpzjvv1M8//2x3JAAA4AE8vsh88cUXGjlypF5++WXFx8erefPm6tq1q06fPm13NAAAYDOPLzKTJ0/W4MGDNXDgQDVu3FgzZ85UlSpVNHv2bLujAQAAm3l0kbl06ZK2bdumzp07u8a8vLzUuXNnbdy40cZkAADAE1SwO0Bhzpw5o5ycHAUHB+cZDw4O1t69e696TFZWlrKyslzP09LSJEnp6eluz5eblen210TRlMb7+a94b+3De1t+leZ7y/tqn9J6X6+8rmVZhe7n0UWmJCZMmKBx48blGw8NDbUhDUpLwFS7E6C08N6WX7y35VNpv6/nzp1TQEBAgds9usjcfPPN8vb2VnJycp7x5ORk1axZ86rHxMXFaeTIka7nubm5Onv2rG666SY5HI5SzWuS9PR0hYaGKjExUf7+/nbHgRvx3pZPvK/lF+/t1VmWpXPnzikkJKTQ/Ty6yFSsWFG33367Vq9erd69e0v6s5isXr1aQ4cOveoxTqdTTqczz1i1atVKOam5/P39+YdTTvHelk+8r+UX721+hc3EXOHRRUaSRo4cqejoaLVq1UqtW7fW1KlTdf78eQ0cONDuaAAAwGYeX2Qeeugh/f777xo7dqySkpJ02223acWKFfkWAAMAgBuPxxcZSRo6dGiBp5JQMk6nUy+//HK+03AwH+9t+cT7Wn7x3l4fh3WtzzUBAAB4KI++IB4AAEBhKDIAAMBYFBkAAGAsigwAADAWRQYAgDKUnZ2tTp066cCBA3ZHKRcoMjeY1NRUffTRR4qLi9PZs2clSfHx8Tpx4oTNyQDgxuDj46OEhAS7Y5QbFJkbSEJCgm699Va98cYbevPNN5WamipJWrRokeLi4uwNhxLLzs5WhQoVtHPnTrujoBQcOnRIL730kh555BGdPn1akrR8+XLt2rXL5mS4Hv3799esWbPsjlEuGHFBPLjHyJEj9cQTT2jSpEny8/NzjXfv3l2PPvqojclwPXx8fFSnTh3l5OTYHQVu9uOPP6pbt25q166d1q1bp9dee01BQUHasWOHZs2apYULF9odESV0+fJlzZ49W6tWrdLtt98uX1/fPNsnT55sUzLzcEG8G0hAQIDi4+NVv359+fn5aceOHapXr56OHTumBg0a6OLFi3ZHRAnNmjVLixYt0qeffqrq1avbHQdu0qZNGz344IMaOXJknn+zP//8s/r06aPffvvN7ogooY4dOxa4zeFwaM2aNWWYxmzMyNxAnE6n0tPT843v379fNWrUsCER3GXatGk6ePCgQkJCVLdu3Xx/3cXHx9uUDNfj119/1fz58/ONBwUF6cyZMzYkgrusXbvW7gjlBkXmBtKrVy+NHz9eX375paQ/W//x48c1evRoPfDAAzanw/Xo3bu33RFQCqpVq6ZTp04pPDw8z/j27dt1yy232JQK7nTw4EEdOnRI7du3V+XKlWVZlhwOh92xjMKppRtIWlqa+vbtq61bt+rcuXMKCQlRUlKS2rRpo2+//TbfX/EA7PX8889r8+bN+uqrr3TrrbcqPj5eycnJGjBggAYMGKCXX37Z7ogooZSUFPXr109r166Vw+HQgQMHVK9ePT355JMKDAzUW2+9ZXdEY1BkbkAbNmzQjh07lJGRoZYtW6pz5852R4IbpKamauHChTp06JBGjRql6tWrKz4+XsHBwfz1bqhLly4pJiZGc+bMUU5OjipUqKCcnBw9+uijmjNnjry9ve2OiBIaMGCATp8+rY8++kiNGjVyrX9auXKlRo4cyafSioEiU85Vr15d+/fv180336wnn3xSb7/9dp5PLKF8SEhIUOfOnRUQEKCjR49q3759qlevnl566SUdP35cn3zyid0RUUyWZSkxMVE1atTQmTNn9OuvvyojI0MtWrRQZGSk3fFwnWrWrKmVK1eqefPmeRZyHz58WM2aNVNGRobdEY3BdWTKuUuXLrkW+M6dO5dPJpVTVz5af+DAAVWqVMk13r17d61bt87GZCgpy7IUERGh3377TaGhoerevbv69etHiSknzp8/rypVquQbP3v2rJxOpw2JzMVi33KuTZs26t27t26//XZZlqXY2FhVrlz5qvvOnj27jNPBXbZs2aL3338/3/gtt9yipKQkGxLhenl5eSkyMlIpKSmUl3Lo7rvv1ieffKJXX31V0p8fvsjNzdWkSZMK/Wg28qPIlHPz5s3TlClTdOjQITkcDqWlpTErUw7x0fryaeLEiRo1apRmzJihJk2a2B0HbjRp0iR16tRJW7du1aVLl/TCCy9o165dOnv2rDZs2GB3PKOwRuYGEh4erq1bt+qmm26yOwrcbNCgQUpJSdGXX36p6tWrKyEhQd7e3urdu7fat2+vqVOn2h0RJRAYGKjMzExdvnxZFStWzDebeuV+aTBTWlqapk2blufDFzExMapVq5bd0YxCkQHKAT5aXz7NnTu30O3R0dFllATwXBSZG8z58+f1448/6vjx47p06VKebbGxsTalgrusX79eCQkJfLQe8EAJCQlq0qSJvLy8rnn362bNmpVRKvNRZG4g27dvV/fu3ZWZmanz58+revXqOnPmjKpUqaKgoCAdPnzY7ogACnDx4sV8f3z4+/vblAYl4eXlpaSkJAUFBcnLy0sOh0NX+xXscDi4CWwxsNj3BjJixAj17NlTM2fOVEBAgDZt2iQfHx/1799fzz77rN3xUEzvvPNOkfdlts1M58+f1+jRo/Xll18qJSUl33Z+2ZnlyJEjrsX3R44csTlN+cGMzA2kWrVq2rx5sxo0aKBq1app48aNatSokTZv3qzo6Gjt3bvX7ogohn+//87vv/+uzMxMVatWTdKfV/plts1sMTExWrt2rV599VU9/vjjmj59uk6cOKH3339fEydO1GOPPWZ3RMB2zMjcQHx8fOTl9ec1EIOCgnT8+HE1atRIAQEBSkxMtDkdiutf/6KbP3++3nvvPc2aNUsNGjSQJO3bt0+DBw/WkCFD7IqI6/TNN9/ok08+UYcOHTRw4EDdfffdioiIUN26dfXZZ59RZAyzdOnSIu/bq1evUkxSvjAjcwPp0qWLnnjiCT366KMaPHiwEhISFBsbq08//VR//PGHNm/ebHdElFD9+vW1cOFCtWjRIs/4tm3b1LdvX6axDVW1alXt3r1bderUUe3atbVo0SK1bt1aR44cUdOmTbmMvWGu/CF5LayRKR5uUXADef31113XJ3jttdcUGBioZ555RmfOnLnqVWFhjlOnTuny5cv5xnNycpScnGxDIrhDvXr1XCW0YcOG+vLLLyX9OVNz5RQizJGbm1ukByWmeJiRuYFcuHBBlmW57u9x9OhRLV68WI0bN1bXrl1tTofr0bNnT504cUIfffSRWrZsKenP2ZinnnpKt9xyS7GmtOE5pkyZIm9vb8XGxmrVqlXq2bOnLMtSdna2Jk+ezCJ9g33yySd66KGH8t1X6dKlS1qwYIEGDBhgUzLzUGRuIF26dFGfPn309NNPKzU1VQ0bNpSPj4/OnDmjyZMn65lnnrE7Ikro999/V3R0tFasWCEfHx9J0uXLl9W1a1fNmTNHQUFBNidEcWVnZ+vee+/VzJkzXfdaOnbsmLZt26aIiAiuM2I4b29vnTp1Kt+/zZSUFAUFBTErUwws9r2BxMfHa8qUKZKkhQsXKjg4WNu3b9fXX3+tsWPHUmQMZVmWLly4oK+//lq//fab9uzZI+nPUxG33nqrzelQUj4+Pvkumla3bl3VrVvXpkRwJ8uy5HA48o3/9ttvCggIsCGRuSgyN5DMzEz5+flJkr777jv16dNHXl5e+stf/qJjx47ZnA4lZVmWIiIitGvXLkVGRnKn5HKkf//+mjVrliZOnGh3FLhJixYt5HA45HA41KlTJ1Wo8P+/hnNycnTkyBHde++9NiY0D0XmBhIREaElS5boP/7jP7Ry5UqNGDFCknT69GmuEGowLy8vRUZGKiUlhRJTzly+fFmzZ8/WqlWrdPvtt+e7Z9bkyZNtSoaS6t27tyTpl19+UdeuXVW1alXXtooVKyosLEwPPPCATenMxBqZG8jChQv16KOPKicnR506ddJ3330nSZowYYLWrVun5cuX25wQJfXNN99o0qRJmjFjhpo0aWJ3HLhJx44dC9zmcDi0Zs2aMkwDd5o7d64eeughVapUye4oxqPI3GCSkpJ06tQpNW/e3HVNg59//ln+/v5q2LChzelQUoGBgcrMzNTly5dVsWJFVa5cOc/2s2fP2pQMAEoXRQYoB+bOnVvo9ujo6DJKAqAortw0siB8aqnoWCMDlAMUFcAsixYtylNksrOztX37ds2dO1fjxo2zMZl5mJEByomcnBwtWbLE9fHrqKgo9erVS97e3jYnA1BU8+fP1xdffKG///3vdkcxBkUGKAcOHjyo7t2768SJE3luGhkaGqply5apfv36NicEUBSHDx9Ws2bNuI9WMXCvJaAciI2NVf369ZWYmKj4+HjFx8fr+PHjCg8PV2xsrN3xABTBhQsX9M477+iWW26xO4pRmJEBygFfX19t2rRJTZs2zTO+Y8cOtWvXjr/uAA8TGBiYZ42MZVk6d+6cqlSponnz5qlXr142pjMLi32BcsDpdOrcuXP5xjMyMlSxYkUbEgEozJQpU/IUGS8vL9WoUUN33nmnAgMDbUxmHk4tAeXAfffdp6eeekqbN2+WZVmyLEubNm3S008/zV92gAd64okndP/99+vMmTP65z//qR9//FG7d+92Xd8LRcepJaAcSE1NVXR0tL755hvX3a+zs7N1//33a86cOdyEDvAwW7du1b333qtKlSqpdevWkqQtW7bowoUL+u6779SyZUubE5qDIgOUIwcPHtTu3bslSY0bN1ZERITNiQBczd13362IiAh9+OGHrhtHXr58WYMGDdLhw4e1bt06mxOagyIDlBOzZs3SlClTdODAAUlSZGSkhg8frkGDBtmcDMC/q1y5srZv357v1jC7d+9Wq1atlJmZaVMy87DYFygHxo4dq8mTJ2vYsGFq06aNJGnjxo0aMWKEjh8/rvHjx9ucEMC/8vf31/Hjx/MVmcTERPn5+dmUykzMyADlQI0aNfTOO+/okUceyTP++eefa9iwYTpz5oxNyQBcTWxsrBYvXqw333xTbdu2lSRt2LBBo0aN0gMPPKCpU6faG9AgzMgA5UB2drZatWqVb/z222/X5cuXbUgEoDBvvvmmHA6HBgwY4Po36uPjo2eeeUYTJ060OZ1ZmJEByoFhw4bJx8dHkydPzjP+/PPP68KFC5o+fbpNyQAUJjMzU4cOHZIk1a9fX1WqVLE5kXmYkQEMNXLkSNfXDodDH330kb777jv95S9/kSRt3rxZx48f14ABA+yKCOAaqlSpku+K3CgeZmQAQ3Xs2LFI+zkcDq1Zs6aU0wCAPSgyAADAWFwLGQAAGIsiAwAAjEWRAQAAxqLIAAAAY1FkANjK4XAU+njllVfsjgjAg3EdGQC2OnXqlOvrL774QmPHjtW+fftcY1WrVrUjFgBDMCMDwFY1a9Z0PQICAuRwOFSzZk35+fnp1ltv1YoVK/Lsv2TJEvn6+urcuXM6evSoHA6HFixYoLZt26pSpUpq0qSJfvzxxzzH7Ny5U926dVPVqlUVHBysxx9/nPtPAeUERQaAR/L19dXDDz+sjz/+OM/4xx9/rL59++a5Q/CoUaP03HPPafv27WrTpo169uyplJQUSVJqaqr++te/qkWLFtq6datWrFih5ORk9evXr0x/HgClgyIDwGMNGjRIK1eudJ1+On36tL799ls9+eSTefYbOnSoHnjgATVq1EgzZsxQQECAZs2aJUmaNm2aWrRooddff10NGzZUixYtNHv2bK1du1b79+8v858JgHtRZAB4rNatWysqKkpz586VJM2bN09169ZV+/bt8+zXpk0b19cVKlRQq1attGfPHknSjh07tHbtWlWtWtX1aNiwoSS5btYHwFws9gXg0QYNGqTp06frxRdf1Mcff6yBAwfK4XAU+fiMjAz17NlTb7zxRr5ttWrVcmdUADZgRgaAR+vfv7+OHTumd955R7t371Z0dHS+fTZt2uT6+vLly9q2bZsaNWokSWrZsqV27dqlsLAwRURE5Hn4+vqW2c8BoHRQZAB4tMDAQPXp00ejRo1Sly5dVLt27Xz7TJ8+XYsXL9bevXsVExOjP/74w7WOJiYmRmfPntUjjzyiLVu26NChQ1q5cqUGDhyonJycsv5xALgZRQaAx/vP//xPXbp0Kd8i3ysmTpyoiRMnqnnz5lq/fr2WLl2qm2++WZIUEhKiDRs2KCcnR126dFHTpk01fPhwVatWTV5e/C8QMJ3DsizL7hAAUJhPP/1UI0aM0MmTJ1WxYkXX+NGjRxUeHq7t27frtttusy8gANuw2BeAx8rMzNSpU6c0ceJEDRkyJE+JAQCJU0sAPNikSZPUsGFD1axZU3FxcXbHAeCBOLUEAACMxYwMAAAwFkUGAAAYiyIDAACMRZEBAADGosgAAABjUWQAAICxKDIAAMBYFBkAAGAsigwAADDW/wGMRzfi2kZHMgAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "knn_dd_object.fit_plot(X=X, y=y);" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "32c50354", + "metadata": {}, + "source": [ + "Next is data difficulty study using some kernel functions. The KNN methods fix the number of neighbors and work on any distance between those neighbors. Kernel methods work on limited distances (kernel bandwidth), but the number of neighbors is unlimited, thus these two methods can yield different results.\n", + "\n", + "Doe, J., & Smith, J. (2021). Title of the article. Title of the Journal, Volume(Issue), Page(s). https://example.com/article\n", + "\n", + "\n", + "Let's run it:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "229d0947", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{0: [-1.0,\n", + " -1.0,\n", + " -1.0,\n", + " 1.0,\n", + " -1.0,\n", + " 1.0,\n", + " 1.0,\n", + " 1.0,\n", + " -1.0,\n", + " -1.0,\n", + " 1.0,\n", + " 1.0,\n", + " 1.0,\n", + " -1.0,\n", + " 0.152557441,\n", + " 1.0,\n", + " 1.0,\n", + " 1.0,\n", + " -1.0,\n", + " 1.0],\n", + " 2: [1.0,\n", + " 1.0,\n", + " -1.0,\n", + " -1.0,\n", + " 1.0,\n", + " 1.0,\n", + " 1.0,\n", + " 1.0,\n", + " -1.0,\n", + " 0.1548038477,\n", + " 1.0,\n", + " 1.0,\n", + " -1.0,\n", + " 0.0,\n", + " -1.0,\n", + " 1.0,\n", + " -1.0,\n", + " -1.0,\n", + " 1.0,\n", + " 1.0]}\n" + ] + } + ], + "source": [ + "kernel_dd_object = KernelDataDifficulty(classes=MINORITY_CLASSES)\n", + "kernel_dd_results = kernel_dd_object.fit(X, y)\n", + "pprint(kernel_dd_results)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "ff82dbac", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "kernel_dd_object.fit_plot(X, y);" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "ef4647f8", + "metadata": {}, + "source": [ + "This way we get exact values. The `-1`s model the situation, in which there is not enough information about an example (no neighbors within kernel bandwidth). With each method which gives results which are not aggregated as counts per class, but present as many values as class cardinality, you can associate specific examples with specific scores by simply iterating through the class:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "e19310aa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(array([-3.10873789, -1.79098605]), -1.0),\n", + " (array([-2.09539738, -1.55185262]), -1.0),\n", + " (array([-1.88315385, -0.44047762]), -1.0),\n", + " (array([-2.10445912, -0.82603466]), 1.0),\n", + " (array([-2.27784105, -1.35958317]), -1.0),\n", + " (array([-2.25140702, -0.59726922]), 1.0),\n", + " (array([-2.1997694 , -0.84379434]), 1.0),\n", + " (array([-1.17917902, -0.17073185]), 1.0),\n", + " (array([-1.12564811, -1.95863255]), -1.0),\n", + " (array([-1.67149983, -1.04075384]), -1.0),\n", + " (array([-2.18413609, -0.79040705]), 1.0),\n", + " (array([-1.74509052, -1.30933388]), 1.0),\n", + " (array([-2.29027883, -1.02594977]), 1.0),\n", + " (array([-1.11579742, -0.52673203]), -1.0),\n", + " (array([-0.53536383, 0.41987972]), 0.152557441),\n", + " (array([-1.32662679, 0.20863229]), 1.0),\n", + " (array([-1.92655868, -1.32777362]), 1.0),\n", + " (array([-2.38803229, -0.89640488]), 1.0),\n", + " (array([-2.03370024, -0.19446392]), -1.0),\n", + " (array([-1.36988334, 0.03679337]), 1.0)]\n" + ] + } + ], + "source": [ + "pprint(list(zip(X[y==0], kernel_dd_results[0])))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "036f8cc7", + "metadata": {}, + "source": [ + "To get results comparable to KNN method (safety levels instead of safety scores) run this code:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "782e0471", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{0: ['not enough info',\n", + " 'not enough info',\n", + " 'not enough info',\n", + " 'safe',\n", + " 'not enough info',\n", + " 'safe',\n", + " 'safe',\n", + " 'safe',\n", + " 'not enough info',\n", + " 'not enough info',\n", + " 'safe',\n", + " 'safe',\n", + " 'safe',\n", + " 'not enough info',\n", + " 'rare',\n", + " 'safe',\n", + " 'safe',\n", + " 'safe',\n", + " 'not enough info',\n", + " 'safe'],\n", + " 2: ['safe',\n", + " 'safe',\n", + " 'not enough info',\n", + " 'not enough info',\n", + " 'safe',\n", + " 'safe',\n", + " 'safe',\n", + " 'safe',\n", + " 'not enough info',\n", + " 'rare',\n", + " 'safe',\n", + " 'safe',\n", + " 'not enough info',\n", + " 'zero',\n", + " 'not enough info',\n", + " 'safe',\n", + " 'not enough info',\n", + " 'not enough info',\n", + " 'safe',\n", + " 'safe']}\n" + ] + } + ], + "source": [ + "pprint(kernel_dd_object.fit_label_difficulty(X, y))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "9e238886", + "metadata": {}, + "source": [ + "`KernelDifficulty` is parametric. We use kernel bandwidth auto tuning (as specified in the article), which automatically sets kernel bandwidth, consequently yielding results comparable to according `KNNDifficulty` method run. User may override this procedure and set custom kernel bandwidth to any value they wish to have. Here is a use case, in which we set kernel bandwidth to `1.`:" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "84f60bfe", + "metadata": {}, + "outputs": [], + "source": [ + "custom_kernel_bandwidth_dd_object = KernelDataDifficulty(classes=MINORITY_CLASSES)\n", + "custom_kernel_bandwidth_dd_results = custom_kernel_bandwidth_dd_object.fit(X=X, \n", + " y=y, \n", + " kernel_bandwidth_override=1.)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "1526e01e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Are the results different? : False\n", + "Auto tuned kernel bandwidth: 0.5235553242694769\n", + "Unitary kernel bandwidth: 1.0\n" + ] + } + ], + "source": [ + "print('Are the results equal?: ', kernel_dd_results == custom_kernel_bandwidth_dd_results)\n", + "print('Auto tuned kernel bandwidth: ', kernel_dd_object.kernel_bandwidth)\n", + "print('Unitary kernel bandwidth: ', custom_kernel_bandwidth_dd_object.kernel_bandwidth)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "3b67de06", + "metadata": {}, + "source": [ + "User can also set different kernel functions. The default kernel function is *Epanechnikov* (quadratic). If you wish to model distances in a different manner, we also provide *uniform* (linear), and *Uniform* (constant) kernel functions:" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Are the results different?: False\n", + "(', '\n", + " ', '\n", + " '')\n" + ] + } + ], + "source": [ + "triangular_dd_object = KernelDataDifficulty(classes=MINORITY_CLASSES, kernel_type='triangular')\n", + "triangular_dd_results = triangular_dd_object.fit(X, y)\n", + "uniform_dd_object = KernelDataDifficulty(classes=MINORITY_CLASSES, kernel_type='uniform')\n", + "uniform_dd_results = uniform_dd_object.fit(X, y)\n", + "print('Are the results equal?: ', kernel_dd_results == triangular_dd_results == uniform_dd_results)\n", + "pprint(f'{kernel_dd_object.kernel_object}, {triangular_dd_object.kernel_object}, {uniform_dd_object.kernel_object}')" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "882ef9a5", + "metadata": {}, + "source": [ + "Now let's take a look at similar method, with more accurate, numerical output. For reference see \n", + "Doe, J., & Smith, J. (2021). Title of the article. Title of the Journal, Volume(Issue), Page(s). https://example.com/article\n", + " This method is particularly interesting if you want to weigh cross-class relations." + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "94ebc797", + "metadata": {}, + "outputs": [], + "source": [ + "cdd_object = ContinuousDataDifficulty(classes=MINORITY_CLASSES)\n", + "cdd_results = cdd_object.fit(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "605b5800", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "cdd_object.fit_plot(X, y);" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "932a03ab", + "metadata": {}, + "source": [ + "This figures are histograms of scores returned by `.fit()`. As per source article the similarity matrix by default consists of `1`s where two minority classes cross over, and `0`s elsewhere. In our case minority classes are 0th and 2nd class:" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "451badc6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "array([[0, 0, 1, 0],\n", + " [0, 0, 0, 0],\n", + " [1, 0, 0, 0],\n", + " [0, 0, 0, 0]])\n" + ] + } + ], + "source": [ + "pprint(cdd_object.similarities)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "f0a8aaab", + "metadata": {}, + "source": [ + "User can alter this matrix any way you see fit:" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "3e24ad30", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "array([[ 0, 42, 1, 0],\n", + " [ 0, 0, 0, 0],\n", + " [ 1, 0, 0, 0],\n", + " [ 0, 0, 0, 0]])\n", + "Are the results different?: False\n" + ] + } + ], + "source": [ + "custom_sim_cdd_object = ContinuousDataDifficulty(classes=MINORITY_CLASSES)\n", + "from copy import deepcopy\n", + "custom_similarity_matrix = deepcopy(cdd_object.similarities)\n", + "custom_similarity_matrix[0,1] = 42\n", + "custom_sim_cdd_results = custom_sim_cdd_object.fit(X=X, y=y, similarities=custom_similarity_matrix)\n", + "pprint(custom_sim_cdd_object.similarities)\n", + "print('Are the results equal?: ', cdd_results == custom_sim_cdd_results)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "d2141494", + "metadata": {}, + "source": [ + "User also can change `k` parameter of underlying KNN to change the number of neighbors, and effectively broaden the area of neighborhood. Default `k` value is set to `5` in all of the `DataDifficulty` methods (see the articles to learn why). Here is how to change `k` (to `2` for example):" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "94e99f72", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Are the results equal?: False\n" + ] + } + ], + "source": [ + "small_neighborhood_cdd_object = ContinuousDataDifficulty(classes=MINORITY_CLASSES, k=2)\n", + "small_neighborhood_cdd_results = small_neighborhood_cdd_object.fit(X, y)\n", + "print('Are the results equal?: ', cdd_results == small_neighborhood_cdd_results)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "f03ad9a2", + "metadata": {}, + "source": [ + "Now let's examine which class pairs cross over. For each data point in each minority class we count neighbors, which aren't sampled from the same class. Here is how to do it:" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "eda1ebee", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "array([[ 0, 9, 14, 4],\n", + " [ 0, 0, 0, 0],\n", + " [14, 1, 0, 15],\n", + " [ 0, 0, 0, 0]])\n" + ] + } + ], + "source": [ + "bcm_object = BorderClassMatrix(classes=MINORITY_CLASSES)\n", + "bcm_results = bcm_object.fit(X, y)\n", + "pprint(bcm_results)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "cc5d2ab4", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "bcm_object.fit_plot(X, y);" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "57c36c69", + "metadata": {}, + "source": [ + "We can see that for example some data point from 2nd class has one neighbor from 1st class. We can easily spot that data point (the brown one below *HERE* sign):" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "id": "75a77f78", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.scatter(X[:,0][y==0], X[:,1][y==0], c='black', label=0)\n", + "plt.scatter(X[:,0][y==1], X[:,1][y==1], c='brown', label=1)\n", + "plt.scatter(X[:,0][y==2], X[:,1][y==2], c='green', label=2)\n", + "plt.scatter(X[:,0][y==3], X[:,1][y==3], c='blue', label=3)\n", + "guilty_one = X[123]\n", + "plt.annotate(\"HERE\", guilty_one, textcoords=\"offset points\", \n", + " xytext=(0,5), ha='center', color='red', fontsize=14, \n", + " fontweight='bold', zorder=10)\n", + "plt.title('Multiclass imbalanced data')\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "20680652", + "metadata": {}, + "source": [ + "Lastly we may want to study per class heterogeneity of class topology. We can use elbow method in which we plot *within-cluster sum of squares* for different cluster numbers, and we try to spot 'an elbow'. This is how we do it:" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "id": "bb558982", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{0: [(1, 14.67433759914556),\n", + " (2, 6.735191559662801),\n", + " (3, 4.501045361129617),\n", + " (4, 2.9559490881411032),\n", + " (5, 2.056692415453553),\n", + " (6, 1.423381724352783),\n", + " (7, 0.9135289469974753),\n", + " (8, 0.6509614470348347),\n", + " (9, 0.39022761236569303),\n", + " (10, 0.3068780393881096),\n", + " (11, 0.2415013633602744),\n", + " (12, 0.17205108240314598),\n", + " (13, 0.13045760397570533),\n", + " (14, 0.08733458980264484)],\n", + " 2: [(1, 13.360902221513912),\n", + " (2, 6.103153728096089),\n", + " (3, 4.002256029006579),\n", + " (4, 2.7871823772018507),\n", + " (5, 1.9987013967907636),\n", + " (6, 1.4796849273289463),\n", + " (7, 1.0188670579543735),\n", + " (8, 0.7630348029639596),\n", + " (9, 0.5547676903419655),\n", + " (10, 0.40975833799885175),\n", + " (11, 0.2953471437139862),\n", + " (12, 0.19485111300561064),\n", + " (13, 0.12014881580463331),\n", + " (14, 0.05857295979315155)]}\n" + ] + } + ], + "source": [ + "kmeans_elbow_object = KMeansElbowMethod(classes=MINORITY_CLASSES, \n", + " cluster_count_selection=list(range(1, 15)))\n", + "kmeans_elbow_results = kmeans_elbow_object.fit(X, y)\n", + "pprint(kmeans_elbow_results)" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "id": "f531f015", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "kmeans_elbow_object.fit_plot(X, y);" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "39471910", + "metadata": {}, + "source": [ + "We can see smooth, descending series on the plot, which usually indicates that a class topology is homogenous. Let's try to alter the topology of our dataset. Let's forget which class is majority and which is minority, and let's create dataset in which one class is homogenous and the second class consists of 3 easily separable clusters:" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "id": "ac2910ce", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "X = np.load('X_altered.npy')\n", + "y = np.load('y_altered.npy')\n", + "plt.scatter(X[:,0][y==0], X[:,1][y==0], c='magenta', label=0)\n", + "plt.scatter(X[:,0][y==1], X[:,1][y==1], c='orange', label=1)\n", + "plt.title('Altered data')\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "08e25e03", + "metadata": {}, + "source": [ + "Now let's rerun our method. Here is how:" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "id": "d5c3f447", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "kmeans_elbow_object = KMeansElbowMethod(classes=[0, 1], \n", + " cluster_count_selection=list(range(1, 15)))\n", + "kmeans_elbow_results = kmeans_elbow_object.fit_plot(X, y)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "91b577da", + "metadata": {}, + "source": [ + "We can see the elbow point right at 3 clusters in class 0th.\n", + "We can also change the search space `cluster_count_selection` and use different underlying clustering method, such as `GaussianMixture`:" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "id": "6395f178", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ge_object = GaussianMixtureElbowMethod(classes=[0, 1], \n", + " cluster_count_selection=list(range(1, 20)))\n", + "ge_results = ge_object.fit_plot(X, y)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + }, + "vscode": { + "interpreter": { + "hash": "d86f111616862d19b4a31c0f57fe36c2d6f8663b0bf9b53456006ede2cf812e8" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/data_difficulty/y.npy b/examples/data_difficulty/y.npy new file mode 100644 index 0000000000000000000000000000000000000000..fb1c00df72a29a0d21ec82d129bdcc4df0d5add1 GIT binary patch literal 1088 zcmbV`u}T9`6ht?s^DDM{5w;LXCDwL|jfIsgW<^0v*bO!zUy&d79p%Uvti14;d*{x~ zeTUooo4bcu^U>_eYM7qJa@m%v*JbJ2^1L3mWB>lR9#;eWb^kUEm7iYv%~0pX#qzv6 z>)L($?>e0~-!-dnQXj~J_sJcjdNZo!$8Sbr_L;38RO6l-6#Vf^Jvv@;nPH!MZ<(7K zvq1Oi(Ye>7Cg&>q<<$>TX!VKksq?Fl4ggBz8{q%| literal 0 HcmV?d00001