From 6d4dec5080369f80133e74096d7d038a5295a666 Mon Sep 17 00:00:00 2001 From: unknown Date: Tue, 28 Nov 2023 20:19:58 -0600 Subject: [PATCH] Add XGBoost --- adv_models/XGBoost.ipynb | 546 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 546 insertions(+) create mode 100644 adv_models/XGBoost.ipynb diff --git a/adv_models/XGBoost.ipynb b/adv_models/XGBoost.ipynb new file mode 100644 index 0000000..1dc00ec --- /dev/null +++ b/adv_models/XGBoost.ipynb @@ -0,0 +1,546 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e888feb4", + "metadata": {}, + "source": [ + "- After feature selection step 2 also build (Based on the models compatibility with the data i.e classification or regression problem) (also write a 3 lines on what you understand about these models you are free to use online resources but please cite them.)\n", + "\n", + "- Perform visualization of your model performances, insights etc using apt metrics and charts (ex: MSE bar plots of all the models to identify best models" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "3ab83994", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from sklearn.metrics import classification_report, confusion_matrix\n", + "from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "5a08bc9c", + "metadata": {}, + "outputs": [], + "source": [ + "selected_features = [\"TP2\", \"H1\", \"DV_pressure\", \"Reservoirs\", \"Oil_temperature\", \"Motor_current\", \"Oil_level\", 'status']" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "0091382c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0timestampTP2TP3H1DV_pressureReservoirsOil_temperatureMotor_currentCOMPDV_eletricTowersMPGLPSPressure_switchOil_levelCaudal_impulsesstatus
05625642020-04-18 00:00:01-0.0188.2488.238-0.0248.24849.450.041.00.01.01.00.01.01.01.01
15625652020-04-18 00:00:13-0.0188.2488.238-0.0248.24849.450.041.00.01.01.00.01.01.01.01
25625662020-04-18 00:00:24-0.0188.2488.238-0.0248.24849.450.041.00.01.01.00.01.01.01.01
35625672020-04-18 00:00:36-0.0188.2488.238-0.0248.24849.450.040.00.00.00.00.00.00.00.01
45625682020-04-18 00:00:49-0.0188.2488.238-0.0248.24849.450.041.00.01.01.00.01.01.01.01
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 timestamp TP2 TP3 H1 DV_pressure \\\n", + "0 562564 2020-04-18 00:00:01 -0.018 8.248 8.238 -0.024 \n", + "1 562565 2020-04-18 00:00:13 -0.018 8.248 8.238 -0.024 \n", + "2 562566 2020-04-18 00:00:24 -0.018 8.248 8.238 -0.024 \n", + "3 562567 2020-04-18 00:00:36 -0.018 8.248 8.238 -0.024 \n", + "4 562568 2020-04-18 00:00:49 -0.018 8.248 8.238 -0.024 \n", + "\n", + " Reservoirs Oil_temperature Motor_current COMP DV_eletric Towers MPG \\\n", + "0 8.248 49.45 0.04 1.0 0.0 1.0 1.0 \n", + "1 8.248 49.45 0.04 1.0 0.0 1.0 1.0 \n", + "2 8.248 49.45 0.04 1.0 0.0 1.0 1.0 \n", + "3 8.248 49.45 0.04 0.0 0.0 0.0 0.0 \n", + "4 8.248 49.45 0.04 1.0 0.0 1.0 1.0 \n", + "\n", + " LPS Pressure_switch Oil_level Caudal_impulses status \n", + "0 0.0 1.0 1.0 1.0 1 \n", + "1 0.0 1.0 1.0 1.0 1 \n", + "2 0.0 1.0 1.0 1.0 1 \n", + "3 0.0 0.0 0.0 0.0 1 \n", + "4 0.0 1.0 1.0 1.0 1 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = pd.read_csv(\"../data/Group_14_Clean_Data.csv\")\n", + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "73b2dde5", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0timestampTP2TP3H1DV_pressureReservoirsOil_temperatureMotor_currentCOMPDV_eletricTowersMPGLPSPressure_switchOil_levelCaudal_impulsesstatus
267719066862020-06-07 10:22:058.0867.894-0.0061.9287.89476.9005.52250.01.00.00.00.01.01.01.01
319391234922020-02-17 10:55:15-0.0109.6989.684-0.0189.70058.3503.68751.00.01.01.00.01.01.01.00
63665689302020-04-18 17:40:398.6108.764-0.0061.8988.76674.6255.58750.01.01.00.00.01.01.01.01
550835055012020-04-09 20:08:30-0.0148.8168.806-0.0248.81653.6750.04251.00.01.01.00.01.01.01.00
35415661052020-04-18 09:53:569.0748.932-0.0102.0328.93476.0255.73000.01.00.00.00.01.01.01.01
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 timestamp TP2 TP3 H1 DV_pressure \\\n", + "26771 906686 2020-06-07 10:22:05 8.086 7.894 -0.006 1.928 \n", + "31939 123492 2020-02-17 10:55:15 -0.010 9.698 9.684 -0.018 \n", + "6366 568930 2020-04-18 17:40:39 8.610 8.764 -0.006 1.898 \n", + "55083 505501 2020-04-09 20:08:30 -0.014 8.816 8.806 -0.024 \n", + "3541 566105 2020-04-18 09:53:56 9.074 8.932 -0.010 2.032 \n", + "\n", + " Reservoirs Oil_temperature Motor_current COMP DV_eletric Towers \\\n", + "26771 7.894 76.900 5.5225 0.0 1.0 0.0 \n", + "31939 9.700 58.350 3.6875 1.0 0.0 1.0 \n", + "6366 8.766 74.625 5.5875 0.0 1.0 1.0 \n", + "55083 8.816 53.675 0.0425 1.0 0.0 1.0 \n", + "3541 8.934 76.025 5.7300 0.0 1.0 0.0 \n", + "\n", + " MPG LPS Pressure_switch Oil_level Caudal_impulses status \n", + "26771 0.0 0.0 1.0 1.0 1.0 1 \n", + "31939 1.0 0.0 1.0 1.0 1.0 0 \n", + "6366 0.0 0.0 1.0 1.0 1.0 1 \n", + "55083 1.0 0.0 1.0 1.0 1.0 0 \n", + "3541 0.0 0.0 1.0 1.0 1.0 1 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Choose only features selected from Question 2\n", + "#Shuffle the data\n", + "data = data.sample(frac = 1)\n", + "data.head()" + ] + }, + { + "cell_type": "markdown", + "id": "c38a6d5f", + "metadata": {}, + "source": [ + "# XGBOOST\n", + "\n", + "- XGBoost, a Gradient Boosted decision tree implementation, excels in Kaggle Competitions. \n", + "- It sequentially creates decision trees, assigning weights to variables that are adjusted based on prediction errors. \n", + "- This ensemble method, with optimizations like the Approximate Greedy Algorithm and Cash-Aware Access, proves effective for regression, classification, ranking, and user-defined prediction problems.\n", + "\n", + "reference:
\n", + "https://xgboost.readthedocs.io/en/stable/python/python_intro.html
\n", + "https://www.geeksforgeeks.org/xgboost/
\n", + "https://www.geeksforgeeks.org/ml-xgboost-extreme-gradient-boosting/
" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f5d4ec6c", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "#!pip install xgboost\n", + "import xgboost as xgb" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c8744021", + "metadata": {}, + "outputs": [], + "source": [ + "X = data[selected_features]\n", + "y = data['status']" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "dfc8122f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 5960\n", + " 1 1.00 1.00 1.00 5929\n", + "\n", + " accuracy 1.00 11889\n", + " macro avg 1.00 1.00 1.00 11889\n", + "weighted avg 1.00 1.00 1.00 11889\n", + "\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "\n", + "# Fitting XGBoost to the training data\n", + "my_model = xgb.XGBClassifier()\n", + "my_model.fit(X_train, y_train)\n", + "\n", + "# Predicting the Test set results\n", + "y_pred = my_model.predict(X_test)\n", + "\n", + "# Making the Confusion Matrix\n", + "cm = confusion_matrix(y_test, y_pred)\n", + "\n", + "print(classification_report(y_test, y_pred))\n", + "\n", + "disp = ConfusionMatrixDisplay(cm, display_labels=['0', '1'])\n", + "disp.plot(cmap='Blues', values_format='d')\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aac04a8d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}