![giskard_logo.png](https://raw.githubusercontent.com/Giskard-AI/giskard/main/readme/Logo_full_darkgreen.png)

# About Giskard

Open-Source CI/CD platform for ML teams. Deliver ML products, better & faster. 

* Collaborate faster with feedback from business stakeholders.
* Deploy automated tests to eliminate regressions, errors & biases.

🏡 [Website](https://giskard.ai/)

📗 [Documentation](https://docs.giskard.ai/)

# Telco custormer churn data


In this notebook we explore how to predict customer churn, a critical factor for telecommunication companies to be able to effectively retain customers. 

## Installing `giskard` and `lightgbm`

In [None]:
!pip install giskard lightgbm

## Connect the external worker in daemon mode

In [None]:
!giskard worker start -d

## 1. Data Reading

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import lightgbm as lbt

random_seed=123

In [None]:
# import telecom dataset into a pandas data frame

dataset_url="https://raw.githubusercontent.com/Giskard-AI/examples/main/datasets/WA_Fn-UseC_-Telco-Customer-Churn.csv"

df_telco=pd.read_csv(dataset_url)

# check unique values of each column
#for column in df_telco.columns:
# print('Column: {} - Unique Values: {}'.format(column, df_telco[column].unique()))

# summary of the data frame
#df_telco.info()

# transform the column TotalCharges into a numeric data type
df_telco['TotalCharges'] = pd.to_numeric(df_telco['TotalCharges'], errors='coerce')

# drop observations with null values
df_telco.dropna(inplace=True)

# drop the customerID column from the dataset
df_telco.drop(columns='customerID', inplace=True)

# remove (automatic) from payment method names
df_telco['PaymentMethod'] = df_telco['PaymentMethod'].str.replace(' (automatic)', '', regex=False)

## 2. Initialising feature names

In [None]:
# Declare the type of each column in the dataset(example: category, numeric, text)
column_types = {'gender': "category",
 'SeniorCitizen': "category", 
 'Partner': "category", 
 'Dependents': "category", 
 'tenure': "numeric",
 'PhoneService': "category", 
 'MultipleLines': "category", 
 'InternetService': "category", 
 'OnlineSecurity': "category",
 'OnlineBackup': "category", 
 'DeviceProtection': "category", 
 'TechSupport': "category", 
 'StreamingTV': "category",
 'StreamingMovies': "category", 
 'Contract': "category", 
 'PaperlessBilling': "category", 
 'PaymentMethod': "category",
 'MonthlyCharges': "numeric", 
 'TotalCharges': "numeric", 
 'Churn': "category"}

# feature_types is used to declare the features the model is trained on
feature_types = {i:column_types[i] for i in column_types if i!='Churn'}

## 3. Setting up Feature Engineering

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection


columns_to_scale = [key for key in feature_types.keys() if feature_types[key]=="numeric"]

columns_to_encode = [key for key in feature_types.keys() if feature_types[key]=="category"]

# Perform preprocessing of the columns with the above pipelines
preprocessor = ColumnTransformer(
 transformers=[
 ('num', StandardScaler(), columns_to_scale),
 ('cat', OneHotEncoder(handle_unknown='ignore',drop='first'), columns_to_encode)
 ]
)

## 4. Data splitting

In [None]:
# select independent variables
X = df_telco.drop(columns='Churn')

# select dependent variables
y = df_telco.loc[:, 'Churn']

# split the data in training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.25, random_state=random_seed, shuffle=True)
# Prepare data to upload on Giskard
train_data = pd.concat([X_train, Y_train], axis=1)
test_data = pd.concat([X_test, Y_test ], axis=1)

## 5. Pipelines and Models Evaluation

In [None]:
models = {}
models['dummy_classifier']= {"model": DummyClassifier(random_state=random_seed, strategy='most_frequent'), "accuracy":0} 
models['k_nearest_neighbors']= {"model": KNeighborsClassifier(), "accuracy":0} 
models['logistic_regression']= {"model": LogisticRegression(random_state=random_seed,max_iter=150), "accuracy":0} 
models['random_forest']= {"model": RandomForestClassifier(random_state=random_seed), "accuracy":0} 
models['gradient_boosting']= {"model": GradientBoostingClassifier(random_state=random_seed), "accuracy":0} 
models['LGBM']= {"model": lbt.LGBMClassifier(random_state=random_seed), "accuracy":0} 
 

# test the accuracy of each model using default hyperparameters
scoring = 'accuracy'
for name in models.keys():
 models[name]['model']= Pipeline(steps=[('preprocessor', preprocessor), ('classifier', models[name]['model'])])
 
 # fit the model with the training data
 models[name]['model'].fit(X_train, Y_train).predict(X_test)
 # make predictions with the testing data
 predictions = models[name]['model'].predict(X_test)
 # calculate accuracy 
 accuracy = accuracy_score(Y_test, predictions)
 # append the model name and the accuracy to the lists
 models[name]['accuracy']=accuracy
 # print classifier accuracy
 print('Classifier: {}, Accuracy: {})'.format(name, accuracy))

# Upload the models in Giskard 🚀🚀🚀

## Initiate a project

In [None]:
from giskard import GiskardClient

url = "http://localhost:19000" #if Giskard is installed locally (for installation, see: https://docs.giskard.ai/start/guides/installation)
#url = "http://app.giskard.ai" # If you want to upload on giskard URL
token = "YOUR GENERATED TOKEN"
client = GiskardClient(url, token)

# your_project = client.create_project("project_key", "PROJECT_NAME", "DESCRIPTION")
# Choose the arguments you want. But "project_key" should be unique and in lower case
churn_analysis_with_tfs = client.create_project("churn_analysis_with_tfs", "Telco Kaggle Churn Analysis", "Project to predict if a customer quits")

# If you've already created a project with the key "churn-analysis" use
#churn_analysis = client.get_project("churn_analysis")


## Upload a specific model and a dataset (see [documentation](https://docs.giskard.ai/start/guides/upload-your-model))

In [None]:
churn_analysis_with_tfs.upload_model_and_df(
 prediction_function=models['dummy_classifier']['model'].predict_proba, # Python function which takes pandas dataframe as input and returns probabilities for classification model OR returns predictions for regression model
 model_type='classification', # "classification" for classification model OR "regression" for regression model
 df=test_data, # the dataset you want to use to inspect your model
 column_types=column_types, # A dictionary with columns names of df as key and types(category, numeric, text) of columns as values
 target='Churn', # The column name in df corresponding to the actual target variable (ground truth).
 feature_names=list(feature_types.keys()), # List of the feature names of prediction_function
 classification_labels=["No","Yes"] , # List of the classification labels of your prediction #TODO: Check their order!!!!!
 model_name='dummy_classifier', # Name of the model
 dataset_name='test_data' # Name of the dataset
)

## Upload more models

In [None]:
for name in models.keys():
 if name=='dummy_classifier': continue
 churn_analysis_with_tfs.upload_model(
 prediction_function=models[name]['model'].predict_proba, # Python function which takes pandas dataframe as input and returns probabilities for classification model OR returns predictions for regression model
 model_type='classification', # "classification" for classification model OR "regression" for regression model
 feature_names=list(feature_types.keys()), # List of the feature names of prediction_function
 name=name, # Name of the model
 target="Churn", # Optional. target sshould be a column of validate_df. Pass this parameter only if validate_df is being passed
 classification_labels=["No","Yes"] # List of the classification labels of your prediction
 )

## Upload more datasets

In [None]:
churn_analysis_with_tfs.upload_df(
 df=train_data, # The dataset you want to upload
 column_types=column_types, # All the column types of df
 target="Churn", # Do not pass this parameter if dataset doesn't contain target column
 name="train_data" # Name of the dataset
)