![giskard_logo.png](https://raw.githubusercontent.com/Giskard-AI/giskard/main/readme/Logo_full_darkgreen.png)

## Installing `giskard` and other packages

In [None]:
!pip install giskard transformers tweepy datasets torch

## Connect the external worker in daemon mode

In [None]:
!giskard worker start -d

# What is sentiment analysis? 🤔
Sentiment Analysis is the technique of determining the sentiments involved in the given text. The most prominent sentiments involved are 'Positive', 'Neutral' or 'Negative'. 

# Why is Sentiment Analysis important? 🔖
Sentiment Analysis helps you understand your customers better.
Imagine you have released a product and want to monitor the performance of your product based on the reviews and feedbacks or even the twitter posts about the product. The right understanding of user feedback and sentiment will help you improve your product and it's reach.

## Lets build a sentiment analysis model 👷‍♀️

In [None]:
# Read the data 
import re

import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split

import tweepy


url = 'https://raw.githubusercontent.com/Giskard-AI/examples/main/datasets/twitter_us_airline_sentiment_analysis.csv'

data = pd.read_csv(url)
# Lets see how our data looks like
data.head()

In [None]:
# Preprocess text (username and link placeholders)
# Replace the Username with @user and the URL in the tweet with http for better comprehension of data for the model
def preprocess(text):
 text = " ".join(text.split())
 text = re.sub(r'http\S+', 'http', text) 
 text = re.sub(r'@\S+', '@user', text)
 text = text.lower()
 return text

In [None]:
np.random.seed(112)

# Define pretrained tokenizer and model
model_name = "Souvikcmsa/SentimentAnalysisDistillBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)


In [None]:
# Define the evaluation metrics 

def compute_metrics(eval_pred):
 pred, labels = eval_pred
 pred = np.argmax(pred, axis=1)

 accuracy = accuracy_score(y_true=labels, y_pred=pred)
 recall = recall_score(y_true=labels, y_pred=pred, average='macro')
 precision = precision_score(y_true=labels, y_pred=pred, average='macro')
 f1 = f1_score(y_true=labels, y_pred=pred, average='macro')

 return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

### Lets train the model on our data

In [None]:
for param in model.base_model.parameters():
 param.requires_grad = False

# ----- 1. Preprocess data -----#
# Preprocess data

X = list(data["text"].apply(preprocess))
classification_labels_mapping = {'negative': 0,'neutral': 1, 'positive':2}
y = list(data['airline_sentiment'].map(classification_labels_mapping)) # Converting target labels to numeric just for training
labels = list(classification_labels_mapping.keys())
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=256)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=256)

In [None]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
 def __init__(self, encodings, labels=None):
 self.encodings = encodings
 self.labels = labels

 def __getitem__(self, idx):
 item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
 if self.labels:
 item["labels"] = torch.tensor(self.labels[idx])
 return item

 def __len__(self):
 return len(self.encodings["input_ids"])

train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)


In [None]:
# Define a new Trainer with all the objects we constructed so far


training_args = TrainingArguments(
 output_dir='output',
 learning_rate=2e-5,
 per_device_train_batch_size=16,
 per_device_eval_batch_size=16,
 num_train_epochs=5,
 weight_decay=0.01,
 save_strategy="epoch", 
)

trainer = Trainer(
 model=model,
 args=training_args,
 train_dataset=train_dataset,
 eval_dataset=val_dataset,
 tokenizer=tokenizer,
 compute_metrics=compute_metrics,
)
# Train pre-trained model
trainer.train()

In [None]:
trainer.evaluate()


## 80% Acurracy, not bad! But is the evaluation metrics enough to say the model is good? 💭


* What if the model is showing different prediction when you change the words from "He" to "She" ? 🤔

* What if the words like "ain't" which is more used by African American person is introduced in the statement, will the prediction change ? 🤔

* Is the performance metrics like accuracy, F1 score changing when you change the name of the user to more feminine in your texts?

* Is your model gender or race biased? 🤔🤔

These are really important questions which we need to have definite answers but due to lack of tools for testing, it is often neglected leading to serious issues in future 😟



## 🌟 Don't worry! [Giskard](https://github.com/Giskard-AI/giskard) is here to help! We are an open source Quality Assurance and CI/CD platform who empowers Data Scientists to inspect and test their model while they are creating it, enabling them to publish it on productions with confidence!


### Here is a sneak peek of our test suites 😍

![giskard_test_suites.png](https://raw.githubusercontent.com/Giskard-AI/examples/main/images/giskard_tests.gif)

# Cant wait to explore?! Lets start with [Installing Giskard](https://docs.giskard.ai/start/) 🚀

## All we need is a simple function that returns the prediction probabilities 😌

In [None]:
def predict(test_dataset):
 test_dataset= test_dataset.squeeze(axis=1)
 X_test = list(test_dataset.apply(preprocess))
 X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=256)

 # Create torch dataset
 test_dataset = Dataset(X_test_tokenized)

 # Define test trainer
 test_trainer = Trainer(model)

 # Make prediction
 raw_pred, _, _ = test_trainer.predict(test_dataset)
 predictions = torch.nn.functional.softmax(torch.from_numpy(raw_pred), dim=-1)
 predictions = predictions.cpu().detach().numpy()

 return predictions

In [None]:
# A quick test to verify the predict is working before uploading on Giskard
feature_names = ['text']
test_df = data[feature_names][:5]
predict(test_df)

## Let the adventure begin 🤩

In [None]:
from giskard import GiskardClient

url = "http://localhost:19000" # If Giskard is installed locally (for installation, see: https://docs.giskard.ai/start/guides/installation)
token = "YOUR GENERATED TOKEN" # you can generate your API token in the Admin tab of the Giskard application (for installation, see: https://docs.giskard.ai/start/guides/installation)

client = GiskardClient(url, token)

# your_project = client.create_project("project_key", "PROJECT_NAME", "DESCRIPTION")
# Choose the arguments you want. But "project_key" should be unique and in lower case
sentiment_analysis = client.create_project("sentiment_analysis", "Sentimental Analysis for Twitter Data", "Sentimental Analysis for Twitter Data")

# If you've already created a project with the key "sentiment_analysis" use
#sentiment_analysis = client.get_project("sentiment_analysis")

In [None]:
column_types={ 
 'airline_sentiment': "category",
 "text": "text"
 }

In [None]:
sentiment_analysis.upload_model_and_df(
 prediction_function=predict, # Python function which takes pandas dataframe as input and returns probabilities for classification model OR returns predictions for regression model
 model_type='classification', # "classification" for classification model OR "regression" for regression model
 df=data[['text','airline_sentiment']].sample(100), # The dataset you want to use to inspect your model
 column_types=column_types, # # A dictionary with columns names of df as key and types(category, numeric, text) of columns as values
 target='airline_sentiment', # The column name in df corresponding to the actual target variable (ground truth).
 feature_names=['text'], # List of the feature names of prediction_function
 model_name='sentiment_analysis', # Name of the model
 dataset_name='twitter_airlinedata', # Name of the dataset
 classification_labels=labels # List of the classification labels of your prediction
)

## Hurray 🥳 Your Dataset and model is now uploaded on Giskard and is available at http://localhost:19000

## Want to download real-time data? Let's start with connecting to your [twitter account using API](https://developer.twitter.com/en/docs/tutorials/step-by-step-guide-to-making-your-first-request-to-the-twitter-api-v2)

In [None]:
# Add Twitter API key and secret
consumer_key = "XXX"
consumer_secret = "XXX"

# Handling authentication with Twitter
auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)

# Create a wrapper for the Twitter API
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [None]:
# Helper function for handling pagination in our search and handle rate limits
def limit_handled(cursor):
 while True:
 try:
 yield cursor.next()
 except tweepy.RateLimitError:
 print('Reached rate limite. Sleeping for >15 minutes')
 time.sleep(15 * 61)
 except StopIteration:
 break

# Define the term we will be using for searching tweets
query = '#virginairlines'
query = query + ' -filter:retweets'

# Define how many tweets to get from the Twitter API 
count = 1000

# Let's search for tweets using Tweepy 
search = limit_handled(tweepy.Cursor(api.search,
 q=query,
 tweet_mode='extended',
 lang='en',
 result_type="recent").items(count))

In [None]:
# Let's retrive the important data from the search
tweets = []
for tweet in search:
 try:
 tweets.append({'id':tweet.id, 'user': tweet.user.name,'date':tweet.created_at,'text': tweet.full_text})
 
 except:
 pass


In [None]:
# Load the data in a dataframe
pd.set_option('max_colwidth', None)
pd.set_option('display.width', 3000) 
prod_data = pd.DataFrame(tweets)

prod_data.head()

## Lets upload this data on Giskard! 

In [None]:
sentiment_analysis.upload_df(
 df=prod_data, # The dataset you want to upload
 column_types=['text'], # All the column types without the target
 name="production_data" # Name of the dataset
)

## Happy Exploration 🕵️‍♂️ 🚀