{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "Y1CS0mREa6da", "pycharm": { "name": "#%% md\n" } }, "source": [ "![giskard_logo.png](https://raw.githubusercontent.com/Giskard-AI/giskard/main/readme/Logo_full_darkgreen.png)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Installing `giskard` and other packages" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install giskard transformers tweepy datasets torch" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Connect the external worker in daemon mode" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!giskard worker start -d" ] }, { "cell_type": "markdown", "metadata": { "id": "QVlWMnxFMslK", "pycharm": { "name": "#%% md\n" } }, "source": [ "# What is sentiment analysis? 🤔\n", "Sentiment Analysis is the technique of determining the sentiments involved in the given text. The most prominent sentiments involved are 'Positive', 'Neutral' or 'Negative'. \n", "\n", "# Why is Sentiment Analysis important? 🔖\n", "Sentiment Analysis helps you understand your customers better.\n", "Imagine you have released a product and want to monitor the performance of your product based on the reviews and feedbacks or even the twitter posts about the product. The right understanding of user feedback and sentiment will help you improve your product and it's reach.\n", "\n", "## Lets build a sentiment analysis model 👷‍♀️" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "XA0apbYgxWgg", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "# Read the data \n", "import re\n", "\n", "import pandas as pd\n", "import numpy as np\n", "import torch\n", "from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer\n", "from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score\n", "from sklearn.model_selection import train_test_split\n", "\n", "import tweepy\n", "\n", "\n", "url = 'https://raw.githubusercontent.com/Giskard-AI/examples/main/datasets/twitter_us_airline_sentiment_analysis.csv'\n", "\n", "data = pd.read_csv(url)\n", "# Lets see how our data looks like\n", "data.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "JsRR0NbFq_y7", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "# Preprocess text (username and link placeholders)\n", "# Replace the Username with @user and the URL in the tweet with http for better comprehension of data for the model\n", "def preprocess(text):\n", " text = \" \".join(text.split())\n", " text = re.sub(r'http\\S+', 'http', text) \n", " text = re.sub(r'@\\S+', '@user', text)\n", " text = text.lower()\n", " return text" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "5oseyfAwQTdr", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "np.random.seed(112)\n", "\n", "# Define pretrained tokenizer and model\n", "model_name = \"Souvikcmsa/SentimentAnalysisDistillBERT\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", "model = AutoModelForSequenceClassification.from_pretrained(model_name)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "xXl-wgTteGf1", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "# Define the evaluation metrics \n", "\n", "def compute_metrics(eval_pred):\n", " pred, labels = eval_pred\n", " pred = np.argmax(pred, axis=1)\n", "\n", " accuracy = accuracy_score(y_true=labels, y_pred=pred)\n", " recall = recall_score(y_true=labels, y_pred=pred, average='macro')\n", " precision = precision_score(y_true=labels, y_pred=pred, average='macro')\n", " f1 = f1_score(y_true=labels, y_pred=pred, average='macro')\n", "\n", " return {\"accuracy\": accuracy, \"precision\": precision, \"recall\": recall, \"f1\": f1}" ] }, { "cell_type": "markdown", "metadata": { "id": "d4dCBYAatnRe", "pycharm": { "name": "#%% md\n" } }, "source": [ "### Lets train the model on our data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "K0lrMmqvfGNq", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "for param in model.base_model.parameters():\n", " param.requires_grad = False\n", "\n", "# ----- 1. Preprocess data -----#\n", "# Preprocess data\n", "\n", "X = list(data[\"text\"].apply(preprocess))\n", "classification_labels_mapping = {'negative': 0,'neutral': 1, 'positive':2}\n", "y = list(data['airline_sentiment'].map(classification_labels_mapping)) # Converting target labels to numeric just for training\n", "labels = list(classification_labels_mapping.keys())\n", "X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)\n", "X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=256)\n", "X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=256)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "pHp4kLGVgZxJ", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "# Create torch dataset\n", "class Dataset(torch.utils.data.Dataset):\n", " def __init__(self, encodings, labels=None):\n", " self.encodings = encodings\n", " self.labels = labels\n", "\n", " def __getitem__(self, idx):\n", " item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}\n", " if self.labels:\n", " item[\"labels\"] = torch.tensor(self.labels[idx])\n", " return item\n", "\n", " def __len__(self):\n", " return len(self.encodings[\"input_ids\"])\n", "\n", "train_dataset = Dataset(X_train_tokenized, y_train)\n", "val_dataset = Dataset(X_val_tokenized, y_val)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "eO67ZdIBekE5", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "# Define a new Trainer with all the objects we constructed so far\n", "\n", "\n", "training_args = TrainingArguments(\n", " output_dir='output',\n", " learning_rate=2e-5,\n", " per_device_train_batch_size=16,\n", " per_device_eval_batch_size=16,\n", " num_train_epochs=5,\n", " weight_decay=0.01,\n", " save_strategy=\"epoch\", \n", ")\n", "\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=train_dataset,\n", " eval_dataset=val_dataset,\n", " tokenizer=tokenizer,\n", " compute_metrics=compute_metrics,\n", ")\n", "# Train pre-trained model\n", "trainer.train()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "xV9BpzNYhtmJ", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "trainer.evaluate()\n" ] }, { "cell_type": "markdown", "metadata": { "id": "EbO2Z6_ou6RP", "pycharm": { "name": "#%% md\n" } }, "source": [ "## 80% Acurracy, not bad! But is the evaluation metrics enough to say the model is good? 💭\n", "\n", "\n", "* What if the model is showing different prediction when you change the words from \"He\" to \"She\" ? 🤔\n", "\n", "* What if the words like \"ain't\" which is more used by African American person is introduced in the statement, will the prediction change ? 🤔\n", "\n", "* Is the performance metrics like accuracy, F1 score changing when you change the name of the user to more feminine in your texts?\n", "\n", "* Is your model gender or race biased? 🤔🤔\n", "\n", "These are really important questions which we need to have definite answers but due to lack of tools for testing, it is often neglected leading to serious issues in future 😟\n", "\n" ] }, { "cell_type": "markdown", "metadata": { "id": "g-Oo1G8wD0aq", "pycharm": { "name": "#%% md\n" } }, "source": [ "## 🌟 Don't worry! [Giskard](https://github.com/Giskard-AI/giskard) is here to help! We are an open source Quality Assurance and CI/CD platform who empowers Data Scientists to inspect and test their model while they are creating it, enabling them to publish it on productions with confidence!\n" ] }, { "cell_type": "markdown", "metadata": { "id": "cItO7aQiRtTr", "pycharm": { "name": "#%% md\n" } }, "source": [ "### Here is a sneak peek of our test suites 😍" ] }, { "cell_type": "markdown", "metadata": { "id": "8fxoU7zfO-j2", "pycharm": { "name": "#%% md\n" } }, "source": [ "![giskard_test_suites.png](https://raw.githubusercontent.com/Giskard-AI/examples/main/images/giskard_tests.gif)" ] }, { "cell_type": "markdown", "metadata": { "id": "wF91vdtrR2w2", "pycharm": { "name": "#%% md\n" } }, "source": [ "# Cant wait to explore?! Lets start with [Installing Giskard](https://docs.giskard.ai/start/) 🚀" ] }, { "cell_type": "markdown", "metadata": { "id": "f8sbXBdhS9ur", "pycharm": { "name": "#%% md\n" } }, "source": [ "## All we need is a simple function that returns the prediction probabilities 😌" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "_Xa0R90auV0w", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "def predict(test_dataset):\n", " test_dataset= test_dataset.squeeze(axis=1)\n", " X_test = list(test_dataset.apply(preprocess))\n", " X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=256)\n", "\n", " # Create torch dataset\n", " test_dataset = Dataset(X_test_tokenized)\n", "\n", " # Define test trainer\n", " test_trainer = Trainer(model)\n", "\n", " # Make prediction\n", " raw_pred, _, _ = test_trainer.predict(test_dataset)\n", " predictions = torch.nn.functional.softmax(torch.from_numpy(raw_pred), dim=-1)\n", " predictions = predictions.cpu().detach().numpy()\n", "\n", " return predictions" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "8q2P-PT3Os1S", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "# A quick test to verify the predict is working before uploading on Giskard\n", "feature_names = ['text']\n", "test_df = data[feature_names][:5]\n", "predict(test_df)" ] }, { "cell_type": "markdown", "metadata": { "id": "iFvZwm4yTjkg", "pycharm": { "name": "#%% md\n" } }, "source": [ "## Let the adventure begin 🤩" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "gw-VIs42VgcZ", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "from giskard import GiskardClient\n", "\n", "url = \"http://localhost:19000\" # If Giskard is installed locally (for installation, see: https://docs.giskard.ai/start/guides/installation)\n", "token = \"YOUR GENERATED TOKEN\" # you can generate your API token in the Admin tab of the Giskard application (for installation, see: https://docs.giskard.ai/start/guides/installation)\n", "\n", "client = GiskardClient(url, token)\n", "\n", "# your_project = client.create_project(\"project_key\", \"PROJECT_NAME\", \"DESCRIPTION\")\n", "# Choose the arguments you want. But \"project_key\" should be unique and in lower case\n", "sentiment_analysis = client.create_project(\"sentiment_analysis\", \"Sentimental Analysis for Twitter Data\", \"Sentimental Analysis for Twitter Data\")\n", "\n", "# If you've already created a project with the key \"sentiment_analysis\" use\n", "#sentiment_analysis = client.get_project(\"sentiment_analysis\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "IHhTzzEeWbIj", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "column_types={ \n", " 'airline_sentiment': \"category\",\n", " \"text\": \"text\"\n", " }" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "FVxeLEaMWPzL", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "sentiment_analysis.upload_model_and_df(\n", " prediction_function=predict, # Python function which takes pandas dataframe as input and returns probabilities for classification model OR returns predictions for regression model\n", " model_type='classification', # \"classification\" for classification model OR \"regression\" for regression model\n", " df=data[['text','airline_sentiment']].sample(100), # The dataset you want to use to inspect your model\n", " column_types=column_types, # # A dictionary with columns names of df as key and types(category, numeric, text) of columns as values\n", " target='airline_sentiment', # The column name in df corresponding to the actual target variable (ground truth).\n", " feature_names=['text'], # List of the feature names of prediction_function\n", " model_name='sentiment_analysis', # Name of the model\n", " dataset_name='twitter_airlinedata', # Name of the dataset\n", " classification_labels=labels # List of the classification labels of your prediction\n", ")" ] }, { "cell_type": "markdown", "metadata": { "id": "IgxV_myEUFSS", "pycharm": { "name": "#%% md\n" } }, "source": [ "## Hurray 🥳 Your Dataset and model is now uploaded on Giskard and is available at http://localhost:19000" ] }, { "cell_type": "markdown", "metadata": { "id": "1UwDVqooUlLC", "pycharm": { "name": "#%% md\n" } }, "source": [ "## Want to download real-time data? Let's start with connecting to your [twitter account using API](https://developer.twitter.com/en/docs/tutorials/step-by-step-guide-to-making-your-first-request-to-the-twitter-api-v2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ROSYwDIMRQvp", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "# Add Twitter API key and secret\n", "consumer_key = \"XXX\"\n", "consumer_secret = \"XXX\"\n", "\n", "# Handling authentication with Twitter\n", "auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)\n", "\n", "# Create a wrapper for the Twitter API\n", "api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "3iEyy_V3RdPv", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "# Helper function for handling pagination in our search and handle rate limits\n", "def limit_handled(cursor):\n", " while True:\n", " try:\n", " yield cursor.next()\n", " except tweepy.RateLimitError:\n", " print('Reached rate limite. Sleeping for >15 minutes')\n", " time.sleep(15 * 61)\n", " except StopIteration:\n", " break\n", "\n", "# Define the term we will be using for searching tweets\n", "query = '#virginairlines'\n", "query = query + ' -filter:retweets'\n", "\n", "# Define how many tweets to get from the Twitter API \n", "count = 1000\n", "\n", "# Let's search for tweets using Tweepy \n", "search = limit_handled(tweepy.Cursor(api.search,\n", " q=query,\n", " tweet_mode='extended',\n", " lang='en',\n", " result_type=\"recent\").items(count))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "DuHe4FPORiDQ", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "# Let's retrive the important data from the search\n", "tweets = []\n", "for tweet in search:\n", " try:\n", " tweets.append({'id':tweet.id, 'user': tweet.user.name,'date':tweet.created_at,'text': tweet.full_text})\n", " \n", " except:\n", " pass\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "aQai4wxHRlBQ", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "# Load the data in a dataframe\n", "pd.set_option('max_colwidth', None)\n", "pd.set_option('display.width', 3000) \n", "prod_data = pd.DataFrame(tweets)\n", "\n", "prod_data.head()" ] }, { "cell_type": "markdown", "metadata": { "id": "lqt-1Gg1ZXoN", "pycharm": { "name": "#%% md\n" } }, "source": [ "## Lets upload this data on Giskard! " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "_cdbPHdZ52KG", "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "sentiment_analysis.upload_df(\n", " df=prod_data, # The dataset you want to upload\n", " column_types=['text'], # All the column types without the target\n", " name=\"production_data\" # Name of the dataset\n", ")" ] }, { "cell_type": "markdown", "metadata": { "id": "EZgaQ3NLaYBL", "pycharm": { "name": "#%% md\n" } }, "source": [ "## Happy Exploration 🕵️‍♂️ 🚀" ] } ], "metadata": { "accelerator": "GPU", "colab": { "collapsed_sections": [], "provenance": [] }, "gpuClass": "standard", "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.15" } }, "nbformat": 4, "nbformat_minor": 4 }