etl pipeline, machine learning, visualization

LN5user · Jun 11, 2021 · 0db1b50 · 0db1b50
commit 0db1b50
Show file tree

Hide file tree

Showing 9 changed files with 608 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+.DS_Store
+*.csv
+*.pkl
+__pycache__
diff --git a/README.md b/README.md
@@ -0,0 +1,41 @@
+# Disaster Response Pipeline Project
+### Table of Contents
+
+
+1. [Project Motivation](#overview)
+2. [Installation](#installation)
+3. [Instructions](#instructions)
+4. [File Descriptions](#files)
+5. [Discussion](#discussion )
+6. [Licensing, Authors, and Acknowledgements](#licensing)
+
+### Project Overview<a name="overview"></a>
+## Installation <a name="installation"></a>
+The code was tested using Python version 3.9. 
+For other necessary libraries please use requirements.txt
+```bash
+pip install -r requirements.txt
+```
+
+### Instructions<a name="instructions"></a>:
+1. Run the following commands in the project's root directory to set up your database and model.
+
+ - To run ETL pipeline that cleans data and stores in database
+ `python data/process_data.py data/disaster_messages.csv data/disaster_categories.csv data/DisasterResponse.db`
+ - To run ML pipeline that trains classifier and saves
+ 1. In case you with to tune the parameter (GridSearchCV) 
+ `python models/train_classifier.py data/DisasterResponse.db models/classifier.pkl True`
+ 2. Otherwise, the model will take for training the optimized parameter
+ `python models/train_classifier.py data/DisasterResponse.db models/classifier.pkl False`
+
+2. Run the following command in the app's directory to run your web app.
+ `python run.py`
+
+3. Go to https://0.0.0.0:3001/
+
+## File Descriptions <a name="files"></a>
+## Discussion <a name="discussion"></a>
+## Licensing, Authors, Acknowledgements!!!!<a name="licensing"></a>
+
+
+Must give credit to Airbnb. You can find the Licensing for the data and more useful information at Airbnb [here](https://insideairbnb.com/get-the-data.html) or at the Kaggle [here](https://www.kaggle.com/airbnb/seattle).
diff --git a/app/run.py b/app/run.py
@@ -0,0 +1,72 @@
+import json
+from string import punctuation
+import plotly
+from flask import Flask
+from flask import render_template, request, jsonify
+from plotly.graph_objs import Bar
+import joblib
+import pandas as pd
+from sqlalchemy import create_engine
+from wrangling_script.wrangle_data import return_figures
+from nltk.tokenize import word_tokenize
+from nltk.stem import WordNetLemmatizer
+app = Flask(__name__)
+
+def tokenize(text):
+ custom_punctuation = punctuation.replace("'", "")
+ text = text.translate(str.maketrans('', '', custom_punctuation))
+ tokens = word_tokenize(text)
+ lemmatizer = WordNetLemmatizer()
+
+ clean_tokens = []
+ for tok in tokens:
+ clean_tok = lemmatizer.lemmatize(tok).lower().strip()
+ clean_tokens.append(clean_tok)
+
+ return clean_tokens
+
+
+# load model
+model = joblib.load("../models/classifier.pkl")
+# load data
+engine = create_engine('sqlite:https:///../data/DisasterResponse.db')
+df = pd.read_sql_table('DisasterResponse', engine)
+
+# index webpage displays cool visuals and receives user input text for model
+@app.route('/')
+@app.route('/index')
+def index():
+
+ graphs = return_figures(df)
+ # encode plotly graphs in JSON
+ ids = ["graph-{}".format(i) for i, _ in enumerate(graphs)]
+ graphJSON = json.dumps(graphs, cls=plotly.utils.PlotlyJSONEncoder)
+
+ # render web page with plotly graphs
+ return render_template('master.html', ids=ids, graphJSON=graphJSON)
+
+
+# web page that handles user query and displays model results
+@app.route('/go')
+def go():
+ # save user input in query
+ query = request.args.get('query', '') 
+
+ # use model to predict classification for query
+ classification_labels = model.predict([query])[0]
+ classification_results = dict(zip(df.columns[4:], classification_labels))
+
+ # This will render the go.html Please see that file. 
+ return render_template(
+ 'go.html',
+ query=query,
+ classification_result=classification_results
+ )
+
+
+def main():
+ app.run(host='0.0.0.0', port=3001, debug=True)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/app/templates/go.html b/app/templates/go.html
@@ -0,0 +1,24 @@
+{% extends "master.html" %}
+{% block title %}Results{% endblock %}
+
+{% block message %}
+ <hr />
+ <h4 class="text-center">MESSAGE</h4>
+ <p class="text-center"><i>{{query}}</i></p>
+{% endblock %}
+
+{% block content %}
+ <h1 class="text-center">Result</h1>
+ <ul class="list-group">
+ {% for category, classification in classification_result.items() %}
+ {% if classification == 1 %}
+ <li class="list-group-item list-group-item-success text-center">{{category.replace('_', ' ').title()}}</li>
+ {% else %}
+ <li class="list-group-item list-group-item-dark text-center">{{category.replace('_', ' ').title()}}</li>
+ {% endif %}
+ {% endfor %}
+
+ </div>
+ </div>
+
+{% endblock %}
diff --git a/app/templates/master.html b/app/templates/master.html
@@ -0,0 +1,76 @@
+<!doctype html>
+<html lang="en">
+<head>
+ <meta charset="utf-8">
+ <meta http-equiv="X-UA-Compatible" content="IE=edge">
+ <meta name="viewport" content="width=device-width, initial-scale=1">
+
+ <title>Disasters</title>
+
+ <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" crossorigin="anonymous">
+ <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap-theme.min.css" integrity="sha384-rHyoN1iRsVXV4nD0JutlnGaslCJuC7uwjduW9SVrLvRYooPp2bWYgmgJQIXwl/Sp" crossorigin="anonymous">
+ <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.3.1/jquery.min.js"></script>
+ <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
+</head>
+
+<body>
+
+<nav class="navbar navbar-inverse navbar-fixed-top">
+ <div class="container">
+ <div class="navbar-header">
+ <a class="navbar-brand" href="/">Disaster Response Project</a>
+ </div>
+ <div id="navbar" class="collapse navbar-collapse">
+ <ul class="nav navbar-nav">
+ <li><a href="https://www.udacity.com/">Made with Udacity</a></li>
+ <li><a href="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/">Contact</a></li>
+ </ul>
+ </div>
+ </div>
+</nav>
+
+
+<div class="jumbotron">
+ <div class="container">
+ <h1 class="text-center">Disaster Response Project</h1>
+ <p class="text-center">Analyzing message data for disaster response</p>
+ <hr />
+
+ <div class="row">
+ <div class="col-lg-12 form-group-lg">
+ <form action="/go" method="get">
+ <input type="text" class="form-control form-control-lg" name="query" placeholder="Enter a message to classify">
+ <div class="col-lg-offset-5">
+ <button type="submit" class="btn btn-lg btn-success">Classify Message</button>
+ </div>
+ </form>
+ </div>
+ </div>
+
+ {% block message %}
+ {% endblock %}
+ </div>
+</div>
+
+<div class="container mt-3 text-center">
+ {% block content %}
+ <div class="page-header">
+ <h1 class="text-center">Overview of Training Dataset</h1>
+ </div>
+ {% endblock %}
+
+ {% for id in ids %}
+ <div id="{{id}}"></div>
+ {% endfor %}
+</div>
+
+<script type="text/javascript">
+ const graphs = {{graphJSON | safe}};
+ const ids = {{ids | safe}};
+ for(let i in graphs) {
+ Plotly.plot(ids[i], graphs[i].data, graphs[i].layout);
+ }
+</script>
+
+</body>
+</html>
diff --git a/app/wrangling_script/wrangle_data.py b/app/wrangling_script/wrangle_data.py
@@ -0,0 +1,102 @@
+
+import plotly.graph_objects as go
+
+
+
+def return_figures(df):
+ """
+ Prepare and visualize data
+ INPUT
+ df: pandas dataframe
+
+ OUTPUT
+ None
+ """
+
+ # extract data needed for visuals
+ genre_counts = df.groupby('genre').count()['message']
+ genre_names = list(genre_counts.index)
+ request_counts = df[df['request'] == 1].groupby('genre').count()['message']
+ offer_counts = df[df['offer'] == 1].groupby('genre').count()['message']
+
+
+ graph_one =[
+ go.Bar(
+ x=genre_names,
+ y=genre_counts,
+ name = 'Total'
+
+ ),
+ go.Bar(
+ x=genre_names,
+ y= request_counts,
+ name = 'Request'
+ ),
+ go.Bar(
+ x=genre_names,
+ y= offer_counts,
+ name = 'Offer'
+ )
+ ]
+
+
+
+
+ layout_one = dict(title='Distribution of Message Genres and Help Type',
+ xaxis=dict(title="Count"),
+ yaxis=dict(title="Genre"),
+ height=500,
+ width=1400,
+ autosize=False
+ )
+
+ # count number of occurrences 1 for each label
+ count_one_occurence = {}
+ for col_name in df.columns[4:]:
+ cnt = df[df[col_name] == 1].shape[0]
+ col_name = col_name.replace('_', ' ')
+ count_one_occurence[col_name] = cnt
+ count_one_occurence = dict(
+ sorted(count_one_occurence.items(), key=lambda item: item[1], reverse=True))
+ graph_two = []
+
+ graph_two.append(
+ go.Bar(
+ x=list(count_one_occurence.keys()),
+ y=list(count_one_occurence.values()),
+
+ )
+ )
+
+ layout_two = dict(title='Distribution of Disaster Types',
+ yaxis=dict(title="Count"),
+ xaxis=dict(title="Labels"),
+ height=500,
+ width=1400,
+ autosize=False
+ )
+
+ graph_three = []
+ graph_three .append(
+ go.Pie(
+ labels=list(count_one_occurence.keys()),
+ values=list(count_one_occurence.values()),
+
+ )
+ )
+
+ layout_three = dict(title='Distribution of Disaster Types in Percent ',
+ yaxis=dict(title="Count"),
+ xaxis=dict(title="Labels"),
+ height=900,
+ width=1200,
+ autosize=False
+ )
+
+ # append all charts
+ figures = []
+ figures.append(dict(data=graph_one, layout=layout_one))
+ figures.append(dict(data=graph_two, layout=layout_two))
+ figures.append(dict(data=graph_three, layout=layout_three))
+ #figures.append(dict(data=graph_four, layout=layout_four))
+ return figures
diff --git a/data/DisasterResponse.db b/data/DisasterResponse.db