From b42961c5f8286a82e57cbf2f333870949fa39c93 Mon Sep 17 00:00:00 2001 From: Andrew Yale Date: Mon, 30 Oct 2017 16:33:53 -0400 Subject: [PATCH] change to python 3 and add hashtag parsing --- DiscursiveEDA.ipynb | 190 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 173 insertions(+), 17 deletions(-) diff --git a/DiscursiveEDA.ipynb b/DiscursiveEDA.ipynb index 656ed31..33b284e 100644 --- a/DiscursiveEDA.ipynb +++ b/DiscursiveEDA.ipynb @@ -2,36 +2,37 @@ "cells": [ { "cell_type": "code", - "execution_count": 29, + "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# imports\n", - "\n", - "import urllib2\n", - "import json" + "from urllib.request import urlopen\n", + "import json\n", + "import re\n", + "from collections import defaultdict" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 9, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ "# get an s3 file from Discursive. This file is a sample of ~500 tweets. \n", "\n", - "response = urllib2.urlopen('https://s3-us-west-2.amazonaws.com/discursive/2017/1/10/18/tweets-25.json')\n", + "response = urlopen('https://s3-us-west-2.amazonaws.com/discursive/2017/1/10/18/tweets-25.json')\n", "html = response.read()\n", "resp = json.loads(html)" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 11, "metadata": { "collapsed": false }, @@ -40,19 +41,19 @@ "name": "stdout", "output_type": "stream", "text": [ - "[{u'loc': None, u'description': u\"I Fuck Up... Just don't forget you Fuck Up Too.\", u'friends_count': 979, u'created': u'2017-01-10 18:14:08', u'text': u\"RT @LindaSuhler: Can we hear from #MSM here?\\n@MTV's @Ira Madison III Calls Jeff Sessions' Granddaughter 'Prop' Stolen from Toys R Us\\u2026 \", u'hashtags': u'[\"MSM\"]', u'original_name': u'Linda Suhler, Ph.D.', u'original_id': 347627434, u'id_str': u'818883641640177665', u'user_created': u'2012-12-29 17:54:08', u'followers': 3098, u'retweet_count': 0, u'retweet': u'Y', u'name': u'VFL2013'}, {u'loc': u'Lagos, Nigeria', u'description': u'Publisher | Editor | Online Promoter | Web Designer | Blogger | Social Media Manager.. Contact: peretzomasi51@gmail.com', u'friends_count': 128, u'created': u'2017-01-10 18:14:09', u'text': u'Breitbart to launch populist business news site https://t.co/DeLYRbFZDs', u'hashtags': u'[]', u'original_name': None, u'original_id': None, u'id_str': u'818883644039294976', u'user_created': u'2016-09-25 18:53:21', u'followers': 154, u'retweet_count': 0, u'retweet': u'N', u'name': u'PeretzOmasi'}, {u'loc': u'Schloss von Kastenfarbe', u'description': u'Extra Bl\\xf6d\\n\\nIrgendwann wird auch Dich niemand verstehen wollen Ach ja: \\u201eNicht alles was Ich teile ist gleich meine Meinung\\u201c http://www.faz.net/-gqz-7h2fh', u'friends_count': 496, u'created': u'2017-01-10 18:14:09', u'text': u'RT @dushanwegner: F\\xfcr eine #SPD-Abgeordnete? Hahaha. Witzig. \\U0001f602\\U0001f602\\U0001f602 https://t.co/YBoSx9YR5f', u'hashtags': u'[\"SPD\"]', u'original_name': u'Dushan Wegner', u'original_id': 14784064, u'id_str': u'818883645096202241', u'user_created': u'2012-03-11 21:22:57', u'followers': 145, u'retweet_count': 0, u'retweet': u'Y', u'name': u'kricketkrackel'}, {u'loc': None, u'description': None, u'friends_count': 249, u'created': u'2017-01-10 18:14:09', u'text': u'RT @pg80808: @qatarairways \\u2708\\ufe0f Did you know your ads support stories like this on Breitbart? 10 THINGS MILO HATES ABOUT ISLAM.\\u2026 ', u'hashtags': u'[]', u'original_name': u'ColoradoVsBreitbart', u'original_id': 814872255406620672, u'id_str': u'818883645188358146', u'user_created': u'2014-12-12 22:55:33', u'followers': 76, u'retweet_count': 0, u'retweet': u'Y', u'name': u'awordonplays'}, {u'loc': None, u'description': u'Conservative; luv my doxie...KEEP THE CHANGE!... ONE NATION UNDER GOD!', u'friends_count': 3361, u'created': u'2017-01-10 18:14:10', u'text': u\"#racist #fool #MTV's Ira #Madison III Calls Jeff Sessions' Granddaughter 'Prop' Stolen from Toys R Us - Breitbart https://t.co/z8GR7iN59Y\", u'hashtags': u'[\"racist\", \"fool\", \"MTV\", \"Madison\"]', u'original_name': None, u'original_id': None, u'id_str': u'818883647382192128', u'user_created': u'2011-08-02 13:45:16', u'followers': 3166, u'retweet_count': 0, u'retweet': u'N', u'name': u'Bonacker69'}]\n" + "[{'description': \"I Fuck Up... Just don't forget you Fuck Up Too.\", 'friends_count': 979, 'text': \"RT @LindaSuhler: Can we hear from #MSM here?\\n@MTV's @Ira Madison III Calls Jeff Sessions' Granddaughter 'Prop' Stolen from Toys R Us… \", 'hashtags': '[\"MSM\"]', 'original_id': 347627434, 'user_created': '2012-12-29 17:54:08', 'retweet': 'Y', 'loc': None, 'name': 'VFL2013', 'created': '2017-01-10 18:14:08', 'original_name': 'Linda Suhler, Ph.D.', 'followers': 3098, 'id_str': '818883641640177665', 'retweet_count': 0}, {'description': 'Publisher | Editor | Online Promoter | Web Designer | Blogger | Social Media Manager.. Contact: peretzomasi51@gmail.com', 'friends_count': 128, 'text': 'Breitbart to launch populist business news site https://t.co/DeLYRbFZDs', 'hashtags': '[]', 'original_id': None, 'user_created': '2016-09-25 18:53:21', 'retweet': 'N', 'loc': 'Lagos, Nigeria', 'name': 'PeretzOmasi', 'created': '2017-01-10 18:14:09', 'original_name': None, 'followers': 154, 'id_str': '818883644039294976', 'retweet_count': 0}, {'description': 'Extra Blöd\\n\\nIrgendwann wird auch Dich niemand verstehen wollen Ach ja: „Nicht alles was Ich teile ist gleich meine Meinung“ http://www.faz.net/-gqz-7h2fh', 'friends_count': 496, 'text': 'RT @dushanwegner: Für eine #SPD-Abgeordnete? Hahaha. Witzig. 😂😂😂 https://t.co/YBoSx9YR5f', 'hashtags': '[\"SPD\"]', 'original_id': 14784064, 'user_created': '2012-03-11 21:22:57', 'retweet': 'Y', 'loc': 'Schloss von Kastenfarbe', 'name': 'kricketkrackel', 'created': '2017-01-10 18:14:09', 'original_name': 'Dushan Wegner', 'followers': 145, 'id_str': '818883645096202241', 'retweet_count': 0}, {'description': None, 'friends_count': 249, 'text': 'RT @pg80808: @qatarairways ✈️ Did you know your ads support stories like this on Breitbart? 10 THINGS MILO HATES ABOUT ISLAM.… ', 'hashtags': '[]', 'original_id': 814872255406620672, 'user_created': '2014-12-12 22:55:33', 'retweet': 'Y', 'loc': None, 'name': 'awordonplays', 'created': '2017-01-10 18:14:09', 'original_name': 'ColoradoVsBreitbart', 'followers': 76, 'id_str': '818883645188358146', 'retweet_count': 0}, {'description': 'Conservative; luv my doxie...KEEP THE CHANGE!... ONE NATION UNDER GOD!', 'friends_count': 3361, 'text': \"#racist #fool #MTV's Ira #Madison III Calls Jeff Sessions' Granddaughter 'Prop' Stolen from Toys R Us - Breitbart https://t.co/z8GR7iN59Y\", 'hashtags': '[\"racist\", \"fool\", \"MTV\", \"Madison\"]', 'original_id': None, 'user_created': '2011-08-02 13:45:16', 'retweet': 'N', 'loc': None, 'name': 'Bonacker69', 'created': '2017-01-10 18:14:10', 'original_name': None, 'followers': 3166, 'id_str': '818883647382192128', 'retweet_count': 0}]\n" ] } ], "source": [ "# have a look at the data\n", "\n", - "print resp[:5]" + "print(resp[:5])" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 12, "metadata": { "collapsed": false }, @@ -74,27 +75,182 @@ "# start exploring the available fields\n", "\n", "for x in resp[:5]:\n", - " print x['text']" + " print(x['text'])" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# get all of the hashtags and counts\n", + "hashtags = defaultdict(int)\n", + "for tweet in resp:\n", + " # take out hashtags\n", + " hts = [ht for ht in tweet['text'].split() if ht.startswith('#')]\n", + " # add them to the dictionary\n", + " for ht in hts:\n", + " hashtags[ht] += 1\n" ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hashtag\t\t\t\tCount\n", + "#MSM \t\t33\n", + "#MAGA \t\t11\n", + "#StopSessions \t\t8\n", + "#FakeNews \t\t8\n", + "#ConfirmSessions!\t\t7\n", + "#SupportSessions\t\t7\n", + "#AmericaFirst \t\t7\n", + "#Breitbart \t\t6\n", + "#MakeAmericaGreatAgain’\t\t6\n", + "#SPD-Abgeordnete?\t\t5\n", + "#tcot \t\t5\n", + "#tichy \t\t4\n", + "#xing \t\t4\n", + "#Br… \t\t4\n", + "#Sessions \t\t4\n", + "#NewYork \t\t4\n", + "#NYC \t\t4\n", + "#Real… \t\t3\n", + "#TCOT \t\t3\n", + "#PJNET \t\t3\n", + "#pjnet \t\t3\n", + "#p2 \t\t3\n", + "#MakeAmericaGreatAgain'\t\t3\n", + "#CloserNation \t\t2\n", + "#worldnews \t\t2\n", + "#Obamacare \t\t2\n", + "#PlannedParenthood\t\t2\n", + "#PJNET🇺🇸… \t\t2\n", + "#Obama \t\t2\n", + "#Jerusalem \t\t2\n", + "#PresidentElectTrump\t\t2\n", + "#jerusalem \t\t2\n", + "#TimScott: \t\t2\n", + "#JeffSessions \t\t2\n", + "#AttorneyGeneral\t\t2\n", + "#Racist \t\t2\n", + "#breitbart \t\t2\n", + "#LiberalHate \t\t2\n", + "#EverythingIsRacial\t\t2\n", + "#racist \t\t1\n", + "#fool \t\t1\n", + "#MTV's \t\t1\n", + "#Madison \t\t1\n", + "#Libtards \t\t1\n", + "#Liberals \t\t1\n", + "#Racism \t\t1\n", + "#Racist… \t\t1\n", + "#AI \t\t1\n", + "#IA \t\t1\n", + "#machinelearning\t\t1\n", + "#AfD: \t\t1\n", + "#TRUMP \t\t1\n", + "#VETS \t\t1\n", + "#OATHKEEPERS \t\t1\n", + "#CONSTITUTION \t\t1\n", + "#2A \t\t1\n", + "#1A \t\t1\n", + "#ORPUW \t\t1\n", + "#NRA \t\t1\n", + "#TLOT \t\t1\n", + "#INFOWARS \t\t1\n", + "#DrudgeReport… \t\t1\n", + "#Resist \t\t1\n", + "#preach \t\t1\n", + "#Bannon \t\t1\n", + "#Germany \t\t1\n", + "#France \t\t1\n", + "#ConfirmationHearings\t\t1\n", + "#JeffSessions' \t\t1\n", + "#TheResistance \t\t1\n", + "#ResistTrump \t\t1\n", + "#Fascism \t\t1\n", + "#NYT \t\t1\n", + "#NYT=FakeNews \t\t1\n", + "#NowPlaying \t\t1\n", + "#woodpile \t\t1\n", + "#p… \t\t1\n", + "#populism\" \t\t1\n", + "#media \t\t1\n", + "#SPD-Abgeordnete\t\t1\n", + "#squirrel \t\t1\n", + "#beta \t\t1\n", + "#bitches \t\t1\n", + "#cuck… \t\t1\n", + "#liberal \t\t1\n", + "#leftists \t\t1\n", + "#bannon \t\t1\n", + "#manafort \t\t1\n", + "#kushner \t\t1\n", + "#putin \t\t1\n", + "#PublicDiplomacy\t\t1\n", + "#dortmund \t\t1\n", + "#TuesdayMotivation\t\t1\n", + "#Fakenews \t\t1\n", + "#verfassungsschutz\t\t1\n", + "#APT28 \t\t1\n", + "#ACA… \t\t1\n", + "#DumpKelloggs \t\t1\n", + "#RIPDNC \t\t1\n", + "#MerylStreep \t\t1\n", + "#Awesome...coming\t\t1\n", + "#TGDN \t\t1\n", + "#CCOT \t\t1\n", + "#diversity \t\t1\n", + "#StopFundingHate\t\t1\n" + ] + } + ], + "source": [ + "# print out the results of hashtags\n", + "print(\"Hashtag\\t\\t\\t\\tCount\")\n", + "for ht in sorted(hashtags, key=hashtags.get, reverse=True):\n", + " print(\"{:16s}\\t\\t{}\".format(ht, hashtags[ht]))\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python [Root]", + "display_name": "Python 3", "language": "python", - "name": "Python [Root]" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.12" + "pygments_lexer": "ipython3", + "version": "3.6.3" } }, "nbformat": 4,