-
Notifications
You must be signed in to change notification settings - Fork 127
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
change to python 3 and add hashtag parsing
- Loading branch information
Showing
1 changed file
with
173 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,36 +2,37 @@ | |
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 29, | ||
"execution_count": 17, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# imports\n", | ||
"\n", | ||
"import urllib2\n", | ||
"import json" | ||
"from urllib.request import urlopen\n", | ||
"import json\n", | ||
"import re\n", | ||
"from collections import defaultdict" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 30, | ||
"execution_count": 9, | ||
"metadata": { | ||
"collapsed": true | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# get an s3 file from Discursive. This file is a sample of ~500 tweets. \n", | ||
"\n", | ||
"response = urllib2.urlopen('https://s3-us-west-2.amazonaws.com/discursive/2017/1/10/18/tweets-25.json')\n", | ||
"response = urlopen('https://s3-us-west-2.amazonaws.com/discursive/2017/1/10/18/tweets-25.json')\n", | ||
"html = response.read()\n", | ||
"resp = json.loads(html)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 34, | ||
"execution_count": 11, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
|
@@ -40,19 +41,19 @@ | |
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"[{u'loc': None, u'description': u\"I Fuck Up... Just don't forget you Fuck Up Too.\", u'friends_count': 979, u'created': u'2017-01-10 18:14:08', u'text': u\"RT @LindaSuhler: Can we hear from #MSM here?\\n@MTV's @Ira Madison III Calls Jeff Sessions' Granddaughter 'Prop' Stolen from Toys R Us\\u2026 \", u'hashtags': u'[\"MSM\"]', u'original_name': u'Linda Suhler, Ph.D.', u'original_id': 347627434, u'id_str': u'818883641640177665', u'user_created': u'2012-12-29 17:54:08', u'followers': 3098, u'retweet_count': 0, u'retweet': u'Y', u'name': u'VFL2013'}, {u'loc': u'Lagos, Nigeria', u'description': u'Publisher | Editor | Online Promoter | Web Designer | Blogger | Social Media Manager.. Contact: [email protected]', u'friends_count': 128, u'created': u'2017-01-10 18:14:09', u'text': u'Breitbart to launch populist business news site https://t.co/DeLYRbFZDs', u'hashtags': u'[]', u'original_name': None, u'original_id': None, u'id_str': u'818883644039294976', u'user_created': u'2016-09-25 18:53:21', u'followers': 154, u'retweet_count': 0, u'retweet': u'N', u'name': u'PeretzOmasi'}, {u'loc': u'Schloss von Kastenfarbe', u'description': u'Extra Bl\\xf6d\\n\\nIrgendwann wird auch Dich niemand verstehen wollen Ach ja: \\u201eNicht alles was Ich teile ist gleich meine Meinung\\u201c https://www.faz.net/-gqz-7h2fh', u'friends_count': 496, u'created': u'2017-01-10 18:14:09', u'text': u'RT @dushanwegner: F\\xfcr eine #SPD-Abgeordnete? Hahaha. Witzig. \\U0001f602\\U0001f602\\U0001f602 https://t.co/YBoSx9YR5f', u'hashtags': u'[\"SPD\"]', u'original_name': u'Dushan Wegner', u'original_id': 14784064, u'id_str': u'818883645096202241', u'user_created': u'2012-03-11 21:22:57', u'followers': 145, u'retweet_count': 0, u'retweet': u'Y', u'name': u'kricketkrackel'}, {u'loc': None, u'description': None, u'friends_count': 249, u'created': u'2017-01-10 18:14:09', u'text': u'RT @pg80808: @qatarairways \\u2708\\ufe0f Did you know your ads support stories like this on Breitbart? 10 THINGS MILO HATES ABOUT ISLAM.\\u2026 ', u'hashtags': u'[]', u'original_name': u'ColoradoVsBreitbart', u'original_id': 814872255406620672, u'id_str': u'818883645188358146', u'user_created': u'2014-12-12 22:55:33', u'followers': 76, u'retweet_count': 0, u'retweet': u'Y', u'name': u'awordonplays'}, {u'loc': None, u'description': u'Conservative; luv my doxie...KEEP THE CHANGE!... ONE NATION UNDER GOD!', u'friends_count': 3361, u'created': u'2017-01-10 18:14:10', u'text': u\"#racist #fool #MTV's Ira #Madison III Calls Jeff Sessions' Granddaughter 'Prop' Stolen from Toys R Us - Breitbart https://t.co/z8GR7iN59Y\", u'hashtags': u'[\"racist\", \"fool\", \"MTV\", \"Madison\"]', u'original_name': None, u'original_id': None, u'id_str': u'818883647382192128', u'user_created': u'2011-08-02 13:45:16', u'followers': 3166, u'retweet_count': 0, u'retweet': u'N', u'name': u'Bonacker69'}]\n" | ||
"[{'description': \"I Fuck Up... Just don't forget you Fuck Up Too.\", 'friends_count': 979, 'text': \"RT @LindaSuhler: Can we hear from #MSM here?\\n@MTV's @Ira Madison III Calls Jeff Sessions' Granddaughter 'Prop' Stolen from Toys R Us… \", 'hashtags': '[\"MSM\"]', 'original_id': 347627434, 'user_created': '2012-12-29 17:54:08', 'retweet': 'Y', 'loc': None, 'name': 'VFL2013', 'created': '2017-01-10 18:14:08', 'original_name': 'Linda Suhler, Ph.D.', 'followers': 3098, 'id_str': '818883641640177665', 'retweet_count': 0}, {'description': 'Publisher | Editor | Online Promoter | Web Designer | Blogger | Social Media Manager.. Contact: [email protected]', 'friends_count': 128, 'text': 'Breitbart to launch populist business news site https://t.co/DeLYRbFZDs', 'hashtags': '[]', 'original_id': None, 'user_created': '2016-09-25 18:53:21', 'retweet': 'N', 'loc': 'Lagos, Nigeria', 'name': 'PeretzOmasi', 'created': '2017-01-10 18:14:09', 'original_name': None, 'followers': 154, 'id_str': '818883644039294976', 'retweet_count': 0}, {'description': 'Extra Blöd\\n\\nIrgendwann wird auch Dich niemand verstehen wollen Ach ja: „Nicht alles was Ich teile ist gleich meine Meinung“ https://www.faz.net/-gqz-7h2fh', 'friends_count': 496, 'text': 'RT @dushanwegner: Für eine #SPD-Abgeordnete? Hahaha. Witzig. 😂😂😂 https://t.co/YBoSx9YR5f', 'hashtags': '[\"SPD\"]', 'original_id': 14784064, 'user_created': '2012-03-11 21:22:57', 'retweet': 'Y', 'loc': 'Schloss von Kastenfarbe', 'name': 'kricketkrackel', 'created': '2017-01-10 18:14:09', 'original_name': 'Dushan Wegner', 'followers': 145, 'id_str': '818883645096202241', 'retweet_count': 0}, {'description': None, 'friends_count': 249, 'text': 'RT @pg80808: @qatarairways ✈️ Did you know your ads support stories like this on Breitbart? 10 THINGS MILO HATES ABOUT ISLAM.… ', 'hashtags': '[]', 'original_id': 814872255406620672, 'user_created': '2014-12-12 22:55:33', 'retweet': 'Y', 'loc': None, 'name': 'awordonplays', 'created': '2017-01-10 18:14:09', 'original_name': 'ColoradoVsBreitbart', 'followers': 76, 'id_str': '818883645188358146', 'retweet_count': 0}, {'description': 'Conservative; luv my doxie...KEEP THE CHANGE!... ONE NATION UNDER GOD!', 'friends_count': 3361, 'text': \"#racist #fool #MTV's Ira #Madison III Calls Jeff Sessions' Granddaughter 'Prop' Stolen from Toys R Us - Breitbart https://t.co/z8GR7iN59Y\", 'hashtags': '[\"racist\", \"fool\", \"MTV\", \"Madison\"]', 'original_id': None, 'user_created': '2011-08-02 13:45:16', 'retweet': 'N', 'loc': None, 'name': 'Bonacker69', 'created': '2017-01-10 18:14:10', 'original_name': None, 'followers': 3166, 'id_str': '818883647382192128', 'retweet_count': 0}]\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# have a look at the data\n", | ||
"\n", | ||
"print resp[:5]" | ||
"print(resp[:5])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 33, | ||
"execution_count": 12, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
|
@@ -74,27 +75,182 @@ | |
"# start exploring the available fields\n", | ||
"\n", | ||
"for x in resp[:5]:\n", | ||
" print x['text']" | ||
" print(x['text'])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 25, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# get all of the hashtags and counts\n", | ||
"hashtags = defaultdict(int)\n", | ||
"for tweet in resp:\n", | ||
" # take out hashtags\n", | ||
" hts = [ht for ht in tweet['text'].split() if ht.startswith('#')]\n", | ||
" # add them to the dictionary\n", | ||
" for ht in hts:\n", | ||
" hashtags[ht] += 1\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 44, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Hashtag\t\t\t\tCount\n", | ||
"#MSM \t\t33\n", | ||
"#MAGA \t\t11\n", | ||
"#StopSessions \t\t8\n", | ||
"#FakeNews \t\t8\n", | ||
"#ConfirmSessions!\t\t7\n", | ||
"#SupportSessions\t\t7\n", | ||
"#AmericaFirst \t\t7\n", | ||
"#Breitbart \t\t6\n", | ||
"#MakeAmericaGreatAgain’\t\t6\n", | ||
"#SPD-Abgeordnete?\t\t5\n", | ||
"#tcot \t\t5\n", | ||
"#tichy \t\t4\n", | ||
"#xing \t\t4\n", | ||
"#Br… \t\t4\n", | ||
"#Sessions \t\t4\n", | ||
"#NewYork \t\t4\n", | ||
"#NYC \t\t4\n", | ||
"#Real… \t\t3\n", | ||
"#TCOT \t\t3\n", | ||
"#PJNET \t\t3\n", | ||
"#pjnet \t\t3\n", | ||
"#p2 \t\t3\n", | ||
"#MakeAmericaGreatAgain'\t\t3\n", | ||
"#CloserNation \t\t2\n", | ||
"#worldnews \t\t2\n", | ||
"#Obamacare \t\t2\n", | ||
"#PlannedParenthood\t\t2\n", | ||
"#PJNET🇺🇸… \t\t2\n", | ||
"#Obama \t\t2\n", | ||
"#Jerusalem \t\t2\n", | ||
"#PresidentElectTrump\t\t2\n", | ||
"#jerusalem \t\t2\n", | ||
"#TimScott: \t\t2\n", | ||
"#JeffSessions \t\t2\n", | ||
"#AttorneyGeneral\t\t2\n", | ||
"#Racist \t\t2\n", | ||
"#breitbart \t\t2\n", | ||
"#LiberalHate \t\t2\n", | ||
"#EverythingIsRacial\t\t2\n", | ||
"#racist \t\t1\n", | ||
"#fool \t\t1\n", | ||
"#MTV's \t\t1\n", | ||
"#Madison \t\t1\n", | ||
"#Libtards \t\t1\n", | ||
"#Liberals \t\t1\n", | ||
"#Racism \t\t1\n", | ||
"#Racist… \t\t1\n", | ||
"#AI \t\t1\n", | ||
"#IA \t\t1\n", | ||
"#machinelearning\t\t1\n", | ||
"#AfD: \t\t1\n", | ||
"#TRUMP \t\t1\n", | ||
"#VETS \t\t1\n", | ||
"#OATHKEEPERS \t\t1\n", | ||
"#CONSTITUTION \t\t1\n", | ||
"#2A \t\t1\n", | ||
"#1A \t\t1\n", | ||
"#ORPUW \t\t1\n", | ||
"#NRA \t\t1\n", | ||
"#TLOT \t\t1\n", | ||
"#INFOWARS \t\t1\n", | ||
"#DrudgeReport… \t\t1\n", | ||
"#Resist \t\t1\n", | ||
"#preach \t\t1\n", | ||
"#Bannon \t\t1\n", | ||
"#Germany \t\t1\n", | ||
"#France \t\t1\n", | ||
"#ConfirmationHearings\t\t1\n", | ||
"#JeffSessions' \t\t1\n", | ||
"#TheResistance \t\t1\n", | ||
"#ResistTrump \t\t1\n", | ||
"#Fascism \t\t1\n", | ||
"#NYT \t\t1\n", | ||
"#NYT=FakeNews \t\t1\n", | ||
"#NowPlaying \t\t1\n", | ||
"#woodpile \t\t1\n", | ||
"#p… \t\t1\n", | ||
"#populism\" \t\t1\n", | ||
"#media \t\t1\n", | ||
"#SPD-Abgeordnete\t\t1\n", | ||
"#squirrel \t\t1\n", | ||
"#beta \t\t1\n", | ||
"#bitches \t\t1\n", | ||
"#cuck… \t\t1\n", | ||
"#liberal \t\t1\n", | ||
"#leftists \t\t1\n", | ||
"#bannon \t\t1\n", | ||
"#manafort \t\t1\n", | ||
"#kushner \t\t1\n", | ||
"#putin \t\t1\n", | ||
"#PublicDiplomacy\t\t1\n", | ||
"#dortmund \t\t1\n", | ||
"#TuesdayMotivation\t\t1\n", | ||
"#Fakenews \t\t1\n", | ||
"#verfassungsschutz\t\t1\n", | ||
"#APT28 \t\t1\n", | ||
"#ACA… \t\t1\n", | ||
"#DumpKelloggs \t\t1\n", | ||
"#RIPDNC \t\t1\n", | ||
"#MerylStreep \t\t1\n", | ||
"#Awesome...coming\t\t1\n", | ||
"#TGDN \t\t1\n", | ||
"#CCOT \t\t1\n", | ||
"#diversity \t\t1\n", | ||
"#StopFundingHate\t\t1\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# print out the results of hashtags\n", | ||
"print(\"Hashtag\\t\\t\\t\\tCount\")\n", | ||
"for ht in sorted(hashtags, key=hashtags.get, reverse=True):\n", | ||
" print(\"{:16s}\\t\\t{}\".format(ht, hashtags[ht]))\n", | ||
" \n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python [Root]", | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "Python [Root]" | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 2 | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython2", | ||
"version": "2.7.12" | ||
"pygments_lexer": "ipython3", | ||
"version": "3.6.3" | ||
} | ||
}, | ||
"nbformat": 4, | ||
|