Skip to content

Commit

Permalink
change to python 3 and add hashtag parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
yknot committed Oct 30, 2017
1 parent 3ec8219 commit b42961c
Showing 1 changed file with 173 additions and 17 deletions.
190 changes: 173 additions & 17 deletions DiscursiveEDA.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,36 +2,37 @@
"cells": [
{
"cell_type": "code",
"execution_count": 29,
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# imports\n",
"\n",
"import urllib2\n",
"import json"
"from urllib.request import urlopen\n",
"import json\n",
"import re\n",
"from collections import defaultdict"
]
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": 9,
"metadata": {
"collapsed": true
"collapsed": false
},
"outputs": [],
"source": [
"# get an s3 file from Discursive. This file is a sample of ~500 tweets. \n",
"\n",
"response = urllib2.urlopen('https://s3-us-west-2.amazonaws.com/discursive/2017/1/10/18/tweets-25.json')\n",
"response = urlopen('https://s3-us-west-2.amazonaws.com/discursive/2017/1/10/18/tweets-25.json')\n",
"html = response.read()\n",
"resp = json.loads(html)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": 11,
"metadata": {
"collapsed": false
},
Expand All @@ -40,19 +41,19 @@
"name": "stdout",
"output_type": "stream",
"text": [
"[{u'loc': None, u'description': u\"I Fuck Up... Just don't forget you Fuck Up Too.\", u'friends_count': 979, u'created': u'2017-01-10 18:14:08', u'text': u\"RT @LindaSuhler: Can we hear from #MSM here?\\n@MTV's @Ira Madison III Calls Jeff Sessions' Granddaughter 'Prop' Stolen from Toys R Us\\u2026 \", u'hashtags': u'[\"MSM\"]', u'original_name': u'Linda Suhler, Ph.D.', u'original_id': 347627434, u'id_str': u'818883641640177665', u'user_created': u'2012-12-29 17:54:08', u'followers': 3098, u'retweet_count': 0, u'retweet': u'Y', u'name': u'VFL2013'}, {u'loc': u'Lagos, Nigeria', u'description': u'Publisher | Editor | Online Promoter | Web Designer | Blogger | Social Media Manager.. Contact: [email protected]', u'friends_count': 128, u'created': u'2017-01-10 18:14:09', u'text': u'Breitbart to launch populist business news site https://t.co/DeLYRbFZDs', u'hashtags': u'[]', u'original_name': None, u'original_id': None, u'id_str': u'818883644039294976', u'user_created': u'2016-09-25 18:53:21', u'followers': 154, u'retweet_count': 0, u'retweet': u'N', u'name': u'PeretzOmasi'}, {u'loc': u'Schloss von Kastenfarbe', u'description': u'Extra Bl\\xf6d\\n\\nIrgendwann wird auch Dich niemand verstehen wollen Ach ja: \\u201eNicht alles was Ich teile ist gleich meine Meinung\\u201c https://www.faz.net/-gqz-7h2fh', u'friends_count': 496, u'created': u'2017-01-10 18:14:09', u'text': u'RT @dushanwegner: F\\xfcr eine #SPD-Abgeordnete? Hahaha. Witzig. \\U0001f602\\U0001f602\\U0001f602 https://t.co/YBoSx9YR5f', u'hashtags': u'[\"SPD\"]', u'original_name': u'Dushan Wegner', u'original_id': 14784064, u'id_str': u'818883645096202241', u'user_created': u'2012-03-11 21:22:57', u'followers': 145, u'retweet_count': 0, u'retweet': u'Y', u'name': u'kricketkrackel'}, {u'loc': None, u'description': None, u'friends_count': 249, u'created': u'2017-01-10 18:14:09', u'text': u'RT @pg80808: @qatarairways \\u2708\\ufe0f Did you know your ads support stories like this on Breitbart? 10 THINGS MILO HATES ABOUT ISLAM.\\u2026 ', u'hashtags': u'[]', u'original_name': u'ColoradoVsBreitbart', u'original_id': 814872255406620672, u'id_str': u'818883645188358146', u'user_created': u'2014-12-12 22:55:33', u'followers': 76, u'retweet_count': 0, u'retweet': u'Y', u'name': u'awordonplays'}, {u'loc': None, u'description': u'Conservative; luv my doxie...KEEP THE CHANGE!... ONE NATION UNDER GOD!', u'friends_count': 3361, u'created': u'2017-01-10 18:14:10', u'text': u\"#racist #fool #MTV's Ira #Madison III Calls Jeff Sessions' Granddaughter 'Prop' Stolen from Toys R Us - Breitbart https://t.co/z8GR7iN59Y\", u'hashtags': u'[\"racist\", \"fool\", \"MTV\", \"Madison\"]', u'original_name': None, u'original_id': None, u'id_str': u'818883647382192128', u'user_created': u'2011-08-02 13:45:16', u'followers': 3166, u'retweet_count': 0, u'retweet': u'N', u'name': u'Bonacker69'}]\n"
"[{'description': \"I Fuck Up... Just don't forget you Fuck Up Too.\", 'friends_count': 979, 'text': \"RT @LindaSuhler: Can we hear from #MSM here?\\n@MTV's @Ira Madison III Calls Jeff Sessions' Granddaughter 'Prop' Stolen from Toys R Us… \", 'hashtags': '[\"MSM\"]', 'original_id': 347627434, 'user_created': '2012-12-29 17:54:08', 'retweet': 'Y', 'loc': None, 'name': 'VFL2013', 'created': '2017-01-10 18:14:08', 'original_name': 'Linda Suhler, Ph.D.', 'followers': 3098, 'id_str': '818883641640177665', 'retweet_count': 0}, {'description': 'Publisher | Editor | Online Promoter | Web Designer | Blogger | Social Media Manager.. Contact: [email protected]', 'friends_count': 128, 'text': 'Breitbart to launch populist business news site https://t.co/DeLYRbFZDs', 'hashtags': '[]', 'original_id': None, 'user_created': '2016-09-25 18:53:21', 'retweet': 'N', 'loc': 'Lagos, Nigeria', 'name': 'PeretzOmasi', 'created': '2017-01-10 18:14:09', 'original_name': None, 'followers': 154, 'id_str': '818883644039294976', 'retweet_count': 0}, {'description': 'Extra Blöd\\n\\nIrgendwann wird auch Dich niemand verstehen wollen Ach ja: „Nicht alles was Ich teile ist gleich meine Meinung“ https://www.faz.net/-gqz-7h2fh', 'friends_count': 496, 'text': 'RT @dushanwegner: Für eine #SPD-Abgeordnete? Hahaha. Witzig. 😂😂😂 https://t.co/YBoSx9YR5f', 'hashtags': '[\"SPD\"]', 'original_id': 14784064, 'user_created': '2012-03-11 21:22:57', 'retweet': 'Y', 'loc': 'Schloss von Kastenfarbe', 'name': 'kricketkrackel', 'created': '2017-01-10 18:14:09', 'original_name': 'Dushan Wegner', 'followers': 145, 'id_str': '818883645096202241', 'retweet_count': 0}, {'description': None, 'friends_count': 249, 'text': 'RT @pg80808: @qatarairways ✈️ Did you know your ads support stories like this on Breitbart? 10 THINGS MILO HATES ABOUT ISLAM.… ', 'hashtags': '[]', 'original_id': 814872255406620672, 'user_created': '2014-12-12 22:55:33', 'retweet': 'Y', 'loc': None, 'name': 'awordonplays', 'created': '2017-01-10 18:14:09', 'original_name': 'ColoradoVsBreitbart', 'followers': 76, 'id_str': '818883645188358146', 'retweet_count': 0}, {'description': 'Conservative; luv my doxie...KEEP THE CHANGE!... ONE NATION UNDER GOD!', 'friends_count': 3361, 'text': \"#racist #fool #MTV's Ira #Madison III Calls Jeff Sessions' Granddaughter 'Prop' Stolen from Toys R Us - Breitbart https://t.co/z8GR7iN59Y\", 'hashtags': '[\"racist\", \"fool\", \"MTV\", \"Madison\"]', 'original_id': None, 'user_created': '2011-08-02 13:45:16', 'retweet': 'N', 'loc': None, 'name': 'Bonacker69', 'created': '2017-01-10 18:14:10', 'original_name': None, 'followers': 3166, 'id_str': '818883647382192128', 'retweet_count': 0}]\n"
]
}
],
"source": [
"# have a look at the data\n",
"\n",
"print resp[:5]"
"print(resp[:5])"
]
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": 12,
"metadata": {
"collapsed": false
},
Expand All @@ -74,27 +75,182 @@
"# start exploring the available fields\n",
"\n",
"for x in resp[:5]:\n",
" print x['text']"
" print(x['text'])"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# get all of the hashtags and counts\n",
"hashtags = defaultdict(int)\n",
"for tweet in resp:\n",
" # take out hashtags\n",
" hts = [ht for ht in tweet['text'].split() if ht.startswith('#')]\n",
" # add them to the dictionary\n",
" for ht in hts:\n",
" hashtags[ht] += 1\n"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Hashtag\t\t\t\tCount\n",
"#MSM \t\t33\n",
"#MAGA \t\t11\n",
"#StopSessions \t\t8\n",
"#FakeNews \t\t8\n",
"#ConfirmSessions!\t\t7\n",
"#SupportSessions\t\t7\n",
"#AmericaFirst \t\t7\n",
"#Breitbart \t\t6\n",
"#MakeAmericaGreatAgain’\t\t6\n",
"#SPD-Abgeordnete?\t\t5\n",
"#tcot \t\t5\n",
"#tichy \t\t4\n",
"#xing \t\t4\n",
"#Br… \t\t4\n",
"#Sessions \t\t4\n",
"#NewYork \t\t4\n",
"#NYC \t\t4\n",
"#Real… \t\t3\n",
"#TCOT \t\t3\n",
"#PJNET \t\t3\n",
"#pjnet \t\t3\n",
"#p2 \t\t3\n",
"#MakeAmericaGreatAgain'\t\t3\n",
"#CloserNation \t\t2\n",
"#worldnews \t\t2\n",
"#Obamacare \t\t2\n",
"#PlannedParenthood\t\t2\n",
"#PJNET🇺🇸… \t\t2\n",
"#Obama \t\t2\n",
"#Jerusalem \t\t2\n",
"#PresidentElectTrump\t\t2\n",
"#jerusalem \t\t2\n",
"#TimScott: \t\t2\n",
"#JeffSessions \t\t2\n",
"#AttorneyGeneral\t\t2\n",
"#Racist \t\t2\n",
"#breitbart \t\t2\n",
"#LiberalHate \t\t2\n",
"#EverythingIsRacial\t\t2\n",
"#racist \t\t1\n",
"#fool \t\t1\n",
"#MTV's \t\t1\n",
"#Madison \t\t1\n",
"#Libtards \t\t1\n",
"#Liberals \t\t1\n",
"#Racism \t\t1\n",
"#Racist… \t\t1\n",
"#AI \t\t1\n",
"#IA \t\t1\n",
"#machinelearning\t\t1\n",
"#AfD: \t\t1\n",
"#TRUMP \t\t1\n",
"#VETS \t\t1\n",
"#OATHKEEPERS \t\t1\n",
"#CONSTITUTION \t\t1\n",
"#2A \t\t1\n",
"#1A \t\t1\n",
"#ORPUW \t\t1\n",
"#NRA \t\t1\n",
"#TLOT \t\t1\n",
"#INFOWARS \t\t1\n",
"#DrudgeReport… \t\t1\n",
"#Resist \t\t1\n",
"#preach \t\t1\n",
"#Bannon \t\t1\n",
"#Germany \t\t1\n",
"#France \t\t1\n",
"#ConfirmationHearings\t\t1\n",
"#JeffSessions' \t\t1\n",
"#TheResistance \t\t1\n",
"#ResistTrump \t\t1\n",
"#Fascism \t\t1\n",
"#NYT \t\t1\n",
"#NYT=FakeNews \t\t1\n",
"#NowPlaying \t\t1\n",
"#woodpile \t\t1\n",
"#p… \t\t1\n",
"#populism\" \t\t1\n",
"#media \t\t1\n",
"#SPD-Abgeordnete\t\t1\n",
"#squirrel \t\t1\n",
"#beta \t\t1\n",
"#bitches \t\t1\n",
"#cuck… \t\t1\n",
"#liberal \t\t1\n",
"#leftists \t\t1\n",
"#bannon \t\t1\n",
"#manafort \t\t1\n",
"#kushner \t\t1\n",
"#putin \t\t1\n",
"#PublicDiplomacy\t\t1\n",
"#dortmund \t\t1\n",
"#TuesdayMotivation\t\t1\n",
"#Fakenews \t\t1\n",
"#verfassungsschutz\t\t1\n",
"#APT28 \t\t1\n",
"#ACA… \t\t1\n",
"#DumpKelloggs \t\t1\n",
"#RIPDNC \t\t1\n",
"#MerylStreep \t\t1\n",
"#Awesome...coming\t\t1\n",
"#TGDN \t\t1\n",
"#CCOT \t\t1\n",
"#diversity \t\t1\n",
"#StopFundingHate\t\t1\n"
]
}
],
"source": [
"# print out the results of hashtags\n",
"print(\"Hashtag\\t\\t\\t\\tCount\")\n",
"for ht in sorted(hashtags, key=hashtags.get, reverse=True):\n",
" print(\"{:16s}\\t\\t{}\".format(ht, hashtags[ht]))\n",
" \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [Root]",
"display_name": "Python 3",
"language": "python",
"name": "Python [Root]"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
Expand Down

0 comments on commit b42961c

Please sign in to comment.