Merge pull request #17 from rautey/Getting-words-from-wiki

Getting words from wiki
Data4Democracy · Aug 2, 2019 · 649e833 · 649e833
2 parents e27c74e + ba3b508
commit 649e833
Show file tree

Hide file tree

Showing 3 changed files with 126 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,6 @@
 *.pyc
 __pycache__
+*.ipynb
+*.xlsx
+Python/code/bad_words_from_wiki.txt
+Python/code/.DS_Store
diff --git a/Python/code/twitter_label.py b/Python/code/twitter_label.py
@@ -0,0 +1,56 @@
+
+import pandas
+import numpy as np
+import re
+
+tweets = pandas.read_csv('~/Documents/final_tweets_NLP+CSS_2016.csv', header = None)
+tweets['label'] = 0
+
+badwords = pandas.read_csv('~/Documents/list1.csv', header = None)
+
+for i in range(0, len(badwords)):
+
+ print(badwords[0][i])
+ tweets["label"] = tweets["label"] + [1 if badwords[0][i] in ele else 0 for ele in tweets[1]]
+
+
+
+
+badwords = pandas.read_csv('~/Documents/list2.csv', header = None)
+
+for i in range(0, len(badwords)):
+ text = re.findall('\"(.*?)\"', badwords.loc[i][0])
+ print(text)
+ tweets["label"] = tweets["label"] + [1 if text[0] in ele else 0 for ele in tweets[1]]
+
+
+badwords = pandas.read_csv('~/Documents/list3.csv', header = None)
+
+for i in range(0, len(badwords)):
+
+ print(badwords[0][i])
+ tweets["label"] = tweets["label"] + [1 if badwords[0][i] in ele else 0 for ele in tweets[1]]
+
+
+
+badwords = pandas.read_csv('~/Documents/list4.csv', header = None)
+
+for i in range(0, len(badwords)):
+
+ print(badwords[0][i])
+ tweets["label"] = tweets["label"] + [1 if badwords[0][i] in ele else 0 for ele in tweets[1]]
+
+
+badwords = pandas.read_csv('~/Documents/list5.csv', header = None)
+
+for i in range(0, len(badwords)):
+ text = badwords.loc[i][0].split(',')
+ print(text)
+ tweets["label"] = tweets["label"] + [1 if text[0] in ele else 0 for ele in tweets[1]]
+
+
+tweets['label'] = np.where(tweets['label']>=1, 1, 0) 
+
+
+## sanity check
+tweets['label'].sum()
diff --git a/Python/code/wiki_badwords.py b/Python/code/wiki_badwords.py
@@ -0,0 +1,66 @@
+import wikipediaapi
+import re
+
+
+
+def process_word(word):
+ # remove tags
+ remove_list = ['<i>', '</i>', '<b>', '</b>', '\xa0', '"']
+ for r in remove_list:
+ word = word.replace(r, '')
+ # Remove span
+ if 'span' in word:
+ word = re.findall(r'>(.*?)<', word)[0]
+ # Remove words in parentheses
+ if '(' in word and ')' in word:
+ word = word[:word.index('(')] + word[word.index(')')+1:]
+ if '(' in word:
+ word = word[:word.index('(')]
+ # Replace differet delimiters to comma
+ replace_list = ['/', ' or ', ' also spelled ']
+ for r in replace_list:
+ word = word.replace(r, ',')
+ # Remove non-latin characters
+ stripped_text = ''
+ for c in word:
+ stripped_text += c if len(c.encode(encoding='utf_8'))==1 else ''
+ word = stripped_text
+ return(word)
+
+
+def process_extract(page_text):
+ extract = re.findall(r'<dt>(.*?)</dt>', page_text)
+ for word in extract:
+ ind = extract.index(word)
+ extract[ind] = process_word(word)
+
+ bad_words = []
+ for word in extract:
+ bad_words.extend(word.split(','))
+ #ethnic_words = [x.strip() for x in ethnic_words]
+
+ bad_words[:] = [x for x in bad_words if x != ''] 
+ bad_words[:] = [x.strip().lower() for x in bad_words] 
+ return(bad_words)
+
+
+
+
+
+wiki_html = wikipediaapi.Wikipedia(
+ language='en',
+ extract_format=wikipediaapi.ExtractFormat.HTML
+)
+
+page_ethnic = wiki_html.page("List_of_ethnic_slurs")
+page_religous = wiki_html.page("List_of_religious_slurs")
+
+
+ethnic_bad_words = process_extract(page_ethnic.text)
+religious_bad_words = process_extract(page_religous.text)
+
+
+
+
+
+