Skip to content

Commit

Permalink
Merge pull request #17 from rautey/Getting-words-from-wiki
Browse files Browse the repository at this point in the history
Getting words from wiki
  • Loading branch information
jss367 committed Aug 2, 2019
2 parents e27c74e + ba3b508 commit 649e833
Show file tree
Hide file tree
Showing 3 changed files with 126 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,6 @@
*.pyc
__pycache__
*.ipynb
*.xlsx
Python/code/bad_words_from_wiki.txt
Python/code/.DS_Store
56 changes: 56 additions & 0 deletions Python/code/twitter_label.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@

import pandas
import numpy as np
import re

tweets = pandas.read_csv('~/Documents/final_tweets_NLP+CSS_2016.csv', header = None)
tweets['label'] = 0

badwords = pandas.read_csv('~/Documents/list1.csv', header = None)

for i in range(0, len(badwords)):

print(badwords[0][i])
tweets["label"] = tweets["label"] + [1 if badwords[0][i] in ele else 0 for ele in tweets[1]]




badwords = pandas.read_csv('~/Documents/list2.csv', header = None)

for i in range(0, len(badwords)):
text = re.findall('\"(.*?)\"', badwords.loc[i][0])
print(text)
tweets["label"] = tweets["label"] + [1 if text[0] in ele else 0 for ele in tweets[1]]


badwords = pandas.read_csv('~/Documents/list3.csv', header = None)

for i in range(0, len(badwords)):

print(badwords[0][i])
tweets["label"] = tweets["label"] + [1 if badwords[0][i] in ele else 0 for ele in tweets[1]]



badwords = pandas.read_csv('~/Documents/list4.csv', header = None)

for i in range(0, len(badwords)):

print(badwords[0][i])
tweets["label"] = tweets["label"] + [1 if badwords[0][i] in ele else 0 for ele in tweets[1]]


badwords = pandas.read_csv('~/Documents/list5.csv', header = None)

for i in range(0, len(badwords)):
text = badwords.loc[i][0].split(',')
print(text)
tweets["label"] = tweets["label"] + [1 if text[0] in ele else 0 for ele in tweets[1]]


tweets['label'] = np.where(tweets['label']>=1, 1, 0)


## sanity check
tweets['label'].sum()
66 changes: 66 additions & 0 deletions Python/code/wiki_badwords.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import wikipediaapi
import re



def process_word(word):
# remove tags
remove_list = ['<i>', '</i>', '<b>', '</b>', '\xa0', '"']
for r in remove_list:
word = word.replace(r, '')
# Remove span
if 'span' in word:
word = re.findall(r'>(.*?)<', word)[0]
# Remove words in parentheses
if '(' in word and ')' in word:
word = word[:word.index('(')] + word[word.index(')')+1:]
if '(' in word:
word = word[:word.index('(')]
# Replace differet delimiters to comma
replace_list = ['/', ' or ', ' also spelled ']
for r in replace_list:
word = word.replace(r, ',')
# Remove non-latin characters
stripped_text = ''
for c in word:
stripped_text += c if len(c.encode(encoding='utf_8'))==1 else ''
word = stripped_text
return(word)


def process_extract(page_text):
extract = re.findall(r'<dt>(.*?)</dt>', page_text)
for word in extract:
ind = extract.index(word)
extract[ind] = process_word(word)

bad_words = []
for word in extract:
bad_words.extend(word.split(','))
#ethnic_words = [x.strip() for x in ethnic_words]

bad_words[:] = [x for x in bad_words if x != '']
bad_words[:] = [x.strip().lower() for x in bad_words]
return(bad_words)





wiki_html = wikipediaapi.Wikipedia(
language='en',
extract_format=wikipediaapi.ExtractFormat.HTML
)

page_ethnic = wiki_html.page("List_of_ethnic_slurs")
page_religous = wiki_html.page("List_of_religious_slurs")


ethnic_bad_words = process_extract(page_ethnic.text)
religious_bad_words = process_extract(page_religous.text)






0 comments on commit 649e833

Please sign in to comment.