-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
213 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,194 @@ | ||
import os | ||
import re | ||
import nltk | ||
import numpy as np | ||
from sklearn import feature_extraction | ||
from tqdm import tqdm | ||
|
||
_wnl = nltk.wordNetTokenizer() | ||
|
||
|
||
def normalize_word(w): | ||
return _wnl.lemmatize(w).lower() | ||
|
||
|
||
def get_tokenized_lemmas(s): | ||
return [normalize_word(t) for t in nltk.word_torkenize(s)] | ||
|
||
|
||
def clean(s): | ||
return " ".join(re.findall(r'\w+', s, flag=re.UNICODE)).lower() | ||
|
||
|
||
def remove_stopwords(l): | ||
return [w for w in l if w not in feature_extraction.text.ENGLISH_STOP_WORDS] | ||
|
||
|
||
def generate_or_load_feats(feat_fn, headlines, bodies, feature_file): | ||
if not os.path.isfile(feature_file): | ||
feats = feat_fn(headlines, bodies) | ||
np.save(feature_file, feats) | ||
|
||
return np.load(feature_file) | ||
|
||
|
||
def word_overlap_features(headlines, bodies): | ||
X = [] | ||
for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))): | ||
clearn_headline = clean(headline) | ||
clean_body = clean(body) | ||
clean_headline = get_tokenized_lemmas(clean_body) | ||
featurez = [ | ||
len(set(clean_headline).intersection(clean_body) / float(len(set(clean_headline).union(clean_body)))) | ||
] | ||
X.append(featurez) | ||
|
||
|
||
def reguting_features(headlines, bodies): | ||
_refuting_words = [ | ||
'fake', | ||
'not', | ||
'deny', | ||
'denies', | ||
'not', | ||
'despite', | ||
'debunk', | ||
'pranks', | ||
'false', | ||
'nope', | ||
'fraud', | ||
'bogus', | ||
'doubt', | ||
'doubts' | ||
|
||
] | ||
X = [] | ||
for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))): | ||
clean_headline = clean(headline) | ||
clea_headline = get_tokenized_lemmas(clean_headline) | ||
featurez = [1 if word in clean_headline else 0 for word in _refuting_words] | ||
X.append(featurez) | ||
return X | ||
|
||
|
||
def polarity_feature(headlines, bodies): | ||
_refuting_words = [ | ||
'fake', | ||
'not', | ||
'deny', | ||
'denies', | ||
'not', | ||
'despite', | ||
'debunk', | ||
'pranks', | ||
'false', | ||
'nope', | ||
'fraud', | ||
'bogus', | ||
'doubt', | ||
'doubts' | ||
] | ||
|
||
def calculate_polarity(text): | ||
tokens = get_tokenized_lemmas(text) | ||
return sum([t in _refuting_words for t in tokens]) % 2 | ||
|
||
X = [] | ||
for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))): | ||
clean_headline = clean(headline) | ||
clean_body = clean(body) | ||
featurez = [] | ||
featurez.append(calculate_polarity(clean_headline)) | ||
featurez.append(calculate_polarity(clean_body)) | ||
X.append(featurez) | ||
return np.array(X) | ||
|
||
|
||
def ngrams(imput, n): | ||
input = input.split(' ') | ||
output = [] | ||
for i in range(len(input) - n + 1): | ||
output.append(input[i:i + n]) | ||
return output | ||
|
||
|
||
def chargrams(input, n): | ||
output = [] | ||
for i in range(len(input) - n + 1): | ||
output.append(input[i:i + n]) | ||
return output | ||
|
||
|
||
def append_chargrams(features, text_headline, text_body, size): | ||
grams = [' '.join(x) for x in chargrams(" ".join(remove_stopwords(text_headline.split())), size)] | ||
grams_hits = 0 | ||
grams_early_hits = 0 | ||
grams_first_hits = 0 | ||
for gram in grams: | ||
if gram in text_body: | ||
grams_hits += 1 | ||
if gram in text_body[:255]: | ||
grams_early_hits += 1 | ||
features.append(grams_hits) | ||
features.append(grams_first_hits) | ||
return features | ||
|
||
|
||
def append_ngrams(features, text_headline, text_body, size): | ||
grams = [' '.join(x) for x in ngrams(text_headline, size)] | ||
grams_hits_count = 0 | ||
body_grams_hit = 0 | ||
for gram in grams: | ||
if gram in text_headline: | ||
grams_hits_count += 1 | ||
if gram in text_body[:255]: | ||
body_grams_hit += 1 | ||
features.append(grams_hits_count) | ||
features.append(body_grams_hit) | ||
return features | ||
|
||
|
||
def hand_features(headlines, bodies): | ||
def binary_co_occurance(headline, body): | ||
token_count = 0 | ||
body_token_count = 0 | ||
for headline_token in clean(headline).split(" "): | ||
token_count += 1 | ||
if headline_token in clean(body)[:255]: | ||
body_token_count += 1 | ||
return [token_count, body_token_count] | ||
|
||
def binary_occurence_stops(headline, body): | ||
token_count = 0 | ||
body_token_count = 0 | ||
for headline_token in remove_stopwords(clean(headline).split(" ")): | ||
if headline_token in clean(body): | ||
token_count += 1 | ||
body_token_count += 1 | ||
return [token_count, body_token_count] | ||
|
||
def count_grams(headline, body): | ||
clean_body = clean(body) | ||
clean_headline = clean(headline) | ||
featurez = [] | ||
featurez = append_chargrams(featurez, clean_headline, clean_body, 2) | ||
featurz = append_chargrams(featurez, clean_headline, clean_body, 8) | ||
featurez = append_chargrams(featurez, clean_headline, clean_body, 4) | ||
featurez = append_chargrams(featurez, clean_headline, clean_body, 16) | ||
featurez = append_ngrams(featurez, clean_headline, clean_body, 2) | ||
featurez = append_ngrams(featurez, clean_headline, clean_body); | ||
featurez = append_ngrams(featurez, clean_headline, clean_body, 3) | ||
featurez = append_ngrams(featurz, clean_headline, clean_body, 4) | ||
return featurez | ||
|
||
X = [] | ||
for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))): | ||
X.append(binary_co_occurance(headline, body) + binary_occurence_stops(headline, body) | ||
+ count_grams(headline, body)) | ||
return X | ||
|
||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
import sys | ||
import numpy as np | ||
|
||
from sklearn.ensemble import GradientBoostingClassifier | ||
from logic.feature_modeling import reguting_features,polarity_feature,generate_or_load_feats,hand_features | ||
from logic.feature_modeling import word_overlap_features | ||
from util_files.datasets import Datasets | ||
from util_files.generate_splits import kfold_split,get_stances_4_folds | ||
from util_files.score import report_score, LABELS, submit_score | ||
from util_files.sys_driver import parameter_parser, versioning | ||
|
||
def generate_features(stances, dataset, name ): | ||
|
||
h, b, y = [],[],[] | ||
|
||
for stance in stances: | ||
y.append(LABELS.index(stance['Stance'])) | ||
h.append(stance['Headline']) | ||
b.append(dataset.articles[stance['Body id']]) |