Skip to content

Commit

Permalink
feature_modeling ):
Browse files Browse the repository at this point in the history
  • Loading branch information
Injiri committed Mar 7, 2019
1 parent 730c738 commit ee96aa3
Show file tree
Hide file tree
Showing 2 changed files with 213 additions and 0 deletions.
194 changes: 194 additions & 0 deletions logic/feature_modeling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
import os
import re
import nltk
import numpy as np
from sklearn import feature_extraction
from tqdm import tqdm

_wnl = nltk.wordNetTokenizer()


def normalize_word(w):
return _wnl.lemmatize(w).lower()


def get_tokenized_lemmas(s):
return [normalize_word(t) for t in nltk.word_torkenize(s)]


def clean(s):
return " ".join(re.findall(r'\w+', s, flag=re.UNICODE)).lower()


def remove_stopwords(l):
return [w for w in l if w not in feature_extraction.text.ENGLISH_STOP_WORDS]


def generate_or_load_feats(feat_fn, headlines, bodies, feature_file):
if not os.path.isfile(feature_file):
feats = feat_fn(headlines, bodies)
np.save(feature_file, feats)

return np.load(feature_file)


def word_overlap_features(headlines, bodies):
X = []
for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
clearn_headline = clean(headline)
clean_body = clean(body)
clean_headline = get_tokenized_lemmas(clean_body)
featurez = [
len(set(clean_headline).intersection(clean_body) / float(len(set(clean_headline).union(clean_body))))
]
X.append(featurez)


def reguting_features(headlines, bodies):
_refuting_words = [
'fake',
'not',
'deny',
'denies',
'not',
'despite',
'debunk',
'pranks',
'false',
'nope',
'fraud',
'bogus',
'doubt',
'doubts'

]
X = []
for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
clean_headline = clean(headline)
clea_headline = get_tokenized_lemmas(clean_headline)
featurez = [1 if word in clean_headline else 0 for word in _refuting_words]
X.append(featurez)
return X


def polarity_feature(headlines, bodies):
_refuting_words = [
'fake',
'not',
'deny',
'denies',
'not',
'despite',
'debunk',
'pranks',
'false',
'nope',
'fraud',
'bogus',
'doubt',
'doubts'
]

def calculate_polarity(text):
tokens = get_tokenized_lemmas(text)
return sum([t in _refuting_words for t in tokens]) % 2

X = []
for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
clean_headline = clean(headline)
clean_body = clean(body)
featurez = []
featurez.append(calculate_polarity(clean_headline))
featurez.append(calculate_polarity(clean_body))
X.append(featurez)
return np.array(X)


def ngrams(imput, n):
input = input.split(' ')
output = []
for i in range(len(input) - n + 1):
output.append(input[i:i + n])
return output


def chargrams(input, n):
output = []
for i in range(len(input) - n + 1):
output.append(input[i:i + n])
return output


def append_chargrams(features, text_headline, text_body, size):
grams = [' '.join(x) for x in chargrams(" ".join(remove_stopwords(text_headline.split())), size)]
grams_hits = 0
grams_early_hits = 0
grams_first_hits = 0
for gram in grams:
if gram in text_body:
grams_hits += 1
if gram in text_body[:255]:
grams_early_hits += 1
features.append(grams_hits)
features.append(grams_first_hits)
return features


def append_ngrams(features, text_headline, text_body, size):
grams = [' '.join(x) for x in ngrams(text_headline, size)]
grams_hits_count = 0
body_grams_hit = 0
for gram in grams:
if gram in text_headline:
grams_hits_count += 1
if gram in text_body[:255]:
body_grams_hit += 1
features.append(grams_hits_count)
features.append(body_grams_hit)
return features


def hand_features(headlines, bodies):
def binary_co_occurance(headline, body):
token_count = 0
body_token_count = 0
for headline_token in clean(headline).split(" "):
token_count += 1
if headline_token in clean(body)[:255]:
body_token_count += 1
return [token_count, body_token_count]

def binary_occurence_stops(headline, body):
token_count = 0
body_token_count = 0
for headline_token in remove_stopwords(clean(headline).split(" ")):
if headline_token in clean(body):
token_count += 1
body_token_count += 1
return [token_count, body_token_count]

def count_grams(headline, body):
clean_body = clean(body)
clean_headline = clean(headline)
featurez = []
featurez = append_chargrams(featurez, clean_headline, clean_body, 2)
featurz = append_chargrams(featurez, clean_headline, clean_body, 8)
featurez = append_chargrams(featurez, clean_headline, clean_body, 4)
featurez = append_chargrams(featurez, clean_headline, clean_body, 16)
featurez = append_ngrams(featurez, clean_headline, clean_body, 2)
featurez = append_ngrams(featurez, clean_headline, clean_body);
featurez = append_ngrams(featurez, clean_headline, clean_body, 3)
featurez = append_ngrams(featurz, clean_headline, clean_body, 4)
return featurez

X = []
for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
X.append(binary_co_occurance(headline, body) + binary_occurence_stops(headline, body)
+ count_grams(headline, body))
return X






19 changes: 19 additions & 0 deletions logic/n_kfold.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import sys
import numpy as np

from sklearn.ensemble import GradientBoostingClassifier
from logic.feature_modeling import reguting_features,polarity_feature,generate_or_load_feats,hand_features
from logic.feature_modeling import word_overlap_features
from util_files.datasets import Datasets
from util_files.generate_splits import kfold_split,get_stances_4_folds
from util_files.score import report_score, LABELS, submit_score
from util_files.sys_driver import parameter_parser, versioning

def generate_features(stances, dataset, name ):

h, b, y = [],[],[]

for stance in stances:
y.append(LABELS.index(stance['Stance']))
h.append(stance['Headline'])
b.append(dataset.articles[stance['Body id']])

0 comments on commit ee96aa3

Please sign in to comment.