Skip to content

Commit

Permalink
f-eng refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
Injiri committed May 30, 2019
1 parent 90da7ef commit 9ad923f
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 29 deletions.
6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

43 changes: 20 additions & 23 deletions logic/feature_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def get_tokenized_lemmas(s):


def clean(s):
#at this point can perfom a regex to eliminate some parts of the text that arent useful at all by re.findall method
return " ".join(re.findall(r'\w+', s, flag=re.UNICODE)).lower()


Expand Down Expand Up @@ -62,12 +63,13 @@ def refuting_features(headlines, bodies):
'doubts'

]
# for each word, enumerate its features.
X = []
for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
clean_headline = clean(headline)
clea_headline = get_tokenized_lemmas(clean_headline)
featurez = [1 if word in clean_headline else 0 for word in _refuting_words]
X.append(featurez)
clean_headline = get_tokenized_lemmas(clean_headline)
features = [1 if word in clean_headline else 0 for word in _refuting_words]
X.append(features)
return X


Expand Down Expand Up @@ -104,7 +106,7 @@ def calculate_polarity(text):
return np.array(X)


def ngrams(imput, n):
def n_grams(input, n):
input = input.split(' ')
output = []
for i in range(len(input) - n + 1):
Expand All @@ -119,7 +121,7 @@ def chargrams(input, n):
return output


def append_chargrams(features, text_headline, text_body, size):
def append_char_grams(features, text_headline, text_body, size):
grams = [' '.join(x) for x in chargrams(" ".join(remove_stopwords(text_headline.split())), size)]
grams_hits = 0
grams_early_hits = 0
Expand All @@ -134,8 +136,8 @@ def append_chargrams(features, text_headline, text_body, size):
return features


def append_ngrams(features, text_headline, text_body, size):
grams = [' '.join(x) for x in ngrams(text_headline, size)]
def append_n_grams(features, text_headline, text_body, size):
grams = [' '.join(x) for x in n_grams(text_headline, size)]
grams_hits_count = 0
body_grams_hit = 0
for gram in grams:
Expand All @@ -148,7 +150,7 @@ def append_ngrams(features, text_headline, text_body, size):
return features


def hand_features(headlines, bodies):
def handle_features(headlines, bodies):
def binary_co_occurance(headline, body):
token_count = 0
body_token_count = 0
Expand All @@ -170,24 +172,19 @@ def binary_occurence_stops(headline, body):
def count_grams(headline, body):
clean_body = clean(body)
clean_headline = clean(headline)
featurez = []
featurez = append_chargrams(featurez, clean_headline, clean_body, 2)
featurz = append_chargrams(featurez, clean_headline, clean_body, 8)
featurez = append_chargrams(featurez, clean_headline, clean_body, 4)
featurez = append_chargrams(featurez, clean_headline, clean_body, 16)
featurez = append_ngrams(featurez, clean_headline, clean_body, 2)
featurez = append_ngrams(featurez, clean_headline, clean_body);
featurez = append_ngrams(featurez, clean_headline, clean_body, 3)
featurez = append_ngrams(featurz, clean_headline, clean_body, 4)
return featurez
features = []
features = append_char_grams(features, clean_headline, clean_body, 2)
features = append_char_grams(features, clean_headline, clean_body, 8)
features = append_char_grams(features, clean_headline, clean_body, 4)
features = append_char_grams(features, clean_headline, clean_body, 16)
features = append_n_grams(features, clean_headline, clean_body, 2)
features = append_n_grams(features, clean_headline, clean_body);
features = append_n_grams(features, clean_headline, clean_body, 3)
features = append_n_grams(features, clean_headline, clean_body, 4)
return features

X = []
for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
X.append(binary_co_occurance(headline, body) + binary_occurence_stops(headline, body)
+ count_grams(headline, body))
return X





12 changes: 6 additions & 6 deletions logic/n_kfold.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import numpy as np

from sklearn.ensemble import GradientBoostingClassifier
from logic.feature_modeling import refuting_features,polarity_feature,generate_or_load_feats,hand_features
from logic.feature_modeling import refuting_features,polarity_feature,generate_or_load_feats,handle_features
from logic.feature_modeling import word_overlap_features
from util_files.datasets import Datasets
from util_files.generate_splits import kfold_split,get_stances_4_folds
Expand Down Expand Up @@ -35,7 +35,7 @@ def generate_features(stances, dataset, name ):
fold_stances, hold_out_stances = get_stances_4_folds(my_dataset,folds,hold_out)

demo_dateset = Datasets("Demo /test")
X_demo, Y_demo = generate_features(demo_dateset.stances, demo_dateset, "demo")
X_demo, Y_test = generate_features(demo_dateset.stances, demo_dateset, "demo")

Xs = dict()
Ys = dict()
Expand All @@ -55,14 +55,14 @@ def generate_features(stances, dataset, name ):
X_train = np.vstack(tuple([Xs[i] for i in ids]))
Y_train = np.hstack(tuple([Ys[i] for i in ids]))

x_demo = Xs[fold]
Y_demo = Ys[fold]
X_test = Xs[fold]
Y_test = Ys[fold]

classifier = GradientBoostingClassifier(n_estimaters=200, random_state=14128, verbose=True)
classifier.fit(X_train, Y_train)

predicted_result = [LABELS[int(n)] for n in classifier.predict(X_demo)]
actual_result = [LABELS[int(n)] for n in Y_demo ]
actual_result = [LABELS[int(n)] for n in Y_test]

fold_score, _ = submit_score(actual_result, predicted_result)
max_fold_score, _ =submit_score(actual_result, actual_result);
Expand All @@ -77,6 +77,6 @@ def generate_features(stances, dataset, name ):

#report the final best_score
predicted = [LABELS[int (n)] for n in best_fold.predict(X_holdout)]
actual_result = [LABELS(int (n)) for n in Y_demo]
actual_result = [LABELS(int (n)) for n in Y_test]

report_score(actual_result, predicted)

0 comments on commit 9ad923f

Please sign in to comment.