Skip to content

Commit

Permalink
created flask web app for Api interfacing
Browse files Browse the repository at this point in the history
  • Loading branch information
Injiri committed Jun 5, 2019
1 parent 69f7ba5 commit 8609579
Show file tree
Hide file tree
Showing 335 changed files with 54,377 additions and 161 deletions.
3 changes: 3 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

279 changes: 186 additions & 93 deletions .idea/workspace.xml

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions fnc-1-baseline
Submodule fnc-1-baseline added at 7fb74c
Binary file modified logic/__pycache__/feature_modeling.cpython-36.pyc
Binary file not shown.
Binary file added logic/__pycache__/generate_splits.cpython-36.pyc
Binary file not shown.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
10 changes: 6 additions & 4 deletions logic/feature_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@ def normalize_word(w):


def get_tokenized_lemmas(s):
return [normalize_word(t) for t in nltk.word_torkenize(s)]
return [normalize_word(t) for t in nltk.word_tokenize(s)]


def clean(s):
# at this point can perfom a regex to eliminate some parts of the text that arent useful at all by re.findall method
return " ".join(re.findall(r'\w+', s, flag=re.UNICODE)).lower()
return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()


def remove_stopwords(l):
Expand All @@ -38,11 +38,13 @@ def word_overlap_features(headlines, bodies):
for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
clean_headline = clean(headline)
clean_body = clean(body)
clean_headline = get_tokenized_lemmas(clean_body)
clean_headline = get_tokenized_lemmas(clean_headline)
clean_body = get_tokenized_lemmas(clean_body)
features = [
len(set(clean_headline).intersection(clean_body) / float(len(set(clean_headline).union(clean_body))))
len(set(clean_headline).intersection(clean_body)) / float(len(set(clean_headline).union(clean_body)))
]
X.append(features)
return X


def refuting_features(headlines, bodies):
Expand Down
12 changes: 6 additions & 6 deletions util_files/generate_splits.py → logic/generate_splits.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
training = 0.8


def gen_holdout_split(datasets, training, base_dir="splits_data"):
def gen_holdout_split(datasets, training, base_dir="/splits_data"):
r = random.Random()
r.seed(1489215)

Expand All @@ -16,10 +16,10 @@ def gen_holdout_split(datasets, training, base_dir="splits_data"):
hold_out_ids = article_ids[int(training * len(article_ids)):]

# write split bodyids to file for later use
with open(base_dir + "/" + training_ids.txt, "w+") as f:
with open(base_dir + "/" + "training_ids.txt", "w+") as f:
f.write("\n".join([str(id) for id in training_ids]))

with open(base_dir + "/" + hold_out_ids.txt, "w+") as f:
with open(base_dir + "/" + "hold_out_ids.txt", "w+") as f:
f.write("\n".join([str(id) for id in hold_out_ids]))


Expand All @@ -36,7 +36,7 @@ def kfold_split(datasets, training=0.8, n_folds=10, base_dir="splits_data"):
and os.path.exists(base_dir + "/" + "hold_out_ids.txt")):
gen_holdout_split(datasets, training, base_dir)

training_ids = read_ids("traing_ids.txt", base_dir)
training_ids = read_ids("training_ids.txt", base_dir)
holdout_ids = read_ids("hold_out_ids.txt", base_dir)

folds_array = []
Expand All @@ -51,12 +51,12 @@ def get_stances_4_folds(datasets, folds, hold_out):
stances_folds = defaultdict(list)
stances_hold_out = []
for stance in datasets.stances:
if stance['Body id'] in hold_out:
if stance['Body ID'] in hold_out:
stances_hold_out.append(stance)
else:
fold_id = 0
for fold in folds:
if stance["Body id"] in fold:
if stance["Body ID"] in fold:
stances_folds[fold_id].append(stance)
fold_id += 1
return stances_folds, stances_hold_out
65 changes: 32 additions & 33 deletions logic/n_kfold.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,55 @@
import sys
import numpy as np

from sklearn.ensemble import GradientBoostingClassifier
from logic.feature_modeling import refuting_features,polarity_feature,generate_or_load_feats,handle_features
from logic.feature_modeling import refuting_features, polarity_feature, generate_or_load_feats, handle_features
from logic.feature_modeling import word_overlap_features
from util_files.datasets import Datasets
from util_files.generate_splits import kfold_split,get_stances_4_folds
from util_files.score import report_score, LABELS, submit_score
from util_files.datasets import datasets
from logic.generate_splits import kfold_split, get_stances_4_folds
from util_files.score import report_score, LABELS, submit_score
from util_files.sys_driver import parameter_parser, versioning

def generate_features(stances, dataset, name ):

h, b, y = [],[],[]
def generate_features(stances, dataset, name):
h, b, y = [], [], []

for stance in stances:
y.append(LABELS.index(stance['Stance']))
h.append(stance['Headline'])
b.append(dataset.articles[stance['Body id']])
b.append(dataset.articles[stance['Body ID']])

X_overlap = generate_or_load_feats(word_overlap_features, h, b, "sysFeatures/overlap." + name + ".npy")
X_refuting = generate_or_load_feats(refuting_features, h, b, "sysFeatures/refuting." + name + ".npy")
X_polarity = generate_or_load_feats(polarity_feature, h, b, "sysFeatures/polarity." + name + ".npy")
X_hand = generate_or_load_feats(handle_features, h, b, "sysFeatures/hand." + name + ".npy")

x_overlap = generate_or_load_feats(word_overlap_features, h, b, "feaures/refuting."+name+".npy")
x_refuting = generate_or_load_feats(refuting_features, h, b, "features/refuting."+name+".npy")
x_polarity = generate_or_load_feats()
X = np.c_[X_hand, X_polarity, X_overlap, X_refuting]
return X, y

X = np.c_
return X,y

#Generate folds
# Generate folds
if __name__ == "__main__":
versioning()
#load the traing dataset
# load the traing dataset
parameter_parser()
my_dataset = Datasets()
folds,hold_out = kfold_split(my_dataset,n_folds=10)
fold_stances, hold_out_stances = get_stances_4_folds(my_dataset,folds,hold_out)
#load dataset
my_dataset = datasets()
folds, hold_out = kfold_split(my_dataset, n_folds=10)
fold_stances, hold_out_stances = get_stances_4_folds(my_dataset, folds, hold_out)

demo_dataset = Datasets("Demo /test")
X_demo, Y_test = generate_features(demo_dataset.stances, demo_dataset, "demo")
demo_dataset = datasets("data")
X_demo, Y_test = generate_features(demo_dataset.stances, demo_dataset, "data")

Xs = dict()
Ys = dict()

#populate all features
X_holdout,y_holdout = generate_features(hold_out_stances, my_dataset,"holder")
# populate all features
X_holdout, y_holdout = generate_features(hold_out_stances, my_dataset, "holder")
for fold in fold_stances:
Xs[fold],Ys[fold] = generate_features(fold_stances[fold], demo_dataset, str(fold))
Xs[fold], Ys[fold] = generate_features(fold_stances[fold], demo_dataset, str(fold))

best_score = 0
best_fold = None


#perform classification for each fold
# perform classification for each fold
for fold in fold_stances:
ids = list(range(len(folds)))
del ids[fold]
Expand All @@ -67,18 +67,17 @@ def generate_features(stances, dataset, name ):
actual_result = [LABELS[int(n)] for n in Y_test]

fold_score, _ = submit_score(actual_result, predicted_result)
max_fold_score, _ =submit_score(actual_result, actual_result);
max_fold_score, _ = submit_score(actual_result, actual_result);

weigthted_score = fold_score/max_fold_score
weigthted_score = fold_score / max_fold_score

print(str(fold) + "is Fold score initialy was" + str(weigthted_score))
if weigthted_score > best_score:
best_score = weigthted_score
best_score = classifier

# report the final best_score
predicted = [LABELS[int(n)] for n in best_fold.predict(X_holdout)]
actual_result = [LABELS(int(n)) for n in Y_test]

#report the final best_score
predicted = [LABELS[int (n)] for n in best_fold.predict(X_holdout)]
actual_result = [LABELS(int (n)) for n in Y_test]

report_score(actual_result, predicted)
report_score(actual_result, predicted)
File renamed without changes.
File renamed without changes.
File renamed without changes.
Binary file added logic/sysFeatures/overlap.data.npy
Binary file not shown.
Binary file added logic/sysFeatures/polarity.data.npy
Binary file not shown.
Binary file added logic/sysFeatures/refuting.data.npy
Binary file not shown.
Binary file modified util_files/__pycache__/datasets.cpython-36.pyc
Binary file not shown.
Binary file not shown.
Binary file modified util_files/__pycache__/score.cpython-36.pyc
Binary file not shown.
12 changes: 6 additions & 6 deletions util_files/datasets.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from csv import DictReader


class Datasets():
def _init_(self, path="data"):
class datasets():
def __init__(self, path="./data"):
self.path = path

print("start reading dataset")
Expand All @@ -14,11 +14,11 @@ def _init_(self, path="data"):
# make an arra of all articles
self.articles = dict()

for t in self.stances:
t['Body id'] = int(t['Body id'])

for stance in self.stances:
stance['Body ID'] = int(stance['Body ID'])
#makes body id aninteger value
for article in articles:
self.articles[int([article["Body id"]])] = article['articleBody']
self.articles[int(article["Body ID"])] = article['articleBody']

print("Total stances" + str(len(self.stances)))
print("Total Articles:" + str(len(self.articles)))
Expand Down
2 changes: 1 addition & 1 deletion util_files/score.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
LABELS = ['agree', 'disgree', 'discuss', 'unrelated']
LABELS = ['agree', 'disagree', 'discuss', 'unrelated']
LABELS_RELATED = ['unrelated', 'realted']
RELATED = LABELS[0:3]

Expand Down
10 changes: 10 additions & 0 deletions venv/bin/flask
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/home/cymoh/PycharmProjects/blog_stance_verifier/venv/bin/python
# -*- coding: utf-8 -*-
import re
import sys

from flask.cli import main

if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0])
sys.exit(main())
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pip
39 changes: 39 additions & 0 deletions venv/lib/python3.6/site-packages/Click-7.0.dist-info/LICENSE.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
Copyright © 2014 by the Pallets team.

Some rights reserved.

Redistribution and use in source and binary forms of the software as
well as documentation, with or without modification, are permitted
provided that the following conditions are met:

- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.

- Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.

THIS SOFTWARE AND DOCUMENTATION IS PROVIDED BY THE COPYRIGHT HOLDERS AND
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
THIS SOFTWARE AND DOCUMENTATION, EVEN IF ADVISED OF THE POSSIBILITY OF
SUCH DAMAGE.

----

Click uses parts of optparse written by Gregory P. Ward and maintained
by the Python Software Foundation. This is limited to code in parser.py.

Copyright © 2001-2006 Gregory P. Ward. All rights reserved.
Copyright © 2002-2006 Python Software Foundation. All rights reserved.
Loading

0 comments on commit 8609579

Please sign in to comment.