created flask web app for Api interfacing

Injiri · Jun 5, 2019 · 8609579 · 8609579
1 parent 69f7ba5
commit 8609579
Show file tree

Hide file tree

Showing 335 changed files with 54,377 additions and 161 deletions.
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
diff --git a/fnc-1-baseline b/fnc-1-baseline
diff --git a/logic/__pycache__/feature_modeling.cpython-36.pyc b/logic/__pycache__/feature_modeling.cpython-36.pyc
diff --git a/logic/__pycache__/generate_splits.cpython-36.pyc b/logic/__pycache__/generate_splits.cpython-36.pyc
diff --git a/data/test_bodies.csv → logic/data/test_bodies.csv b/data/test_bodies.csv → logic/data/test_bodies.csv
diff --git a/data/test_stances.csv → logic/data/test_stances.csv b/data/test_stances.csv → logic/data/test_stances.csv
diff --git a/data/test_stances_unlabeled.csv → logic/data/test_stances_unlabeled.csv b/data/test_stances_unlabeled.csv → logic/data/test_stances_unlabeled.csv
diff --git a/data/train_bodies.csv → logic/data/train_bodies.csv b/data/train_bodies.csv → logic/data/train_bodies.csv
diff --git a/data/train_stances.csv → logic/data/train_stances.csv b/data/train_stances.csv → logic/data/train_stances.csv
diff --git a/data/train_stances.random.csv → logic/data/train_stances.random.csv b/data/train_stances.random.csv → logic/data/train_stances.random.csv
diff --git a/logic/feature_modeling.py b/logic/feature_modeling.py
@@ -13,12 +13,12 @@ def normalize_word(w):
 
 
 def get_tokenized_lemmas(s):
- return [normalize_word(t) for t in nltk.word_torkenize(s)]
+ return [normalize_word(t) for t in nltk.word_tokenize(s)]
 
 
 def clean(s):
  # at this point can perfom a regex to eliminate some parts of the text that arent useful at all by re.findall method
- return " ".join(re.findall(r'\w+', s, flag=re.UNICODE)).lower()
+ return " ".join(re.findall(r'\w+', s, flags=re.UNICODE)).lower()
 
 
 def remove_stopwords(l):
@@ -38,11 +38,13 @@ def word_overlap_features(headlines, bodies):
  for i, (headline, body) in tqdm(enumerate(zip(headlines, bodies))):
  clean_headline = clean(headline)
  clean_body = clean(body)
- clean_headline = get_tokenized_lemmas(clean_body)
+ clean_headline = get_tokenized_lemmas(clean_headline)
+ clean_body = get_tokenized_lemmas(clean_body)
  features = [
- len(set(clean_headline).intersection(clean_body) / float(len(set(clean_headline).union(clean_body))))
+ len(set(clean_headline).intersection(clean_body)) / float(len(set(clean_headline).union(clean_body)))
  ]
  X.append(features)
+ return X
 
 
 def refuting_features(headlines, bodies):

diff --git a/util_files/generate_splits.py → logic/generate_splits.py b/util_files/generate_splits.py → logic/generate_splits.py
@@ -5,7 +5,7 @@
 training = 0.8
 
 
-def gen_holdout_split(datasets, training, base_dir="splits_data"):
+def gen_holdout_split(datasets, training, base_dir="/splits_data"):
  r = random.Random()
  r.seed(1489215)
 
@@ -16,10 +16,10 @@ def gen_holdout_split(datasets, training, base_dir="splits_data"):
  hold_out_ids = article_ids[int(training * len(article_ids)):]
 
  # write split bodyids to file for later use
- with open(base_dir + "/" + training_ids.txt, "w+") as f:
+ with open(base_dir + "/" + "training_ids.txt", "w+") as f:
  f.write("\n".join([str(id) for id in training_ids]))
 
- with open(base_dir + "/" + hold_out_ids.txt, "w+") as f:
+ with open(base_dir + "/" + "hold_out_ids.txt", "w+") as f:
  f.write("\n".join([str(id) for id in hold_out_ids]))
 
 
@@ -36,7 +36,7 @@ def kfold_split(datasets, training=0.8, n_folds=10, base_dir="splits_data"):
  and os.path.exists(base_dir + "/" + "hold_out_ids.txt")):
  gen_holdout_split(datasets, training, base_dir)
 
- training_ids = read_ids("traing_ids.txt", base_dir)
+ training_ids = read_ids("training_ids.txt", base_dir)
  holdout_ids = read_ids("hold_out_ids.txt", base_dir)
 
  folds_array = []
@@ -51,12 +51,12 @@ def get_stances_4_folds(datasets, folds, hold_out):
  stances_folds = defaultdict(list)
  stances_hold_out = []
  for stance in datasets.stances:
- if stance['Body id'] in hold_out:
+ if stance['Body ID'] in hold_out:
  stances_hold_out.append(stance)
  else:
  fold_id = 0
  for fold in folds:
- if stance["Body id"] in fold:
+ if stance["Body ID"] in fold:
  stances_folds[fold_id].append(stance)
  fold_id += 1
  return stances_folds, stances_hold_out
diff --git a/logic/n_kfold.py b/logic/n_kfold.py
@@ -1,55 +1,55 @@
-import sys
 import numpy as np
-
 from sklearn.ensemble import GradientBoostingClassifier
-from logic.feature_modeling import refuting_features,polarity_feature,generate_or_load_feats,handle_features
+from logic.feature_modeling import refuting_features, polarity_feature, generate_or_load_feats, handle_features
 from logic.feature_modeling import word_overlap_features
-from util_files.datasets import Datasets
-from util_files.generate_splits import kfold_split,get_stances_4_folds
-from  util_files.score import  report_score, LABELS, submit_score
+from util_files.datasets import datasets
+from logic.generate_splits import kfold_split, get_stances_4_folds
+from util_files.score import report_score, LABELS, submit_score
 from util_files.sys_driver import parameter_parser, versioning
 
-def generate_features(stances, dataset, name ):
 
- h, b, y = [],[],[]
+def generate_features(stances, dataset, name):
+ h, b, y = [], [], []
 
  for stance in stances:
  y.append(LABELS.index(stance['Stance']))
  h.append(stance['Headline'])
- b.append(dataset.articles[stance['Body id']])
+ b.append(dataset.articles[stance['Body ID']])
+
+ X_overlap = generate_or_load_feats(word_overlap_features, h, b, "sysFeatures/overlap." + name + ".npy")
+ X_refuting = generate_or_load_feats(refuting_features, h, b, "sysFeatures/refuting." + name + ".npy")
+ X_polarity = generate_or_load_feats(polarity_feature, h, b, "sysFeatures/polarity." + name + ".npy")
+ X_hand = generate_or_load_feats(handle_features, h, b, "sysFeatures/hand." + name + ".npy")
 
- x_overlap = generate_or_load_feats(word_overlap_features, h, b, "feaures/refuting."+name+".npy")
- x_refuting = generate_or_load_feats(refuting_features, h, b, "features/refuting."+name+".npy")
- x_polarity = generate_or_load_feats()
+ X = np.c_[X_hand, X_polarity, X_overlap, X_refuting]
+ return X, y
 
- X = np.c_
- return X,y
 
-#Generate folds
+# Generate folds
 if __name__ == "__main__":
  versioning()
- #load the traing dataset
+ # load the traing dataset
  parameter_parser()
- my_dataset = Datasets()
- folds,hold_out = kfold_split(my_dataset,n_folds=10)
- fold_stances, hold_out_stances = get_stances_4_folds(my_dataset,folds,hold_out)
+ #load dataset
+ my_dataset = datasets()
+ folds, hold_out = kfold_split(my_dataset, n_folds=10)
+ fold_stances, hold_out_stances = get_stances_4_folds(my_dataset, folds, hold_out)
 
- demo_dataset = Datasets("Demo /test")
- X_demo, Y_test = generate_features(demo_dataset.stances, demo_dataset, "demo")
+ demo_dataset = datasets("data")
+ X_demo, Y_test = generate_features(demo_dataset.stances, demo_dataset, "data")
 
  Xs = dict()
  Ys = dict()
 
- #populate all features
- X_holdout,y_holdout = generate_features(hold_out_stances, my_dataset,"holder")
+ # populate all features
+ X_holdout, y_holdout = generate_features(hold_out_stances, my_dataset, "holder")
  for fold in fold_stances:
- Xs[fold],Ys[fold] = generate_features(fold_stances[fold], demo_dataset, str(fold))
+ Xs[fold], Ys[fold] = generate_features(fold_stances[fold], demo_dataset, str(fold))
 
  best_score = 0
  best_fold = None
 
-
-#perform classification for each fold
+ # perform classification for each fold
  for fold in fold_stances:
  ids = list(range(len(folds)))
  del ids[fold]
@@ -67,18 +67,17 @@ def generate_features(stances, dataset, name ):
  actual_result = [LABELS[int(n)] for n in Y_test]
 
  fold_score, _ = submit_score(actual_result, predicted_result)
- max_fold_score, _ =submit_score(actual_result, actual_result);
+ max_fold_score, _ = submit_score(actual_result, actual_result);
 
- weigthted_score = fold_score/max_fold_score
+ weigthted_score = fold_score / max_fold_score
 
  print(str(fold) + "is Fold score initialy was" + str(weigthted_score))
  if weigthted_score > best_score:
  best_score = weigthted_score
  best_score = classifier
 
+ # report the final best_score
+ predicted = [LABELS[int(n)] for n in best_fold.predict(X_holdout)]
+ actual_result = [LABELS(int(n)) for n in Y_test]
 
- #report the final best_score
- predicted = [LABELS[int (n)] for n in best_fold.predict(X_holdout)]
- actual_result = [LABELS(int (n)) for n in Y_test]
-
- report_score(actual_result, predicted)
+ report_score(actual_result, predicted)
diff --git a/splits_data/dummy.txt → logic/splits_data/dummy.txt b/splits_data/dummy.txt → logic/splits_data/dummy.txt
diff --git a/splits_data/hold_out_ids.txt → logic/splits_data/hold_out_ids.txt b/splits_data/hold_out_ids.txt → logic/splits_data/hold_out_ids.txt
diff --git a/splits_data/training_ids.txt → logic/splits_data/training_ids.txt b/splits_data/training_ids.txt → logic/splits_data/training_ids.txt
diff --git a/logic/sysFeatures/overlap.data.npy b/logic/sysFeatures/overlap.data.npy
diff --git a/logic/sysFeatures/polarity.data.npy b/logic/sysFeatures/polarity.data.npy
diff --git a/logic/sysFeatures/refuting.data.npy b/logic/sysFeatures/refuting.data.npy
diff --git a/util_files/__pycache__/datasets.cpython-36.pyc b/util_files/__pycache__/datasets.cpython-36.pyc
diff --git a/util_files/__pycache__/generate_splits.cpython-36.pyc b/util_files/__pycache__/generate_splits.cpython-36.pyc
diff --git a/util_files/__pycache__/score.cpython-36.pyc b/util_files/__pycache__/score.cpython-36.pyc
diff --git a/util_files/datasets.py b/util_files/datasets.py
@@ -1,8 +1,8 @@
 from csv import DictReader
 
 
-class Datasets():
- def _init_(self, path="data"):
+class datasets():
+ def __init__(self, path="./data"):
  self.path = path
 
  print("start reading dataset")
@@ -14,11 +14,11 @@ def _init_(self, path="data"):
  # make an arra of all articles
  self.articles = dict()
 
- for t in self.stances:
- t['Body id'] = int(t['Body id'])
-
+ for stance in self.stances:
+ stance['Body ID'] = int(stance['Body ID'])
+#makes body id aninteger value
  for article in articles:
- self.articles[int([article["Body id"]])] = article['articleBody']
+ self.articles[int(article["Body ID"])] = article['articleBody']
 
  print("Total stances" + str(len(self.stances)))
  print("Total Articles:" + str(len(self.articles)))

diff --git a/util_files/score.py b/util_files/score.py
@@ -1,4 +1,4 @@
-LABELS = ['agree', 'disgree', 'discuss', 'unrelated']
+LABELS = ['agree', 'disagree', 'discuss', 'unrelated']
 LABELS_RELATED = ['unrelated', 'realted']
 RELATED = LABELS[0:3]
 

diff --git a/venv/bin/flask b/venv/bin/flask
@@ -0,0 +1,10 @@
+#!/home/cymoh/PycharmProjects/blog_stance_verifier/venv/bin/python
+# -*- coding: utf-8 -*-
+import re
+import sys
+
+from flask.cli import main
+
+if __name__ == '__main__':
+ sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0])
+ sys.exit(main())
diff --git a/venv/lib/python3.6/site-packages/Click-7.0.dist-info/INSTALLER b/venv/lib/python3.6/site-packages/Click-7.0.dist-info/INSTALLER
@@ -0,0 +1 @@
+pip
diff --git a/venv/lib/python3.6/site-packages/Click-7.0.dist-info/LICENSE.txt b/venv/lib/python3.6/site-packages/Click-7.0.dist-info/LICENSE.txt
@@ -0,0 +1,39 @@
+Copyright © 2014 by the Pallets team.
+
+Some rights reserved.
+
+Redistribution and use in source and binary forms of the software as
+well as documentation, with or without modification, are permitted
+provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+- Neither the name of the copyright holder nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE AND DOCUMENTATION IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
+BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+THIS SOFTWARE AND DOCUMENTATION, EVEN IF ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.
+
+----
+
+Click uses parts of optparse written by Gregory P. Ward and maintained
+by the Python Software Foundation. This is limited to code in parser.py.
+
+Copyright © 2001-2006 Gregory P. Ward. All rights reserved.
+Copyright © 2002-2006 Python Software Foundation. All rights reserved.