Skip to content

Commit

Permalink
Update imdb_test.py
Browse files Browse the repository at this point in the history
  • Loading branch information
vyraun committed Jun 10, 2018
1 parent ca1319a commit 4d91910
Showing 1 changed file with 55 additions and 13 deletions.
68 changes: 55 additions & 13 deletions imdb_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,19 @@
import numpy as np
from keras.datasets import imdb

top_words = 5000
test_split = 0.30
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words)
#top_words = 5000
#test_split = 0.30
(X, y), (X_test, y_test) = imdb.load_data() # num_words=top_words

X, y = np.array(X_train), np.array(y_train)
print ("total examples %s" % len(y))
print("Reading Done")

X, y = np.array(X), np.array(y)
print ("Total Training Data Points %s" % len(y))
#print ("X Shape = {}, y Shape = {}".format(X.shape, y.shape))
#print ("Sample X = {0}".format(X[0]))
#print ("Sample y = {0}".format(y[0]))

print("Reading the Word Vectors")

with open("glove.6B.200d.txt", "rb") as lines:
w2v = {line.split()[0]: np.array(map(float, line.split()[1:]))
Expand All @@ -16,6 +23,8 @@
with open("pca_embedding_30.txt", "rb") as lines:
rw2v = {line.split()[0]: np.array(map(float, line.split()[1:]))
for line in lines}

print("Reading Done")

from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter, defaultdict
Expand All @@ -38,7 +47,7 @@ def transform(self, X):
])

class TfidfEmbeddingVectorizer(object):
def __init__(self, word2vec):
def __init__(self, word2vec): # embedding dictionary is passed
self.word2vec = word2vec
self.word2weight = None
self.dim = len(next(iter(word2vec)))
Expand Down Expand Up @@ -67,25 +76,58 @@ def transform(self, X):
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

print("Transforming the Training Data")
print("An Input Vector Sample = {}".format(X[0]))
print("A Transformed Label Sample = {}".format(y[0]))

rX = X.copy()
rvec = TfidfEmbeddingVectorizer(rw2v)
rvec.fit(rX, y)
rX = rvec.transform(rX)
print("The Reduced Embedding Matrix Shape:")
print(rX.shape)

wX = X.copy()
wvec = TfidfEmbeddingVectorizer(w2v)
wvec.fit(wX, y)
wX = wvec.transform(wX)
print("The Non-Reduced Embedding Matrix Shape:")
print(wX.shape)

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(y)
y = le.transform(y)

print("A Transformed (Reduced) Input Vector Sample = {}".format(rX[0]))
print("A Transformed Label Sample = {}".format(y[0]))
print("The Label Classes = {}".format(le.classes_))

print("Starting the Model Training for Reduced Data")
rclf = LinearSVC(random_state=0)
rclf.fit(rX, y)
print("Training set score: %f" % rclf.score(rX, y))

print("Starting the Model Training for Non-Reduced Data")
wclf = LinearSVC(random_state=0)
wclf.fit(wX, y)
print("Training set score: %f" % wclf.score(wX, y))

"""
LinearSVM_rw2v_tfidf = Pipeline([
("word2vec vectorizer", TfidfEmbeddingVectorizer(rw2v)),
("extra trees", LinearSVC(random_state=0))])
LinearSVM_w2v_tfidf = Pipeline([
("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)),
("extra trees", LinearSVC(random_state=0))])

all_models = [
("LinearSVM_rw2v_tfidf", LinearSVM_rw2v_tfidf),
("LinearSVM_w2v_tfidf", LinearSVM_w2v_tfidf)
]


import tabulate
from tabulate import tabulate

from sklearn.cross_validation import cross_val_score
unsorted_scores = [(name, cross_val_score(model, X, y, cv=5).mean()) for name, model in all_models]
unsorted_scores = [(name, cross_val_score(model, X, y, cv=None).mean()) for name, model in all_models]
scores = sorted(unsorted_scores, key=lambda x: -x[1])


print (tabulate(scores, floatfmt=".4f", headers=("model", 'score')))
"""

0 comments on commit 4d91910

Please sign in to comment.