Skip to content

Commit

Permalink
my first rnn try
Browse files Browse the repository at this point in the history
  • Loading branch information
orbxball committed May 21, 2017
1 parent 095c720 commit 609140b
Showing 1 changed file with 206 additions and 0 deletions.
206 changes: 206 additions & 0 deletions hw5/train_rnn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
import sys, os
import argparse
import numpy as np
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.models import load_model
from keras import backend as K

def read_data(file):
print('Reading training data...')
tags, texts, categories = [], [], []
with open(file) as f:
for line in f.readlines():
buf = line.split('"')
if len(buf) < 3: continue

tags_tmp = buf[1].split(' ')
for category in tags_tmp:
categories.append(category)
tags.append(tags_tmp)
text = '"'.join(buf[2:])
texts.append(text)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
index_seq = tokenizer.texts_to_sequences(texts)

return tags, pad_sequences(index_seq), sorted(list(set(categories)))


def read_test(file):
print('Reading test data...')
texts = []
with open(file) as f:
for line in f.readlines():
text = ','.join(line.split(',')[1:])
if 'text' == text.strip(): continue
texts.append(text)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
index_seq = tokenizer.texts_to_sequences(texts)
return pad_sequences(index_seq)


def validate(X, Y, valid_size):
permu = np.random.permutation(X.shape[0])
x_valid = X[permu[:valid_size], :]
y_valid = Y[permu[:valid_size], :]
x_train = X[permu[valid_size:], :]
y_train = Y[permu[valid_size:], :]
return (x_train, y_train), (x_valid, y_valid)


def precision(y_true, y_pred):
"""Precision metric.
Only computes a batch-wise average of precision.
Computes the precision, a metric for multi-label classification of
how many selected items are relevant.
"""
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
precision = true_positives / (predicted_positives + K.epsilon())
return precision


def recall(y_true, y_pred):
"""Recall metric.
Only computes a batch-wise average of recall.
Computes the recall, a metric for multi-label classification of
how many relevant items are selected.
"""
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
recall = true_positives / (possible_positives + K.epsilon())
return recall


def fbeta_score(y_true, y_pred, beta=1):
"""Computes the F score.
The F score is the weighted harmonic mean of precision and recall.
Here it is only computed as a batch-wise average, not globally.
This is useful for multi-label classification, where input samples can be
classified as sets of labels. By only using accuracy (precision) a model
would achieve a perfect score by simply assigning every class to every
input. In order to avoid this, a metric should penalize incorrect class
assignments as well (recall). The F-beta score (ranged from 0.0 to 1.0)
computes this, as a weighted mean of the proportion of correct class
assignments vs. the proportion of incorrect class assignments.
With beta = 1, this is equivalent to a F-measure. With beta < 1, assigning
correct classes becomes more important, and with beta > 1 the metric is
instead weighted towards penalizing incorrect class assignments.
"""
if beta < 0:
raise ValueError('The lowest choosable beta is zero (only precision).')

# If there are no true positives, fix the F score at 0 like sklearn.
if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
return 0

p = precision(y_true, y_pred)
r = recall(y_true, y_pred)
bb = beta ** 2
fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
return fbeta_score


def fmeasure(y_true, y_pred):
"""Computes the f-measure, the harmonic mean of precision and recall.
Here it is only computed as a batch-wise average, not globally.
"""
print('y_true: {}'.format(y_true))
return fbeta_score(y_true, y_pred, beta=1)


def build_model(class_size, x_train, y_train, x_valid=None, y_valid=None, embedding_size=128):
print('Build model...')
model = Sequential()
model.add(Embedding(max_vocab, embedding_size))
model.add(LSTM(embedding_size, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(class_size, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=[fmeasure])

print('Train...')
if is_valid:
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=training_epoch,
validation_data=(x_valid, y_valid))
else:
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=training_epoch,)

return model


def main():
if os.path.exists(model_name) and os.path.exists(categories_name):
print('Loading model...')
model = load_model(model_name, custom_objects={'fmeasure': fmeasure})
categories = np.load(categories_name)
else:
tags, sequences, categories = read_data(train)
print(len(tags), sequences.shape, len(categories))

categorical_tags = np.zeros((len(tags), len(categories)))
for i, tag in enumerate(tags):
for item in tag:
categorical_tags[i][categories.index(item)] = 1

if is_valid:
(x_train, y_train) , (x_valid, y_valid) = validate(sequences, categorical_tags, valid_size)
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)
model = build_model(len(categories), x_train, y_train,
x_valid=x_valid, y_valid=y_valid, embedding_size=128)
else:
x_train, y_train = sequences, categorical_tags
model = build_model(len(categories), x_train, y_train,
embedding_size=256)

print('Saving model...')
model.save(model_name)
np.save(categories_name, categories)

x_test = read_test(test)
pred = model.predict(x_test)
pred[pred >= threshold] = 1
pred[pred < threshold] = 0
print(pred.shape)
print(categories)


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Homework 5: RNN')
parser.add_argument('--train', metavar='<#train data path>',
type=str)
parser.add_argument('--test', metavar='<#test data path>',
type=str, required=True)
parser.add_argument('--output', metavar='<#output path>',
type=str, required=True)
parser.add_argument('--valid', action='store_true')
args = parser.parse_args()

train = args.train
test = args.test
output = args.output
is_valid = args.valid
valid_size = 250
training_epoch = 100
batch_size = 32
max_vocab = 60000
threshold = 0.4
model_name = 'rnn_model.h5'
categories_name = 'rnn_categories.npy'

main()

0 comments on commit 609140b

Please sign in to comment.