-
Notifications
You must be signed in to change notification settings - Fork 10
/
train_rnn.py
206 lines (172 loc) · 6.83 KB
/
train_rnn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import sys, os
import argparse
import numpy as np
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.models import load_model
from keras import backend as K
def read_data(file):
print('Reading training data...')
tags, texts, categories = [], [], []
with open(file) as f:
for line in f.readlines():
buf = line.split('"')
if len(buf) < 3: continue
tags_tmp = buf[1].split(' ')
for category in tags_tmp:
categories.append(category)
tags.append(tags_tmp)
text = '"'.join(buf[2:])
texts.append(text)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
index_seq = tokenizer.texts_to_sequences(texts)
return tags, pad_sequences(index_seq), sorted(list(set(categories)))
def read_test(file):
print('Reading test data...')
texts = []
with open(file) as f:
for line in f.readlines():
text = ','.join(line.split(',')[1:])
if 'text' == text.strip(): continue
texts.append(text)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
index_seq = tokenizer.texts_to_sequences(texts)
return pad_sequences(index_seq)
def validate(X, Y, valid_size):
permu = np.random.permutation(X.shape[0])
x_valid = X[permu[:valid_size], :]
y_valid = Y[permu[:valid_size], :]
x_train = X[permu[valid_size:], :]
y_train = Y[permu[valid_size:], :]
return (x_train, y_train), (x_valid, y_valid)
def precision(y_true, y_pred):
"""Precision metric.
Only computes a batch-wise average of precision.
Computes the precision, a metric for multi-label classification of
how many selected items are relevant.
"""
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
precision = true_positives / (predicted_positives + K.epsilon())
return precision
def recall(y_true, y_pred):
"""Recall metric.
Only computes a batch-wise average of recall.
Computes the recall, a metric for multi-label classification of
how many relevant items are selected.
"""
true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
recall = true_positives / (possible_positives + K.epsilon())
return recall
def fbeta_score(y_true, y_pred, beta=1):
"""Computes the F score.
The F score is the weighted harmonic mean of precision and recall.
Here it is only computed as a batch-wise average, not globally.
This is useful for multi-label classification, where input samples can be
classified as sets of labels. By only using accuracy (precision) a model
would achieve a perfect score by simply assigning every class to every
input. In order to avoid this, a metric should penalize incorrect class
assignments as well (recall). The F-beta score (ranged from 0.0 to 1.0)
computes this, as a weighted mean of the proportion of correct class
assignments vs. the proportion of incorrect class assignments.
With beta = 1, this is equivalent to a F-measure. With beta < 1, assigning
correct classes becomes more important, and with beta > 1 the metric is
instead weighted towards penalizing incorrect class assignments.
"""
if beta < 0:
raise ValueError('The lowest choosable beta is zero (only precision).')
# If there are no true positives, fix the F score at 0 like sklearn.
if K.sum(K.round(K.clip(y_true, 0, 1))) == 0:
return 0
p = precision(y_true, y_pred)
r = recall(y_true, y_pred)
bb = beta ** 2
fbeta_score = (1 + bb) * (p * r) / (bb * p + r + K.epsilon())
return fbeta_score
def fmeasure(y_true, y_pred):
"""Computes the f-measure, the harmonic mean of precision and recall.
Here it is only computed as a batch-wise average, not globally.
"""
print('y_true: {}'.format(y_true))
return fbeta_score(y_true, y_pred, beta=1)
def build_model(class_size, x_train, y_train, x_valid=None, y_valid=None, embedding_size=128):
print('Build model...')
model = Sequential()
model.add(Embedding(max_vocab, embedding_size))
model.add(LSTM(embedding_size, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(class_size, activation='sigmoid'))
# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=[fmeasure])
print('Train...')
if is_valid:
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=training_epoch,
validation_data=(x_valid, y_valid))
else:
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=training_epoch,)
return model
def main():
if os.path.exists(model_name) and os.path.exists(categories_name):
print('Loading model...')
model = load_model(model_name, custom_objects={'fmeasure': fmeasure})
categories = np.load(categories_name)
else:
tags, sequences, categories = read_data(train)
print(len(tags), sequences.shape, len(categories))
categorical_tags = np.zeros((len(tags), len(categories)))
for i, tag in enumerate(tags):
for item in tag:
categorical_tags[i][categories.index(item)] = 1
if is_valid:
(x_train, y_train) , (x_valid, y_valid) = validate(sequences, categorical_tags, valid_size)
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)
model = build_model(len(categories), x_train, y_train,
x_valid=x_valid, y_valid=y_valid, embedding_size=128)
else:
x_train, y_train = sequences, categorical_tags
model = build_model(len(categories), x_train, y_train,
embedding_size=256)
print('Saving model...')
model.save(model_name)
np.save(categories_name, categories)
x_test = read_test(test)
pred = model.predict(x_test)
pred[pred >= threshold] = 1
pred[pred < threshold] = 0
print(pred.shape)
print(categories)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Homework 5: RNN')
parser.add_argument('--train', metavar='<#train data path>',
type=str)
parser.add_argument('--test', metavar='<#test data path>',
type=str, required=True)
parser.add_argument('--output', metavar='<#output path>',
type=str, required=True)
parser.add_argument('--valid', action='store_true')
args = parser.parse_args()
train = args.train
test = args.test
output = args.output
is_valid = args.valid
valid_size = 250
training_epoch = 100
batch_size = 32
max_vocab = 60000
threshold = 0.4
model_name = 'rnn_model.h5'
categories_name = 'rnn_categories.npy'
main()