-
Notifications
You must be signed in to change notification settings - Fork 0
/
sampler.py
executable file
·121 lines (95 loc) · 4.63 KB
/
sampler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# SPDX-License-Identifer: GPL-3.0-only
# Copyright (C) 2020 Gregory Norton
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, version 3.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <https://www.gnu.org/licenses/>.
# randomly generates 750 variations on the dataset split and saves the
# indices of the best splits
# to be used in tandem with data_builder.py
from sklearn.model_selection import cross_val_score, cross_val_predict
#from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import NearestCentroid
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import attributes
#def TRAIN_PATH(): return 'data/attributes/train.csv'
#def TEST_PATH(): return 'data/attributes/test.csv'
def FULL_PATH(): return 'data/attributes/flags.csv'
def NUM_FOLDS(): return 10
#def DATASET_VERSION(): return 4 # reached 79%
def DATASET_VERSION(): return 8
def format_data(data):
return ([flag.attributes() for flag in data], [country.religion for country in data])
def classify(x_train, y_train, x_valid, y_valid):
model = RandomForestClassifier()
# model = DecisionTreeClassifier()
# model = MLPClassifier(solver='lbfgs', alpha=2.3, hidden_layer_sizes=(5, 2), random_state=0)
# model = SVC()
# model = NearestCentroid()
# model = GaussianNB()
skf = StratifiedKFold(n_splits=NUM_FOLDS())#, shuffle=True)
scores = []
for train_index, test_index in skf.split(x_train, y_train):
model.fit([x_train[i] for i in train_index], [y_train[i] for i in train_index])
scores.append(model.score([x_train[i] for i in test_index], [y_train[i] for i in test_index]))
pred_valid = [attributes.Religion(p) for p in model.predict(x_valid)]
correct = len([i for i in range(0, len(y_valid)) if y_valid[i] == pred_valid[i]])
performance = correct/len(pred_valid)
return (model, scores, performance)#, performance_full)
def main(args):
full_data = None
with open(FULL_PATH(), 'r') as f:
full_data = attributes.FlagAttributes.parse(f)
x_full, y_full = format_data(full_data)
best_valid_perf = 0
best_dataset = None
for i in range(0, 750):
print('\r%d' % i, end='')
skf = StratifiedKFold(n_splits=4, shuffle=True)
scores = []
train_index, valid_index = next(skf.split(x_full, y_full))
x_train = [x_full[j] for j in train_index]
y_train = [y_full[j] for j in train_index]
x_valid = [x_full[j] for j in valid_index]
y_valid = [y_full[j] for j in valid_index]
model, scores, performance = classify(x_train, y_train, x_valid, y_valid)
if best_valid_perf < performance:
print('\nnew best performance', performance)
best_valid_perf = performance
best_dataset = { 'train': train_index, 'valid': valid_index}
print('')
with open('sampler_test/train_indices.txt', 'w') as f:
for i in best_dataset['train']:
f.write('%d' % i)
f.write('\n')
with open('sampler_test/valid_indices.txt', 'w') as f:
for i in best_dataset['valid']:
f.write('%d' % i)
f.write('\n')
'''print('Scoring of model for each iteration')
print(scores)
print('Mean of model iterations:', sum(scores)/len(scores))
pred_test = [attributes.Religion(p) for p in model.predict(x_valid)]
#pred_test = [attributes.Religion(p) for p in cross_val_predict(model, x_test, y_test, cv=NUM_FOLDS())]
correct = len([i for i in range(0, len(y_valid)) if y_valid[i] == pred_test[i]])
print('testing performance:\t', correct, '/', len(y_valid), '\tratio:', round(correct / len(y_valid), 2))
pred_full = [attributes.Religion(p) for p in model.predict(x_full)]
correct = len([i for i in range(0, len(y_full)) if y_full[i] == pred_full[i]])
print('full performance:\t', correct, '/', len(y_full), '\tratio:', round(correct / len(y_full), 2))'''
return 0
if __name__ == '__main__':
import sys
sys.exit(main(sys.argv))