-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
102 lines (80 loc) · 3.82 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""
DESCRIPTION: main script.
AUTHOR: Pablo Ferri
DATE: 20/08/2023
"""
# MODULES IMPORT
from os.path import join
from pandas import read_csv
from Classification.classifunc import initialize_model, train_model_calculate_predictions
from Imputation.imputfunc import impute_missings
from Preparation.prepfunc import split_data, scale_data
# SETTINGS
# Data directory
# replace with the directory where your data is located
data_directory = './data/'
# Data filename
# replace with the filename of your csv data file
data_filename = 'data_original.csv'
# Delimiter used in the csv file
# replace with your csv data file delimiter
delimiter = ';'
# Numerical features to be imputed
# replace with your actual feature names of those numerical features to be imputed
features2impute = ['FEATURE_1', 'FEATURE_2', 'FEATURE_3', 'FEATURE_4', 'FEATURE_5']
# Feature identifiers
# replace with your actual feature names of those numerical features to be imputed plus those which do not require
# to be scaled or imputed (they are already prepared), but they need to be considered for the classification task
feature_identifiers = ['FEATURE_1', 'FEATURE_2', 'FEATURE_3', 'FEATURE_4', 'FEATURE_5', 'FEATURE_6', 'FEATURE_7',
'FEATURE_8']
# Scaling method specification
# to choose between 'robust', 'minmax'
scaling_method = 'minmax'
# Imputation method specification
# to choose among 'missing_mask', 'mean', 'translation_encoding', 'bayesian_regression', 'knn', 'gain'
# 'missing_mask', 'mean', 'translation_encoding', 'bayesian_regression' and 'knn' are intended to be used in
# combination with 'robust' scaling
# 'translation_encoding' and 'gain' require 'minmax' scaling
imputation_method = 'translation_encoding'
# Classification model specification
# to choose among 'k_nearest_neighbors', 'logistic_regression', 'random_forest', 'gradient_boosting' and
# 'multilayer_perceptron'
model_identifier = 'multilayer_perceptron'
# Classification label specification
# update with the actual label identifier
label_identifier = 'OUTCOME'
# update with the actual number of classes
number_classes = 2
# EXECUTION
if __name__ == '__main__':
# DATA LOADING
# Filepath definition
absolute_filepath = join(data_directory, data_filename)
# Loading
data = read_csv(filepath_or_buffer=absolute_filepath, delimiter=delimiter, encoding='latin-1', engine='python')
# DATA PREPARATION
# Data splitting
data_split = split_data(data)
# Workspace cleaning
del data
# Data scaling
data_split_scaled = scale_data(data_split=data_split, scaling_method=scaling_method, columns2scale=features2impute)
# Workspace cleaning
del data_split
# DATA IMPUTATION
data_split_imputed = impute_missings(data_split=data_split_scaled, imputation_method=imputation_method,
scaling_method=scaling_method, columns2impute=features2impute,
feature_identifiers=feature_identifiers)
# CLASSIFICATION
# Data extraction
# To simplify readability we present the classification pipeline with the 'train' and 'test' partition
data_train = data_split_imputed['train_test']['train']
data_test = data_split_imputed['train_test']['test']
# Classifier initialization
model = initialize_model(model_identifier=model_identifier, scaling_method=scaling_method,
imputation_method=imputation_method, feature_identifiers=feature_identifiers,
number_classes=number_classes)
# Classifier training and predictions calculation
labels_predictions = train_model_calculate_predictions(
model=model, data_train=data_train, data_eval=data_test, feature_identifiers=feature_identifiers,
label_identifier=label_identifier, scaling_method=scaling_method, imputation_method=imputation_method)