Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial Neural Net - DO NOT MERGE #349

Open
wants to merge 38 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
ab5b130
Add ctg dataset specifically for building neural network
Jul 17, 2017
f3d0e58
Add a function to load ctg data
Jul 17, 2017
aca1bdd
For testing, print roc score
Jul 17, 2017
f8fc7df
Add get_algorithm_neural_network
Jul 17, 2017
63122de
Add neural_network_classifier
Jul 17, 2017
922755b
Run an example to test neural network
Jul 17, 2017
2067e50
* using pandas DataFrame.as_matrix() in train_test_split to convert d…
Jul 18, 2017
23cd59a
Remove `issubclass(type(model), sklearn.base.BaseEstimator)`
Jul 19, 2017
4a8c3ea
Remove print(roc)
Jul 19, 2017
c16d7b5
Create an estimator for KerasClassifier
Jul 19, 2017
d6f477e
Add feature scaling, adjusted _create_trained_supervised_model
Jul 19, 2017
e907623
Add TestNeuralNetworkClassificaton
Jul 19, 2017
d18dd46
Neural network example using ctg data
Jul 19, 2017
f055c6a
Neural network example using diabetes data
Jul 19, 2017
5befb94
Merge remote-tracking branch 'origin/sc_neuralnet' into sc_neuralnet
Jul 19, 2017
16011f5
Add `compute_confusion_matrix()`, add metrics choices for multiclass
Jul 20, 2017
e2cb790
Add print/plot confusionMat; check if is binary classification
Jul 21, 2017
7e8356e
Calculate number of output neurons in neural network
Jul 21, 2017
adb398c
Add calculated output dimension to neural network
Jul 21, 2017
86af2dd
Add confusion matrix tests
Jul 21, 2017
bdb4368
Add annotations for confusion matrix
Jul 21, 2017
181644f
Add annotations for `get_algorithm_neural_network()`
Jul 21, 2017
ba7c39d
Adjusted transformations for target variable
Jul 21, 2017
de78588
Add a function to load the dermatology dataset
Jul 21, 2017
9b1fdcf
Add dermatology data for multi class classification
Jul 21, 2017
0875b3a
Add annotations for print/plot confusion matrix
Jul 21, 2017
b90eebe
Change default scoring_metric to accuracy
Jul 21, 2017
835c5e6
Add a check on binary classification
Jul 21, 2017
2266fbe
Add `calculate_classification_metrics()`
Jul 21, 2017
347f6ff
Add annotations
Jul 21, 2017
8d5cc23
Some minor adjustments
Jul 21, 2017
516da02
Add tests for multi class classification
Jul 21, 2017
6ceaba7
Deleted ctg related examples
Jul 21, 2017
753d08d
Deleted ctg dataset
Jul 21, 2017
e4fd448
Remove ctg data related function
Jul 21, 2017
1d16316
A binary classification example with neural network
Jul 21, 2017
d26ef88
Add a multiclass example with neural network
Jul 21, 2017
6cfff51
Merge branch 'master' into sc_neuralnet
Sep 20, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add feature scaling, adjusted _create_trained_supervised_model
  • Loading branch information
Shufang Ci committed Jul 19, 2017
commit d6f477e3688eba2a23e7c7cd83b6c311efd5a773
163 changes: 54 additions & 109 deletions healthcareai/advanced_supvervised_model_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class AdvancedSupervisedModelTrainer(object):
metrics.
"""

def __init__(self, dataframe, model_type, predicted_column, grain_column=None, verbose=False):
def __init__(self, dataframe, model_type, predicted_column, grain_column=None, data_scaling=False, verbose=False):
"""
Creates an instance of AdvancedSupervisedModelTrainer.

Expand All @@ -52,11 +52,13 @@ def __init__(self, dataframe, model_type, predicted_column, grain_column=None, v
self.predicted_column = predicted_column
self.grain_column = grain_column
self.verbose = verbose
self.columns = None
self.x_train = None
self.X_test = None
self.y_train = None
self.y_test = None
self.pipeline = None
self.data_scaling = data_scaling

self._console_log(
'Shape and top 5 rows of original dataframe:\n{}\n{}'.format(self.dataframe.shape, self.dataframe.head()))
Expand Down Expand Up @@ -118,9 +120,23 @@ def train_test_split(self, random_seed=None):
y = np.squeeze(self.dataframe[[self.predicted_column]])
X = self.dataframe.drop([self.predicted_column], axis=1)

# Save off a copy of the column names before converting to a numpy array
self.columns = X.columns.values

self.x_train, self.X_test, self.y_train, self.y_test = sklearn.model_selection.train_test_split(
X, y, test_size=.20, random_state=random_seed)

# Scale the x variables, turn this on when using neural network
if self.data_scaling:
names = list(self.columns)
is_numeric = np.vectorize(lambda x: np.issubdtype(x, np.number))
numeric_col_bool = list(is_numeric(X.dtypes))
numeric_col = [i for (i, v) in zip(names, numeric_col_bool) if v]
self.feature_scaling(numeric_col)

self.x_train = np.array(self.x_train)
self.X_test = np.array(self.X_test)

self._console_log('\nShape of X_train: {}\ny_train: {}\nX_test: {}\ny_test: {}'.format(
self.x_train.shape,
self.y_train.shape,
Expand Down Expand Up @@ -219,8 +235,7 @@ def metrics(self, trained_sklearn_estimator):
elif self.model_type is 'regression':
performance_metrics = hcai_model_evaluation.calculate_regression_metrics(trained_sklearn_estimator,
self.X_test, self.y_test)
# Shufang testing:
print(performance_metrics)

return performance_metrics

def logistic_regression(self,
Expand Down Expand Up @@ -350,7 +365,7 @@ def random_forest_classifier(self,
"""
self.validate_classification('Random Forest Classifier')
if hyperparameter_grid is None:
max_features = hcai_helpers.calculate_random_forest_mtry_hyperparameter(len(self.X_test.columns),
max_features = hcai_helpers.calculate_random_forest_mtry_hyperparameter(len(self.columns),
self.model_type)
hyperparameter_grid = {'n_estimators': [100, 200, 300], 'max_features': max_features}
number_iteration_samples = 5
Expand Down Expand Up @@ -389,7 +404,7 @@ def random_forest_regressor(self,
"""
self.validate_regression('Random Forest Regressor')
if hyperparameter_grid is None:
max_features = hcai_helpers.calculate_random_forest_mtry_hyperparameter(len(self.X_test.columns),
max_features = hcai_helpers.calculate_random_forest_mtry_hyperparameter(len(self.columns),
self.model_type)
hyperparameter_grid = {'n_estimators': [10, 50, 200], 'max_features': max_features}
number_iteration_samples = 5
Expand All @@ -405,21 +420,32 @@ def random_forest_regressor(self,

return trained_supervised_model

def create_nn(self, activation='relu', neurons=22, optimizer='adam', scoring_metric='mae'):
algorithm = Sequential()
algorithm.add(Dense(neurons, input_dim=22, activation=activation))
algorithm.add(Dense(neurons, activation=activation))
algorithm.add(Dense(1, activation='sigmoid'))
algorithm.compile(loss='binary_crossentropy',
def create_nn(self,
neurons_num=None,
activation='relu',
optimizer='adam',
scoring_metric='accuracy'):

# Calculate number of neurons
input_dim = self.x_train.shape[1]
if neurons_num is None:
neurons_num = (input_dim + 2) // 2

# Create a neural network architecture
neuralnet = Sequential()
neuralnet.add(Dense(input_dim, input_dim=input_dim, activation=activation))
neuralnet.add(Dense(neurons_num, activation=activation))
neuralnet.add(Dense(2, activation='softmax'))
neuralnet.compile(loss='sparse_categorical_crossentropy',
# can also use one hot encoding with categorical_crossentropy
# for binary: binary_crossentropy sigmoid
# for multiclass: softmax sparse_categorical_crossentropy
optimizer=optimizer,
metrics=[scoring_metric])
return algorithm
return neuralnet

def neural_network_classifier(self,
scoring_metric='roc_auc',
scoring_metric='accuracy',
hyperparameter_grid=None,
randomized_search=True,
number_iteration_samples=3):
Expand All @@ -432,33 +458,28 @@ def neural_network_classifier(self,
"""
self.validate_classification('Neural Network Classifier')

# Create a neural network architecture
#TODO customize input/output dims
algorithm = KerasClassifier(build_fn=self.create_nn, verbose=0)
# Create a wrapper for sklearn
neuralnet = KerasClassifier(build_fn=self.create_nn, verbose=0)

#TODO change hyperparameter grid
if hyperparameter_grid is None:
# activation = ['relu']
# # neurons = calculate_nn_number_of_neurons_hyperparameter(
# # X_train.shape[1], 'classification')
# neurons = [22]
# batch_size = [3]
# epochs = [100]
# optimizer = ['adam']

hyperparameter_grid = dict(activation=['relu'],
neurons=[10],
batch_size=[5],
epochs=[100],
optimizer=['adam'])

algorithm = get_algorithm_neural_network(algorithm,
activation = ['relu']
batch_size = [5]
epochs = [20]
optimizer = ['adam']

hyperparameter_grid = dict(activation=activation,
batch_size=batch_size,
epochs=epochs,
optimizer=optimizer)

algorithm = get_algorithm_neural_network(neuralnet,
scoring_metric,
hyperparameter_grid,
randomized_search,
number_iteration_samples=1)

trained_supervised_model = self._create_trained_supervised_model_nn(algorithm)
trained_supervised_model = self._create_trained_supervised_model(algorithm)

return trained_supervised_model

Expand Down Expand Up @@ -501,83 +522,7 @@ def _create_trained_supervised_model(self, algorithm, include_factor_model=True)
feature_model=factor_model,
fit_pipeline=self.pipeline,
model_type=self.model_type,
column_names=self.X_test.columns.values,
grain_column=self.grain_column,
prediction_column=self.predicted_column,
test_set_predictions=test_set_predictions,
test_set_class_labels=test_set_class_labels,
test_set_actual=self.y_test,
metric_by_name=self.metrics(algorithm),
training_time=time.time() - t0)

return trained_supervised_model

def _create_trained_supervised_model_nn(self, algorithm, include_factor_model=True):
"""
Trains an algorithm, prepares metrics, builds and returns a TrainedSupervisedModel

Args:
algorithm (sklearn.base.BaseEstimator): The scikit learn algorithm, ready to fit.
include_factor_model (bool): Trains a model for top factors. Defaults to True

Returns:
TrainedSupervisedModel: a TrainedSupervisedModel
"""
# Get time before model training
t0 = time.time()

## testing
'''
from sklearn.preprocessing import Normalizer
scaler = Normalizer().fit(self.x_train)
self.x_train = scaler.transform(self.x_train)
self.X_test = scaler.transform(self.X_test)
'''
column_names = self.X_test.columns.values
self.x_train = np.array(self.x_train)
self.X_test = np.array(self.X_test)
self.y_train = np.array(self.y_train)
self.y_test = np.array(self.y_test)
print(self.x_train[:10])

print('start training...')
result = algorithm.fit(self.x_train, self.y_train)
print('\nBest trained hyper-parameter set is:\n')
print(result.best_params_)
print('\nBest training accuracy is:\n')
print(result.best_score_)
print('\nHyper-parameter tuning has completed. \n')

# Build prediction sets for ROC/PR curve generation. Note this does increase the size of the TSM because the
# test set is saved inside the object as well as the calculated thresholds.
# See https://github.com/HealthCatalyst/healthcareai-py/issues/264 for a discussion on pros/cons
# PEP 8
test_set_predictions = None
test_set_class_labels = None
if self.is_classification:
# Save both the probabilities and labels
test_set_predictions = algorithm.predict_proba(self.X_test)
test_set_class_labels = algorithm.predict(self.X_test)
print('test_set_class_labels:', test_set_class_labels)
print('Classification report:')
print(classification_report(self.y_test, test_set_class_labels))
print('Confusion matrix:')
print(confusion_matrix(self.y_test, test_set_class_labels))
print('predict_proba ends.')
elif self.is_regression:
test_set_predictions = algorithm.predict(self.X_test)

if include_factor_model:
factor_model = hcai_factors.prepare_fit_model_for_factors(self.model_type, self.x_train, self.y_train)
else:
factor_model = None

trained_supervised_model = hcai_tsm.TrainedSupervisedModel(
model=algorithm,
feature_model=factor_model,
fit_pipeline=self.pipeline,
model_type=self.model_type,
column_names=column_names,
column_names=self.columns,
grain_column=self.grain_column,
prediction_column=self.predicted_column,
test_set_predictions=test_set_predictions,
Expand Down