-
Notifications
You must be signed in to change notification settings - Fork 1
/
convergence_experiment.py
173 lines (139 loc) · 6.99 KB
/
convergence_experiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import gpflow
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from shgp.data.metadata_reinit import ReinitMetaDataset
from shgp.data.metadata_convergence import PimaConvergenceMetaDataset
from shgp.inducing.initialisation_methods import h_reinitialise_PGPR, k_means, uniform_subsample
from shgp.utilities.train_pgpr import train_pgpr
from shgp.utilities.train_svgp import train_svgp
np.random.seed(42)
tf.random.set_seed(42)
"""
A comparison of PGPR with two different inducing point initialisation procedures. Here we investigate
the convergence of the ELBO using Adam vs BFGS on both k_means and hgv. This allows us to analyse
whether the inducing points of hgv are initialised closer than k_means to the gradient-optimised result.
To evaluate other inducing point selection methods, change the model definitions below.
"""
def run_convergence_experiment(X, Y, M, opt_iters):
##############################################
# SVGP with Full Inducing Points ('Optimal') #
##############################################
print("Training non-sparse model...")
svgp, elbo_svgp = train_svgp(X, Y, M=len(X), train_iters=250, init_method=uniform_subsample)
optimal_kernel = svgp.kernel
optimal = np.full(opt_iters + 1, elbo_svgp)
print("Non-sparse result:")
print("optimal = {}".format(elbo_svgp))
############################################
# Z for Different Reinitialisation Methods #
############################################
# Get the initial inducing points for hgv and kmeans
print("Selecting inducing points...")
pgpr_hgv, _ = train_pgpr(
X, Y,
10, 500, 10,
M=M, init_method=h_reinitialise_PGPR, reinit_metadata=ReinitMetaDataset()
)
hgv_Z = pgpr_hgv.inducing_variable.Z.variables[0].numpy()
kmeans_Z = k_means(X, M)
print("Inducing points selected.")
####################
# Convergence test #
####################
results_hgv_bfgs, results_hgv_adam, results_kmeans_bfgs, results_kmeans_adam = \
test_convergence((X, Y), opt_iters, hgv_Z, kmeans_Z, optimal_kernel)
results = list(zip(results_hgv_bfgs, results_hgv_adam, results_kmeans_bfgs, results_kmeans_adam))
print("Sparse results:")
print("results_hgv_bfgs = {}".format(results_hgv_bfgs))
print("results_hgv_adam = {}".format(results_hgv_adam))
print("results_kmeans_bfgs = {}".format(results_kmeans_bfgs))
print("results_kmeans_adam = {}".format(results_kmeans_adam))
return results, optimal
# Test the performance of a model with a given number of inducing points, M.
def test_convergence(data, opt_iters, hgv_Z, kmeans_Z, optimal_kernel):
print("Training HGV BFGS...")
bfgs_opt = gpflow.optimizers.Scipy()
results_hgv_bfgs = train_convergence_svgp(data, opt_iters, optimal_kernel, hgv_Z, bfgs_opt)
print("HGV BFGS trained: ELBO = {}".format(results_hgv_bfgs[-1]))
print("Training HGV Adam...")
adam_opt = tf.optimizers.Adam(beta_1=0.5, beta_2=0.5)
results_hgv_adam = train_convergence_svgp(data, opt_iters, optimal_kernel, hgv_Z, adam_opt)
print("HGV Adam trained: ELBO = {}".format(results_hgv_adam[-1]))
print("Training K-means BFGS...")
bfgs_opt = gpflow.optimizers.Scipy()
results_kmeans_bfgs = train_convergence_svgp(data, opt_iters, optimal_kernel, kmeans_Z, bfgs_opt)
print("K-means BFGS trained: ELBO = {}".format(results_kmeans_bfgs[-1]))
print("Training K-means BFGS...")
adam_opt = tf.optimizers.Adam(beta_1=0.5, beta_2=0.5)
results_kmeans_adam = train_convergence_svgp(data, opt_iters, optimal_kernel, kmeans_Z, adam_opt)
print("K-means Adam trained: ELBO = {}".format(results_kmeans_adam[-1]))
return results_hgv_bfgs, results_hgv_adam, results_kmeans_bfgs, results_kmeans_adam
def train_convergence_svgp(data, opt_iters, kernel, Z, opt):
"""
Train an SVGP model until completion, recording the ELBO every 'elbo_period' iterations.
If we error, keep retrying until success - this is due to a spurious Cholesky error.
"""
model = gpflow.models.SVGP(
kernel=kernel,
likelihood=gpflow.likelihoods.Bernoulli(),
inducing_variable=Z
)
gpflow.set_trainable(model.kernel, False) # fixed hyperparameters
gpflow.set_trainable(model.inducing_variable, True) # optimised Z
results = [model.elbo(data).numpy()]
# Try to run the full optimisation cycle.
try:
if isinstance(opt, gpflow.optimizers.Scipy):
# Optimise kernel hyperparameters, recording the ELBO after each step.
for _ in range(opt_iters):
opt.minimize(model.training_loss_closure(data), variables=model.trainable_variables, options=dict(maxiter=1))
results.append(model.elbo(data).numpy())
else:
# Optimise kernel hyperparameters, recording the ELBO after each step.
for step in range(opt_iters):
opt.lr.assign(decayed_learning_rate(step, opt_iters))
opt.minimize(model.training_loss_closure(data), var_list=model.trainable_variables)
results.append(model.elbo(data).numpy())
# If we fail due to a (spurious) Cholesky error, restart.
except tf.errors.InvalidArgumentError as error:
msg = error.message
if "Cholesky" not in msg and "invertible" not in msg:
raise error
else:
print("Cholesky error caught, retrying...")
return train_convergence_svgp(data, opt_iters, kernel, Z, opt)
return results
def decayed_learning_rate(step, decay_steps, initial_learning_rate=0.5, alpha=0.05):
step = min(step, decay_steps)
cosine_decay = 0.5 * (1 + np.cos(np.pi * step / decay_steps))
decayed = (1 - alpha) * cosine_decay + alpha
return initial_learning_rate * decayed
def plot_results(name, opt_iters, results, optimal):
plt.rcParams["figure.figsize"] = (12, 6)
plt.tick_params(labelright=True)
# Axis labels
plt.title('Convergence of Inducing Point Methods - {} Dataset'.format(name))
plt.ylabel('ELBO')
plt.xlabel('Iterations')
# Axis limits
plt.xlim(0, opt_iters)
plt.xticks([0, 50, 100, 150])
plt.yscale('symlog')
# Setup each subplot
iter_array = np.arange(opt_iters + 1)
plt.plot(iter_array, optimal, color='k', linestyle='dashed', label='Optimal', zorder=101)
plt.plot(iter_array, results[:, 0], label='HGV GO - BFGS', zorder=100)
plt.plot(iter_array, results[:, 1], label='HGV GO - Adam', zorder=98)
plt.plot(iter_array, results[:, 2], label='K-means GO - BFGS', zorder=99)
plt.plot(iter_array, results[:, 3], label='K-means GO - Adam', zorder=97)
plt.legend(loc='lower right')
plt.show()
if __name__ == '__main__':
# Load data
dataset = PimaConvergenceMetaDataset() # to test another dataset, just change this definition
X, Y = dataset.load_data()
# Get convergence results
results, optimal = run_convergence_experiment(X, Y, dataset.M, dataset.opt_iters)
# Plot results
plot_results(dataset.name, dataset.opt_iters, np.array(results), optimal)