From 81e3e72c647c33149ef05aa3aab0580ee3b70ba9 Mon Sep 17 00:00:00 2001 From: bnb32 Date: Tue, 9 Apr 2024 09:41:22 -0600 Subject: [PATCH 1/6] tensorboard logging --- sup3r/models/abstract.py | 70 +++++++++++++++++++------ sup3r/models/base.py | 15 +++++- sup3r/models/conditional_moments.py | 11 +++- tests/training/test_train_gan_lr_era.py | 2 +- 4 files changed, 77 insertions(+), 21 deletions(-) diff --git a/sup3r/models/abstract.py b/sup3r/models/abstract.py index e54182de0..a69b23c6b 100644 --- a/sup3r/models/abstract.py +++ b/sup3r/models/abstract.py @@ -484,6 +484,8 @@ def __init__(self): self._gen = None self._means = None self._stdevs = None + self._tb_writer = None + self._timing_details = {} def load_network(self, model, name): """Load a CustomNetwork object from hidden layers config, .json file @@ -915,10 +917,9 @@ def update_loss_details(loss_details, new_data, batch_len, prefix=None): prior_n_obs = loss_details['n_obs'] new_n_obs = prior_n_obs + batch_len - for key, new_value in new_data.items(): - key = key if prefix is None else prefix + key - new_value = (new_value if not isinstance(new_value, tf.Tensor) else - new_value.numpy()) + for k, v in new_data.items(): + key = k if prefix is None else prefix + k + new_value = (v if not isinstance(v, tf.Tensor) else v.numpy()) if key in loss_details: saved_value = loss_details[key] @@ -1063,7 +1064,6 @@ def finish_epoch(self, """ self.log_loss_details(loss_details) - self._history.at[epoch, 'elapsed_time'] = time.time() - t0 for key, value in loss_details.items(): if key != 'n_obs': @@ -1090,6 +1090,15 @@ def finish_epoch(self, for k, v in extras.items(): self._history.at[epoch, k] = v + if self._tb_writer is not None: + with self._tb_writer.as_default(): + for col in self._history.columns: + tf.summary.scalar(col, self._history.at[epoch, col], epoch) + for name, value in extras.items(): + tf.summary.scalar(name, value, epoch) + for name, value in self._timing_details.items(): + tf.summary.scalar(name, value, epoch) + return stop def run_gradient_descent(self, @@ -1135,19 +1144,23 @@ def run_gradient_descent(self, loss_details : dict Namespace of the breakdown of loss components """ - t0 = time.time() if optimizer is None: optimizer = self.optimizer if not multi_gpu or len(self.gpu_list) == 1: + start = time.time() grad, loss_details = self.get_single_grad(low_res, hi_res_true, training_weights, **calc_loss_kwargs) + self._timing_details['dt:get_single_grad'] = time.time() - start + start = time.time() optimizer.apply_gradients(zip(grad, training_weights)) + self._timing_details['dt:apply_gradients'] = time.time() - start t1 = time.time() logger.debug(f'Finished single gradient descent step ' f'in {(t1 - t0):.3f}s') + self._timing_details['dt:run_gradient_descent'] = t1 - t0 else: futures = [] @@ -1178,6 +1191,7 @@ def run_gradient_descent(self, t1 = time.time() logger.debug(f'Finished {len(futures)} gradient descent steps on ' f'{len(self.gpu_list)} GPUs in {(t1 - t0):.3f}s') + self._timing_details['dt:run_gradient_descent'] = t1 - t0 return loss_details @@ -1238,6 +1252,20 @@ def _reshape_norm_exo(self, hi_res, hi_res_exo, exo_name, norm_in=True): return hi_res_exo + def _init_tensorboard_writer(self, out_dir): + """Initialize the ``tf.summary.SummaryWriter`` to use for writing + tensorboard compatible log files. + + Parameters + ---------- + out_dir : str + Standard out_dir where model epochs are saved. e.g. './gan_{epoch}' + """ + tb_log_dir = os.path.join( + os.path.abspath(os.path.join(out_dir, os.pardir)), 'logs') + os.makedirs(tb_log_dir, exist_ok=True) + self._tb_writer = tf.summary.create_file_writer(tb_log_dir) + def generate(self, low_res, norm_in=True, @@ -1398,16 +1426,24 @@ def get_single_grad(self, loss_details : dict Namespace of the breakdown of loss components """ - with tf.device(device_name): - with tf.GradientTape(watch_accessed_variables=False) as tape: - tape.watch(training_weights) - - hi_res_exo = self.get_high_res_exo_input(hi_res_true) - hi_res_gen = self._tf_generate(low_res, hi_res_exo) - loss_out = self.calc_loss(hi_res_true, hi_res_gen, - **calc_loss_kwargs) - loss, loss_details = loss_out - - grad = tape.gradient(loss, training_weights) + with tf.device(device_name), tf.GradientTape( + watch_accessed_variables=False) as tape: + t0 = time.time() + tape.watch(training_weights) + self._timing_details['dt:tape.watch'] = time.time() - t0 + t0 = time.time() + hi_res_exo = self.get_high_res_exo_input(hi_res_true) + self._timing_details['dt:get_high_res_exo_input'] = time.time() - t0 + t0 = time.time() + hi_res_gen = self._tf_generate(low_res, hi_res_exo) + self._timing_details['dt:tf.generate'] = time.time() - t0 + t0 = time.time() + loss_out = self.calc_loss(hi_res_true, hi_res_gen, + **calc_loss_kwargs) + self._timing_details['dt:calc_loss'] = time.time() - t0 + loss, loss_details = loss_out + t0 = time.time() + grad = tape.gradient(loss, training_weights) + self._timing_details['dt:tape.gradient'] = time.time() - t0 return grad, loss_details diff --git a/sup3r/models/base.py b/sup3r/models/base.py index d055c582f..8348a7c8e 100644 --- a/sup3r/models/base.py +++ b/sup3r/models/base.py @@ -794,7 +794,8 @@ def train(self, early_stop_n_epoch=5, adaptive_update_bounds=(0.9, 0.99), adaptive_update_fraction=0.0, - multi_gpu=False): + multi_gpu=False, + tensorboard_log=True): """Train the GAN model on real low res data and real high res data Parameters @@ -856,7 +857,14 @@ def train(self, constitute a single gradient descent step with the nominal learning rate that the model was initialized with. If true and multiple gpus are found, default_device device should be set to /gpu:0 + tensorboard_log : bool + Whether to write log file for use with tensorboard. Log data can + be viewed with ``tensorboard --logdir `` where ```` + is the parent directory of ``out_dir``, and pointing the browser to + the printed address. """ + if tensorboard_log: + self._init_tensorboard_writer(out_dir) self.set_norm_stats(batch_handler.means, batch_handler.stds) self.set_model_params( @@ -889,9 +897,10 @@ def train(self, train_disc, disc_loss_bounds, multi_gpu=multi_gpu) - + train_n_obs = loss_details['n_obs'] loss_details = self.calc_val_loss(batch_handler, weight_gen_advers, loss_details) + val_n_obs = loss_details['n_obs'] msg = f'Epoch {epoch} of {epochs[-1]} ' msg += 'gen/disc train loss: {:.2e}/{:.2e} '.format( @@ -911,6 +920,8 @@ def train(self, self.optimizer_disc)['learning_rate'] extras = { + 'train_n_obs': train_n_obs, + 'val_n_obs': val_n_obs, 'weight_gen_advers': weight_gen_advers, 'disc_loss_bound_0': disc_loss_bounds[0], 'disc_loss_bound_1': disc_loss_bounds[1], diff --git a/sup3r/models/conditional_moments.py b/sup3r/models/conditional_moments.py index ec7559e6d..ccd41c07c 100644 --- a/sup3r/models/conditional_moments.py +++ b/sup3r/models/conditional_moments.py @@ -347,7 +347,8 @@ def train(self, batch_handler, early_stop_on=None, early_stop_threshold=0.005, early_stop_n_epoch=5, - multi_gpu=False): + multi_gpu=False, + tensorboard_log=True): """Train the model on real low res data and real high res data Parameters @@ -388,7 +389,15 @@ def train(self, batch_handler, between the GPUs and the resulting gradient from each GPU will constitute a single gradient descent step with the nominal learning rate that the model was initialized with. + tensorboard_log : bool + Whether to write log file for use with tensorboard. Log data can + be viewed with ``tensorboard --logdir `` where ```` + is the parent directory of ``out_dir``, and pointing the browser to + the printed address. """ + if tensorboard_log: + self._init_tensorboard_writer(out_dir) + self.set_norm_stats(batch_handler.means, batch_handler.stds) self.set_model_params( input_resolution=input_resolution, diff --git a/tests/training/test_train_gan_lr_era.py b/tests/training/test_train_gan_lr_era.py index 52c1974ff..1957254c5 100644 --- a/tests/training/test_train_gan_lr_era.py +++ b/tests/training/test_train_gan_lr_era.py @@ -163,7 +163,7 @@ def test_train_st(n_epoch=3, log=False): s_enhance=3, t_enhance=4, n_batches=5, - worker_kwargs=dict(max_workers=1), + worker_kwargs={'max_workers': 1}, ) assert batch_handler.norm_workers == 1 From 1b4778e8ab24533afc68b87f9253e44b4b5f6766 Mon Sep 17 00:00:00 2001 From: bnb32 Date: Tue, 9 Apr 2024 09:58:05 -0600 Subject: [PATCH 2/6] linting --- sup3r/models/abstract.py | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/sup3r/models/abstract.py b/sup3r/models/abstract.py index a69b23c6b..21888ce9f 100644 --- a/sup3r/models/abstract.py +++ b/sup3r/models/abstract.py @@ -1009,6 +1009,27 @@ def save(self, out_dir): if it does not already exist. """ + def _log_to_tensorboard(self, epoch, extras=None): + """Write data to tensorboard log file. Includes history values, some + timing info, and provided extras. + + Parameters + ---------- + epoch : int + Current epoch to write info for + extras : dict | None + Extra kwargs/parameters to save in the epoch history. + """ + if self._tb_writer is not None: + with self._tb_writer.as_default(): + for col in self._history.columns: + tf.summary.scalar(col, self._history.at[epoch, col], epoch) + for name, value in self._timing_details.items(): + tf.summary.scalar(name, value, epoch) + if extras is not None: + for name, value in extras.items(): + tf.summary.scalar(name, value, epoch) + def finish_epoch(self, epoch, epochs, @@ -1062,7 +1083,6 @@ def finish_epoch(self, stop : bool Flag to early stop training. """ - self.log_loss_details(loss_details) self._history.at[epoch, 'elapsed_time'] = time.time() - t0 for key, value in loss_details.items(): @@ -1090,14 +1110,7 @@ def finish_epoch(self, for k, v in extras.items(): self._history.at[epoch, k] = v - if self._tb_writer is not None: - with self._tb_writer.as_default(): - for col in self._history.columns: - tf.summary.scalar(col, self._history.at[epoch, col], epoch) - for name, value in extras.items(): - tf.summary.scalar(name, value, epoch) - for name, value in self._timing_details.items(): - tf.summary.scalar(name, value, epoch) + self._log_to_tensorboard(epoch, extras=extras) return stop @@ -1433,7 +1446,8 @@ def get_single_grad(self, self._timing_details['dt:tape.watch'] = time.time() - t0 t0 = time.time() hi_res_exo = self.get_high_res_exo_input(hi_res_true) - self._timing_details['dt:get_high_res_exo_input'] = time.time() - t0 + self._timing_details[ + 'dt:get_high_res_exo_input'] = time.time() - t0 t0 = time.time() hi_res_gen = self._tf_generate(low_res, hi_res_exo) self._timing_details['dt:tf.generate'] = time.time() - t0 From 59c227ba8da8f0dc165ff9d806d062012a176ddb Mon Sep 17 00:00:00 2001 From: bnb32 Date: Tue, 9 Apr 2024 10:52:18 -0600 Subject: [PATCH 3/6] test fix with tb logging dtype --- sup3r/models/abstract.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sup3r/models/abstract.py b/sup3r/models/abstract.py index 21888ce9f..3eb301789 100644 --- a/sup3r/models/abstract.py +++ b/sup3r/models/abstract.py @@ -1023,7 +1023,11 @@ def _log_to_tensorboard(self, epoch, extras=None): if self._tb_writer is not None: with self._tb_writer.as_default(): for col in self._history.columns: - tf.summary.scalar(col, self._history.at[epoch, col], epoch) + val = self._history.at[epoch, col] + if isinstance(val, str): + tf.summary.text(col, val, epoch) + else: + tf.summary.scalar(col, val, epoch) for name, value in self._timing_details.items(): tf.summary.scalar(name, value, epoch) if extras is not None: From acf05a48f9100e071e3510d00a963d692ffe59c2 Mon Sep 17 00:00:00 2001 From: bnb32 Date: Thu, 11 Apr 2024 09:52:56 -0600 Subject: [PATCH 4/6] tensorboard profiling --- sup3r/models/abstract.py | 128 ++++++++++++++++++++++----------------- sup3r/models/base.py | 70 ++++++++++++--------- 2 files changed, 115 insertions(+), 83 deletions(-) diff --git a/sup3r/models/abstract.py b/sup3r/models/abstract.py index 3eb301789..caac8a510 100644 --- a/sup3r/models/abstract.py +++ b/sup3r/models/abstract.py @@ -1,7 +1,5 @@ # -*- coding: utf-8 -*- -""" -Abstract class to define the required interface for Sup3r model subclasses -""" +"""Abstract class defining the required interface for Sup3r model subclasses""" import json import logging import os @@ -371,9 +369,8 @@ def hr_exo_features(self): # pylint: disable=E1101 features = [] if hasattr(self, '_gen'): - for layer in self._gen.layers: - if isinstance(layer, (Sup3rAdder, Sup3rConcat)): - features.append(layer.name) + features = [layer.name for layer in self._gen.layers + if isinstance(layer, (Sup3rAdder, Sup3rConcat))] return features @property @@ -485,8 +482,27 @@ def __init__(self): self._means = None self._stdevs = None self._tb_writer = None + self._tb_log_dir = None + self._write_tb_profile = False + self._total_batches = None self._timing_details = {} + @property + def total_batches(self): + """Record of total number of batches for logging.""" + if self._total_batches is None and self._history is None: + self._total_batches = 0 + elif self._history is None and 'total_batches' in self._history: + self._total_batches = self._history['total_batches'].values[-1] + elif self._total_batches is None and self._history is not None: + self._total_batches = 0 + return self._total_batches + + @total_batches.setter + def total_batches(self, value): + """Set total number of batches.""" + self._total_batches = value + def load_network(self, model, name): """Load a CustomNetwork object from hidden layers config, .json file config, or .pkl file saved pre-trained model. @@ -745,13 +761,13 @@ def init_optimizer(optimizer, learning_rate): """ if isinstance(optimizer, dict): class_name = optimizer['name'] - OptimizerClass = getattr(optimizers, class_name) - sig = signature(OptimizerClass) + optimizer_class = getattr(optimizers, class_name) + sig = signature(optimizer_class) optimizer_kwargs = { k: v for k, v in optimizer.items() if k in sig.parameters } - optimizer = OptimizerClass.from_config(optimizer_kwargs) + optimizer = optimizer_class.from_config(optimizer_kwargs) elif optimizer is None: optimizer = optimizers.Adam(learning_rate=learning_rate) @@ -1009,30 +1025,35 @@ def save(self, out_dir): if it does not already exist. """ - def _log_to_tensorboard(self, epoch, extras=None): - """Write data to tensorboard log file. Includes history values, some - timing info, and provided extras. + def dict_to_tensorboard(self, entry): + """Write data to tensorboard log file. This is usually a loss_details + dictionary. Parameters ---------- - epoch : int - Current epoch to write info for - extras : dict | None - Extra kwargs/parameters to save in the epoch history. + entry: dict + Dictionary of values to write to tensorboard log file """ if self._tb_writer is not None: with self._tb_writer.as_default(): - for col in self._history.columns: - val = self._history.at[epoch, col] - if isinstance(val, str): - tf.summary.text(col, val, epoch) + for name, value in entry.items(): + if isinstance(value, str): + tf.summary.text(name, value, self.total_batches) else: - tf.summary.scalar(col, val, epoch) - for name, value in self._timing_details.items(): - tf.summary.scalar(name, value, epoch) - if extras is not None: - for name, value in extras.items(): - tf.summary.scalar(name, value, epoch) + tf.summary.scalar(name, value, self.total_batches) + + def profile_to_tensorboard(self, name): + """Write profile data to tensorboard log file. + + Parameters + ---------- + name : str + Tag name to use for profile info + """ + if self._tb_writer is not None and self._write_tb_profile: + with self._tb_writer.as_default(): + tf.summary.trace_export(name=name, step=self.total_batches, + profiler_outdir=self._tb_log_dir) def finish_epoch(self, epoch, @@ -1114,8 +1135,6 @@ def finish_epoch(self, for k, v in extras.items(): self._history.at[epoch, k] = v - self._log_to_tensorboard(epoch, extras=extras) - return stop def run_gradient_descent(self, @@ -1166,19 +1185,14 @@ def run_gradient_descent(self, optimizer = self.optimizer if not multi_gpu or len(self.gpu_list) == 1: - start = time.time() + grad, loss_details = self.get_single_grad(low_res, hi_res_true, training_weights, **calc_loss_kwargs) - self._timing_details['dt:get_single_grad'] = time.time() - start - start = time.time() optimizer.apply_gradients(zip(grad, training_weights)) - self._timing_details['dt:apply_gradients'] = time.time() - start t1 = time.time() logger.debug(f'Finished single gradient descent step ' f'in {(t1 - t0):.3f}s') - self._timing_details['dt:run_gradient_descent'] = t1 - t0 - else: futures = [] lr_chunks = np.array_split(low_res, len(self.gpu_list)) @@ -1208,8 +1222,7 @@ def run_gradient_descent(self, t1 = time.time() logger.debug(f'Finished {len(futures)} gradient descent steps on ' f'{len(self.gpu_list)} GPUs in {(t1 - t0):.3f}s') - self._timing_details['dt:run_gradient_descent'] = t1 - t0 - + self._timing_details['dt:run_gradient_descent'] = t1 - t0 return loss_details def _reshape_norm_exo(self, hi_res, hi_res_exo, exo_name, norm_in=True): @@ -1278,10 +1291,10 @@ def _init_tensorboard_writer(self, out_dir): out_dir : str Standard out_dir where model epochs are saved. e.g. './gan_{epoch}' """ - tb_log_dir = os.path.join( + self._tb_log_dir = os.path.join( os.path.abspath(os.path.join(out_dir, os.pardir)), 'logs') - os.makedirs(tb_log_dir, exist_ok=True) - self._tb_writer = tf.summary.create_file_writer(tb_log_dir) + os.makedirs(self._tb_log_dir, exist_ok=True) + self._tb_writer = tf.summary.create_file_writer(self._tb_log_dir) def generate(self, low_res, @@ -1328,8 +1341,10 @@ def generate(self, low_res = self.norm_input(low_res) hi_res = self.generator.layers[0](low_res) - for i, layer in enumerate(self.generator.layers[1:]): - try: + layer_num = 1 + try: + for i, layer in enumerate(self.generator.layers[1:]): + layer_num = i + 1 if isinstance(layer, (Sup3rAdder, Sup3rConcat)): msg = (f'layer.name = {layer.name} does not match any ' 'features in exogenous_data ' @@ -1344,11 +1359,11 @@ def generate(self, hi_res = layer(hi_res, hi_res_exo) else: hi_res = layer(hi_res) - except Exception as e: - msg = ('Could not run layer #{} "{}" on tensor of shape {}'. - format(i + 1, layer, hi_res.shape)) - logger.error(msg) - raise RuntimeError(msg) from e + except Exception as e: + msg = ('Could not run layer #{} "{}" on tensor of shape {}'. + format(layer_num, layer, hi_res.shape)) + logger.error(msg) + raise RuntimeError(msg) from e hi_res = hi_res.numpy() @@ -1386,8 +1401,10 @@ def _tf_generate(self, low_res, hi_res_exo=None): Synthetically generated high-resolution data """ hi_res = self.generator.layers[0](low_res) - for i, layer in enumerate(self.generator.layers[1:]): - try: + layer_num = 1 + try: + for i, layer in enumerate(self.generator.layers[1:]): + layer_num = i + 1 if isinstance(layer, (Sup3rAdder, Sup3rConcat)): msg = (f'layer.name = {layer.name} does not match any ' f'features in exogenous_data ({list(hi_res_exo)})') @@ -1396,11 +1413,11 @@ def _tf_generate(self, low_res, hi_res_exo=None): hi_res = layer(hi_res, hr_exo) else: hi_res = layer(hi_res) - except Exception as e: - msg = ('Could not run layer #{} "{}" on tensor of shape {}'. - format(i + 1, layer, hi_res.shape)) - logger.error(msg) - raise RuntimeError(msg) from e + except Exception as e: + msg = ('Could not run layer #{} "{}" on tensor of shape {}'. + format(layer_num, layer, hi_res.shape)) + logger.error(msg) + raise RuntimeError(msg) from e return hi_res @@ -1454,14 +1471,13 @@ def get_single_grad(self, 'dt:get_high_res_exo_input'] = time.time() - t0 t0 = time.time() hi_res_gen = self._tf_generate(low_res, hi_res_exo) - self._timing_details['dt:tf.generate'] = time.time() - t0 + self._timing_details['dt:_tf_generate'] = time.time() - t0 t0 = time.time() loss_out = self.calc_loss(hi_res_true, hi_res_gen, **calc_loss_kwargs) self._timing_details['dt:calc_loss'] = time.time() - t0 - loss, loss_details = loss_out t0 = time.time() + loss, loss_details = loss_out grad = tape.gradient(loss, training_weights) self._timing_details['dt:tape.gradient'] = time.time() - t0 - return grad, loss_details diff --git a/sup3r/models/base.py b/sup3r/models/base.py index 8348a7c8e..bc2c3862b 100644 --- a/sup3r/models/base.py +++ b/sup3r/models/base.py @@ -233,14 +233,16 @@ def discriminate(self, hi_res, norm_in=False): hi_res = (hi_res - mean_arr) / std_arr out = self.discriminator.layers[0](hi_res) - for i, layer in enumerate(self.discriminator.layers[1:]): - try: + layer_num = 1 + try: + for i, layer in enumerate(self.discriminator.layers[1:]): out = layer(out) - except Exception as e: - msg = ('Could not run layer #{} "{}" on tensor of shape {}'. - format(i + 1, layer, out.shape)) - logger.error(msg) - raise RuntimeError(msg) from e + layer_num = i + 1 + except Exception as e: + msg = ('Could not run layer #{} "{}" on tensor of shape {}'. + format(layer_num, layer, out.shape)) + logger.error(msg) + raise RuntimeError(msg) from e out = out.numpy() @@ -263,16 +265,17 @@ def _tf_discriminate(self, hi_res): out : np.ndarray Discriminator output logits """ - out = self.discriminator.layers[0](hi_res) - for i, layer in enumerate(self.discriminator.layers[1:]): - try: + layer_num = 1 + try: + for i, layer in enumerate(self.discriminator.layers[1:]): + layer_num = i + 1 out = layer(out) - except Exception as e: - msg = ('Could not run layer #{} "{}" on tensor of shape {}'. - format(i + 1, layer, out.shape)) - logger.error(msg) - raise RuntimeError(msg) from e + except Exception as e: + msg = ('Could not run layer #{} "{}" on tensor of shape {}'. + format(layer_num, layer, out.shape)) + logger.error(msg) + raise RuntimeError(msg) from e return out @@ -302,14 +305,14 @@ def update_optimizer(self, option='generator', **kwargs): if 'gen' in option.lower() or 'all' in option.lower(): conf = self.get_optimizer_config(self.optimizer) conf.update(**kwargs) - OptimizerClass = getattr(optimizers, conf['name']) - self._optimizer = OptimizerClass.from_config(conf) + optimizer_class = getattr(optimizers, conf['name']) + self._optimizer = optimizer_class.from_config(conf) if 'disc' in option.lower() or 'all' in option.lower(): conf = self.get_optimizer_config(self.optimizer_disc) conf.update(**kwargs) - OptimizerClass = getattr(optimizers, conf['name']) - self._optimizer_disc = OptimizerClass.from_config(conf) + optimizer_class = getattr(optimizers, conf['name']) + self._optimizer_disc = optimizer_class.from_config(conf) @property def meta(self): @@ -669,6 +672,8 @@ def train_epoch(self, only_gen = train_gen and not train_disc only_disc = train_disc and not train_gen + if self._write_tb_profile: + tf.summary.trace_on(graph=True, profiler=True) for ib, batch in enumerate(batch_handler): trained_gen = False trained_disc = False @@ -707,26 +712,29 @@ def train_epoch(self, b_loss_details['gen_trained_frac'] = float(trained_gen) b_loss_details['disc_trained_frac'] = float(trained_disc) + self.dict_to_tensorboard(b_loss_details) + self.dict_to_tensorboard(self._timing_details) loss_details = self.update_loss_details(loss_details, b_loss_details, len(batch), prefix='train_') - logger.debug('Batch {} out of {} has epoch-average ' '(gen / disc) loss of: ({:.2e} / {:.2e}). ' 'Trained (gen / disc): ({} / {})'.format( - ib, len(batch_handler), + ib + 1, len(batch_handler), loss_details['train_loss_gen'], loss_details['train_loss_disc'], trained_gen, trained_disc)) - if all([not trained_gen, not trained_disc]): msg = ('For some reason none of the GAN networks trained ' 'during batch {} out of {}!'.format( ib, len(batch_handler))) logger.warning(msg) warn(msg) + self.total_batches += 1 + loss_details['total_batches'] = int(self.total_batches) + self.profile_to_tensorboard('training_epoch') return loss_details def update_adversarial_weights(self, history, adaptive_update_fraction, @@ -795,7 +803,8 @@ def train(self, adaptive_update_bounds=(0.9, 0.99), adaptive_update_fraction=0.0, multi_gpu=False, - tensorboard_log=True): + tensorboard_log=True, + tensorboard_profile=False): """Train the GAN model on real low res data and real high res data Parameters @@ -862,9 +871,14 @@ def train(self, be viewed with ``tensorboard --logdir `` where ```` is the parent directory of ``out_dir``, and pointing the browser to the printed address. + tensorboard_profile : bool + Whether to export profiling information to tensorboard. This can + then be viewed in the tensorboard dashboard under the profile tab """ if tensorboard_log: self._init_tensorboard_writer(out_dir) + if tensorboard_profile: + self._write_tb_profile = True self.set_norm_stats(batch_handler.means, batch_handler.stds) self.set_model_params( @@ -898,7 +912,8 @@ def train(self, disc_loss_bounds, multi_gpu=multi_gpu) train_n_obs = loss_details['n_obs'] - loss_details = self.calc_val_loss(batch_handler, weight_gen_advers, + loss_details = self.calc_val_loss(batch_handler, + weight_gen_advers, loss_details) val_n_obs = loss_details['n_obs'] @@ -915,7 +930,8 @@ def train(self, logger.info(msg) - lr_g = self.get_optimizer_config(self.optimizer)['learning_rate'] + lr_g = self.get_optimizer_config( + self.optimizer)['learning_rate'] lr_d = self.get_optimizer_config( self.optimizer_disc)['learning_rate'] @@ -930,8 +946,8 @@ def train(self, } weight_gen_advers = self.update_adversarial_weights( - loss_details, adaptive_update_fraction, adaptive_update_bounds, - weight_gen_advers, train_disc) + loss_details, adaptive_update_fraction, + adaptive_update_bounds, weight_gen_advers, train_disc) stop = self.finish_epoch(epoch, epochs, From d7b1e7c43aa2db3907fa3c6153daf0c3d5b9a2d7 Mon Sep 17 00:00:00 2001 From: bnb32 Date: Fri, 19 Apr 2024 10:58:23 -0600 Subject: [PATCH 5/6] split tensorboard methods into mixin class. added timer class to clean up timing/logging --- sup3r/models/abstract.py | 164 +++++++++++++++++------------------ sup3r/models/base.py | 2 +- sup3r/utilities/utilities.py | 38 ++++++-- 3 files changed, 113 insertions(+), 91 deletions(-) diff --git a/sup3r/models/abstract.py b/sup3r/models/abstract.py index caac8a510..2fdade0b8 100644 --- a/sup3r/models/abstract.py +++ b/sup3r/models/abstract.py @@ -21,10 +21,82 @@ import sup3r.utilities.loss_metrics from sup3r.preprocessing.data_handling.exogenous_data_handling import ExoData from sup3r.utilities import VERSION_RECORD +from sup3r.utilities.utilities import Timer logger = logging.getLogger(__name__) +class TensorboardMixIn: + """MixIn class for tensorboard logging and profiling.""" + + def __init__(self): + self._tb_writer = None + self._tb_log_dir = None + self._write_tb_profile = False + self._total_batches = None + self.timer = Timer() + + @property + def total_batches(self): + """Record of total number of batches for logging.""" + if self._total_batches is None and self._history is None: + self._total_batches = 0 + elif self._history is None and 'total_batches' in self._history: + self._total_batches = self._history['total_batches'].values[-1] + elif self._total_batches is None and self._history is not None: + self._total_batches = 0 + return self._total_batches + + @total_batches.setter + def total_batches(self, value): + """Set total number of batches.""" + self._total_batches = value + + def dict_to_tensorboard(self, entry): + """Write data to tensorboard log file. This is usually a loss_details + dictionary. + + Parameters + ---------- + entry: dict + Dictionary of values to write to tensorboard log file + """ + if self._tb_writer is not None: + with self._tb_writer.as_default(): + for name, value in entry.items(): + if isinstance(value, str): + tf.summary.text(name, value, self.total_batches) + else: + tf.summary.scalar(name, value, self.total_batches) + + def profile_to_tensorboard(self, name): + """Write profile data to tensorboard log file. + + Parameters + ---------- + name : str + Tag name to use for profile info + """ + if self._tb_writer is not None and self._write_tb_profile: + with self._tb_writer.as_default(): + tf.summary.trace_export(name=name, step=self.total_batches, + profiler_outdir=self._tb_log_dir) + + def _init_tensorboard_writer(self, out_dir): + """Initialize the ``tf.summary.SummaryWriter`` to use for writing + tensorboard compatible log files. + + Parameters + ---------- + out_dir : str + Standard out_dir where model epochs are saved. e.g. './gan_{epoch}' + """ + tb_log_pardir = os.path.abspath(os.path.join(out_dir, os.pardir)) + self._tb_log_dir = os.path.join(tb_log_pardir, 'logs') + os.makedirs(self._tb_log_dir, exist_ok=True) + self._tb_writer = tf.summary.create_file_writer(self._tb_log_dir) + + class AbstractInterface(ABC): """ Abstract class to define the required interface for Sup3r model subclasses @@ -462,13 +534,14 @@ def save_params(self, out_dir): # pylint: disable=E1101,W0201,E0203 -class AbstractSingleModel(ABC): +class AbstractSingleModel(ABC, TensorboardMixIn): """ Abstract class to define the required training interface for Sup3r model subclasses """ def __init__(self): + super().__init__() self.gpu_list = tf.config.list_physical_devices('GPU') self.default_device = '/cpu:0' self._version_record = VERSION_RECORD @@ -481,27 +554,6 @@ def __init__(self): self._gen = None self._means = None self._stdevs = None - self._tb_writer = None - self._tb_log_dir = None - self._write_tb_profile = False - self._total_batches = None - self._timing_details = {} - - @property - def total_batches(self): - """Record of total number of batches for logging.""" - if self._total_batches is None and self._history is None: - self._total_batches = 0 - elif self._history is None and 'total_batches' in self._history: - self._total_batches = self._history['total_batches'].values[-1] - elif self._total_batches is None and self._history is not None: - self._total_batches = 0 - return self._total_batches - - @total_batches.setter - def total_batches(self, value): - """Set total number of batches.""" - self._total_batches = value def load_network(self, model, name): """Load a CustomNetwork object from hidden layers config, .json file @@ -1025,36 +1077,6 @@ def save(self, out_dir): if it does not already exist. """ - def dict_to_tensorboard(self, entry): - """Write data to tensorboard log file. This is usually a loss_details - dictionary. - - Parameters - ---------- - entry: dict - Dictionary of values to write to tensorboard log file - """ - if self._tb_writer is not None: - with self._tb_writer.as_default(): - for name, value in entry.items(): - if isinstance(value, str): - tf.summary.text(name, value, self.total_batches) - else: - tf.summary.scalar(name, value, self.total_batches) - - def profile_to_tensorboard(self, name): - """Write profile data to tensorboard log file. - - Parameters - ---------- - name : str - Tag name to use for profile info - """ - if self._tb_writer is not None and self._write_tb_profile: - with self._tb_writer.as_default(): - tf.summary.trace_export(name=name, step=self.total_batches, - profiler_outdir=self._tb_log_dir) - def finish_epoch(self, epoch, epochs, @@ -1222,7 +1244,6 @@ def run_gradient_descent(self, t1 = time.time() logger.debug(f'Finished {len(futures)} gradient descent steps on ' f'{len(self.gpu_list)} GPUs in {(t1 - t0):.3f}s') - self._timing_details['dt:run_gradient_descent'] = t1 - t0 return loss_details def _reshape_norm_exo(self, hi_res, hi_res_exo, exo_name, norm_in=True): @@ -1282,20 +1303,6 @@ def _reshape_norm_exo(self, hi_res, hi_res_exo, exo_name, norm_in=True): return hi_res_exo - def _init_tensorboard_writer(self, out_dir): - """Initialize the ``tf.summary.SummaryWriter`` to use for writing - tensorboard compatible log files. - - Parameters - ---------- - out_dir : str - Standard out_dir where model epochs are saved. e.g. './gan_{epoch}' - """ - self._tb_log_dir = os.path.join( - os.path.abspath(os.path.join(out_dir, os.pardir)), 'logs') - os.makedirs(self._tb_log_dir, exist_ok=True) - self._tb_writer = tf.summary.create_file_writer(self._tb_log_dir) - def generate(self, low_res, norm_in=True, @@ -1462,22 +1469,11 @@ def get_single_grad(self, """ with tf.device(device_name), tf.GradientTape( watch_accessed_variables=False) as tape: - t0 = time.time() - tape.watch(training_weights) - self._timing_details['dt:tape.watch'] = time.time() - t0 - t0 = time.time() - hi_res_exo = self.get_high_res_exo_input(hi_res_true) - self._timing_details[ - 'dt:get_high_res_exo_input'] = time.time() - t0 - t0 = time.time() - hi_res_gen = self._tf_generate(low_res, hi_res_exo) - self._timing_details['dt:_tf_generate'] = time.time() - t0 - t0 = time.time() - loss_out = self.calc_loss(hi_res_true, hi_res_gen, - **calc_loss_kwargs) - self._timing_details['dt:calc_loss'] = time.time() - t0 - t0 = time.time() + self.timer(tape.watch(training_weights)) + hi_res_exo = self.timer(self.get_high_res_exo_input(hi_res_true)) + hi_res_gen = self.timer(self._tf_generate(low_res, hi_res_exo)) + loss_out = self.timer(self.calc_loss(hi_res_true, hi_res_gen, + **calc_loss_kwargs)) loss, loss_details = loss_out - grad = tape.gradient(loss, training_weights) - self._timing_details['dt:tape.gradient'] = time.time() - t0 + grad = self.timer(tape.gradient(loss, training_weights)) return grad, loss_details diff --git a/sup3r/models/base.py b/sup3r/models/base.py index bc2c3862b..2988e3314 100644 --- a/sup3r/models/base.py +++ b/sup3r/models/base.py @@ -713,7 +713,7 @@ def train_epoch(self, b_loss_details['gen_trained_frac'] = float(trained_gen) b_loss_details['disc_trained_frac'] = float(trained_disc) self.dict_to_tensorboard(b_loss_details) - self.dict_to_tensorboard(self._timing_details) + self.dict_to_tensorboard(self.timer.log) loss_details = self.update_loss_details(loss_details, b_loss_details, len(batch), diff --git a/sup3r/utilities/utilities.py b/sup3r/utilities/utilities.py index 1e4c7b4a3..37ddbd04d 100644 --- a/sup3r/utilities/utilities.py +++ b/sup3r/utilities/utilities.py @@ -1,8 +1,6 @@ # -*- coding: utf-8 -*- -"""Utilities module for preparing training data - -@author: bbenton -""" +"""Miscellaneous utilities for computing features, preparing training data, +timing functions, etc """ import glob import logging @@ -10,6 +8,7 @@ import random import re import string +import time from fnmatch import fnmatch from warnings import warn @@ -20,14 +19,41 @@ from packaging import version from scipy import ndimage as nd from scipy.interpolate import RegularGridInterpolator, interp1d -from scipy.ndimage import zoom -from scipy.ndimage import gaussian_filter +from scipy.ndimage import gaussian_filter, zoom np.random.seed(42) logger = logging.getLogger(__name__) +class Timer: + """Timer class for timing and storing function call times.""" + + def __init__(self): + self.log = {} + + def __call__(self, fun, *args, **kwargs): + """Time function call and store elapsed time in self.log. + + Parameters + ---------- + func : function + *args : list + positional arguments for func + **kwargs : dict + keyword arguments for func + + Returns + ------- + output of func + """ + t0 = time.time() + out = fun(*args, **kwargs) + t_elap = time.time() - t0 + self.log[f'elapsed:{fun.__name__}'] = t_elap + return out + + def generate_random_string(length): """Generate random string with given length. Used for naming temporary files to avoid collisions.""" From e2d62aee00b7a16f4c25643d2f9096d9a52e5436 Mon Sep 17 00:00:00 2001 From: bnb32 Date: Fri, 19 Apr 2024 11:24:23 -0600 Subject: [PATCH 6/6] linting --- sup3r/models/abstract.py | 13 +++++++------ sup3r/utilities/utilities.py | 9 +++++---- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/sup3r/models/abstract.py b/sup3r/models/abstract.py index 2fdade0b8..ff6dbbdaf 100644 --- a/sup3r/models/abstract.py +++ b/sup3r/models/abstract.py @@ -34,6 +34,7 @@ def __init__(self): self._tb_log_dir = None self._write_tb_profile = False self._total_batches = None + self._history = None self.timer = Timer() @property @@ -1469,11 +1470,11 @@ def get_single_grad(self, """ with tf.device(device_name), tf.GradientTape( watch_accessed_variables=False) as tape: - self.timer(tape.watch(training_weights)) - hi_res_exo = self.timer(self.get_high_res_exo_input(hi_res_true)) - hi_res_gen = self.timer(self._tf_generate(low_res, hi_res_exo)) - loss_out = self.timer(self.calc_loss(hi_res_true, hi_res_gen, - **calc_loss_kwargs)) + self.timer(tape.watch, training_weights) + hi_res_exo = self.timer(self.get_high_res_exo_input, hi_res_true) + hi_res_gen = self.timer(self._tf_generate, low_res, hi_res_exo) + loss_out = self.timer(self.calc_loss, hi_res_true, hi_res_gen, + **calc_loss_kwargs) loss, loss_details = loss_out - grad = self.timer(tape.gradient(loss, training_weights)) + grad = self.timer(tape.gradient, loss, training_weights) return grad, loss_details diff --git a/sup3r/utilities/utilities.py b/sup3r/utilities/utilities.py index 37ddbd04d..b0ac20a62 100644 --- a/sup3r/utilities/utilities.py +++ b/sup3r/utilities/utilities.py @@ -37,15 +37,16 @@ def __call__(self, fun, *args, **kwargs): Parameters ---------- - func : function + fun : function + Function to time *args : list - positional arguments for func + positional arguments for fun **kwargs : dict - keyword arguments for func + keyword arguments for fun Returns ------- - output of func + output of fun """ t0 = time.time() out = fun(*args, **kwargs)