Skip to content

Commit

Permalink
tensorboard profiling
Browse files Browse the repository at this point in the history
  • Loading branch information
bnb32 committed Apr 11, 2024
1 parent 59c227b commit acf05a4
Show file tree
Hide file tree
Showing 2 changed files with 115 additions and 83 deletions.
128 changes: 72 additions & 56 deletions sup3r/models/abstract.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
# -*- coding: utf-8 -*-
"""
Abstract class to define the required interface for Sup3r model subclasses
"""
"""Abstract class defining the required interface for Sup3r model subclasses"""
import json
import logging
import os
Expand Down Expand Up @@ -371,9 +369,8 @@ def hr_exo_features(self):
# pylint: disable=E1101
features = []
if hasattr(self, '_gen'):
for layer in self._gen.layers:
if isinstance(layer, (Sup3rAdder, Sup3rConcat)):
features.append(layer.name)
features = [layer.name for layer in self._gen.layers
if isinstance(layer, (Sup3rAdder, Sup3rConcat))]
return features

@property
Expand Down Expand Up @@ -485,8 +482,27 @@ def __init__(self):
self._means = None
self._stdevs = None
self._tb_writer = None
self._tb_log_dir = None
self._write_tb_profile = False
self._total_batches = None
self._timing_details = {}

@property
def total_batches(self):
"""Record of total number of batches for logging."""
if self._total_batches is None and self._history is None:
self._total_batches = 0
elif self._history is None and 'total_batches' in self._history:
self._total_batches = self._history['total_batches'].values[-1]
elif self._total_batches is None and self._history is not None:
self._total_batches = 0
return self._total_batches

@total_batches.setter
def total_batches(self, value):
"""Set total number of batches."""
self._total_batches = value

def load_network(self, model, name):
"""Load a CustomNetwork object from hidden layers config, .json file
config, or .pkl file saved pre-trained model.
Expand Down Expand Up @@ -745,13 +761,13 @@ def init_optimizer(optimizer, learning_rate):
"""
if isinstance(optimizer, dict):
class_name = optimizer['name']
OptimizerClass = getattr(optimizers, class_name)
sig = signature(OptimizerClass)
optimizer_class = getattr(optimizers, class_name)
sig = signature(optimizer_class)
optimizer_kwargs = {
k: v
for k, v in optimizer.items() if k in sig.parameters
}
optimizer = OptimizerClass.from_config(optimizer_kwargs)
optimizer = optimizer_class.from_config(optimizer_kwargs)
elif optimizer is None:
optimizer = optimizers.Adam(learning_rate=learning_rate)

Expand Down Expand Up @@ -1009,30 +1025,35 @@ def save(self, out_dir):
if it does not already exist.
"""

def _log_to_tensorboard(self, epoch, extras=None):
"""Write data to tensorboard log file. Includes history values, some
timing info, and provided extras.
def dict_to_tensorboard(self, entry):
"""Write data to tensorboard log file. This is usually a loss_details
dictionary.
Parameters
----------
epoch : int
Current epoch to write info for
extras : dict | None
Extra kwargs/parameters to save in the epoch history.
entry: dict
Dictionary of values to write to tensorboard log file
"""
if self._tb_writer is not None:
with self._tb_writer.as_default():
for col in self._history.columns:
val = self._history.at[epoch, col]
if isinstance(val, str):
tf.summary.text(col, val, epoch)
for name, value in entry.items():
if isinstance(value, str):
tf.summary.text(name, value, self.total_batches)
else:
tf.summary.scalar(col, val, epoch)
for name, value in self._timing_details.items():
tf.summary.scalar(name, value, epoch)
if extras is not None:
for name, value in extras.items():
tf.summary.scalar(name, value, epoch)
tf.summary.scalar(name, value, self.total_batches)

def profile_to_tensorboard(self, name):
"""Write profile data to tensorboard log file.
Parameters
----------
name : str
Tag name to use for profile info
"""
if self._tb_writer is not None and self._write_tb_profile:
with self._tb_writer.as_default():
tf.summary.trace_export(name=name, step=self.total_batches,
profiler_outdir=self._tb_log_dir)

def finish_epoch(self,
epoch,
Expand Down Expand Up @@ -1114,8 +1135,6 @@ def finish_epoch(self,
for k, v in extras.items():
self._history.at[epoch, k] = v

self._log_to_tensorboard(epoch, extras=extras)

return stop

def run_gradient_descent(self,
Expand Down Expand Up @@ -1166,19 +1185,14 @@ def run_gradient_descent(self,
optimizer = self.optimizer

if not multi_gpu or len(self.gpu_list) == 1:
start = time.time()

grad, loss_details = self.get_single_grad(low_res, hi_res_true,
training_weights,
**calc_loss_kwargs)
self._timing_details['dt:get_single_grad'] = time.time() - start
start = time.time()
optimizer.apply_gradients(zip(grad, training_weights))
self._timing_details['dt:apply_gradients'] = time.time() - start
t1 = time.time()
logger.debug(f'Finished single gradient descent step '
f'in {(t1 - t0):.3f}s')
self._timing_details['dt:run_gradient_descent'] = t1 - t0

else:
futures = []
lr_chunks = np.array_split(low_res, len(self.gpu_list))
Expand Down Expand Up @@ -1208,8 +1222,7 @@ def run_gradient_descent(self,
t1 = time.time()
logger.debug(f'Finished {len(futures)} gradient descent steps on '
f'{len(self.gpu_list)} GPUs in {(t1 - t0):.3f}s')
self._timing_details['dt:run_gradient_descent'] = t1 - t0

self._timing_details['dt:run_gradient_descent'] = t1 - t0
return loss_details

def _reshape_norm_exo(self, hi_res, hi_res_exo, exo_name, norm_in=True):
Expand Down Expand Up @@ -1278,10 +1291,10 @@ def _init_tensorboard_writer(self, out_dir):
out_dir : str
Standard out_dir where model epochs are saved. e.g. './gan_{epoch}'
"""
tb_log_dir = os.path.join(
self._tb_log_dir = os.path.join(
os.path.abspath(os.path.join(out_dir, os.pardir)), 'logs')
os.makedirs(tb_log_dir, exist_ok=True)
self._tb_writer = tf.summary.create_file_writer(tb_log_dir)
os.makedirs(self._tb_log_dir, exist_ok=True)
self._tb_writer = tf.summary.create_file_writer(self._tb_log_dir)

def generate(self,
low_res,
Expand Down Expand Up @@ -1328,8 +1341,10 @@ def generate(self,
low_res = self.norm_input(low_res)

hi_res = self.generator.layers[0](low_res)
for i, layer in enumerate(self.generator.layers[1:]):
try:
layer_num = 1
try:
for i, layer in enumerate(self.generator.layers[1:]):
layer_num = i + 1
if isinstance(layer, (Sup3rAdder, Sup3rConcat)):
msg = (f'layer.name = {layer.name} does not match any '
'features in exogenous_data '
Expand All @@ -1344,11 +1359,11 @@ def generate(self,
hi_res = layer(hi_res, hi_res_exo)
else:
hi_res = layer(hi_res)
except Exception as e:
msg = ('Could not run layer #{} "{}" on tensor of shape {}'.
format(i + 1, layer, hi_res.shape))
logger.error(msg)
raise RuntimeError(msg) from e
except Exception as e:
msg = ('Could not run layer #{} "{}" on tensor of shape {}'.
format(layer_num, layer, hi_res.shape))
logger.error(msg)
raise RuntimeError(msg) from e

hi_res = hi_res.numpy()

Expand Down Expand Up @@ -1386,8 +1401,10 @@ def _tf_generate(self, low_res, hi_res_exo=None):
Synthetically generated high-resolution data
"""
hi_res = self.generator.layers[0](low_res)
for i, layer in enumerate(self.generator.layers[1:]):
try:
layer_num = 1
try:
for i, layer in enumerate(self.generator.layers[1:]):
layer_num = i + 1
if isinstance(layer, (Sup3rAdder, Sup3rConcat)):
msg = (f'layer.name = {layer.name} does not match any '
f'features in exogenous_data ({list(hi_res_exo)})')
Expand All @@ -1396,11 +1413,11 @@ def _tf_generate(self, low_res, hi_res_exo=None):
hi_res = layer(hi_res, hr_exo)
else:
hi_res = layer(hi_res)
except Exception as e:
msg = ('Could not run layer #{} "{}" on tensor of shape {}'.
format(i + 1, layer, hi_res.shape))
logger.error(msg)
raise RuntimeError(msg) from e
except Exception as e:
msg = ('Could not run layer #{} "{}" on tensor of shape {}'.
format(layer_num, layer, hi_res.shape))
logger.error(msg)
raise RuntimeError(msg) from e

return hi_res

Expand Down Expand Up @@ -1454,14 +1471,13 @@ def get_single_grad(self,
'dt:get_high_res_exo_input'] = time.time() - t0
t0 = time.time()
hi_res_gen = self._tf_generate(low_res, hi_res_exo)
self._timing_details['dt:tf.generate'] = time.time() - t0
self._timing_details['dt:_tf_generate'] = time.time() - t0
t0 = time.time()
loss_out = self.calc_loss(hi_res_true, hi_res_gen,
**calc_loss_kwargs)
self._timing_details['dt:calc_loss'] = time.time() - t0
loss, loss_details = loss_out
t0 = time.time()
loss, loss_details = loss_out
grad = tape.gradient(loss, training_weights)
self._timing_details['dt:tape.gradient'] = time.time() - t0

return grad, loss_details
Loading

0 comments on commit acf05a4

Please sign in to comment.