incremental changes

train_direct.py: use argparse instead of tf.flags mypy.ini: stricter and more thorough checking
chrisyeh96 · Sep 29, 2020 · 6454170 · 6454170
1 parent bd83af1
commit 6454170
Show file tree

Hide file tree

Showing 4 changed files with 149 additions and 72 deletions.
diff --git a/batchers/tfrecord_paths_utils.py b/batchers/tfrecord_paths_utils.py
@@ -60,11 +60,11 @@ def dhs_ooc(dataset: str, split: str) -> np.ndarray:
  tfrecord_paths = []
  for split in splits:
  for country_year in survey_names[split]:
- glob_path = os.path.join(DHS_TFRECORDS_PATH_ROOT, country_year + '*', '*.tfrecord.gz')
+ glob_path = os.path.join(
+ DHS_TFRECORDS_PATH_ROOT, country_year + '*', '*.tfrecord.gz')
  tfrecord_paths.extend(glob(glob_path))
- tfrecord_paths = np.sort(tfrecord_paths)
- # assert len(tfrecord_paths) == SIZES[dataset][split] # TODO: uncomment this
- return tfrecord_paths
+ assert len(tfrecord_paths) == SIZES[dataset][split]
+ return np.sort(tfrecord_paths)
 
 
 def lsms_ooc(cys: Optional[Iterable[str]] = None) -> List[str]:

diff --git a/mypy.ini b/mypy.ini
@@ -1,3 +1,9 @@
-[mypy]
-ignore_missing_imports = True
-namespace_packages = True
+[mypy]
+allow_redefinition = True
+disallow_incomplete_defs = True
+ignore_missing_imports = True
+namespace_packages = True
+warn_redundant_casts = True
+warn_unreachable = True
+warn_unused_configs = True
+warn_unused_ignores = True
diff --git a/train_direct.py b/train_direct.py
@@ -1,4 +1,4 @@
-'''
+r'''
 This script trains ResNet CNN models to estimate wealth for DHS and LSMS
 locations. Model checkpoints and TensorBoard training logs are saved to
 `out_dir`.
@@ -9,7 +9,7 @@
  --model_name resnet --num_layers 18 \
  --lr_decay 0.96 --batch_size 64 \
  --gpu 0 --num_threads 5 \
- --cache train,train_eval,val \
+ --cache train train_eval val \
  --augment True --eval_every 1 --print_every 40 \
  --ooc {ooc} --max_epochs {max_epochs} \
  --out_dir {out_dir} \
@@ -25,6 +25,7 @@
  `preprocessing/1_process_tfrecords.ipynb` and
  `preprocessing/2_create_incountry_folds.ipynb`.
 '''
+import argparse
 import json
 import os
 from pprint import pprint
@@ -234,7 +235,7 @@ def get_batcher(tfrecord_paths: tf.Tensor, shuffle: bool, augment: bool,
 
 def run_training_wrapper(**params: Any) -> None:
  '''
- params is a dict with keys matching the FLAGS defined below
+ params is a dict with keys matching the arguments from _parse_args()
  '''
  start = time.time()
  print('Current time:', start)
@@ -262,16 +263,18 @@ def run_training_wrapper(**params: Any) -> None:
  params['experiment_name'], params['batch_size'],
  params['fc_reg'], params['conv_reg'], params['lr'])
  out_dir = os.path.join(params['out_dir'], full_experiment_name)
- os.makedirs(out_dir, exist_ok=True)
- print(f'Outputs directory: {out_dir}')
-
  params_filepath = os.path.join(out_dir, 'params.json')
- assert not os.path.exists(params_filepath), f'Stopping. Found previous run at: {params_filepath}'
+ if os.path.exists(params_filepath):
+ print(f'Stopping. Found previous run at: {params_filepath}')
+ return
+
+ print(f'Outputs directory: {out_dir}')
+ os.makedirs(out_dir, exist_ok=True)
  with open(params_filepath, 'w') as config_file:
  json.dump(params, config_file, indent=4)
 
  # Create session
- # - MUST set os.environ['CUDA_VISIBLE_DEVICES'] before creating tf.Session object
+ # - MUST set os.environ['CUDA_VISIBLE_DEVICES'] before creating tf.Session
  if params['gpu'] is None: # restrict to CPU only
  os.environ['CUDA_VISIBLE_DEVICES'] = ''
  else:
@@ -321,58 +324,114 @@ def run_training_wrapper(**params: Any) -> None:
  print('Time elasped (sec.):', end - start)
 
 
-def main(_: Any) -> None:
- params = {
- key: flags.FLAGS.__getattr__(key)
- for key in dir(flags.FLAGS)
- }
- run_training_wrapper(**params)
-
-
-if __name__ == '__main__':
- flags = tf.app.flags
+def _parse_args() -> argparse.Namespace:
+ """Parses arguments."""
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ description='Run end-to-end training.')
 
  # paths
- flags.DEFINE_string('experiment_name', 'new_experiment', 'name of the experiment being run')
- flags.DEFINE_string('out_dir', os.path.join(ROOT_DIR, 'outputs/'), 'path to output directory for saving checkpoints and TensorBoard logs')
+ parser.add_argument(
+ '--experiment_name', default='new_experiment',
+ help='name of experiment being run')
+ parser.add_argument(
+ '--out_dir', default=os.path.join(ROOT_DIR, 'outputs/'),
+ help='path to output directory for saving checkpoints and TensorBoard '
+ 'logs')
 
  # initialization
- flags.DEFINE_string('init_ckpt_dir', None, 'path to checkpoint prefix from which to initialize weights (default None)')
- flags.DEFINE_string('imagenet_weights_path', None, 'path to ImageNet weights for initialization (default None)')
- flags.DEFINE_string('hs_weight_init', None, 'method for initializing weights of non-RGB bands in 1st conv layer, one of [None (default), "random", "same", "samescaled"]')
- flags.DEFINE_boolean('exclude_final_layer', False, 'whether to use checkpoint to initialize final layer')
+ parser.add_argument(
+ '--init_ckpt_dir',
+ help='path to checkpoint prefix from which to initialize weights')
+ parser.add_argument(
+ '--imagenet_weights_path',
+ help='path to ImageNet weights for initialization')
+ parser.add_argument(
+ '--hs_weight_init', choices=[None, 'random', 'same', 'samescaled'],
+ help='method for initializing weights of non-RGB bands in 1st conv '
+ 'layer')
+ parser.add_argument(
+ '--exclude_final_layer', action='store_true',
+ help='whether to use checkpoint to initialize final layer')
 
  # learning parameters
- flags.DEFINE_string('label_name', 'wealthpooled', 'name of label to use from the TFRecord files')
- flags.DEFINE_integer('batch_size', 64, 'batch size')
- flags.DEFINE_boolean('augment', True, 'whether to use data augmentation')
- flags.DEFINE_float('fc_reg', 1e-3, 'Regularization penalty factor for fully connected layers')
- flags.DEFINE_float('conv_reg', 1e-3, 'Regularization penalty factor for convolution layers')
- flags.DEFINE_float('lr', 1e-3, 'Learning rate for optimizer')
- flags.DEFINE_float('lr_decay', 1.0, 'Decay rate of the learning rate (default 1.0 for no decay)')
+ parser.add_argument(
+ '--label_name', default='wealthpooled',
+ help='name of label to use from the TFRecord files')
+ parser.add_argument(
+ '--batch_size', type=int, default=64,
+ help='batch size')
+ parser.add_argument(
+ '--augment', action='store_true',
+ help='whether to use data augmentation')
+ parser.add_argument(
+ '--fc_reg', type=float, default=1e-3,
+ help='Regularization penalty factor for fully connected layers')
+ parser.add_argument(
+ '--conv_reg', type=float, default=1e-3,
+ help='Regularization penalty factor for convolution layers')
+ parser.add_argument(
+ '--lr', type=float, default=1e-3,
+ help='Learning rate for optimizer')
+ parser.add_argument(
+ '--lr_decay', type=float, default=1.0,
+ help='Decay rate of the learning rate')
 
  # high-level model control
- flags.DEFINE_string('model_name', 'resnet', 'name of the model to be used, currently only "resnet" is supported')
+ parser.add_argument(
+ '--model_name', default='resnet', choices=['resnet'],
+ help='name of model architecture')
 
  # resnet-only params
- flags.DEFINE_integer('num_layers', 18, 'Number of ResNet layers, one of [18, 34, 50]')
+ parser.add_argument(
+ '--num_layers', type=int, default=18, choices=[18, 34, 50],
+ help='number of ResNet layers')
 
  # data params
- flags.DEFINE_string('dataset', 'DHS_OOC_A', 'dataset to use') # TODO
- flags.DEFINE_boolean('ooc', True, 'whether to use out-of-country split')
- flags.DEFINE_float('keep_frac', 1.0, 'fraction of training data to use (default 1.0 uses all data)')
- flags.DEFINE_string('ls_bands', None, 'Landsat bands to use, one of [None (default), "rgb", "ms"]')
- flags.DEFINE_string('nl_band', None, 'nightlights band, one of [None (default), "merge", "split"]')
+ parser.add_argument(
+ '--dataset', default='DHS_OOC_A', # TODO: choices?
+ help='dataset to use')
+ parser.add_argument(
+ '--ooc', action='store_true',
+ help='whether to use out-of-country split')
+ parser.add_argument(
+ '--keep_frac', type=float, default=1.0,
+ help='fraction of training data to use')
+ parser.add_argument(
+ '--ls_bands', choices=[None, 'rgb', 'ms'],
+ help='Landsat bands to use')
+ parser.add_argument(
+ '--nl_band', choices=[None, 'merge', 'split'],
+ help='nightlights band')
 
  # system
- flags.DEFINE_integer('gpu', None, 'which GPU to use (default None)')
- flags.DEFINE_integer('num_threads', 1, 'number of threads for batcher')
- flags.DEFINE_list('cache', [], 'comma-separated list (no spaces) of datasets to cache in memory, choose from [None, "train", "train_eval", "val"]')
+ parser.add_argument(
+ '--gpu', type=int,
+ help='which GPU to use')
+ parser.add_argument(
+ '--num_threads', type=int, default=1,
+ help='number of threads for batcher')
+ parser.add_argument(
+ '--cache', nargs='*', default=[], choices=['train', 'train_eval', 'val'],
+ help='list of datasets to cache in memory')
 
  # Misc
- flags.DEFINE_integer('max_epochs', 150, 'maximum number of epochs for training')
- flags.DEFINE_integer('eval_every', 1, 'evaluate the model on the validation set after every so many epochs of training')
- flags.DEFINE_integer('print_every', 40, 'print training statistics after every so many steps')
- flags.DEFINE_integer('seed', 123, 'seed for random initialization and shuffling')
+ parser.add_argument(
+ '--max_epochs', type=int, default=150,
+ help='maximum number of epochs for training')
+ parser.add_argument(
+ '--eval_every', type=int, default=1,
+ help='evaluate the model on the validation set after every so many '
+ 'epochs of training')
+ parser.add_argument(
+ '--print_every', type=int, default=40,
+ help='print training statistics after every so many steps')
+ parser.add_argument(
+ '--seed', type=int, default=123,
+ help='seed for random initialization and shuffling')
+ return parser.parse_args()
+
 
- tf.app.run()
+if __name__ == '__main__':
+ args = _parse_args()
+ run_training_wrapper(**vars(args))
diff --git a/utils/run.py b/utils/run.py
@@ -2,7 +2,7 @@
 from glob import glob
 import os
 import time
-from typing import Any, DefaultDict, Dict, Iterable, List, Mapping, Optional
+from typing import Any, Callable, DefaultDict, Dict, Iterable, Mapping, Optional
 
 import numpy as np
 import tensorflow as tf
@@ -41,7 +41,8 @@ def get_full_experiment_name(experiment_name: str, batch_size: int,
  fc_str = param_to_str(fc_reg)
  conv_str = param_to_str(conv_reg)
  lr_str = param_to_str(lr)
- full_experiment_name = f'{experiment_name}_b{batch_size}_fc{fc_str}_conv{conv_str}_lr{lr_str}'
+ full_experiment_name = (
+ f'{experiment_name}_b{batch_size}_fc{fc_str}_conv{conv_str}_lr{lr_str}')
 
  if tag is not None:
  full_experiment_name += f'_{tag}'
@@ -50,7 +51,8 @@ def get_full_experiment_name(experiment_name: str, batch_size: int,
 
 
 def checkpoint_path_exists(ckpt_path: str) -> bool:
- '''Checks whether a TensorFlow modeol checkpoint exists at the given path.'''
+ '''Checks whether a TensorFlow modeol checkpoint exists at the given path.
+ '''
  if ckpt_path[-6:] == '.index':
  ckpt_path = ckpt_path[-6:]
  if ckpt_path[-5:] == '.meta':
@@ -70,22 +72,23 @@ def load(sess: tf.Session, saver: tf.train.Saver, checkpoint_dir: str) -> bool:
  - saver: tf.train.Saver
  - checkpoint_dir: str, path to directory containing checkpoint(s)
 
- Returns: bool, True if successful at restoring checkpoint from given directory
+ Returns: bool, True if successful at restoring checkpoint from given dir
  '''
  print(f'Reading from checkpoint dir: {checkpoint_dir}')
  if checkpoint_dir is None:
  raise ValueError('No checkpoint path, given, cannot load checkpoint')
  if not os.path.isdir(checkpoint_dir):
  raise ValueError('Given path is not a valid directory.')
 
- # read the CheckpointState proto from the 'checkpoint' file in checkpoint_dir
+ # read the CheckpointState proto from 'checkpoint' file in checkpoint_dir
  ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
  if ckpt and ckpt.model_checkpoint_path:
  ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
  print(f'Loading checkpoint: {ckpt_name}')
  if not checkpoint_path_exists(ckpt.model_checkpoint_path):
- raise LoadNoFileError('Checkpoint could not be loaded because it does not exist,'
- ' but its information is in the checkpoint meta-data file.')
+ raise LoadNoFileError(
+ 'Checkpoint could not be loaded because it does not exist,'
+ ' but its information is in the checkpoint meta-data file.')
  saver.restore(sess, ckpt.model_checkpoint_path)
  return True
  return False
@@ -95,18 +98,19 @@ def print_number_of_parameters(verbose: bool = True) -> None:
  '''Prints the total number of trainable parameters.
 
  Args
- - verbose: bool, whether to print name and shape info for every trainable var
+ - verbose: bool, whether to print name & shape info for every trainable var
  '''
  total_parameters = 0 # total # of trainable params in the current graph
- num_none_vars = 0 # num variables in the graph with a shape that is not fully defined
+ num_none_vars = 0 # variables in graph with shape that is not fully defined
 
  for variable in tf.trainable_variables():
  name = variable.name
  shape = (d.value for d in variable.shape) # each d is a tf.Dimension
  num_params = np.prod(variable.shape).value
 
  if verbose:
- print(f'Variable name: {name}, shape: {shape}, num_params: {num_params}')
+ print(f'Variable name: {name}, shape: {shape}, '
+ f'num_params: {num_params}')
 
  if num_params is None:
  num_none_vars += 1
@@ -146,7 +150,8 @@ def run_batches(sess: tf.Session, tensors_dict_ops: Mapping[str, tf.Tensor],
  curr_batch += 1
  if verbose:
  speed = curr_batch / (time.time() - start_time)
- print(f'\rRan {curr_batch} batches ({speed:.3f} batch/s)', end='')
+ print(f'\rRan {curr_batch} batches ({speed:.3f} batch/s)',
+ end='')
  if curr_batch >= max_nbatches:
  break
  except tf.errors.OutOfRangeError:
@@ -178,7 +183,8 @@ def save_results(dir_path: str, np_dict: dict, filename: str = 'features.npz'
  np.savez_compressed(npz_path, **np_dict)
 
 
-def check_existing(model_dirs: Iterable[str], outputs_root_dir: str, test_filename: str) -> bool:
+def check_existing(model_dirs: Iterable[str], outputs_root_dir: str,
+ test_filename: str) -> bool:
  '''Checks a list of model directories to ensure that they contain model
  checkpoints but not a given filename.
 
@@ -188,10 +194,12 @@ def check_existing(model_dirs: Iterable[str], outputs_root_dir: str, test_filena
 
  Args
  - model_dirs: list of str, model directories within outputs_root_dir
- - outputs_root_dir: str, path to root directory for saving logs and checkpoints
+ - outputs_root_dir: str, path to root directory for saving logs and
+ checkpoints
  - test_filename: str, name of file to check for
 
- Returns: bool, True if ckpts exist and no test_filename files found, otherwise False
+ Returns: bool, True if ckpts exist and no test_filename files found,
+ otherwise False
  '''
  ret = True
  for model_dir in model_dirs:
@@ -213,7 +221,7 @@ def check_existing(model_dirs: Iterable[str], outputs_root_dir: str, test_filena
 
 
 def run_extraction_on_models(model_dirs: Iterable[str],
- ModelClass: Any,
+ ModelClass: Callable,
  model_params: Mapping,
  batcher: batcher.Batcher,
  batches_per_epoch: int,
@@ -226,7 +234,8 @@ def run_extraction_on_models(model_dirs: Iterable[str],
  features as a compressed numpy .npz file.
 
  Args
- - model_dirs: list of str, names of folders where models are saved
+ - model_dirs: list of str, names of folders where models are saved, should
+ be subfolders of out_root_dir
  - ModelClass: class, an instance `model` of ModelClass which has attributes
  model.features_layer: tf.Tensor
  model.outputs: tf.Tensor
@@ -252,7 +261,8 @@ def run_extraction_on_models(model_dirs: Iterable[str],
  tensors_dict_ops[key] = batch_op[key]
 
  saver = tf.train.Saver(var_list=None)
- var_init_ops = [tf.global_variables_initializer(), tf.local_variables_initializer()]
+ var_init_ops = [tf.global_variables_initializer(),
+ tf.local_variables_initializer()]
 
  print('Creating session...')
  config_proto = tf.ConfigProto()
@@ -270,5 +280,7 @@ def run_extraction_on_models(model_dirs: Iterable[str],
 
  # run the saved model, then save to *.npz files
  all_tensors = run_batches(
- sess, tensors_dict_ops, max_nbatches=batches_per_epoch, verbose=True)
- save_results(dir_path=out_dir, np_dict=all_tensors, filename=save_filename)
+ sess, tensors_dict_ops, max_nbatches=batches_per_epoch,
+ verbose=True)
+ save_results(
+ dir_path=out_dir, np_dict=all_tensors, filename=save_filename)