Skip to content

Commit

Permalink
incremental changes
Browse files Browse the repository at this point in the history
train_direct.py: use argparse instead of tf.flags

mypy.ini: stricter and more thorough checking
  • Loading branch information
chrisyeh96 committed Sep 29, 2020
1 parent bd83af1 commit 6454170
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 72 deletions.
8 changes: 4 additions & 4 deletions batchers/tfrecord_paths_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,11 +60,11 @@ def dhs_ooc(dataset: str, split: str) -> np.ndarray:
tfrecord_paths = []
for split in splits:
for country_year in survey_names[split]:
glob_path = os.path.join(DHS_TFRECORDS_PATH_ROOT, country_year + '*', '*.tfrecord.gz')
glob_path = os.path.join(
DHS_TFRECORDS_PATH_ROOT, country_year + '*', '*.tfrecord.gz')
tfrecord_paths.extend(glob(glob_path))
tfrecord_paths = np.sort(tfrecord_paths)
# assert len(tfrecord_paths) == SIZES[dataset][split] # TODO: uncomment this
return tfrecord_paths
assert len(tfrecord_paths) == SIZES[dataset][split]
return np.sort(tfrecord_paths)


def lsms_ooc(cys: Optional[Iterable[str]] = None) -> List[str]:
Expand Down
12 changes: 9 additions & 3 deletions mypy.ini
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
[mypy]
ignore_missing_imports = True
namespace_packages = True
[mypy]
allow_redefinition = True
disallow_incomplete_defs = True
ignore_missing_imports = True
namespace_packages = True
warn_redundant_casts = True
warn_unreachable = True
warn_unused_configs = True
warn_unused_ignores = True
151 changes: 105 additions & 46 deletions train_direct.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
'''
r'''
This script trains ResNet CNN models to estimate wealth for DHS and LSMS
locations. Model checkpoints and TensorBoard training logs are saved to
`out_dir`.
Expand All @@ -9,7 +9,7 @@
--model_name resnet --num_layers 18 \
--lr_decay 0.96 --batch_size 64 \
--gpu 0 --num_threads 5 \
--cache train,train_eval,val \
--cache train train_eval val \
--augment True --eval_every 1 --print_every 40 \
--ooc {ooc} --max_epochs {max_epochs} \
--out_dir {out_dir} \
Expand All @@ -25,6 +25,7 @@
`preprocessing/1_process_tfrecords.ipynb` and
`preprocessing/2_create_incountry_folds.ipynb`.
'''
import argparse
import json
import os
from pprint import pprint
Expand Down Expand Up @@ -234,7 +235,7 @@ def get_batcher(tfrecord_paths: tf.Tensor, shuffle: bool, augment: bool,

def run_training_wrapper(**params: Any) -> None:
'''
params is a dict with keys matching the FLAGS defined below
params is a dict with keys matching the arguments from _parse_args()
'''
start = time.time()
print('Current time:', start)
Expand Down Expand Up @@ -262,16 +263,18 @@ def run_training_wrapper(**params: Any) -> None:
params['experiment_name'], params['batch_size'],
params['fc_reg'], params['conv_reg'], params['lr'])
out_dir = os.path.join(params['out_dir'], full_experiment_name)
os.makedirs(out_dir, exist_ok=True)
print(f'Outputs directory: {out_dir}')

params_filepath = os.path.join(out_dir, 'params.json')
assert not os.path.exists(params_filepath), f'Stopping. Found previous run at: {params_filepath}'
if os.path.exists(params_filepath):
print(f'Stopping. Found previous run at: {params_filepath}')
return

print(f'Outputs directory: {out_dir}')
os.makedirs(out_dir, exist_ok=True)
with open(params_filepath, 'w') as config_file:
json.dump(params, config_file, indent=4)

# Create session
# - MUST set os.environ['CUDA_VISIBLE_DEVICES'] before creating tf.Session object
# - MUST set os.environ['CUDA_VISIBLE_DEVICES'] before creating tf.Session
if params['gpu'] is None: # restrict to CPU only
os.environ['CUDA_VISIBLE_DEVICES'] = ''
else:
Expand Down Expand Up @@ -321,58 +324,114 @@ def run_training_wrapper(**params: Any) -> None:
print('Time elasped (sec.):', end - start)


def main(_: Any) -> None:
params = {
key: flags.FLAGS.__getattr__(key)
for key in dir(flags.FLAGS)
}
run_training_wrapper(**params)


if __name__ == '__main__':
flags = tf.app.flags
def _parse_args() -> argparse.Namespace:
"""Parses arguments."""
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description='Run end-to-end training.')

# paths
flags.DEFINE_string('experiment_name', 'new_experiment', 'name of the experiment being run')
flags.DEFINE_string('out_dir', os.path.join(ROOT_DIR, 'outputs/'), 'path to output directory for saving checkpoints and TensorBoard logs')
parser.add_argument(
'--experiment_name', default='new_experiment',
help='name of experiment being run')
parser.add_argument(
'--out_dir', default=os.path.join(ROOT_DIR, 'outputs/'),
help='path to output directory for saving checkpoints and TensorBoard '
'logs')

# initialization
flags.DEFINE_string('init_ckpt_dir', None, 'path to checkpoint prefix from which to initialize weights (default None)')
flags.DEFINE_string('imagenet_weights_path', None, 'path to ImageNet weights for initialization (default None)')
flags.DEFINE_string('hs_weight_init', None, 'method for initializing weights of non-RGB bands in 1st conv layer, one of [None (default), "random", "same", "samescaled"]')
flags.DEFINE_boolean('exclude_final_layer', False, 'whether to use checkpoint to initialize final layer')
parser.add_argument(
'--init_ckpt_dir',
help='path to checkpoint prefix from which to initialize weights')
parser.add_argument(
'--imagenet_weights_path',
help='path to ImageNet weights for initialization')
parser.add_argument(
'--hs_weight_init', choices=[None, 'random', 'same', 'samescaled'],
help='method for initializing weights of non-RGB bands in 1st conv '
'layer')
parser.add_argument(
'--exclude_final_layer', action='store_true',
help='whether to use checkpoint to initialize final layer')

# learning parameters
flags.DEFINE_string('label_name', 'wealthpooled', 'name of label to use from the TFRecord files')
flags.DEFINE_integer('batch_size', 64, 'batch size')
flags.DEFINE_boolean('augment', True, 'whether to use data augmentation')
flags.DEFINE_float('fc_reg', 1e-3, 'Regularization penalty factor for fully connected layers')
flags.DEFINE_float('conv_reg', 1e-3, 'Regularization penalty factor for convolution layers')
flags.DEFINE_float('lr', 1e-3, 'Learning rate for optimizer')
flags.DEFINE_float('lr_decay', 1.0, 'Decay rate of the learning rate (default 1.0 for no decay)')
parser.add_argument(
'--label_name', default='wealthpooled',
help='name of label to use from the TFRecord files')
parser.add_argument(
'--batch_size', type=int, default=64,
help='batch size')
parser.add_argument(
'--augment', action='store_true',
help='whether to use data augmentation')
parser.add_argument(
'--fc_reg', type=float, default=1e-3,
help='Regularization penalty factor for fully connected layers')
parser.add_argument(
'--conv_reg', type=float, default=1e-3,
help='Regularization penalty factor for convolution layers')
parser.add_argument(
'--lr', type=float, default=1e-3,
help='Learning rate for optimizer')
parser.add_argument(
'--lr_decay', type=float, default=1.0,
help='Decay rate of the learning rate')

# high-level model control
flags.DEFINE_string('model_name', 'resnet', 'name of the model to be used, currently only "resnet" is supported')
parser.add_argument(
'--model_name', default='resnet', choices=['resnet'],
help='name of model architecture')

# resnet-only params
flags.DEFINE_integer('num_layers', 18, 'Number of ResNet layers, one of [18, 34, 50]')
parser.add_argument(
'--num_layers', type=int, default=18, choices=[18, 34, 50],
help='number of ResNet layers')

# data params
flags.DEFINE_string('dataset', 'DHS_OOC_A', 'dataset to use') # TODO
flags.DEFINE_boolean('ooc', True, 'whether to use out-of-country split')
flags.DEFINE_float('keep_frac', 1.0, 'fraction of training data to use (default 1.0 uses all data)')
flags.DEFINE_string('ls_bands', None, 'Landsat bands to use, one of [None (default), "rgb", "ms"]')
flags.DEFINE_string('nl_band', None, 'nightlights band, one of [None (default), "merge", "split"]')
parser.add_argument(
'--dataset', default='DHS_OOC_A', # TODO: choices?
help='dataset to use')
parser.add_argument(
'--ooc', action='store_true',
help='whether to use out-of-country split')
parser.add_argument(
'--keep_frac', type=float, default=1.0,
help='fraction of training data to use')
parser.add_argument(
'--ls_bands', choices=[None, 'rgb', 'ms'],
help='Landsat bands to use')
parser.add_argument(
'--nl_band', choices=[None, 'merge', 'split'],
help='nightlights band')

# system
flags.DEFINE_integer('gpu', None, 'which GPU to use (default None)')
flags.DEFINE_integer('num_threads', 1, 'number of threads for batcher')
flags.DEFINE_list('cache', [], 'comma-separated list (no spaces) of datasets to cache in memory, choose from [None, "train", "train_eval", "val"]')
parser.add_argument(
'--gpu', type=int,
help='which GPU to use')
parser.add_argument(
'--num_threads', type=int, default=1,
help='number of threads for batcher')
parser.add_argument(
'--cache', nargs='*', default=[], choices=['train', 'train_eval', 'val'],
help='list of datasets to cache in memory')

# Misc
flags.DEFINE_integer('max_epochs', 150, 'maximum number of epochs for training')
flags.DEFINE_integer('eval_every', 1, 'evaluate the model on the validation set after every so many epochs of training')
flags.DEFINE_integer('print_every', 40, 'print training statistics after every so many steps')
flags.DEFINE_integer('seed', 123, 'seed for random initialization and shuffling')
parser.add_argument(
'--max_epochs', type=int, default=150,
help='maximum number of epochs for training')
parser.add_argument(
'--eval_every', type=int, default=1,
help='evaluate the model on the validation set after every so many '
'epochs of training')
parser.add_argument(
'--print_every', type=int, default=40,
help='print training statistics after every so many steps')
parser.add_argument(
'--seed', type=int, default=123,
help='seed for random initialization and shuffling')
return parser.parse_args()


tf.app.run()
if __name__ == '__main__':
args = _parse_args()
run_training_wrapper(**vars(args))
50 changes: 31 additions & 19 deletions utils/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from glob import glob
import os
import time
from typing import Any, DefaultDict, Dict, Iterable, List, Mapping, Optional
from typing import Any, Callable, DefaultDict, Dict, Iterable, Mapping, Optional

import numpy as np
import tensorflow as tf
Expand Down Expand Up @@ -41,7 +41,8 @@ def get_full_experiment_name(experiment_name: str, batch_size: int,
fc_str = param_to_str(fc_reg)
conv_str = param_to_str(conv_reg)
lr_str = param_to_str(lr)
full_experiment_name = f'{experiment_name}_b{batch_size}_fc{fc_str}_conv{conv_str}_lr{lr_str}'
full_experiment_name = (
f'{experiment_name}_b{batch_size}_fc{fc_str}_conv{conv_str}_lr{lr_str}')

if tag is not None:
full_experiment_name += f'_{tag}'
Expand All @@ -50,7 +51,8 @@ def get_full_experiment_name(experiment_name: str, batch_size: int,


def checkpoint_path_exists(ckpt_path: str) -> bool:
'''Checks whether a TensorFlow modeol checkpoint exists at the given path.'''
'''Checks whether a TensorFlow modeol checkpoint exists at the given path.
'''
if ckpt_path[-6:] == '.index':
ckpt_path = ckpt_path[-6:]
if ckpt_path[-5:] == '.meta':
Expand All @@ -70,22 +72,23 @@ def load(sess: tf.Session, saver: tf.train.Saver, checkpoint_dir: str) -> bool:
- saver: tf.train.Saver
- checkpoint_dir: str, path to directory containing checkpoint(s)
Returns: bool, True if successful at restoring checkpoint from given directory
Returns: bool, True if successful at restoring checkpoint from given dir
'''
print(f'Reading from checkpoint dir: {checkpoint_dir}')
if checkpoint_dir is None:
raise ValueError('No checkpoint path, given, cannot load checkpoint')
if not os.path.isdir(checkpoint_dir):
raise ValueError('Given path is not a valid directory.')

# read the CheckpointState proto from the 'checkpoint' file in checkpoint_dir
# read the CheckpointState proto from 'checkpoint' file in checkpoint_dir
ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
if ckpt and ckpt.model_checkpoint_path:
ckpt_name = os.path.basename(ckpt.model_checkpoint_path)
print(f'Loading checkpoint: {ckpt_name}')
if not checkpoint_path_exists(ckpt.model_checkpoint_path):
raise LoadNoFileError('Checkpoint could not be loaded because it does not exist,'
' but its information is in the checkpoint meta-data file.')
raise LoadNoFileError(
'Checkpoint could not be loaded because it does not exist,'
' but its information is in the checkpoint meta-data file.')
saver.restore(sess, ckpt.model_checkpoint_path)
return True
return False
Expand All @@ -95,18 +98,19 @@ def print_number_of_parameters(verbose: bool = True) -> None:
'''Prints the total number of trainable parameters.
Args
- verbose: bool, whether to print name and shape info for every trainable var
- verbose: bool, whether to print name & shape info for every trainable var
'''
total_parameters = 0 # total # of trainable params in the current graph
num_none_vars = 0 # num variables in the graph with a shape that is not fully defined
num_none_vars = 0 # variables in graph with shape that is not fully defined

for variable in tf.trainable_variables():
name = variable.name
shape = (d.value for d in variable.shape) # each d is a tf.Dimension
num_params = np.prod(variable.shape).value

if verbose:
print(f'Variable name: {name}, shape: {shape}, num_params: {num_params}')
print(f'Variable name: {name}, shape: {shape}, '
f'num_params: {num_params}')

if num_params is None:
num_none_vars += 1
Expand Down Expand Up @@ -146,7 +150,8 @@ def run_batches(sess: tf.Session, tensors_dict_ops: Mapping[str, tf.Tensor],
curr_batch += 1
if verbose:
speed = curr_batch / (time.time() - start_time)
print(f'\rRan {curr_batch} batches ({speed:.3f} batch/s)', end='')
print(f'\rRan {curr_batch} batches ({speed:.3f} batch/s)',
end='')
if curr_batch >= max_nbatches:
break
except tf.errors.OutOfRangeError:
Expand Down Expand Up @@ -178,7 +183,8 @@ def save_results(dir_path: str, np_dict: dict, filename: str = 'features.npz'
np.savez_compressed(npz_path, **np_dict)


def check_existing(model_dirs: Iterable[str], outputs_root_dir: str, test_filename: str) -> bool:
def check_existing(model_dirs: Iterable[str], outputs_root_dir: str,
test_filename: str) -> bool:
'''Checks a list of model directories to ensure that they contain model
checkpoints but not a given filename.
Expand All @@ -188,10 +194,12 @@ def check_existing(model_dirs: Iterable[str], outputs_root_dir: str, test_filena
Args
- model_dirs: list of str, model directories within outputs_root_dir
- outputs_root_dir: str, path to root directory for saving logs and checkpoints
- outputs_root_dir: str, path to root directory for saving logs and
checkpoints
- test_filename: str, name of file to check for
Returns: bool, True if ckpts exist and no test_filename files found, otherwise False
Returns: bool, True if ckpts exist and no test_filename files found,
otherwise False
'''
ret = True
for model_dir in model_dirs:
Expand All @@ -213,7 +221,7 @@ def check_existing(model_dirs: Iterable[str], outputs_root_dir: str, test_filena


def run_extraction_on_models(model_dirs: Iterable[str],
ModelClass: Any,
ModelClass: Callable,
model_params: Mapping,
batcher: batcher.Batcher,
batches_per_epoch: int,
Expand All @@ -226,7 +234,8 @@ def run_extraction_on_models(model_dirs: Iterable[str],
features as a compressed numpy .npz file.
Args
- model_dirs: list of str, names of folders where models are saved
- model_dirs: list of str, names of folders where models are saved, should
be subfolders of out_root_dir
- ModelClass: class, an instance `model` of ModelClass which has attributes
model.features_layer: tf.Tensor
model.outputs: tf.Tensor
Expand All @@ -252,7 +261,8 @@ def run_extraction_on_models(model_dirs: Iterable[str],
tensors_dict_ops[key] = batch_op[key]

saver = tf.train.Saver(var_list=None)
var_init_ops = [tf.global_variables_initializer(), tf.local_variables_initializer()]
var_init_ops = [tf.global_variables_initializer(),
tf.local_variables_initializer()]

print('Creating session...')
config_proto = tf.ConfigProto()
Expand All @@ -270,5 +280,7 @@ def run_extraction_on_models(model_dirs: Iterable[str],

# run the saved model, then save to *.npz files
all_tensors = run_batches(
sess, tensors_dict_ops, max_nbatches=batches_per_epoch, verbose=True)
save_results(dir_path=out_dir, np_dict=all_tensors, filename=save_filename)
sess, tensors_dict_ops, max_nbatches=batches_per_epoch,
verbose=True)
save_results(
dir_path=out_dir, np_dict=all_tensors, filename=save_filename)

0 comments on commit 6454170

Please sign in to comment.