Merge pull request #59 from moskomule/dev

Accumulated updates
moskomule · Jul 10, 2021 · 1d6ffa2 · 1d6ffa2
2 parents 815750e + ed9697a
commit 1d6ffa2
Show file tree

Hide file tree

Showing 35 changed files with 166 additions and 53 deletions.
diff --git a/.github/workflows/ghpage.yml b/.github/workflows/ghpage.yml
@@ -22,7 +22,7 @@ jobs:
  . venv/bin/activate
  pip install -U pip
  pip install Sphinx sphinx-rtd-theme
- pip install torch==1.8.1+cpu torchvision==0.9.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
+ pip install torch==1.9.0+cpu torchvision==0.10.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
  pip install -U .
 
  - name: build

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -12,6 +12,7 @@ jobs:
  matrix:
  python: [ '3.9' ]
  torch: [ 'torch==1.8.1+cpu torchvision==0.9.1+cpu -f https://download.pytorch.org/whl/torch_stable.html',
+ 'torch==1.9.0+cpu torchvision==0.10.0+cpu -f https://download.pytorch.org/whl/torch_stable.html',
  '--pre torch torchvision -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html' ]
 
  steps:

diff --git a/README.md b/README.md
@@ -91,7 +91,7 @@ with trainers.SupervisedTrainer(model,
  trainer.run(train_loader, test_loader,
  total_iterations=1_000, val_intervals=10)
 
- print(f"Max Accuracy={max(trainer.history['accuracy']['test'])}")
+ print(f"Max Accuracy={max(trainer.history['accuracy']['tests'])}")
 ```
 
 You can customize `iteration` of `trainer` as follows.

diff --git a/examples/imagenet.py b/examples/imagenet.py
@@ -3,14 +3,15 @@
 from torch.nn import functional as F
 from torchvision.models import resnet50
 
-from homura import distributed_ready_main, enable_accimage, get_num_nodes, is_distributed, lr_scheduler, optim, \
+from homura import distributed_ready_main, enable_accimage, get_world_size, is_distributed, lr_scheduler, optim, \
  reporters
 from homura.trainers import SupervisedTrainer
 from homura.vision.data import DATASET_REGISTRY
 
 
 @chika.config
 class Config:
+ base_lr: float = 0.1
  epochs: int = 90
  batch_size: int = 256
  enable_accimage: bool = False
@@ -30,10 +31,10 @@ def main(cfg: Config):
  enable_accimage()
 
  model = resnet50()
- optimizer = optim.SGD(lr=1e-1 * cfg.batch_size * get_num_nodes() / 256, momentum=0.9, weight_decay=1e-4)
- scheduler = lr_scheduler.MultiStepLR([30, 60, 80])
- train_loader, test_loader = DATASET_REGISTRY("fast_imagenet" if cfg.use_fast_collate else
-  "imagenet")(cfg.batch_size,
+ optimizer = optim.SGD(lr=cfg.base_lr * cfg.batch_size * get_world_size() / 256, momentum=0.9, weight_decay=1e-4,
+  multi_tensor=True)
+ scheduler = lr_scheduler.MultiStepLR([30, 60, 90])
+ train_loader, test_loader = DATASET_REGISTRY("imagenet")(cfg.batch_size,
  train_size=cfg.batch_size * 50 if cfg.debug else None,
  test_size=cfg.batch_size * 50 if cfg.debug else None,
  num_workers=cfg.num_workers)

diff --git a/homura/__init__.py b/homura/__init__.py
@@ -1,8 +1,8 @@
 from .register import Registry
-from .utils import TensorDataClass, TensorTuple, distributed_print, enable_accimage, get_args, get_environ, \
- get_git_hash, get_global_rank, get_local_rank, get_num_nodes, get_world_size, if_is_master, init_distributed, \
- is_accimage_available, is_distributed, is_distributed_available, is_faiss_available, is_master, set_deterministic, \
- set_seed, distributed_ready_main
+from .utils import TensorDataClass, TensorTuple, disable_tf32, disable_tf32_locally, distributed_print, \
+ distributed_ready_main, enable_accimage, get_args, get_environ, get_git_hash, get_global_rank, get_local_rank, \
+ get_num_nodes, get_world_size, if_is_master, init_distributed, is_accimage_available, is_distributed, \
+ is_distributed_available, is_faiss_available, is_master, set_deterministic, set_seed
 
 Registry.import_modules('homura.vision')
 # to avoid circular import

diff --git a/homura/modules/ema.py b/homura/modules/ema.py
@@ -41,9 +41,17 @@ def __init__(self,
 
  self._original_model = original_model
  self._ema_model = copy.deepcopy(original_model)
- for p in self._ema_model.parameters():
+ for p in self.ema_model.parameters():
  p.requires_grad_(False)
 
+ def __getattr__(self,
+ item: str):
+ # fallback
+ try:
+ return super().__getattr__(item)
+ except AttributeError:
+ return getattr(self.original_model, item)
+
  @property
  def original_model(self) -> nn.Module:
  return self._original_model
@@ -53,14 +61,15 @@ def ema_model(self) -> nn.Module:
  return self._ema_model
 
  def parameters(self, recurse: bool = True) -> Iterator[nn.Parameter]:
- # this makes it simple, but may incur unexpected behavior
  return self._original_model.parameters(recurse)
 
- def requires_grad_(self, requires_grad: bool = True):
+ def requires_grad_(self, requires_grad: bool = True) -> nn.Module:
  return self._original_model.requires_grad_(requires_grad)
 
  @torch.no_grad()
  def _update(self):
+ if torch.cuda.is_available():
+ torch.cuda.synchronize()
  # _foreach_** is n times faster than for loops
  o_p = [p.data for p in self._original_model.parameters() if isinstance(p, torch.Tensor)]
  e_p = [p.data for p in self._ema_model.parameters() if isinstance(p, torch.Tensor)]

diff --git a/homura/reporters.py b/homura/reporters.py
@@ -452,4 +452,4 @@ def _clear_epoch_hist(self
  def exit(self
  ) -> None:
  # expected to be used in TrainerBase.exit
- self._persistent_hist = defaultdict(list)
+ ReporterList._persistent_hist = defaultdict(list)
diff --git a/homura/trainers.py b/homura/trainers.py
@@ -53,6 +53,7 @@ def __init__(self,
  use_sync_bn: bool = False,
  tqdm_ncols: int = 120,
  debug: bool = False,
+ dist_kwargs: Optional[dict] = None,
  **kwargs):
 
  if kwargs.get("update_scheduler_by_epoch"):
@@ -106,7 +107,8 @@ def __init__(self,
  self.logger.info(f"cuda: False (torch.cuda.is_available()={torch.cuda.is_available()})")
 
  if is_distributed():
- self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[rank])
+ dist_kwargs = dist_kwargs or {}
+ self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[rank], **dist_kwargs)
  self.logger.debug(f"model converted to DistributedDataParallel at rank={rank}")
 
  # self.accessible_model is useful for e.g., checkpointing
@@ -139,7 +141,7 @@ def __init__(self,
 
  # to nest, leave=False (https://github.com/tqdm/tqdm/blob/master/examples/simple_examples.py#L19)
  self._tqdm = lambda x: x
- if self._verbose:
+ if self.verbose:
  self._tqdm = Partial(tqdm, ncols=tqdm_ncols, leave=False)
  set_tqdm_stdout_stderr()
  self.logger.debug("verbose: setup tqdm")
@@ -156,6 +158,11 @@ def __init__(self,
  setattr(self, k, v)
  self.logger.debug(f"trainer sets {k} as a new attribute")
 
+ @property
+ def verbose(self
+ ) -> bool:
+ return self._verbose
+
  @property
  def step(self
  ) -> int:
@@ -447,6 +454,7 @@ def __init__(self,
  use_amp=False,
  use_channel_last=False,
  report_accuracy_topk: Optional[int or List[int]] = None,
+ update_scheduler_iter: bool = False,
  **kwargs):
  if isinstance(model, dict):
  raise TypeError(f"{type(self)} does not support dict model")
@@ -469,6 +477,11 @@ def __init__(self,
  if report_accuracy_topk is not None and not isinstance(report_accuracy_topk, Iterable):
  report_accuracy_topk = [report_accuracy_topk]
  self._report_topk = report_accuracy_topk
+ self.update_scheduler_iter = update_scheduler_iter & (scheduler is not None)
+ if self.update_scheduler_iter:
+ self.logger.info("scheduler is set to be updated after every iteration")
+ else:
+ self.logger.debug("self.update_scheduler_iter=False. Update scheduler manually")
 
  def iteration(self,
  data: Tuple[Tensor, Tensor]
@@ -487,6 +500,8 @@ def iteration(self,
  else:
  loss.backward()
  self.optimizer.step()
+ if self.update_scheduler_iter:
+ self.scheduler.step()
  if self._is_debug and torch.isnan(loss):
  self.logger.warning("loss is NaN")
 

diff --git a/homura/utils/__init__.py b/homura/utils/__init__.py
@@ -4,6 +4,6 @@
 from .distributed import (distributed_print, distributed_ready_main, get_global_rank, get_local_rank, get_num_nodes,
  get_world_size, if_is_master, init_distributed, is_distributed, is_distributed_available,
  is_master)
-from .environment import (enable_accimage, get_args, get_environ, get_git_hash, is_accimage_available,
- is_faiss_available)
+from .environment import (disable_tf32, disable_tf32_locally, enable_accimage, get_args, get_environ, get_git_hash,
+ is_accimage_available, is_faiss_available)
 from .reproducibility import set_deterministic, set_seed
diff --git a/homura/utils/distributed.py b/homura/utils/distributed.py
@@ -125,7 +125,8 @@ def distributed_ready_main(func: Callable = None,
  """ Wrap a main function to make it distributed ready
  """
 
- init_distributed(backend=backend, init_method=init_method, disable_distributed_print=disable_distributed_print)
+ if is_distributed():
+ init_distributed(backend=backend, init_method=init_method, disable_distributed_print=disable_distributed_print)
 
  @wraps(func)
  def inner(*args, **kwargs):

diff --git a/homura/utils/environment.py b/homura/utils/environment.py
@@ -7,6 +7,8 @@
 import sys as python_sys
 from typing import Any, Optional
 
+import torch
+
 from homura.liblog import get_logger
 
 logger = get_logger("homura.environment")
@@ -44,6 +46,53 @@ def is_opteinsum_available() -> bool:
  return importlib.util.find_spec("opt_einsum") is not None
 
 
+# TF32
+def _enable_tf32(mode: bool) -> None:
+ try:
+ torch.backends.cuda.matmul.allow_tf32 = mode
+ torch.backends.cudnn.allow_tf32 = mode
+ if mode:
+ logger.info("TF32 is enabled")
+ else:
+ logger.info("TF32 is disabled")
+
+ except Exception as e:
+ logger.exception(e)
+
+
+def disable_tf32() -> None:
+ """ Globally disable TF32
+
+ """
+
+ _enable_tf32(False)
+
+
+class disable_tf32_locally(object):
+ """ Locally disable TF32
+
+ >>> with disable_tf32_locally():
+ >>> ...
+
+
+ or
+
+ >>> @disable_tf32_locally()
+ >>> def function():
+ >>> ...
+
+ """
+
+ def __call__(self):
+ _enable_tf32(False)
+
+ def __enter__(self):
+ _enable_tf32(False)
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ _enable_tf32(True)
+
+
 # get environment information
 
 def get_git_hash() -> str:

diff --git a/homura/utils/reproducibility.py b/homura/utils/reproducibility.py
@@ -47,24 +47,19 @@ def set_seed(seed: Optional[int] = None,
 @contextlib.contextmanager
 def set_deterministic(seed: Optional[int] = None,
  by_rank: bool = False):
- """ Set seed of `torch`, `random` and `numpy` to `seed` for making it deterministic. Because of CUDA's limitation, this
- does not make everything deterministic, however.
+ """ Set seed of `torch`, `random` and `numpy` to `seed` for making it deterministic. Because of CUDA's limitation,
+ this may not make everything deterministic, however.
  """
 
- has_set_deterministic = hasattr(torch, "set_deterministic")
  with set_seed(seed, by_rank):
  if seed is not None:
- if has_set_deterministic:
- torch.set_deterministic(True)
- else:
- torch.backends.cudnn.deterministic = True
- torch.backends.cudnn.benchmark = False
+ torch.set_deterministic(True)
+ torch.backends.cudnn.deterministic = True
+ torch.backends.cudnn.benchmark = False
  logger.info("Set deterministic. But some GPU computations might be still non-deterministic. "
  "Also, this may affect the performance.")
  yield
- if has_set_deterministic:
- torch.set_deterministic(False)
- else:
- torch.backends.cudnn.deterministic = False
- torch.backends.cudnn.benchmark = True
+ torch.set_deterministic(False)
+ torch.backends.cudnn.deterministic = False
+ torch.backends.cudnn.benchmark = True
  logger.info("Back to non-deterministic.")