Fixes a couple of issues to add fp16 training support (#488)

* Fixes a couple of issues to add fp16 training support (#476) * Add half precision support to `nanodet_plus` head Moves the explicit `sigmoid` calculation inside the `dsl_assigner` so that `binary_cross_entropy_with_logits` can be used. This allows for the use of `auto_cast` to support training with `fp16` precision. If this is not done `torch` will complain that using `binary_cross_entropy` with `fp16` is unstable and as such refuses to train the model in `fp16` precision. * Add model precision settings to config Allows for setting the model precision during training using the config system. Co-authored-by: RangiLyu <[email protected]> * fix lint * lightning version Co-authored-by: Bjarne <[email protected]>
RangiLyu · Jan 20, 2023 · a59db3c · a59db3c
1 parent d8ba391
commit a59db3c
Show file tree

Hide file tree

Showing 6 changed files with 19 additions and 7 deletions.
diff --git a/.github/workflows/workflow.yml b/.github/workflows/workflow.yml
@@ -56,7 +56,7 @@ jobs:
  python -m pip install -U pip
  python -m pip install ninja opencv-python-headless onnx pytest-xdist codecov
  python -m pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
- python -m pip install Cython termcolor numpy tensorboard pycocotools matplotlib pyaml opencv-python tqdm pytorch-lightning torchmetrics codecov flake8 pytest timm
+ python -m pip install Cython termcolor numpy tensorboard pycocotools matplotlib pyaml opencv-python tqdm pytorch-lightning==1.8.0 torchmetrics codecov flake8 pytest timm
  python -m pip install -r requirements.txt
  - name: Setup
  run: rm -rf .eggs && python setup.py develop

diff --git a/nanodet/model/head/assigner/dsl_assigner.py b/nanodet/model/head/assigner/dsl_assigner.py
@@ -97,9 +97,9 @@ def assign(
  valid_pred_scores = valid_pred_scores.unsqueeze(1).repeat(1, num_gt, 1)
 
  soft_label = gt_onehot_label * pairwise_ious[..., None]
- scale_factor = soft_label - valid_pred_scores
+ scale_factor = soft_label - valid_pred_scores.sigmoid()
 
- cls_cost = F.binary_cross_entropy(
+ cls_cost = F.binary_cross_entropy_with_logits(
  valid_pred_scores, soft_label, reduction="none"
  ) * scale_factor.abs().pow(2.0)
 

diff --git a/nanodet/model/head/nanodet_plus_head.py b/nanodet/model/head/nanodet_plus_head.py
@@ -330,7 +330,7 @@ def target_assign_single_img(
  gt_bboxes_ignore = gt_bboxes_ignore.to(decoded_bboxes.dtype)
 
  assign_result = self.assigner.assign(
- cls_preds.sigmoid(),
+ cls_preds,
  center_priors,
  decoded_bboxes,
  gt_bboxes,

diff --git a/nanodet/util/config.py b/nanodet/util/config.py
@@ -14,6 +14,7 @@
 cfg.data.train = CfgNode(new_allowed=True)
 cfg.data.val = CfgNode(new_allowed=True)
 cfg.device = CfgNode(new_allowed=True)
+cfg.device.precision = 32
 # train
 cfg.schedule = CfgNode(new_allowed=True)
 

diff --git a/requirements.txt b/requirements.txt
@@ -7,7 +7,7 @@ onnx-simplifier
 opencv-python
 pyaml
 pycocotools
-pytorch-lightning>=1.7.0
+pytorch-lightning>=1.7.0,<1.9.0
 tabulate
 tensorboard
 termcolor

diff --git a/tools/train.py b/tools/train.py
@@ -113,9 +113,19 @@ def main(args):
  )
  if cfg.device.gpu_ids == -1:
  logger.info("Using CPU training")
- accelerator, devices, strategy = "cpu", None, None
+ accelerator, devices, strategy, precision = (
+ "cpu",
+ None,
+ None,
+ cfg.device.precision,
+ )
  else:
- accelerator, devices, strategy = "gpu", cfg.device.gpu_ids, None
+ accelerator, devices, strategy, precision = (
+ "gpu",
+ cfg.device.gpu_ids,
+ None,
+ cfg.device.precision,
+ )
 
  if devices and len(devices) > 1:
  strategy = "ddp"
@@ -135,6 +145,7 @@ def main(args):
  benchmark=cfg.get("cudnn_benchmark", True),
  gradient_clip_val=cfg.get("grad_clip", 0.0),
  strategy=strategy,
+ precision=precision,
  )
 
  trainer.fit(task, train_dataloader, val_dataloader)