Add Torch Profiler Support (#1226)

* format: flagged on pre-commit * feat: add pytorch profiling * Update NeoXArgs docs automatically * Update NeoXArgs docs automatically --------- Co-authored-by: github-actions <[email protected]> Co-authored-by: Quentin Anthony <[email protected]>
EleutherAI · May 21, 2024 · 2746d43 · 2746d43
1 parent 153e732
commit 2746d43
Show file tree

Hide file tree

Showing 5 changed files with 39 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -640,7 +640,7 @@ If you need to supply a hostfile for use with the MPI-based DeepSpeed launcher,
 
 # Profiling
 
-We support profiling with Nsight Systems and PyTorch Memory Profiling.
+We support profiling with Nsight Systems, the PyTorch Profiler, and PyTorch Memory Profiling.
 
 ## Nsight Systems Profiling
 
@@ -656,6 +656,15 @@ The generated output file can then by viewed with the Nsight Systems GUI:
 
 ![Alt text](images/nsight_profiling.png)
 
+## PyTorch Profiling
+
+To use the built-in PyTorch profiler, set config options `profile`, `profile_step_start`, and `profile_step_stop`.
+
+The PyTorch profiler will save traces to your `tensorboard` log directory. You can view these traces within
+TensorBoard by following the steps [here](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html).
+
+![Alt text](images/pytorch_profiling.png)
+
 ## PyTorch Memory Profiling
 
 To use PyTorch Memory Profiling, set config options `memory_profiling` and `memory_profiling_path`.

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
- Default = 0d5992f
+ Default = b68ba6d
 
  current git hash of repository
 

diff --git a/images/pytorch_profiling.png b/images/pytorch_profiling.png
diff --git a/megatron/data/helpers.cpp b/megatron/data/helpers.cpp
@@ -428,9 +428,9 @@ py::array build_mapping_impl(const py::array_t<int64_t>& docs_,
  }
 
  } // for (auto sent_index=sent_index_first; ...
- } // if (num_remain_sent > 1) {
- } // for (int doc=0; doc < num_docs; ++doc) {
- } // for (int epoch=0; epoch < num_epochs; ++epoch) {
+ }  // if (num_remain_sent > 1) {
+ }  // for (int doc=0; doc < num_docs; ++doc) {
+ }  // for (int epoch=0; epoch < num_epochs; ++epoch) {
 
  if (!second) {
  if (verbose) {
@@ -660,9 +660,9 @@ py::array build_blocks_mapping_impl(const py::array_t<int64_t>& docs_,
  num_sent = 0;
  }
  } // for (auto sent_index=sent_index_first; ...
- } // if (num_remain_sent > 1) {
- } // for (int doc=0; doc < num_docs; ++doc) {
- } // for (int epoch=0; epoch < num_epochs; ++epoch) {
+ }  // if (num_remain_sent > 1) {
+ }  // for (int doc=0; doc < num_docs; ++doc) {
+ }  // for (int epoch=0; epoch < num_epochs; ++epoch) {
 
  if (!second) {
  if (verbose) {

diff --git a/megatron/training.py b/megatron/training.py
@@ -970,7 +970,28 @@ def train(
 
  # to monitor if we've skipped many iterations in a row and trigger an early exit
  overflow_monitor = OverflowMonitor(optimizer)
+
+ if neox_args.profile:
+ schedule = torch.profiler.schedule(
+ wait=neox_args.profile_step_start,
+ warmup=1,
+ active=neox_args.profile_step_stop - neox_args.profile_step_start,
+ )
+ prof = torch.profiler.profile(
+ schedule=schedule,
+ on_trace_ready=torch.profiler.tensorboard_trace_handler(
+ neox_args.tensorboard_dir
+ ),
+ record_shapes=True,
+ profile_memory=True,
+ with_flops=True,
+ with_modules=True,
+ with_stack=True,
+ )
+ prof.start()
  while iteration < neox_args.train_iters:
+ if neox_args.profile:
+ prof.step()
  if neox_args.profile and iteration == neox_args.profile_step_start:
  torch.cuda.cudart().cudaProfilerStart()
  loss_dict, skipped_iter = train_step(
@@ -983,6 +1004,7 @@ def train(
  )
  if neox_args.profile and iteration == neox_args.profile_step_stop:
  torch.cuda.cudart().cudaProfilerStop()
+ prof.stop()
  iteration += 1
  neox_args.iteration = iteration
  if neox_args.precision == "fp16":