Use NVTX filtering to limit NCU profile collection

Summary: Previously, we used `--replay-mode range`, but that did not give us per-kernel metrics, so it was changed to `---replay-mode kernel` (the default). However, that can causes us to profile a lot more kernels outside the ones in the desired benchmark. It appears we can instead use NVTX filtering to solve this problem. Relevant docs: https://docs.nvidia.com/nsight-compute/NsightComputeCli/index.html#nvtx-filtering I also tacked on a minor change to the ncu invocation, adding `--import-source yes`. This makes it easier to analyze the traces on a different machine from the one doing the profiling. Reviewed By: chenyang78 Differential Revision: D58711358 fbshipit-source-id: 28aec4f71a736c7427b1886335297ece4a2a54a8
pytorch · Jun 19, 2024 · b2b4158 · b2b4158
1 parent 62e2609
commit b2b4158
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 23 deletions.
diff --git a/torchbenchmark/_components/ncu/__init__.py b/torchbenchmark/_components/ncu/__init__.py
@@ -1,7 +1,14 @@
-
 from typing import Callable
 
-def do_bench_ncu_in_task(fn: Callable, warmup=25, grad_to_none=None, fast_flush=True, output_dir=None) -> None:
+
+def do_bench_ncu_in_task(
+ fn: Callable,
+ warmup=25,
+ grad_to_none=None,
+ fast_flush=True,
+ output_dir=None,
+ range_name: str = "",
+) -> None:
  """
  Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
  the 20-th and 80-th performance percentile.
@@ -46,8 +53,6 @@ def do_bench_ncu_in_task(fn: Callable, warmup=25, grad_to_none=None, fast_flush=
  # Warm-up
  for _ in range(n_warmup):
  fn()
- # Start ncu profiling
- torch.cuda.cudart().cudaProfilerStart()
  # we don't want `fn` to accumulate gradient values
  # if it contains a backward pass. So we clear the
  # provided gradients
@@ -56,5 +61,5 @@ def do_bench_ncu_in_task(fn: Callable, warmup=25, grad_to_none=None, fast_flush=
  x.grad = None
  # we clear the L2 cache before run
  cache.zero_()
- fn()
- torch.cuda.cudart().cudaProfilerStop()
+ with torch.cuda.nvtx.range(range_name):
+  fn()
diff --git a/torchbenchmark/util/triton_op.py b/torchbenchmark/util/triton_op.py
@@ -414,6 +414,10 @@ def __call__(cls, *args, **kwargs):
  obj.__post__init__()
  return obj
 
+
+_RANGE_NAME = "tritonbench_range"
+
+
 class BenchmarkOperator(metaclass=PostInitProcessor):
  mode: Mode = Mode.FWD
  test: str = "eval"
@@ -827,6 +831,7 @@ def _init_extra_metrics() -> Dict[str, Any]:
  fn=fn,
  warmup=warmup,
  grad_to_none=self.get_grad_to_none(self.example_inputs),
+ range_name=_RANGE_NAME,
  )
  metrics.extra_metrics["_ncu_trace_in_task"] = "success"
  # generate customized metrics
@@ -901,26 +906,27 @@ def ncu_trace(self, input_id: int, fn_name: str, replay: bool=False) -> str:
  "ncu",
  "--set",
  "full",
- "--replay-mode",
- "kernel",
+ "--nvtx",
+ "--nvtx-include",
+ f"{_RANGE_NAME}/",
  "--target-processes",
  "all",
- "--csv",
- "-f",
- "--log-file",
- str(ncu_output_file.resolve()),
- ] if not replay else [
- "ncu",
- "--set",
- "full",
- "--replay-mode",
- "kernel",
- "--target-processes",
- "all",
- "-f",
- "-o",
- str(ncu_output_file.resolve()),
+ "--import-source",
+ "yes",
  ]
+ if replay:
+ ncu_args.extend([
+ "-f",
+ "-o",
+ str(ncu_output_file.resolve()),
+ ])
+ else:
+ ncu_args.extend([
+ "--csv",
+ "-f",
+ "--log-file",
+ str(ncu_output_file.resolve()),
+ ])
  ncu_args.extend(op_task_args)
  subprocess.check_call(ncu_args)
  return str(ncu_output_file.resolve())