More error handling improvements

Reviewed By: xuzhao9 Differential Revision: D57965945 fbshipit-source-id: 1ccd9e1e2f24426d073e72acc9aab1172bd505a8
pytorch · Jun 3, 2024 · 0546be8 · 0546be8
1 parent c3d510d
commit 0546be8
Show file tree

Hide file tree

Showing 2 changed files with 104 additions and 92 deletions.
diff --git a/torchbenchmark/util/triton_op.py b/torchbenchmark/util/triton_op.py
@@ -370,6 +370,10 @@ def parse_args(
  type=int,
  help="Number of example inputs.",
  )
+ parser.add_argument(
+ "--keep-going",
+ action="store_true",
+ )
  parser.add_argument(
  "--input-id",
  type=int,
@@ -466,85 +470,89 @@ def _get_bm_func(self, bm_func_name: str):
 
  def run(
  self, warmup=DEFAULT_WARMUP, rep=DEFAULT_RUN_ITERS, quantiles=DEFAULT_QUANTILES
- ) -> BenchmarkOperatorResult:
+ ) -> None:
  """Benchmarking the operator and returning its metrics."""
  metrics = []
- input_id_range = range(self._input_id, self._input_id+self._num_inputs)
- if tqdm is not None:
- input_id_range = tqdm(input_id_range)
- if self._input_id:
- for _dryrun_input_id in range(self._input_id):
+ try:
+ input_id_range = range(self._input_id, self._input_id + self._num_inputs)
+ if tqdm is not None:
+ input_id_range = tqdm(input_id_range)
+ if self._input_id:
+ for _dryrun_input_id in range(self._input_id):
+ self.example_inputs = self.get_example_inputs()
+ for input_id in input_id_range:
  self.example_inputs = self.get_example_inputs()
- for input_id in input_id_range:
- self.example_inputs = self.get_example_inputs()
- if self.example_inputs is None:
- warnings.warn(
- UserWarning(
- f"The input generator get_input_iter() has depleted at id {input_id}. Available number of inputs: {self._available_num_inputs}."
+ if self.example_inputs is None:
+ warnings.warn(
+ f"The input generator get_input_iter() has depleted at id {input_id}. Available number of "
+ f"inputs: {self._available_num_inputs}.",
+ stacklevel=1
  )
+ break
+ # Move inputs to the device
+ self.example_inputs = input_cast(
+ lambda x: isinstance(x, torch.Tensor),
+ lambda x: x.to(self.device),
+ self.example_inputs,
  )
- break
- # Move inputs to the device
- self.example_inputs = input_cast(
- lambda x: isinstance(x, torch.Tensor),
- lambda x: x.to(self.device),
- self.example_inputs,
- )
- self.baseline_fn = None
- self.baseline_metrics = None
- self._op_flops = {}
- # Cast the input precisions
- apply_decoration_args(self, self.dargs)
- x_val = self.get_x_val(self.example_inputs)
- if self._only:
- benchmarks = self._only
- else:
- benchmarks = (
- [bm for bm in REGISTERED_BENCHMARKS[self.name]]
- if self.name in REGISTERED_BENCHMARKS
- else []
- )
- # Run the baseline first, if baseline exists
- baseline_name = (
- BASELINE_BENCHMARKS[self.name]
- if self.name in BASELINE_BENCHMARKS
- else None
- )
- if baseline_name and baseline_name in benchmarks:
- benchmarks.remove(baseline_name)
- benchmarks.insert(0, baseline_name)
-
- # get metrics for for each registered benchmark
- def _reduce_benchmarks(acc, bm_name: str):
- baseline = (
- bm_name == BASELINE_BENCHMARKS[self.name]
- if self.name in BASELINE_BENCHMARKS
- else False
- )
- acc[bm_name] = self._do_bench(
- input_id=input_id,
- fn_name=bm_name,
- warmup=warmup,
- rep=rep,
- quantiles=quantiles,
- baseline=baseline,
- )
- if baseline:
- self.baseline_metrics = acc[bm_name]
- return acc
+ self.baseline_fn = None
+ self.baseline_metrics = None
+ self._op_flops = {}
+ # Cast the input precisions
+ apply_decoration_args(self, self.dargs)
+ x_val = self.get_x_val(self.example_inputs)
+ if self._only:
+ benchmarks = self._only
+ else:
+ benchmarks = (
+ [bm for bm in REGISTERED_BENCHMARKS[self.name]]
+ if self.name in REGISTERED_BENCHMARKS
+ else []
+ )
+ # Run the baseline first, if baseline exists
+ baseline_name = (
+ BASELINE_BENCHMARKS[self.name]
+ if self.name in BASELINE_BENCHMARKS
+ else None
+ )
+ if baseline_name and baseline_name in benchmarks:
+ benchmarks.remove(baseline_name)
+ benchmarks.insert(0, baseline_name)
+
+ # get metrics for for each registered benchmark
+ def _reduce_benchmarks(acc, bm_name: str):
+ baseline = (
+ bm_name == BASELINE_BENCHMARKS[self.name]
+ if self.name in BASELINE_BENCHMARKS
+ else False
+ )
+ acc[bm_name] = self._do_bench(
+ input_id=input_id,
+ fn_name=bm_name,
+ warmup=warmup,
+ rep=rep,
+ quantiles=quantiles,
+ baseline=baseline,
+ )
+ if baseline:
+ self.baseline_metrics = acc[bm_name]
+ return acc
 
- y_vals: Dict[str, BenchmarkOperatorMetrics] = functools.reduce(
- _reduce_benchmarks, benchmarks, {}
+ y_vals: Dict[str, BenchmarkOperatorMetrics] = functools.reduce(
+ _reduce_benchmarks, benchmarks, {}
+ )
+ metrics.append((x_val, y_vals))
+ del self.example_inputs
+ gc.collect()
+ except (KeyboardInterrupt, Exception):
+ warnings.warn("Caught exception, terminating early with partial results", stacklevel=1)
+ raise
+ finally:
+ self.output = BenchmarkOperatorResult(
+ op_name=self.name,
+ metrics=self.required_metrics,
+ result=metrics,
  )
- metrics.append((x_val, y_vals))
- del self.example_inputs
- gc.collect()
- self.output = BenchmarkOperatorResult(
- op_name=self.name,
- metrics=self.required_metrics,
- result=metrics,
- )
- return self.output
 
  def get_x_val(self, example_inputs) -> Any:
  raise NotImplementedError(
@@ -798,6 +806,8 @@ def _init_extra_metrics() -> Dict[str, Any]:
  except torch.cuda.OutOfMemoryError:
  metrics.error_msg = "CUDA OOM"
  except Exception as e:
+ if not self.tb_args.keep_going:
+ raise
  metrics.error_msg = str(e)
  return metrics
 

diff --git a/userbenchmark/triton/run.py b/userbenchmark/triton/run.py
@@ -45,7 +45,7 @@ def parse_args(args):
  parser.add_argument("--ci", action="store_true", help="Run in the CI mode.")
  return parser.parse_known_args(args)
 
-def _run(args: argparse.Namespace, extra_args: List[str]) -> BenchmarkOperatorResult:
+def _run(args: argparse.Namespace, extra_args: List[str]) -> None:
  Opbench = load_opbench_by_name(args.op)
  if args.fwd_bwd:
  args.mode = "fwd_bwd"
@@ -56,27 +56,29 @@ def _run(args: argparse.Namespace, extra_args: List[str]) -> BenchmarkOperatorRe
  device=args.device,
  extra_args=extra_args,
  )
- metrics = opbench.run(args.warmup, args.iter)
- if not args.skip_print:
- if args.csv:
- metrics.write_csv_to_file(sys.stdout)
- else:
- print(metrics)
- if not hasattr(torch_version, "git_version") and args.log_scuba:
- from userbenchmark.triton.fb import log_benchmark
+ try:
+ opbench.run(args.warmup, args.iter)
+ finally:
+ metrics = opbench.output
+ if not args.skip_print:
+ if args.csv:
+ metrics.write_csv_to_file(sys.stdout)
+ else:
+ print(metrics)
+ if not hasattr(torch_version, "git_version") and args.log_scuba:
+ from userbenchmark.triton.fb import log_benchmark
 
- log_benchmark(metrics)
- if args.plot:
- try:
- opbench.plot()
- except NotImplementedError:
- print(f"Plotting is not implemented for {args.op}")
+  log_benchmark(metrics)
+  if args.plot:
+  try:
+  opbench.plot()
+  except NotImplementedError:
+  print(f"Plotting is not implemented for {args.op}")
 
- if args.dump_csv:
- os.makedirs(TRITON_BENCH_CSV_DUMP_PATH, exist_ok=True)
- path = metrics.write_csv(TRITON_BENCH_CSV_DUMP_PATH)
- print(f"[TritonBench] Dumped csv to {path}")
- return metrics
+ if args.dump_csv:
+ os.makedirs(TRITON_BENCH_CSV_DUMP_PATH, exist_ok=True)
+ path = metrics.write_csv(TRITON_BENCH_CSV_DUMP_PATH)
+ print(f"[TritonBench] Dumped csv to {path}")
 
 def run(args: List[str] = []):
  if args == []: