Log more errors + make CSV writing more robust

Summary: Previously, we only caught specific CUDA OOM errors, but benchmarks can fail in other ways too. Let's make it more robust by catching and logging all exceptions. While there is already code to log exception messages, it often leads to malformed CSVs since there was no quoting going on. We should use Python's csv module to avoid this issue. Additionally, the previous logic would record the error message in each metric column of the failed benchmark. This was redundant, so I've changed it to emit the message only once. Finally, since Python's csv writer writes directly to a file, instead of creating a string first, the previous csv file naming convention using the hash of its contents no longer applies. Instead I've used NamedTemporaryFile to get a unique file name. Reviewed By: chenyang78 Differential Revision: D57785120 fbshipit-source-id: 73c76bba7661b60a7357aaba3d5b9659b533479e
pytorch · May 27, 2024 · d5d2762 · d5d2762
1 parent d1a6363
commit d5d2762
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 19 deletions.
diff --git a/torchbenchmark/util/triton_op.py b/torchbenchmark/util/triton_op.py
@@ -3,6 +3,7 @@
 import functools
 import gc
 import json
+import os
 import random
 import time
 import warnings
@@ -206,10 +207,11 @@ def select_metric(m):
  row.append(x_only_metric_dict[x_only_metric])
  for k in y_val_keys:
  metrics_dict = asdict(y_val[k])
+ if metrics_dict["error_msg"]:
+ row.append(metrics_dict["error_msg"])
+ row.extend([None] * (len(key_metrics[k]) - 1))
+ continue
  for metric in key_metrics[k]:
- if metrics_dict["error_msg"]:
- row.append(metrics_dict["error_msg"])
- continue
  _metrics_dict = (
  metrics_dict["extra_metrics"]
  if metric in metrics_dict["extra_metrics"]
@@ -224,12 +226,28 @@ def select_metric(m):
  table.append(row)
  return headers, table
 
- @property
- def csv(self):
+ def write_csv_to_file(self, fileobj):
+ import csv
+
  headers, table = self._table()
- headers = "; ".join(headers)
- table = "\n".join(["; ".join([str(v) for v in row]) for row in table])
- return f"{headers}\n{table}"
+ writer = csv.writer(fileobj, delimiter=";", quoting=csv.QUOTE_MINIMAL)
+ writer.writerow(headers)
+ writer.writerows(table)
+
+ def write_csv(self, dir_path):
+ import tempfile
+
+ # This is just a way to create a unique filename. It's not actually a
+ # temporary file (since delete=False).
+ with tempfile.NamedTemporaryFile(
+ mode='w',
+ prefix=os.path.join(dir_path, f"op_{self.op_name}_"),
+ suffix=".csv",
+ newline="",
+ delete=False,
+ ) as fileobj:
+ self.write_csv_to_file(fileobj)
+ return fileobj.name
 
  @property
  def x_vals(self):
@@ -779,6 +797,8 @@ def _init_extra_metrics() -> Dict[str, Any]:
  metrics.extra_metrics[metric_name] = func(fn, self.example_inputs, metrics)
  except torch.cuda.OutOfMemoryError:
  metrics.error_msg = "CUDA OOM"
+ except Exception as e:
+ metrics.error_msg = str(e)
  return metrics
 
  def get_peak_mem(

diff --git a/userbenchmark/triton/run.py b/userbenchmark/triton/run.py
@@ -59,7 +59,7 @@ def _run(args: argparse.Namespace, extra_args: List[str]) -> BenchmarkOperatorRe
  metrics = opbench.run(args.warmup, args.iter)
  if not args.skip_print:
  if args.csv:
- print(metrics.csv)
+ metrics.write_csv_to_file(sys.stdout)
  else:
  print(metrics)
  if not hasattr(torch_version, "git_version") and args.log_scuba:
@@ -73,16 +73,9 @@ def _run(args: argparse.Namespace, extra_args: List[str]) -> BenchmarkOperatorRe
  print(f"Plotting is not implemented for {args.op}")
 
  if args.dump_csv:
- if not os.path.exists(TRITON_BENCH_CSV_DUMP_PATH):
- os.mkdir(TRITON_BENCH_CSV_DUMP_PATH)
-
- csv_str = metrics.csv
- csv_str_hash = abs(hash(csv_str)) % (10**8)
- file_name = f"op_{args.op}_{csv_str_hash}.csv"
- file_path = os.path.join(TRITON_BENCH_CSV_DUMP_PATH, file_name)
- with open(file_path, "w") as f:
- f.write(csv_str)
- print(f"[TritonBench] Dumped csv to {file_path}")
+ os.makedirs(TRITON_BENCH_CSV_DUMP_PATH, exist_ok=True)
+ path = metrics.write_csv(TRITON_BENCH_CSV_DUMP_PATH)
+ print(f"[TritonBench] Dumped csv to {path}")
  return metrics
 
 def run(args: List[str] = []):