Skip to content

Commit

Permalink
Log more errors + make CSV writing more robust
Browse files Browse the repository at this point in the history
Summary:
Previously, we only caught specific CUDA OOM errors, but benchmarks can fail in
other ways too. Let's make it more robust by catching and logging all
exceptions.

While there is already code to log exception messages, it often leads to
malformed CSVs since there was no quoting going on. We should use Python's csv
module to avoid this issue.

Additionally, the previous logic would record the error message in each metric column
of the failed benchmark. This was redundant, so I've changed it to emit the message
only once.

Finally, since Python's csv writer writes directly to a file, instead of
creating a string first, the previous csv file naming convention using the hash
of its contents no longer applies. Instead I've used NamedTemporaryFile to get
a unique file name.

Reviewed By: chenyang78

Differential Revision: D57785120

fbshipit-source-id: 73c76bba7661b60a7357aaba3d5b9659b533479e
  • Loading branch information
int3 authored and facebook-github-bot committed May 27, 2024
1 parent d1a6363 commit d5d2762
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 19 deletions.
36 changes: 28 additions & 8 deletions torchbenchmark/util/triton_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import functools
import gc
import json
import os
import random
import time
import warnings
Expand Down Expand Up @@ -206,10 +207,11 @@ def select_metric(m):
row.append(x_only_metric_dict[x_only_metric])
for k in y_val_keys:
metrics_dict = asdict(y_val[k])
if metrics_dict["error_msg"]:
row.append(metrics_dict["error_msg"])
row.extend([None] * (len(key_metrics[k]) - 1))
continue
for metric in key_metrics[k]:
if metrics_dict["error_msg"]:
row.append(metrics_dict["error_msg"])
continue
_metrics_dict = (
metrics_dict["extra_metrics"]
if metric in metrics_dict["extra_metrics"]
Expand All @@ -224,12 +226,28 @@ def select_metric(m):
table.append(row)
return headers, table

@property
def csv(self):
def write_csv_to_file(self, fileobj):
import csv

headers, table = self._table()
headers = "; ".join(headers)
table = "\n".join(["; ".join([str(v) for v in row]) for row in table])
return f"{headers}\n{table}"
writer = csv.writer(fileobj, delimiter=";", quoting=csv.QUOTE_MINIMAL)
writer.writerow(headers)
writer.writerows(table)

def write_csv(self, dir_path):
import tempfile

# This is just a way to create a unique filename. It's not actually a
# temporary file (since delete=False).
with tempfile.NamedTemporaryFile(
mode='w',
prefix=os.path.join(dir_path, f"op_{self.op_name}_"),
suffix=".csv",
newline="",
delete=False,
) as fileobj:
self.write_csv_to_file(fileobj)
return fileobj.name

@property
def x_vals(self):
Expand Down Expand Up @@ -779,6 +797,8 @@ def _init_extra_metrics() -> Dict[str, Any]:
metrics.extra_metrics[metric_name] = func(fn, self.example_inputs, metrics)
except torch.cuda.OutOfMemoryError:
metrics.error_msg = "CUDA OOM"
except Exception as e:
metrics.error_msg = str(e)
return metrics

def get_peak_mem(
Expand Down
15 changes: 4 additions & 11 deletions userbenchmark/triton/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def _run(args: argparse.Namespace, extra_args: List[str]) -> BenchmarkOperatorRe
metrics = opbench.run(args.warmup, args.iter)
if not args.skip_print:
if args.csv:
print(metrics.csv)
metrics.write_csv_to_file(sys.stdout)
else:
print(metrics)
if not hasattr(torch_version, "git_version") and args.log_scuba:
Expand All @@ -73,16 +73,9 @@ def _run(args: argparse.Namespace, extra_args: List[str]) -> BenchmarkOperatorRe
print(f"Plotting is not implemented for {args.op}")

if args.dump_csv:
if not os.path.exists(TRITON_BENCH_CSV_DUMP_PATH):
os.mkdir(TRITON_BENCH_CSV_DUMP_PATH)

csv_str = metrics.csv
csv_str_hash = abs(hash(csv_str)) % (10**8)
file_name = f"op_{args.op}_{csv_str_hash}.csv"
file_path = os.path.join(TRITON_BENCH_CSV_DUMP_PATH, file_name)
with open(file_path, "w") as f:
f.write(csv_str)
print(f"[TritonBench] Dumped csv to {file_path}")
os.makedirs(TRITON_BENCH_CSV_DUMP_PATH, exist_ok=True)
path = metrics.write_csv(TRITON_BENCH_CSV_DUMP_PATH)
print(f"[TritonBench] Dumped csv to {path}")
return metrics

def run(args: List[str] = []):
Expand Down

0 comments on commit d5d2762

Please sign in to comment.