pytorch · xuzhao9 · May 6, 2024 · May 9, 2024 · May 10, 2024 · May 10, 2024
diff --git a/.github/workflows/v3-nightly.yml b/.github/workflows/v3-nightly.yml
@@ -1,4 +1,4 @@
-name: TorchBench V3 nightly (A100)
+name: TorchBench nightly (A100)
 on:
   workflow_dispatch:
   schedule:
@@ -23,7 +23,6 @@ jobs:
       - name: Checkout TorchBench v3.0 branch
         uses: actions/checkout@v3
         with:
-          ref: v3.0
           path: benchmark
       - name: Tune Nvidia GPU
         run: |
@@ -40,14 +39,14 @@ jobs:
           . "${SETUP_SCRIPT}"
           pushd benchmark
           python install.py
-      - name: Run the torch-nightly userbenchmark
+      - name: Run the torch-nightly-test userbenchmark
         run: |
           . "${SETUP_SCRIPT}"
           # remove old results
           if [ -d benchmark-output ]; then rm -Rf benchmark-output; fi
           pushd benchmark
           if [ -d .userbenchmark ]; then rm -Rf .userbenchmark; fi
-          python run_benchmark.py torch-nightly -c v3-cuda-tests.yaml
+          python run_benchmark.py torch-nightly-test -c nightly.yaml
           cp -r ./.userbenchmark/torch-nightly ../benchmark-output
       - name: Detect potential regressions
         continue-on-error: true

diff --git a/torchbenchmark/util/experiment/instantiator.py b/torchbenchmark/util/experiment/instantiator.py
@@ -24,11 +24,13 @@
 
 @dataclasses.dataclass
 class TorchBenchModelConfig:
+    model_set: str
     name: str
     test: str
     device: str
     batch_size: Optional[int]
     extra_args: List[str]
+    metrics: List[str]
     extra_env: Optional[Dict[str, str]] = None
     output_dir: Optional[pathlib.Path] = None
 
@@ -153,7 +155,25 @@ def list_extended_models(suite_name: str = "all") -> List[str]:
         return list_extended_timm_models()
     elif suite_name == "all":
         return list_extended_huggingface_models() + list_extended_timm_models()
+    elif suite_name == "torchbench":
+        return list_models()
     else:
         assert (
             False
-        ), "Currently, we only support extended model set huggingface or timm."
+        ), f"Currently, we only support model set torchbench, huggingface or timm, but get {suite_name}."
+
+
+def get_model_set_from_model_name(model_name: str) -> str:
+    from torchbenchmark.util.framework.huggingface.extended_configs import (
+        list_extended_huggingface_models,
+    )
+    from torchbenchmark.util.framework.timm.extended_configs import (
+        list_extended_timm_models,
+    )
+    if model_name in list_extended_huggingface_models():
+        return "huggingface"
+    if model_name in list_extended_timm_models():
+        return "timm"
+    if model_name in list_models():
+        return "torchbench"
+    assert False, f"Model {model_name} is not found in any model set."
diff --git a/torchbenchmark/util/experiment/metrics.py b/torchbenchmark/util/experiment/metrics.py
@@ -5,7 +5,7 @@
 import dataclasses
 import pathlib
 import time
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union, Dict, Any
 
 import torch
 from torchbenchmark import ModelTask
@@ -22,12 +22,14 @@
 class TorchBenchModelMetrics:
     latencies: List[float]
     throughputs: List[float]
+    accuracy: Optional[bool]
     cpu_peak_mem: Optional[float]
     gpu_peak_mem: Optional[float]
     ttfb: Optional[float]  # time-to-first-batch
     pt2_compilation_time: Optional[float]
     pt2_graph_breaks: Optional[float]
     model_flops: Optional[float]
+    error_msg: Optional[str]
 
 
 def get_latencies(
@@ -139,22 +141,25 @@ def work_func():
 
 def get_model_test_metrics(
     model: Union[BenchmarkModel, ModelTask],
-    metrics=[],
+    required_metrics=[],
     export_metrics_file=False,
     metrics_gpu_backend="nvml",
     nwarmup=WARMUP_ROUNDS,
     num_iter=BENCHMARK_ITERS,
 ) -> TorchBenchModelMetrics:
     import os
-
-    latencies = None
-    throughputs = None
-    cpu_peak_mem = None
-    gpu_peak_mem = None
-    ttfb = None
-    pt2_compilation_time = None
-    pt2_graph_breaks = None
-    model_flops = None
+    metrics = TorchBenchModelMetrics(
+        latencies=[],
+        throughputs=[],
+        accuracy=None,
+        cpu_peak_mem=None,
+        gpu_peak_mem=None,
+        ttfb=None,
+        pt2_compilation_time=None,
+        pt2_graph_breaks=None,
+        model_flops=None,
+        error_msg=None,
+    )
     if not (isinstance(model, BenchmarkModel) or isinstance(model, ModelTask)):
         raise ValueError(
             f"Expected BenchmarkModel or ModelTask, get type: {type(model)}"
@@ -167,51 +172,42 @@ def get_model_test_metrics(
         if isinstance(model, BenchmarkModel)
         else model.get_model_attribute("device")
     )
-    if "latencies" in metrics or "throughputs" in metrics:
-        latencies = get_latencies(
+    if "latencies" in required_metrics or "throughputs" in required_metrics:
+        metrics.latencies = get_latencies(
             model.invoke, device, nwarmup=nwarmup, num_iter=num_iter
         )
-    if "cpu_peak_mem" in metrics or "gpu_peak_mem" in metrics:
-        cpu_peak_mem, _device_id, gpu_peak_mem = get_peak_memory(
+    if "cpu_peak_mem" in required_metrics or "gpu_peak_mem" in required_metrics:
+        metrics.cpu_peak_mem, _device_id, metrics.gpu_peak_mem = get_peak_memory(
             model.invoke,
             device,
             export_metrics_file=export_metrics_file,
-            metrics_needed=metrics,
+            metrics_needed=required_metrics,
             metrics_gpu_backend=metrics_gpu_backend,
             cpu_monitored_pid=model_pid,
         )
-    if "throughputs" in metrics:
-        throughputs = [model.batch_size * 1000 / latency for latency in latencies]
-    if "pt2_compilation_time" in metrics:
-        pt2_compilation_time = (
+    if "throughputs" in required_metrics:
+        metrics.throughputs = [model.batch_size * 1000 / latency for latency in metrics.latencies]
+    if "pt2_compilation_time" in required_metrics:
+        metrics.pt2_compilation_time = (
             model.get_model_attribute("pt2_compilation_time")
             if isinstance(model, ModelTask)
             else model.pt2_compilation_time
         )
-    if "pt2_graph_breaks" in metrics:
-        pt2_graph_breaks = (
+    if "pt2_graph_breaks" in required_metrics:
+        metrics.pt2_graph_breaks = (
             model.get_model_attribute("pt2_graph_breaks")
             if isinstance(model, ModelTask)
             else model.pt2_graph_breaks
         )
-    if "model_flops" in metrics:
-        model_flops = get_model_flops(model)
-    if "ttfb" in metrics:
-        ttfb = (
+    if "model_flops" in required_metrics:
+        metrics.model_flops = get_model_flops(model)
+    if "ttfb" in required_metrics:
+        metrics.ttfb = (
             model.get_model_attribute("ttfb")
             if isinstance(model, ModelTask)
             else model.ttfb
         )
-    return TorchBenchModelMetrics(
-        latencies,
-        throughputs,
-        cpu_peak_mem,
-        gpu_peak_mem,
-        ttfb,
-        pt2_compilation_time,
-        pt2_graph_breaks,
-        model_flops,
-    )
+    return metrics
 
 
 def get_model_accuracy(
@@ -242,3 +238,41 @@ def get_model_accuracy(
         accuracy = model.accuracy
         del model
         return accuracy
+
+
+def run_config(config: TorchBenchModelConfig,
+               as_dict: bool=False,
+               dryrun: bool=False,
+    ) -> Union[TorchBenchModelMetrics, Dict[str, Any]]:
+    """Run a benchmark config and return the metrics as a Dict"""
+    print(f"Running config {config} ...", flush=True, end="")
+    metrics = TorchBenchModelMetrics(
+        latencies=[],
+        throughputs=[],
+        accuracy=None,
+        cpu_peak_mem=None,
+        gpu_peak_mem=None,
+        ttfb=None,
+        pt2_compilation_time=None,
+        pt2_graph_breaks=None,
+        model_flops=None,
+        error_msg=None,
+    )
+    if dryrun:
+        print("[skip_by_dryrun]", flush=True)
+        return dataclasses.asdict(metrics) if as_dict else metrics
+    required_metrics = config.metrics.copy()
+    accuracy = None
+    if "accuracy" in required_metrics:
+        accuracy = get_model_accuracy(config)
+        required_metrics.remove("accuracy")
+    if required_metrics:
+        from torchbenchmark.util.experiment.instantiator import (
+            load_model_isolated,
+        )
+        model_task = load_model_isolated(config)
+        metrics = get_model_test_metrics(model_task, required_metrics=required_metrics)
+    if "accuracy" in required_metrics:
+        metrics.accuracy = accuracy
+    print("[done]", flush=True)
+    return dataclasses.asdict(metrics) if as_dict else metrics
diff --git a/userbenchmark/group_bench/run.py b/userbenchmark/group_bench/run.py
@@ -186,10 +186,10 @@ def load_group_config(config_file: str) -> TorchBenchGroupBenchConfig:
     ]
     metrics = data["metrics"] if "metrics" in data else []
     group_configs = {}
-    for group_name in data["test_group"]:
+    for group_name in data["test_groups"]:
             group_configs[group_name] = []
-            group_extra_args = list(filter(lambda x: bool(x), data["test_group"][group_name].get("extra_args", "").split(" ")))
-            for subgroup in data["test_group"][group_name]["subgroup"]:
+            group_extra_args = list(filter(lambda x: bool(x), data["test_groups"][group_name].get("extra_args", "").split(" ")))
+            for subgroup in data["test_groups"][group_name]["subgroup"]:
                 subgroup_extra_args = subgroup.get("extra_args", "")
                 subgroup_extra_args = "" if subgroup_extra_args == None else subgroup_extra_args
                 subgroup_extra_args_list = list(filter(lambda x: bool(x), subgroup_extra_args.split(" ")))

diff --git a/userbenchmark/group_bench/run_config.py b/userbenchmark/group_bench/run_config.py
@@ -0,0 +1,102 @@
+import yaml
+import numpy
+import itertools
+from typing import Any, Dict, List, Optional, Tuple
+from torchbenchmark.util.experiment.instantiator import TorchBenchModelConfig, list_extended_models, get_model_set_from_model_name
+from torchbenchmark.util.experiment.metrics import run_config
+
+
+def _get_models(models: Optional[List[str]]=None, model_set: Optional[List[str]]=None) -> List[Tuple[str, str]]:
+    result = set(map(lambda x: (get_model_set_from_model_name(x), x), models)) if models else set()
+    if model_set:
+        for s in model_set:
+            result = result.union(set(map(lambda x: (s, x), list_extended_models(s))))
+    return sorted(list(result))
+
+
+def config_obj_to_model_configs(config: Dict[str, Any]) -> Dict[str, Dict[str, List[TorchBenchModelConfig]]]:
+    models: Tuple[str, str] = _get_models(models=config.get("model", None), model_set=config.get("model_set", None))
+    batch_sizes = config.get("batch_size", [None])
+    tests = config.get("test", ["train", "eval"])
+    devices = config.get("device", ["cuda"])
+    precisions = config.get("precision", [None])
+    metrics = config["metrics"]
+    test_groups = config["test_groups"]
+    result = {}
+    for group_name in test_groups.keys():
+        extra_args = test_groups[group_name].get("extra_args", [])
+        extra_args = [] if extra_args == None else extra_args.copy()
+        cfgs = itertools.product(*[devices, tests, batch_sizes, precisions, models])
+        for device, test, batch_size, precision, model_name_with_set in cfgs:
+            if precision:
+                extra_args = extra_args.extend(["--precision", precision])
+            if batch_size:
+                batch_size = int(batch_size)
+            common_key = (device, test, batch_size, precision)
+            if not common_key in result:
+                result[common_key] = {}
+            if not group_name in result[common_key]:
+                result[common_key][group_name] = []
+            result[common_key][group_name].append(
+                TorchBenchModelConfig(
+                    model_set=model_name_with_set[0],
+                    name=model_name_with_set[1],
+                    device=device,
+                    test=test,
+                    batch_size=batch_size,
+                    extra_args=extra_args,
+                    extra_env=None,
+                    metrics=metrics,
+                )
+            )
+    return result
+
+
+def _common_key_to_group_key(common_key: Tuple[str, str, int, str]):
+    device, test, batch_size, precision = common_key
+    key = {
+        "device": device,
+        "test": test,
+        "batch_size": batch_size if batch_size else "default",
+        "precision": precision if precision else "default",
+    }
+    return key
+
+
+def _config_result_to_group_result(
+        group_name: str,
+        model_set: str,
+        model_name: str,
+        metrics: Dict[str, Any],
+        required_metrics: List[str],
+        metric_aggregation: str="p50"):
+    # output metric format: <model_set>_<model_name>[<group_name>]_<metric_name>
+    result = {}
+    for metric in required_metrics:
+        metric_name = f"{model_set}_{model_name}[{group_name}]_{metric}"
+        metric_value = metrics[metric]
+        if isinstance(metrics[metric], list) and metric_aggregation == "p50":
+            metric_value = numpy.median(metrics[metric])
+        result[metric_name] = metric_value
+    return result
+
+
+def run_benchmark_group_config(group_config_file: str, dryrun: bool=False) -> List[Dict[str, Any]]:
+    result = []
+    with open(group_config_file, "r") as fp:
+        config_obj = yaml.safe_load(fp)
+    configs: Dict[str, Dict[str, List[TorchBenchModelConfig]]] = config_obj_to_model_configs(config_obj)
+    for common_key in configs.keys():
+        group_key = _common_key_to_group_key(common_key)
+        group_result = {"group_key": group_key, "group_results": {}}
+        for group_name in configs[common_key]:
+            for x in configs[common_key][group_name]:
+                group_result["group_results"].update(
+                    _config_result_to_group_result(
+                        group_name=group_name,
+                        model_set=x.model_set,
+                        model_name=x.name,
+                        metrics=run_config(x, as_dict=True, dryrun=dryrun),
+                        required_metrics=x.metrics))
+        result.append(group_result)
+    return result
diff --git a/userbenchmark/torch-nightly/__init__.py → userbenchmark/torch-nightly-test/__init__.py b/userbenchmark/torch-nightly/__init__.py → userbenchmark/torch-nightly-test/__init__.py
diff --git a/userbenchmark/torch-nightly-test/nightly.yaml b/userbenchmark/torch-nightly-test/nightly.yaml
@@ -0,0 +1,16 @@
+model_set:
+  - torchbench
+  - huggingface
+  - timm
+test:
+  - train
+  - eval
+device:
+  - cuda
+metrics:
+  - latencies
+  - cpu_peak_mem
+  - gpu_peak_mem
+test_groups:
+  eager:
+    extra_args:
diff --git a/...mark/torch-nightly/regression_detector.py → ...torch-nightly-test/regression_detector.py b/...mark/torch-nightly/regression_detector.py → ...torch-nightly-test/regression_detector.py