[air/train/benchmark] Add TF GPU 4x4 benchmark (ray-project#26776)

Signed-off-by: Stefan van der Kleij <[email protected]>
Stefan-1313 · Aug 18, 2022 · be6eecc · be6eecc
1 parent cfe34f2
commit be6eecc
Show file tree

Hide file tree

Showing 2 changed files with 66 additions and 7 deletions.
diff --git a/release/air_tests/air_benchmarks/workloads/tensorflow_benchmark.py b/release/air_tests/air_benchmarks/workloads/tensorflow_benchmark.py
@@ -187,6 +187,8 @@ def train_tf_vanilla(
  str(rank),
  "--worker-ip-ports",
  ip_port_str,
+ "--batch-size",
+ str(config["batch_size"]),
  ]
  + (["--use-gpu"] if use_gpu else [])
  for rank in range(num_workers)
@@ -216,18 +218,22 @@ def cli():
 @click.option("--num-workers", type=int, default=4)
 @click.option("--cpus-per-worker", type=int, default=8)
 @click.option("--use-gpu", is_flag=True, default=False)
+@click.option("--batch-size", type=int, default=64)
 def run(
  num_runs: int = 1,
  num_epochs: int = 4,
  num_workers: int = 4,
  cpus_per_worker: int = 8,
  use_gpu: bool = False,
+ batch_size: int = 64,
+ smoke_test: bool = False,
 ):
  import ray
  from benchmark_util import upload_file_to_all_nodes, run_command_on_all_nodes
 
  config = CONFIG.copy()
  config["epochs"] = num_epochs
+ config["batch_size"] = batch_size
 
  ray.init("auto")
  print("Preparing Tensorflow benchmark: Downloading MNIST")
@@ -241,6 +247,8 @@ def run(
  times_vanilla = []
  losses_vanilla = []
  for run in range(1, num_runs + 1):
+ time.sleep(2)
+
  print(f"[Run {run}/{num_runs}] Running Tensorflow Ray benchmark")
 
  time_ray, loss_ray = train_tf_ray_air(
@@ -255,16 +263,27 @@ def run(
  f"{time_ray:.2f} seconds. Observed loss = {loss_ray:.4f}"
  )
 
- time.sleep(5)
+ time.sleep(2)
 
  print(f"[Run {run}/{num_runs}] Running Tensorflow vanilla benchmark")
 
- time_vanilla, loss_vanilla = train_tf_vanilla(
- num_workers=num_workers,
- cpus_per_worker=cpus_per_worker,
- use_gpu=use_gpu,
- config=config,
- )
+ # Todo: Vanilla runs are sometimes failing. We just retry here, but we should
+ # get to the bottom of it.
+ time_vanilla = loss_vanilla = 0.0
+ for i in range(3):
+ try:
+ time_vanilla, loss_vanilla = train_tf_vanilla(
+ num_workers=num_workers,
+ cpus_per_worker=cpus_per_worker,
+ use_gpu=use_gpu,
+ config=config,
+ )
+ except Exception as e:
+ if i > +2:
+ raise RuntimeError("Vanilla TF run failed 3 times") from e
+ print("Vanilla TF run failed:", e)
+ continue
+ break
 
  print(
  f"[Run {run}/{num_runs}] Finished vanilla training ({num_epochs} epochs) "
@@ -332,16 +351,19 @@ def run(
 @click.option("--num-workers", type=int, default=4)
 @click.option("--rank", type=int, default=0)
 @click.option("--worker-ip-ports", type=str, default="")
+@click.option("--batch-size", type=int, default=64)
 @click.option("--use-gpu", is_flag=True, default=False)
 def worker(
  num_epochs: int = 4,
  num_workers: int = 4,
  rank: int = 0,
  worker_ip_ports: str = "",
+ batch_size: int = 64,
  use_gpu: bool = False,
 ):
  config = CONFIG.copy()
  config["epochs"] = num_epochs
+ config["batch_size"] = batch_size
 
  # Parse worker ip ports
  worker_ip_port_list = worker_ip_ports.split(",")

diff --git a/release/release_tests.yaml b/release/release_tests.yaml
@@ -490,6 +490,43 @@
  alert: default
 
 
+- name: air_benchmark_tensorflow_mnist_gpu_4x4
+ group: AIR tests
+ working_dir: air_tests/air_benchmarks
+
+ frequency: weekly
+ team: ml
+ env: staging
+
+ cluster:
+ cluster_env: app_config.yaml
+ cluster_compute: compute_gpu_4x4.yaml
+
+ run:
+ timeout: 3600
+ script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 200 --num-workers 16 --cpus-per-worker 4 --batch-size 64 --use-gpu
+
+ wait_for_nodes:
+ num_nodes: 4
+
+ type: sdk_command
+ file_manager: job
+
+ smoke_test:
+ frequency: nightly
+
+ cluster:
+ cluster_compute: compute_gpu_2x2.yaml
+
+ run:
+ script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 60 --num-workers 4 --cpus-per-worker 4 --batch-size 512 --use-gpu
+
+ wait_for_nodes:
+ num_nodes: 2
+
+ alert: default
+
+
 - name: air_benchmark_pytorch_training_e2e_gpu_1x1_20gb
  group: AIR tests
  working_dir: air_tests/air_benchmarks