Skip to content

Commit

Permalink
[air/train/benchmark] Add TF GPU 4x4 benchmark (ray-project#26776)
Browse files Browse the repository at this point in the history
Signed-off-by: Stefan van der Kleij <[email protected]>
  • Loading branch information
krfricke authored and Stefan van der Kleij committed Aug 18, 2022
1 parent cfe34f2 commit be6eecc
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 7 deletions.
36 changes: 29 additions & 7 deletions release/air_tests/air_benchmarks/workloads/tensorflow_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,8 @@ def train_tf_vanilla(
str(rank),
"--worker-ip-ports",
ip_port_str,
"--batch-size",
str(config["batch_size"]),
]
+ (["--use-gpu"] if use_gpu else [])
for rank in range(num_workers)
Expand Down Expand Up @@ -216,18 +218,22 @@ def cli():
@click.option("--num-workers", type=int, default=4)
@click.option("--cpus-per-worker", type=int, default=8)
@click.option("--use-gpu", is_flag=True, default=False)
@click.option("--batch-size", type=int, default=64)
def run(
num_runs: int = 1,
num_epochs: int = 4,
num_workers: int = 4,
cpus_per_worker: int = 8,
use_gpu: bool = False,
batch_size: int = 64,
smoke_test: bool = False,
):
import ray
from benchmark_util import upload_file_to_all_nodes, run_command_on_all_nodes

config = CONFIG.copy()
config["epochs"] = num_epochs
config["batch_size"] = batch_size

ray.init("auto")
print("Preparing Tensorflow benchmark: Downloading MNIST")
Expand All @@ -241,6 +247,8 @@ def run(
times_vanilla = []
losses_vanilla = []
for run in range(1, num_runs + 1):
time.sleep(2)

print(f"[Run {run}/{num_runs}] Running Tensorflow Ray benchmark")

time_ray, loss_ray = train_tf_ray_air(
Expand All @@ -255,16 +263,27 @@ def run(
f"{time_ray:.2f} seconds. Observed loss = {loss_ray:.4f}"
)

time.sleep(5)
time.sleep(2)

print(f"[Run {run}/{num_runs}] Running Tensorflow vanilla benchmark")

time_vanilla, loss_vanilla = train_tf_vanilla(
num_workers=num_workers,
cpus_per_worker=cpus_per_worker,
use_gpu=use_gpu,
config=config,
)
# Todo: Vanilla runs are sometimes failing. We just retry here, but we should
# get to the bottom of it.
time_vanilla = loss_vanilla = 0.0
for i in range(3):
try:
time_vanilla, loss_vanilla = train_tf_vanilla(
num_workers=num_workers,
cpus_per_worker=cpus_per_worker,
use_gpu=use_gpu,
config=config,
)
except Exception as e:
if i > +2:
raise RuntimeError("Vanilla TF run failed 3 times") from e
print("Vanilla TF run failed:", e)
continue
break

print(
f"[Run {run}/{num_runs}] Finished vanilla training ({num_epochs} epochs) "
Expand Down Expand Up @@ -332,16 +351,19 @@ def run(
@click.option("--num-workers", type=int, default=4)
@click.option("--rank", type=int, default=0)
@click.option("--worker-ip-ports", type=str, default="")
@click.option("--batch-size", type=int, default=64)
@click.option("--use-gpu", is_flag=True, default=False)
def worker(
num_epochs: int = 4,
num_workers: int = 4,
rank: int = 0,
worker_ip_ports: str = "",
batch_size: int = 64,
use_gpu: bool = False,
):
config = CONFIG.copy()
config["epochs"] = num_epochs
config["batch_size"] = batch_size

# Parse worker ip ports
worker_ip_port_list = worker_ip_ports.split(",")
Expand Down
37 changes: 37 additions & 0 deletions release/release_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,43 @@
alert: default


- name: air_benchmark_tensorflow_mnist_gpu_4x4
group: AIR tests
working_dir: air_tests/air_benchmarks

frequency: weekly
team: ml
env: staging

cluster:
cluster_env: app_config.yaml
cluster_compute: compute_gpu_4x4.yaml

run:
timeout: 3600
script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 200 --num-workers 16 --cpus-per-worker 4 --batch-size 64 --use-gpu

wait_for_nodes:
num_nodes: 4

type: sdk_command
file_manager: job

smoke_test:
frequency: nightly

cluster:
cluster_compute: compute_gpu_2x2.yaml

run:
script: python workloads/tensorflow_benchmark.py run --num-runs 3 --num-epochs 60 --num-workers 4 --cpus-per-worker 4 --batch-size 512 --use-gpu

wait_for_nodes:
num_nodes: 2

alert: default


- name: air_benchmark_pytorch_training_e2e_gpu_1x1_20gb
group: AIR tests
working_dir: air_tests/air_benchmarks
Expand Down

0 comments on commit be6eecc

Please sign in to comment.