Make data nightly tests still work with lazy execution

Signed-off-by: Cheng Su <[email protected]>
ray-project · clarkzinzow · Jan 19, 2023 · Jan 5, 2023 · Jan 19, 2023 · Jan 18, 2023
commit fb1f0e7e7be49f2f1a12c423c6f36d22580c0a22
diff --git a/release/nightly_tests/dataset/inference.py b/release/nightly_tests/dataset/inference.py
@@ -86,17 +86,17 @@ def infer(batch):
  ray_remote_args={"num_cpus": 0.5},
 )
 # Do a blocking map so that we can measure the download time.
-ds = ds.map(lambda x: x)
+ds = ds.map(lambda x: x).fully_executed()
 
 end_download_time = time.time()
 print("Preprocessing...")
-ds = ds.map(preprocess)
+ds = ds.map(preprocess).fully_executed()
 end_preprocess_time = time.time()
 print("Inferring...")
 # NOTE: set a small batch size to avoid OOM on GRAM when doing inference.
 ds = ds.map_batches(
  infer, num_gpus=0.25, batch_size=128, batch_format="pandas", compute="actors"
-)
+).fully_executed()
 
 end_time = time.time()
 

diff --git a/release/nightly_tests/dataset/map_batches_benchmark.py b/release/nightly_tests/dataset/map_batches_benchmark.py
@@ -19,16 +19,22 @@ def map_batches(
  batch_format: Literal["default", "pandas", "pyarrow", "numpy"],
  compute: Optional[Union[str, ComputeStrategy]] = None,
  num_calls: Optional[int] = 1,
+ is_eager_executed: Optional[bool] = False,
 ) -> Dataset:
 
  ds = input_ds
+ if is_eager_executed:
+ ds.fully_executed()
+
  for _ in range(num_calls):
  ds = ds.map_batches(
  lambda x: x,
  batch_format=batch_format,
  batch_size=batch_size,
  compute=compute,
  )
+ if is_eager_executed:
+ ds.fully_executed()
  return ds
 
 
@@ -37,6 +43,7 @@ def run_map_batches_benchmark(benchmark: Benchmark):
  "s3:https://air-example-data/ursa-labs-taxi-data/by_year/2018/01"
  )
  lazy_input_ds = input_ds.lazy()
+ input_ds.fully_executed()
 
  batch_formats = ["pandas", "numpy"]
  batch_sizes = [1024, 2048, 4096, None]
@@ -56,14 +63,15 @@ def run_map_batches_benchmark(benchmark: Benchmark):
  continue
 
  num_calls = 2
- test_name = f"map-batches-{batch_format}-{batch_size}-{num_calls}-default"
+ test_name = f"map-batches-{batch_format}-{batch_size}-{num_calls}-eager"
  benchmark.run(
  test_name,
  map_batches,
  input_ds=input_ds,
  batch_format=batch_format,
  batch_size=batch_size,
  num_calls=num_calls,
+ is_eager_executed=True,
  )
  test_name = f"map-batches-{batch_format}-{batch_size}-{num_calls}-lazy"
  benchmark.run(
@@ -86,7 +94,7 @@ def run_map_batches_benchmark(benchmark: Benchmark):
 
  test_name = (
  f"map-batches-{batch_format}-{batch_size}-{num_calls}-"
- f"{compute_strategy}-default"
+ f"{compute_strategy}-eager"
  )
  benchmark.run(
  test_name,
@@ -96,6 +104,7 @@ def run_map_batches_benchmark(benchmark: Benchmark):
  batch_size=batch_size,
  compute=compute,
  num_calls=num_calls,
+ is_eager_executed=True,
  )
  test_name = (
  f"map-batches-{batch_format}-{batch_size}-{num_calls}-"
@@ -131,7 +140,8 @@ def run_map_batches_benchmark(benchmark: Benchmark):
  # Test reading multiple files.
  input_ds = ray.data.read_parquet(
  "s3:https://air-example-data/ursa-labs-taxi-data/by_year/2018"
- )
+ ).fully_executed()
+
  for batch_format in batch_formats:
  for compute in ["tasks", "actors"]:
  test_name = f"map-batches-{batch_format}-{compute}-multi-files"

diff --git a/release/nightly_tests/dataset/ray_sgd_training.py b/release/nightly_tests/dataset/ray_sgd_training.py
@@ -26,7 +26,9 @@
 
 def read_dataset(path: str) -> ray.data.Dataset:
  print(f"reading data from {path}")
- return ray.data.read_parquet(path).repartition(400).random_shuffle()
+ ds = ray.data.read_parquet(path).repartition(400).random_shuffle()
+ ds.fully_executed()
+ return ds
 
 
 class DataPreprocessor:

diff --git a/release/nightly_tests/dataset/sort.py b/release/nightly_tests/dataset/sort.py
@@ -118,6 +118,7 @@ def make_block(count: int, num_columns: int) -> Block:
  ds = ds.random_shuffle()
  else:
  ds = ds.sort(key="c_0")
+ ds.fully_executed()
  except Exception as e:
  exc = e
  pass