[Telemetry] Add Telemetry for Ray Train Utilities (ray-project#39363)

Signed-off-by: woshiyyya <[email protected]>
krfricke · Sep 7, 2023 · 449afc9 · 449afc9
1 parent 3e8a1dc
commit 449afc9
Show file tree

Hide file tree

Showing 6 changed files with 191 additions and 4 deletions.
diff --git a/python/ray/train/BUILD b/python/ray/train/BUILD
@@ -632,6 +632,14 @@ py_test(
  deps = [":train_lib"]
 )
 
+py_test(
+ name = "test_train_usage",
+ size = "medium",
+ srcs = ["tests/test_train_usage.py"],
+ tags = ["team:ml", "exclusive"],
+ deps = [":train_lib"],
+)
+
 py_test(
  name = "test_training_iterator",
  size = "large",

diff --git a/python/ray/train/huggingface/transformers/_transformers_utils.py b/python/ray/train/huggingface/transformers/_transformers_utils.py
@@ -20,6 +20,7 @@
  LegacyTransformersCheckpoint,
 )
 from ray.util import PublicAPI
+from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag
 
 logger = logging.getLogger(__name__)
 
@@ -263,6 +264,10 @@ class RayTrainReportCallback(TrainerCallback):
 
  """
 
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ record_extra_usage_tag(TagKey.TRAIN_TRANSFORMERS_RAYTRAINREPORTCALLBACK, "1")
+
  def on_save(self, args, state, control, **kwargs):
  """Event called after a checkpoint save."""
  with TemporaryDirectory() as tmpdir:
@@ -331,4 +336,5 @@ def get_eval_dataloader(
 
  trainer.__class__ = RayTransformersTrainer
 
+ record_extra_usage_tag(TagKey.TRAIN_TRANSFORMERS_PREPARE_TRAINER, "1")
  return trainer
diff --git a/python/ray/train/lightning/_lightning_utils.py b/python/ray/train/lightning/_lightning_utils.py
@@ -4,6 +4,7 @@
 from ray.air.constants import MODEL_KEY
 from ray.data.dataset import DataIterator
 from ray.util import PublicAPI
+from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag
 
 import logging
 import shutil
@@ -63,6 +64,10 @@ class RayDDPStrategy(DDPStrategy):
  https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.strategies.DDPStrategy.html
  """
 
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ record_extra_usage_tag(TagKey.TRAIN_LIGHTNING_RAYDDPSTRATEGY, "1")
+
  @property
  def root_device(self) -> torch.device:
  return get_worker_root_device()
@@ -83,6 +88,10 @@ class RayFSDPStrategy(FSDPStrategy):
  https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.strategies.FSDPStrategy.html
  """
 
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ record_extra_usage_tag(TagKey.TRAIN_LIGHTNING_RAYFSDPSTRATEGY, "1")
+
  @property
  def root_device(self) -> torch.device:
  return get_worker_root_device()
@@ -122,6 +131,10 @@ class RayDeepSpeedStrategy(DeepSpeedStrategy):
  https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.strategies.DeepSpeedStrategy.html
  """
 
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ record_extra_usage_tag(TagKey.TRAIN_LIGHTNING_RAYDEEPSPEEDSTRATEGY, "1")
+
  @property
  def root_device(self) -> torch.device:
  return get_worker_root_device()
@@ -138,6 +151,10 @@ def distributed_sampler_kwargs(self) -> Dict[str, Any]:
 class RayLightningEnvironment(LightningEnvironment):
  """Setup Lightning DDP training environment for Ray cluster."""
 
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ record_extra_usage_tag(TagKey.TRAIN_LIGHTNING_RAYLIGHTNINGENVIRONMENT, "1")
+
  def world_size(self) -> int:
  return train.get_context().get_world_size()
 
@@ -188,6 +205,7 @@ def prepare_trainer(trainer: pl.Trainer) -> pl.Trainer:
  f"but got {type(cluster_environment)}!"
  )
 
+ record_extra_usage_tag(TagKey.TRAIN_LIGHTNING_PREPARE_TRAINER, "1")
  return trainer
 
 
@@ -203,6 +221,8 @@ def __init__(self) -> None:
  if os.path.isdir(self.tmpdir_prefix) and self.local_rank == 0:
  shutil.rmtree(self.tmpdir_prefix)
 
+ record_extra_usage_tag(TagKey.TRAIN_LIGHTNING_RAYTRAINREPORTCALLBACK, "1")
+
  def on_train_epoch_end(self, trainer, pl_module) -> None:
  # Creates a checkpoint dir with fixed name
  tmpdir = os.path.join(self.tmpdir_prefix, str(trainer.current_epoch))

diff --git a/python/ray/train/tests/test_train_usage.py b/python/ray/train/tests/test_train_usage.py
@@ -0,0 +1,138 @@
+import pytest
+import torch
+
+import ray
+from ray.train import ScalingConfig
+from ray.train.torch import TorchTrainer
+
+
+@pytest.fixture
+def shutdown_only():
+ yield None
+ ray.shutdown()
+
+
+def run_torch():
+ from torch.utils.data import DataLoader, TensorDataset
+ from ray.train.torch import get_device, prepare_model, prepare_data_loader
+
+ def train_func():
+ # Create dummy model and data loader
+ model = torch.nn.Linear(10, 10)
+ inputs, targets = torch.randn(128, 10), torch.randn(128, 1)
+ dataloader = DataLoader(TensorDataset(inputs, targets), batch_size=32)
+
+ # Test Torch Utilities
+ prepare_data_loader(dataloader)
+ prepare_model(model)
+ get_device()
+
+ trainer = TorchTrainer(
+ train_func, scaling_config=ScalingConfig(num_workers=2, use_gpu=False)
+ )
+ trainer.fit()
+
+
+def run_lightning():
+ import pytorch_lightning as pl
+ from ray.train.lightning import (
+ RayTrainReportCallback,
+ RayDDPStrategy,
+ RayFSDPStrategy,
+ RayDeepSpeedStrategy,
+ RayLightningEnvironment,
+ prepare_trainer,
+ )
+
+ def train_func():
+ # Test Lighting utilites
+ strategy = RayFSDPStrategy()
+ strategy = RayDeepSpeedStrategy()
+ strategy = RayDDPStrategy()
+ ray_environment = RayLightningEnvironment()
+ report_callback = RayTrainReportCallback()
+
+ trainer = pl.Trainer(
+ devices="auto",
+ accelerator="auto",
+ strategy=strategy,
+ plugins=[ray_environment],
+ callbacks=[report_callback],
+ )
+ trainer = prepare_trainer(trainer)
+
+ trainer = TorchTrainer(
+ train_func, scaling_config=ScalingConfig(num_workers=2, use_gpu=False)
+ )
+
+ trainer.fit()
+
+
+def run_transformers():
+ from datasets import Dataset
+ from transformers import Trainer, TrainingArguments
+ from ray.train.huggingface.transformers import (
+ prepare_trainer,
+ RayTrainReportCallback,
+ )
+
+ def train_func():
+ # Create dummy model and datasets
+ dataset = Dataset.from_dict({"text": ["text1", "text2"], "label": [0, 1]})
+ model = torch.nn.Linear(10, 10)
+
+ # Test Transformers utilites
+ training_args = TrainingArguments(output_dir="./results", no_cuda=True)
+ trainer = Trainer(model=model, args=training_args, train_dataset=dataset)
+
+ trainer.add_callback(RayTrainReportCallback())
+ trainer = prepare_trainer(trainer)
+
+ trainer = TorchTrainer(
+ train_func, scaling_config=ScalingConfig(num_workers=2, use_gpu=False)
+ )
+
+ trainer.fit()
+
+
+@pytest.mark.parametrize("framework", ["torch", "lightning", "transformers"])
+def test_torch_utility_usage_tags(shutdown_only, framework):
+ from ray._private.usage.usage_lib import TagKey, get_extra_usage_tags_to_report
+
+ ctx = ray.init()
+ gcs_client = ray._raylet.GcsClient(address=ctx.address_info["gcs_address"])
+
+ if framework == "torch":
+ run_torch()
+ expected_tags = [
+ TagKey.TRAIN_TORCH_GET_DEVICE,
+ TagKey.TRAIN_TORCH_PREPARE_MODEL,
+ TagKey.TRAIN_TORCH_PREPARE_DATALOADER,
+ ]
+ elif framework == "lightning":
+ run_lightning()
+ expected_tags = [
+ TagKey.TRAIN_LIGHTNING_PREPARE_TRAINER,
+ TagKey.TRAIN_LIGHTNING_RAYTRAINREPORTCALLBACK,
+ TagKey.TRAIN_LIGHTNING_RAYDDPSTRATEGY,
+ TagKey.TRAIN_LIGHTNING_RAYFSDPSTRATEGY,
+ TagKey.TRAIN_LIGHTNING_RAYDEEPSPEEDSTRATEGY,
+ TagKey.TRAIN_LIGHTNING_RAYLIGHTNINGENVIRONMENT,
+ ]
+ elif framework == "transformers":
+ run_transformers()
+ expected_tags = [
+ TagKey.TRAIN_TRANSFORMERS_PREPARE_TRAINER,
+ TagKey.TRAIN_TRANSFORMERS_RAYTRAINREPORTCALLBACK,
+ ]
+
+ result = get_extra_usage_tags_to_report(gcs_client)
+ assert set(result.keys()).issuperset(
+ {TagKey.Name(tag).lower() for tag in expected_tags}
+ )
+
+
+if __name__ == "__main__":
+ import sys
+
+ sys.exit(pytest.main(["-v", "-x", __file__]))
diff --git a/python/ray/train/torch/train_loop_utils.py b/python/ray/train/torch/train_loop_utils.py
@@ -3,21 +3,20 @@
 import random
 import types
 import collections
+import numpy as np
 from packaging.version import Version
-
 from typing import Any, Dict, List, Optional, Callable, Union
 
 from ray.train._internal import session
 from ray.train._internal.accelerator import Accelerator
-from torch.optim import Optimizer
 from ray.train._internal.session import get_accelerator, set_accelerator
 from ray.util.annotations import PublicAPI, Deprecated
-
-import numpy as np
+from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag
 
 import torch
 from torch.cuda.amp import autocast, GradScaler
 from torch.nn.parallel import DistributedDataParallel
+from torch.optim import Optimizer
 
 if Version(torch.__version__) < Version("1.11.0"):
  FullyShardedDataParallel = None
@@ -67,6 +66,7 @@ def get_device() -> Union[torch.device, List[torch.device]]:
  """
  from ray.air._internal import torch_utils
 
+ record_extra_usage_tag(TagKey.TRAIN_TORCH_GET_DEVICE, "1")
  return torch_utils.get_device()
 
 
@@ -104,6 +104,7 @@ def prepare_model(
  "Run `pip install 'torch>=1.11.0'` to use FullyShardedDataParallel."
  )
 
+ record_extra_usage_tag(TagKey.TRAIN_TORCH_PREPARE_MODEL, "1")
  return get_accelerator(_TorchAccelerator).prepare_model(
  model,
  move_to_device=move_to_device,
@@ -138,6 +139,7 @@ def prepare_data_loader(
  regardless of the setting. This configuration will be ignored
  if ``move_to_device`` is False.
  """
+ record_extra_usage_tag(TagKey.TRAIN_TORCH_PREPARE_DATALOADER, "1")
  return get_accelerator(_TorchAccelerator).prepare_data_loader(
  data_loader,
  add_dist_sampler=add_dist_sampler,

diff --git a/src/ray/protobuf/usage.proto b/src/ray/protobuf/usage.proto
@@ -171,4 +171,17 @@ enum TagKey {
  // AIR entrypoint
  // One of: "Trainer.fit", "Tuner.fit", "tune.run", "tune.run_experiments"
  AIR_ENTRYPOINT = 508;
+
+ // Train Utilities
+ TRAIN_TORCH_GET_DEVICE = 509;
+ TRAIN_TORCH_PREPARE_MODEL = 510;
+ TRAIN_TORCH_PREPARE_DATALOADER = 511;
+ TRAIN_LIGHTNING_PREPARE_TRAINER = 512;
+ TRAIN_LIGHTNING_RAYTRAINREPORTCALLBACK = 513;
+ TRAIN_LIGHTNING_RAYDDPSTRATEGY = 514;
+ TRAIN_LIGHTNING_RAYFSDPSTRATEGY = 515;
+ TRAIN_LIGHTNING_RAYDEEPSPEEDSTRATEGY = 516;
+ TRAIN_LIGHTNING_RAYLIGHTNINGENVIRONMENT = 517;
+ TRAIN_TRANSFORMERS_PREPARE_TRAINER = 518;
+ TRAIN_TRANSFORMERS_RAYTRAINREPORTCALLBACK = 519;
 }