Add functionality to automatically upload logs to Vertex Tensorboard

google · Apr 9, 2024 · 93d120a · 93d120a
1 parent 57b3dcf
commit 93d120a
Show file tree

Hide file tree

Showing 7 changed files with 210 additions and 1 deletion.
diff --git a/MaxText/configs/base.yml b/MaxText/configs/base.yml
@@ -251,3 +251,13 @@ target_eval_loss: 0. # early stop once reaching target eval_loss
 
 # Goodput parameters
 enable_goodput_recording: False
+
+# Vertex AI Tensorboard Configurations - https://github.com/google/maxtext/tree/main/getting_started/Use_Vertex_AI_Tensorboard.md
+# Set to True for GCE, False if running via XPK
+use_vertex_tensorboard: False
+# Project to create Vertex AI Tensorboard in for GCE, blank if project is set using 'gcloud config set project'
+# Set this to blank if running via XPK
+vertex_tensorboard_project: ""
+# Region to create Vertex AI Tensorboard in for GCE, blank if running via XPK
+# Vertex AI supported regions: https://cloud.google.com/vertex-ai/docs/general/locations#available-regions
+vertex_tensorboard_region: ""
diff --git a/MaxText/max_utils.py b/MaxText/max_utils.py
@@ -20,6 +20,7 @@
 import functools
 import time
 import socket
+import subprocess
 
 import max_logging
 
@@ -609,3 +610,11 @@ def print_model_vars(print_str, model_vars):
  for k in model_vars:
  print(f'{print_str} key{k}:')
  print(f'\t {model_vars[k]}')
+
+def get_project():
+ completed_command = subprocess.run(["gcloud", "config", "get", "project"], check=True, capture_output=True)
+ project_outputs = completed_command.stdout.decode().strip().split('\n')
+ if len(project_outputs) < 1 or project_outputs[-1]=='':
+ max_logging.log("You must specify config.vertex_tensorboard_project or set 'gcloud config set project <project>'")
+ return None
+ return project_outputs[-1]
diff --git a/MaxText/train.py b/MaxText/train.py
@@ -43,6 +43,7 @@
 import pyconfig
 # pylint: disable-next=unused-import
 import register_jax_proxy_backend
+from vertex_tensorboard import VertexTensorboardManager
 
 from input_pipeline.input_pipeline_interface import create_data_iterator_with_tokenizer
 from layers import models
@@ -506,6 +507,9 @@ def main(argv: Sequence[str]) -> None:
  config = pyconfig.config
  validate_train_config(config)
  os.environ["TFDS_DATA_DIR"] = config.dataset_path
+ vertex_tensorboard_manager = VertexTensorboardManager()
+ vertex_tensorboard_manager.configure_vertex_tensorboard(config)
+
  debug_config = debug_configuration.DebugConfig(
  stack_trace_config = stack_trace_configuration.StackTraceConfig(
  collect_stack_trace = config.collect_stack_trace,

diff --git a/MaxText/vertex_tensorboard.py b/MaxText/vertex_tensorboard.py
@@ -0,0 +1,122 @@
+"""
+ Copyright 2023 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ """
+
+"""Utilities for Tensorboard in Vertex AI."""
+
+import os
+
+import jax
+
+import max_logging
+import max_utils
+
+from cloud_accelerator_diagnostics import tensorboard
+from cloud_accelerator_diagnostics import uploader
+
+
+class VertexTensorboardManager:
+ """Class to create Vertex AI Tensorboard and upload logs to that instance."""
+
+ def __init__(self):
+ self.uploader_flag = False
+
+ def __del__(self):
+ """Stop the Tensorboard uploader thread."""
+ if self.uploader_flag:
+ uploader.stop_upload_to_tensorboard()
+
+ def setup(self):
+ """Creates Tensorboard instance and Experiment in Vertex AI.
+ 
+ Returns:
+ URL to view Vertex Tensorboard created in Google Cloud Project.
+ """
+ max_logging.log("Setting up Tensorboard and Experiment in Vertex AI.")
+
+ vertex_tensorboard_project = os.environ.get("TENSORBOARD_PROJECT")
+ vertex_tensorboard_region = os.environ.get("TENSORBOARD_REGION")
+ if not vertex_tensorboard_project or not vertex_tensorboard_region:
+ max_logging.log("Either config.vertex_tensorboard_project or config.vertex_tensorboard_region is not set.")
+ return None
+
+ # Create Vertex Tensorboard instance
+ vertex_tensorboard_name = os.environ.get("TENSORBOARD_NAME")
+ instance_id = tensorboard.create_instance(project=vertex_tensorboard_project,
+ location=vertex_tensorboard_region,
+ tensorboard_name=vertex_tensorboard_name)
+ # Failed to create Vertex Tensorboard instance
+ if instance_id is None:
+ return None
+
+ # Create Vertex Experiment
+ vertex_experiment_name = os.environ.get("EXPERIMENT_NAME")
+ _, tensorboard_url = tensorboard.create_experiment(project=vertex_tensorboard_project,
+ location=vertex_tensorboard_region,
+ experiment_name=vertex_experiment_name,
+ tensorboard_name=vertex_tensorboard_name)
+ return tensorboard_url
+
+ def upload_data(self, tensorboard_dir):
+ """Starts an uploader to continously monitor and upload data to Vertex Tensorboard.
+
+ Args:
+ tensorboard_dir: directory that contains Tensorboard data.
+ """
+ tensorboard_project = os.environ.get("TENSORBOARD_PROJECT")
+ tensorboard_region = os.environ.get("TENSORBOARD_REGION")
+ tensorboard_name = os.environ.get("TENSORBOARD_NAME")
+ experiment_name = os.environ.get("EXPERIMENT_NAME")
+
+ if not tensorboard_project or not tensorboard_region or not tensorboard_name or not experiment_name:
+ max_logging.log("Vertex Tensorboard configurations are not set. Data will not be uploaded to Vertex AI.")
+ self.uploader_flag = False
+
+ max_logging.log(f"Data will be uploaded to Vertex Tensorboard instance: {tensorboard_name} "
+ f"and Experiment: {experiment_name} in {tensorboard_region}.")
+ uploader.start_upload_to_tensorboard(project=tensorboard_project,
+ location=tensorboard_region,
+ experiment_name=experiment_name,
+ tensorboard_name=tensorboard_name,
+ logdir=tensorboard_dir)
+ self.uploader_flag = True
+
+ def configure_vertex_tensorboard(self, config):
+ """Creates Vertex Tensorboard and start thread to upload data to Vertex Tensorboard."""
+ if jax.process_index()==0:
+ if not os.environ.get("TENSORBOARD_PROJECT"):
+ if not config.vertex_tensorboard_project:
+ os.environ["TENSORBOARD_PROJECT"] = max_utils.get_project()
+ else:
+ os.environ["TENSORBOARD_PROJECT"] = config.vertex_tensorboard_project
+
+ if not os.environ.get("TENSORBOARD_REGION"):
+ os.environ["TENSORBOARD_REGION"] = config.vertex_tensorboard_region
+
+ if not os.environ.get("TENSORBOARD_NAME"):
+ vertex_tensorboard_project = os.environ.get("TENSORBOARD_PROJECT")
+ os.environ["TENSORBOARD_NAME"] = f"{vertex_tensorboard_project}-tb-instance"
+
+ if not os.environ.get("EXPERIMENT_NAME"):
+ os.environ["EXPERIMENT_NAME"] = config.run_name
+
+ if config.use_vertex_tensorboard: # running MaxText on GCE
+ tensorboard_url = self.setup()
+ if tensorboard_url is None:
+ raise ValueError("Unable to create Tensorboard and Experiment in Vertex AI.")
+ max_logging.log(f"View your Vertex AI Tensorboard at: {tensorboard_url}")
+ self.upload_data(config.tensorboard_dir)
+ elif os.environ.get("UPLOAD_DATA_TO_TENSORBOARD"): # running MaxText via XPK
+ self.upload_data(config.tensorboard_dir)
diff --git a/README.md b/README.md
@@ -149,4 +149,7 @@ global_parameter_scale=16 per_device_batch_size=4 steps=10000 learning_rate=1e-
 base_output_directory=gs:https://my-output-bucket dataset_path=gs:https://my-dataset-bucket
 ```
 
-In the save step of example 2 above we included exporting the compiler flag `LIBTPU_INIT_ARGS` and `learning_rate` because those affect the compiled object `my_compiled_train.pickle.` The sizes of the model (e.g. `global_parameter_scale`, `max_sequence_length` and `per_device_batch`) are fixed when you initially compile via `compile_train.py`, you will see a size error if you try to run the saved compiled object with different sizes than you compiled with. However a subtle note is that the **learning rate schedule** is also fixed when you run `compile_train` - which is determined by both `steps` and `learning_rate`. The optimizer parameters such as `adam_b1` are passed only as shaped objects to the compiler - thus their real values are determined when you run `train.py`, not during the compilation. If you do pass in different shapes (e.g. `per_device_batch`), you will get a clear error message reporting that the compiled signature has different expected shapes than what was input. If you attempt to run on different hardware than the compilation targets requested via `compile_topology`, you will get an error saying there is a failure to map the devices from the compiled to your real devices. Using different XLA flags or a LIBTPU than what was compiled will probably run silently with the environment you compiled in without error. However there is no guaranteed behavior in this case; you should run in the same environment you compiled in.
+In the save step of example 2 above we included exporting the compiler flag `LIBTPU_INIT_ARGS` and `learning_rate` because those affect the compiled object `my_compiled_train.pickle.` The sizes of the model (e.g. `global_parameter_scale`, `max_sequence_length` and `per_device_batch`) are fixed when you initially compile via `compile_train.py`, you will see a size error if you try to run the saved compiled object with different sizes than you compiled with. However a subtle note is that the **learning rate schedule** is also fixed when you run `compile_train` - which is determined by both `steps` and `learning_rate`. The optimizer parameters such as `adam_b1` are passed only as shaped objects to the compiler - thus their real values are determined when you run `train.py`, not during the compilation. If you do pass in different shapes (e.g. `per_device_batch`), you will get a clear error message reporting that the compiled signature has different expected shapes than what was input. If you attempt to run on different hardware than the compilation targets requested via `compile_topology`, you will get an error saying there is a failure to map the devices from the compiled to your real devices. Using different XLA flags or a LIBTPU than what was compiled will probably run silently with the environment you compiled in without error. However there is no guaranteed behavior in this case; you should run in the same environment you compiled in.
+
+## Automatically Upload Logs to Vertex Tensorboard
+MaxText supports automatic upload of logs collected in a directory to a Tensorboard instance in Vertex AI. Follow [user guide](getting_started/Use_Vertex_AI_Tensorboard.md) to know more.
diff --git a/getting_started/Use_Vertex_AI_Tensorboard.md b/getting_started/Use_Vertex_AI_Tensorboard.md
@@ -0,0 +1,60 @@
+<!--
+ Copyright 2023 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+## Use Vertex AI Tensorboard
+MaxText supports automatic upload of logs collected in a directory to a Tensorboard instance in Vertex AI. For more information on how MaxText supports this feature, visit [cloud-accelerator-diagnostics](https://pypi.org/project/cloud-accelerator-diagnostics) PyPI package documentation.
+
+### What is Vertex AI Tensorboard and Vertex AI Experiment
+Vertex AI Tensorboard is a fully managed and enterprise-ready version of open-source Tensorboard. To learn more about Vertex AI Tensorboard, visit [this](https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-introduction). Vertex AI Experiment is a tool that helps to track and analyze an experiment run on Vertex AI Tensorboard. To learn more about Vertex AI Experiments, visit [this](https://cloud.google.com/vertex-ai/docs/experiments/intro-vertex-ai-experiments). 
+
+You can use a single Vertex AI Tensorboard instance to track and compare metrics from multiple Vertex AI Experiments. While you can view metrics from multiple Vertex AI Experiments within a single Tensorboard instance, the underlying log data for each experiment remains separate.
+
+### Prerequisites
+* Enable [Vertex AI API](https://cloud.google.com/vertex-ai/docs/start/cloud-environment#enable_vertexai_apis) in your Google Cloud console.
+* Assign [Vertex AI User IAM role](https://cloud.google.com/vertex-ai/docs/general/access-control#aiplatform.user) to the service account used by the TPU VMs. This is required to create and access the Vertex AI Tensorboard in Google Cloud console. If you are using XPK for MaxText, the necessary Vertex AI User IAM role will be automatically assigned to your node pools by XPK – no need to assign it manually.
+
+### Upload Logs to Vertex AI Tensorboard
+**Scenario 1: Using XPK to run MaxText on GKE**
+
+XPK simplifies MaxText's Vertex AI Tensorboard integration. A Vertex Tensorboard instance and Experiment are automatically created by XPK during workload scheduling. Also, XPK automatically sets the necessary environment variables, eliminating the need to manually configure this in MaxText. Set `use_vertex_tensorboard=False` to avoid setting up Vertex Tensorboard again in MaxText. This is how the configuration will look like for running MaxText via XPK:
+```
+use_vertex_tensorboard: False
+vertex_tensorboard_project: ""
+vertex_tensorboard_region: ""
+```
+The above configuration will upload logs in `config.tensorboard_dir` to Vertex Tensorboard instance set as an environment variable by XPK.
+
+**Scenario 2: Running MaxText on GCE**
+
+Set `use_vertex_tensorboard=True` to upload logs in `config.tensorboard_dir` to a Tensorboard instance in Vertex AI. You can manually create a Tensorboard instance named `<config.vertex_tensorboard_project>-tb-instance` and an Experiment named `config.run_name` in Vertex AI on Google Cloud console. Otherwise, MaxText will create those resources for you when `use_vertex_tensorboard=True`. Note that Vertex AI is available in only [these](https://cloud.google.com/vertex-ai/docs/general/locations#available-regions) regions.
+
+**Scenario 2.1: Configuration to upload logs to Vertex AI Tensorboard**
+
+```
+run_name: "test-run"
+use_vertex_tensorboard: True
+vertex_tensorboard_project: "test-project" # or vertex_tensorboard_project: ""
+vertex_tensorboard_location: "us-central1"
+```
+The above configuration will try to create a Vertex AI Tensorboard instance named `test-project-tb-instance` and a Vertex AI Experiment named `test-run` in the `us-central1` region of `test-project`. If you set `vertex_tensorboard_project=""`, then the default project (`gcloud config get project`) set on the VM will be used to create the Vertex AI resources. It will only create these resources if they do not already exist. Also, the logs in `config.tensorboard_dir` will be uploaded to `test-project-tb-instance` Tensorboard instance and `test-run` Experiment in Vertex AI.
+
+**Scenario 2.2: Configuration to not upload logs to Vertex AI Tensorboard**
+
+The following configuration will not upload any log data collected in `config.tensorboard_dir` to Tensorboard in Vertex AI.
+```
+use_vertex_tensorboard: False
+vertex_tensorboard_project: ""
+vertex_tensorboard_location: ""
+```
diff --git a/requirements.txt b/requirements.txt
@@ -4,6 +4,7 @@ orbax-checkpoint>=0.5.5
 absl-py
 array-record
 aqtp
+cloud-accelerator-diagnostics
 cloud-tpu-diagnostics
 google-cloud-storage
 grain-nightly