Mark nvidia devtools repo as trusted #15440

Workflow file for this run

.github/workflows/UnitTests.yml at 5a3d5bd

	# Copyright 2023 Google LLC
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# https://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
	# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python

	name: Unit Test

	on:
	pull_request:
	push:
	branches: [ "main" ]
	workflow_dispatch:
	schedule:
	# Run the job every 2 hours
	- cron: '0 /2 * *'

	jobs:
	# IF YOU MODIFY THIS, YOU SHOULD ALSO ADD CORRESPONDING MODICATIONS TO 'gpu' job
	tpu:
	strategy:
	fail-fast: false
	matrix:
	device-type: ["v4-8"]
	name: "TPU test (${{ matrix.device-type }})"
	runs-on: ["self-hosted", "tpu", "${{ matrix.device-type }}"]
	steps:
	- uses: actions/checkout@v3
	- name: Cleanup old docker images
	run: \|
	docker system prune --all --force
	- name: Install dependencies
	run: \|
	bash docker_build_dependency_image.sh
	- name: Test gsutil installation
	run: \|
	docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged maxtext_base_image bash -c \
	'which gsutil >/dev/null 2>&1 \|\| { echo >&2 "gsutil is required but not installed. Aborting"; exit 24;}'
	- name: Test with pytest
	run: \|
	docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged maxtext_base_image bash -c 'cd MaxText;python3 -m pytest'
	- name: Test train.py with c4
	run: \|
	docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged maxtext_base_image bash -c \
	'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M)-${RANDOM} base_output_directory=gs:https://runner-maxtext-logs dataset_path=gs:https://maxtext-dataset steps=2 enable_checkpointing=false'
	- name: Test train.py with synthetic data
	run: \|
	docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged maxtext_base_image bash -c \
	'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M)-${RANDOM} base_output_directory=gs:https://runner-maxtext-logs dataset_path=gs:https://maxtext-dataset steps=2 enable_checkpointing=false dataset_type=synthetic'
	- name: Test train.py with per_device_batch_size < 1
	run: \|
	docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged maxtext_base_image bash -c \
	'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M)-${RANDOM} base_output_directory=gs:https://runner-maxtext-logs dataset_path=gs:https://maxtext-dataset steps=2 per_device_batch_size=0.25 ici_tensor_parallelism=4 enable_checkpointing=false'
	- name: Test decode.py
	run: \|
	docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged maxtext_base_image bash -c \
	'python3 MaxText/decode.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M)-${RANDOM} base_output_directory=gs:https://runner-maxtext-logs dataset_path=gs:https://maxtext-dataset steps=2 ici_tensor_parallelism=4 attention=dot_product enable_checkpointing=false max_target_length=128 per_device_batch_size=1'
	- name: Test decode.py with per_device_batch_size < 1
	run: \|
	docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged maxtext_base_image bash -c \
	'python3 MaxText/decode.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M)-${RANDOM} base_output_directory=gs:https://runner-maxtext-logs dataset_path=gs:https://maxtext-dataset steps=2 ici_tensor_parallelism=4 attention=dot_product enable_checkpointing=false max_target_length=128 per_device_batch_size=.25'
	- name: Test int8_training
	run: \|
	docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged maxtext_base_image bash -c \
	'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M)-${RANDOM} base_output_directory=gs:https://runner-maxtext-logs dataset_path=gs:https://maxtext-dataset quantization=int8 steps=2 enable_checkpointing=false'
	- name: Test fp8_training
	run: \|
	docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged maxtext_base_image bash -c \
	'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M)-${RANDOM} base_output_directory=gs:https://runner-maxtext-logs dataset_path=gs:https://maxtext-dataset quantization=fp8 steps=2 enable_checkpointing=false'
	- name: Test generate_param_only_checkpoint
	run: \|
	docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged maxtext_base_image bash -c \
	'bash end_to_end/tpu/test_generate_param_only_checkpoint.sh -r runner_$(date +%Y-%m-%d-%H-%M)-${RANDOM} -o gs:https://runner-maxtext-logs -d gs:https://maxtext-dataset -i 4'
	- name: Test generate_param_only_checkpoint with int8 quantization
	run: \|
	docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged maxtext_base_image bash -c \
	'bash end_to_end/tpu/test_generate_param_only_checkpoint.sh -r runner_$(date +%Y-%m-%d-%H-%M)-${RANDOM} -o gs:https://runner-maxtext-logs -d gs:https://maxtext-dataset -i 4 -q int8'
	- name: Test grain checkpoint determinism
	run: \|
	docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged maxtext_base_image bash -c \
	'bash end_to_end/tpu/test_checkpointing.sh runner gs:https://runner-maxtext-logs gs:https://maxtext-dataset False c4-array_record'
	- name: Test checkpoint compatibility
	run: \|
	docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged maxtext_base_image bash -c \
	'bash end_to_end/tpu/test_checkpoint_compatibility.sh runner gs:https://runner-maxtext-logs gs:https://maxtext-dataset'
	- name: Validate Pedagogical Example, Shmap_collective_matmul
	run: \|
	docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged maxtext_base_image bash -c \
	'python3 pedagogical_examples/shmap_collective_matmul.py'

	# IF YOU MODIFY THIS, YOU SHOULD ALSO ADD CORRESPONDING MODICATIONS TO 'tpu' job
	gpu:
	strategy:
	fail-fast: false
	matrix:
	device-type: ["a100-40gb-4"]
	build-mode: ["pinned"]
	name: "GPU test (${{ matrix.device-type }}, ${{ matrix.build-mode }})"
	runs-on: ["self-hosted", "gpu", "${{ matrix.device-type }}"]
	env:
	LOCAL_IMAGE_NAME: "maxtext_base_image_${{ matrix.build-mode }}_${{ github.sha }}"
	steps:
	- uses: actions/checkout@v3
	- name: Set up Docker Buildx
	uses: docker/setup-buildx-action@v3
	- name: Cleanup old docker images
	run: \|
	docker system prune --all --force
	- name: Install dependencies
	run: \|
	bash docker_build_dependency_image.sh DEVICE=gpu MODE=${{ matrix.build-mode }} LOCAL_IMAGE_NAME="$LOCAL_IMAGE_NAME"
	- name: Test gsutil installation
	run: \|
	docker run --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged "$LOCAL_IMAGE_NAME" bash -c \
	'which gsutil >/dev/null 2>&1 \|\| { echo >&2 "gsutil is required but not installed. Aborting"; exit 24;}'
	- name: Test with pytest
	run: \|
	docker run -e XLA_PYTHON_CLIENT_MEM_FRACTION=0.65 -e TF_FORCE_GPU_ALLOW_GROWTH=true --shm-size=2g --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged "$LOCAL_IMAGE_NAME" bash -c 'cd MaxText;python3 -m pytest -m "not tpu"'
	- name: Test train.py
	run: \|
	docker run -e XLA_PYTHON_CLIENT_MEM_FRACTION=0.65 -e TF_FORCE_GPU_ALLOW_GROWTH=true --shm-size=2g --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged "$LOCAL_IMAGE_NAME" bash -c \
	'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M)-${RANDOM} base_output_directory=gs:https://runner-maxtext-logs dataset_path=gs:https://maxtext-dataset steps=2 enable_checkpointing=false attention=dot_product'
	- name: Test train.py with flash attention
	run: \|
	docker run -e XLA_PYTHON_CLIENT_MEM_FRACTION=0.65 -e TF_FORCE_GPU_ALLOW_GROWTH=true --shm-size=2g --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged "$LOCAL_IMAGE_NAME" bash -c \
	'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M)-${RANDOM} base_output_directory=gs:https://runner-maxtext-logs dataset_path=gs:https://maxtext-dataset steps=2 enable_checkpointing=false attention=cudnn_flash_te'
	- name: Test train.py with per_device_batch_size < 1
	run: \|
	docker run -e XLA_PYTHON_CLIENT_MEM_FRACTION=0.65 -e TF_FORCE_GPU_ALLOW_GROWTH=true --shm-size=2g --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged "$LOCAL_IMAGE_NAME" bash -c \
	'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M)-${RANDOM} base_output_directory=gs:https://runner-maxtext-logs dataset_path=gs:https://maxtext-dataset steps=2 per_device_batch_size=0.25 ici_tensor_parallelism=4 enable_checkpointing=false attention=dot_product'
	- name: Test int8_training
	run: \|
	docker run -e XLA_PYTHON_CLIENT_MEM_FRACTION=0.65 -e TF_FORCE_GPU_ALLOW_GROWTH=true --shm-size=2g --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged "$LOCAL_IMAGE_NAME" bash -c \
	'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M)-${RANDOM} base_output_directory=gs:https://runner-maxtext-logs dataset_path=gs:https://maxtext-dataset quantization=int8 steps=2 enable_checkpointing=false attention=dot_product'
	- name: Test decode.py
	run: \|
	docker run -e XLA_PYTHON_CLIENT_MEM_FRACTION=0.65 -e TF_FORCE_GPU_ALLOW_GROWTH=true --shm-size=2g --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged "$LOCAL_IMAGE_NAME" bash -c \
	'python3 MaxText/decode.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M)-${RANDOM} base_output_directory=gs:https://runner-maxtext-logs dataset_path=gs:https://maxtext-dataset steps=2 ici_tensor_parallelism=4 attention=dot_product enable_checkpointing=false max_target_length=128 per_device_batch_size=1'
	- name: Test decode.py with per_device_batch_size < 1
	run: \|
	docker run -e XLA_PYTHON_CLIENT_MEM_FRACTION=0.65 -e TF_FORCE_GPU_ALLOW_GROWTH=true --shm-size=2g --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged "$LOCAL_IMAGE_NAME" bash -c \
	'python3 MaxText/decode.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M)-${RANDOM} base_output_directory=gs:https://runner-maxtext-logs dataset_path=gs:https://maxtext-dataset steps=2 ici_tensor_parallelism=4 attention=dot_product enable_checkpointing=false max_target_length=128 per_device_batch_size=.25'

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Mark nvidia devtools repo as trusted #15440

Workflow file

Mark nvidia devtools repo as trusted #15440

Jobs

Run details

Workflow file for this run