Skip to content

Input steps for convergence test #14127

Input steps for convergence test

Input steps for convergence test #14127

Workflow file for this run

# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
name: Unit Test
on:
pull_request:
push:
branches: [ "main" ]
workflow_dispatch:
schedule:
# Run the job every 2 hours
- cron: '0 */2 * * *'
jobs:
cpu:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-20.04]
python-version: ['3.10']
steps:
- uses: actions/checkout@v3
- name: setup python
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install Dependencies
run: |
pip install pytype
pip install pylint
- name: Typecheck the code with pytype
run: |
pytype --jobs auto --disable import-error MaxText/
- name: Analysing the code with pylint
run: |
pylint MaxText/
# IF YOU MODIFY THIS, YOU SHOULD ALSO ADD CORRESPONDING MODICATIONS TO 'gpu' job
tpu:
strategy:
fail-fast: false
matrix:
device-type: ["v4-8"]
name: "TPU test (${{ matrix.device-type }})"
runs-on: ["self-hosted", "tpu", "${{ matrix.device-type }}"]
steps:
- uses: actions/checkout@v3
- name: Cleanup old docker images
run: |
docker system prune --all --force
- name: Install dependencies
run: |
bash docker_build_dependency_image.sh
- name: Test gsutil installation
run: |
docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \
'which gsutil >/dev/null 2>&1 || { echo >&2 "gsutil is required but not installed. Aborting"; exit 24;}'
- name: Test with pytest
run: |
docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c 'cd MaxText;python3 -m pytest'
- name: Test train.py
run: |
docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \
'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs:https://runner-maxtext-logs dataset_path=gs:https://maxtext-dataset steps=2 enable_checkpointing=false'
- name: Test train.py with per_device_batch_size < 1
run: |
docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \
'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs:https://runner-maxtext-logs dataset_path=gs:https://maxtext-dataset steps=2 per_device_batch_size=0.25 ici_tensor_parallelism=4 enable_checkpointing=false'
- name: Test decode.py
run: |
docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \
'python3 MaxText/decode.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs:https://runner-maxtext-logs dataset_path=gs:https://maxtext-dataset steps=2 ici_tensor_parallelism=4 attention=dot_product enable_checkpointing=false max_target_length=128 per_device_batch_size=1'
- name: Test decode.py with per_device_batch_size < 1
run: |
docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \
'python3 MaxText/decode.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs:https://runner-maxtext-logs dataset_path=gs:https://maxtext-dataset steps=2 ici_tensor_parallelism=4 attention=dot_product enable_checkpointing=false max_target_length=128 per_device_batch_size=.25'
- name: Test standalone_dataloader.py
run: |
docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \
'python3 MaxText/standalone_dataloader.py MaxText/configs/base.yml run_name=standalone_dataloader_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs:https://runner-maxtext-logs dataset_path=gs:https://maxtext-dataset steps=100 enable_checkpointing=false'
- name: Test standalone_checkpointer.py
run: |
docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \
'python3 MaxText/standalone_checkpointer.py MaxText/configs/base.yml run_name=standalone_checkpointer_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs:https://runner-maxtext-logs dataset_path=gs:https://maxtext-dataset steps=200 checkpoint_period=50 enable_checkpointing=True async_checkpointing=False'
- name: Test int8_training
run: |
docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \
'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs:https://runner-maxtext-logs dataset_path=gs:https://maxtext-dataset quantization=int8 steps=2 enable_checkpointing=false'
- name: Test generate_param_only_checkpoint
run: |
docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \
'bash end_to_end/test_generate_param_only_checkpoint.sh -r runner_$(date +%Y-%m-%d-%H-%M) -o gs:https://runner-maxtext-logs -d gs:https://maxtext-dataset -i 4'
# IF YOU MODIFY THIS, YOU SHOULD ALSO ADD CORRESPONDING MODICATIONS TO 'tpu' job
gpu:
strategy:
fail-fast: false
matrix:
device-type: ["a100-40gb-4"]
name: "GPU test (${{ matrix.device-type }})"
runs-on: ["self-hosted", "gpu", "${{ matrix.device-type }}"]
steps:
- uses: actions/checkout@v3
- name: Cleanup old docker images
run: |
docker system prune --all --force
- name: Install dependencies
run: |
bash docker_build_dependency_image.sh DEVICE=gpu
- name: Test gsutil installation
run: |
docker run --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \
'which gsutil >/dev/null 2>&1 || { echo >&2 "gsutil is required but not installed. Aborting"; exit 24;}'
- name: Test with pytest
run: |
docker run --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c 'cd MaxText;python3 -m pytest -m "not tpu"'
- name: Test train.py
run: |
docker run --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \
'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs:https://runner-maxtext-logs dataset_path=gs:https://maxtext-dataset steps=2 enable_checkpointing=false attention=dot_product'
- name: Test train.py with per_device_batch_size < 1
run: |
docker run --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \
'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs:https://runner-maxtext-logs dataset_path=gs:https://maxtext-dataset steps=2 per_device_batch_size=0.25 ici_tensor_parallelism=4 enable_checkpointing=false attention=dot_product'
- name: Test int8_training
run: |
docker run --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \
'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs:https://runner-maxtext-logs dataset_path=gs:https://maxtext-dataset quantization=int8 steps=2 enable_checkpointing=false attention=dot_product'
- name: Test decode.py
run: |
docker run --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \
'python3 MaxText/decode.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs:https://runner-maxtext-logs dataset_path=gs:https://maxtext-dataset steps=2 ici_tensor_parallelism=4 attention=dot_product enable_checkpointing=false max_target_length=128 per_device_batch_size=1'
- name: Test decode.py with per_device_batch_size < 1
run: |
docker run --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \
'python3 MaxText/decode.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs:https://runner-maxtext-logs dataset_path=gs:https://maxtext-dataset steps=2 ici_tensor_parallelism=4 attention=dot_product enable_checkpointing=false max_target_length=128 per_device_batch_size=.25'