Skip to content

Commit

Permalink
Merge pull request #630 from google:yooh/gpu-unit-tests
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 629837420
  • Loading branch information
maxtext authors committed May 1, 2024
2 parents 8f132d6 + 8a6f30d commit fcf48fe
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 10 deletions.
34 changes: 29 additions & 5 deletions .github/workflows/UnitTests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,19 +81,19 @@ jobs:
- name: Test generate_param_only_checkpoint
run: |
docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged maxtext_base_image bash -c \
'bash end_to_end/tpu/test_generate_param_only_checkpoint.sh -r runner_$(date +%Y-%m-%d-%H-%M)-${RANDOM} -o gs:https://runner-maxtext-logs -d gs:https://maxtext-dataset -i 4'
'bash end_to_end/test_generate_param_only_checkpoint.sh -r runner_$(date +%Y-%m-%d-%H-%M)-${RANDOM} -o gs:https://runner-maxtext-logs -d gs:https://maxtext-dataset -i 4'
- name: Test generate_param_only_checkpoint with int8 quantization
run: |
docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged maxtext_base_image bash -c \
'bash end_to_end/tpu/test_generate_param_only_checkpoint.sh -r runner_$(date +%Y-%m-%d-%H-%M)-${RANDOM} -o gs:https://runner-maxtext-logs -d gs:https://maxtext-dataset -i 4 -q int8'
'bash end_to_end/test_generate_param_only_checkpoint.sh -r runner_$(date +%Y-%m-%d-%H-%M)-${RANDOM} -o gs:https://runner-maxtext-logs -d gs:https://maxtext-dataset -i 4 -q int8'
- name: Test grain checkpoint determinism
run: |
docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged maxtext_base_image bash -c \
'bash end_to_end/tpu/test_checkpointing.sh runner gs:https://runner-maxtext-logs gs:https://maxtext-dataset False c4-array_record'
'bash end_to_end/test_checkpointing.sh runner gs:https://runner-maxtext-logs gs:https://maxtext-dataset False c4-array_record'
- name: Test checkpoint compatibility
run: |
docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged maxtext_base_image bash -c \
'bash end_to_end/tpu/test_checkpoint_compatibility.sh runner gs:https://runner-maxtext-logs gs:https://maxtext-dataset'
'bash end_to_end/test_checkpoint_compatibility.sh runner gs:https://runner-maxtext-logs gs:https://maxtext-dataset'
- name: Validate Pedagogical Example, Shmap_collective_matmul
run: |
docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged maxtext_base_image bash -c \
Expand Down Expand Up @@ -127,10 +127,14 @@ jobs:
- name: Test with pytest
run: |
docker run -e XLA_PYTHON_CLIENT_MEM_FRACTION=0.65 -e TF_FORCE_GPU_ALLOW_GROWTH=true --shm-size=2g --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged "$LOCAL_IMAGE_NAME" bash -c 'cd MaxText;python3 -m pytest -m "not tpu"'
- name: Test train.py
- name: Test train.py with c4
run: |
docker run -e XLA_PYTHON_CLIENT_MEM_FRACTION=0.65 -e TF_FORCE_GPU_ALLOW_GROWTH=true --shm-size=2g --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged "$LOCAL_IMAGE_NAME" bash -c \
'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M)-${RANDOM} base_output_directory=gs:https://runner-maxtext-logs dataset_path=gs:https://maxtext-dataset steps=2 enable_checkpointing=false attention=dot_product'
- name: Test train.py with synthetic data
run: |
docker run -e XLA_PYTHON_CLIENT_MEM_FRACTION=0.65 -e TF_FORCE_GPU_ALLOW_GROWTH=true --shm-size=2g --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged "$LOCAL_IMAGE_NAME" bash -c \
'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M)-${RANDOM} base_output_directory=gs:https://runner-maxtext-logs dataset_path=gs:https://maxtext-dataset steps=2 enable_checkpointing=false attention=dot_product dataset_type=synthetic'
- name: Test train.py with flash attention
run: |
docker run -e XLA_PYTHON_CLIENT_MEM_FRACTION=0.65 -e TF_FORCE_GPU_ALLOW_GROWTH=true --shm-size=2g --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged "$LOCAL_IMAGE_NAME" bash -c \
Expand All @@ -151,3 +155,23 @@ jobs:
run: |
docker run -e XLA_PYTHON_CLIENT_MEM_FRACTION=0.65 -e TF_FORCE_GPU_ALLOW_GROWTH=true --shm-size=2g --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged "$LOCAL_IMAGE_NAME" bash -c \
'python3 MaxText/decode.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M)-${RANDOM} base_output_directory=gs:https://runner-maxtext-logs dataset_path=gs:https://maxtext-dataset steps=2 ici_tensor_parallelism=4 attention=dot_product enable_checkpointing=false max_target_length=128 per_device_batch_size=.25'
- name: Test fp8_training
run: |
docker run -e XLA_PYTHON_CLIENT_MEM_FRACTION=0.65 -e TF_FORCE_GPU_ALLOW_GROWTH=true --shm-size=2g --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged "$LOCAL_IMAGE_NAME" bash -c \
'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M)-${RANDOM} base_output_directory=gs:https://runner-maxtext-logs dataset_path=gs:https://maxtext-dataset quantization=fp8 steps=2 enable_checkpointing=false attention=dot_product'
- name: Test generate_param_only_checkpoint
run: |
docker run -e XLA_PYTHON_CLIENT_MEM_FRACTION=0.65 -e TF_FORCE_GPU_ALLOW_GROWTH=true --shm-size=2g --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged "$LOCAL_IMAGE_NAME" bash -c \
'bash end_to_end/test_generate_param_only_checkpoint.sh -r runner_$(date +%Y-%m-%d-%H-%M)-${RANDOM} -o gs:https://runner-maxtext-logs -d gs:https://maxtext-dataset -i 4 -a dot_product'
- name: Test generate_param_only_checkpoint with int8 quantization
run: |
docker run -e XLA_PYTHON_CLIENT_MEM_FRACTION=0.65 -e TF_FORCE_GPU_ALLOW_GROWTH=true --shm-size=2g --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged "$LOCAL_IMAGE_NAME" bash -c \
'bash end_to_end/test_generate_param_only_checkpoint.sh -r runner_$(date +%Y-%m-%d-%H-%M)-${RANDOM} -o gs:https://runner-maxtext-logs -d gs:https://maxtext-dataset -i 4 -q int8 -a dot_product'
- name: Test grain checkpoint determinism
run: |
docker run -e XLA_PYTHON_CLIENT_MEM_FRACTION=0.65 -e TF_FORCE_GPU_ALLOW_GROWTH=true --shm-size=2g --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged "$LOCAL_IMAGE_NAME" bash -c \
'bash end_to_end/test_checkpointing.sh runner gs:https://runner-maxtext-logs gs:https://maxtext-dataset False c4-array_record dot_product'
- name: Test checkpoint compatibility
run: |
docker run -e XLA_PYTHON_CLIENT_MEM_FRACTION=0.65 -e TF_FORCE_GPU_ALLOW_GROWTH=true --shm-size=2g --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/deps --rm --privileged "$LOCAL_IMAGE_NAME" bash -c \
'bash end_to_end/test_checkpoint_compatibility.sh runner gs:https://runner-maxtext-logs gs:https://maxtext-dataset dot_product'
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ fi
RUN_NAME=${1}-$(date +%Y-%m-%d-%H-%M)
OUTPUT_PATH=${2}
DATASET_PATH=${3}
ATTENTION=${4}
if [ -z "${4}" ]; then
ATTENTION='autoselected'
fi
model_params=" base_emb_dim=384 base_num_query_heads=8 base_num_kv_heads=8 base_mlp_dim=192 base_num_decoder_layers=8 head_dim=128"

echo "Mounting $DATASET_PATH to /tmp/gcsfuse/"
Expand All @@ -20,7 +24,7 @@ python3 MaxText/train.py MaxText/configs/base.yml run_name=$RUN_NAME steps=3 ${m
max_target_length=128 per_device_batch_size=1\
metrics_file=run_1_metrics.txt checkpoint_period=2 async_checkpointing=false\
dataset_path=/tmp/gcsfuse base_output_directory=$OUTPUT_PATH\
dataset_type=c4-array_record grain_worker_count=0\
dataset_type=c4-array_record grain_worker_count=0 attention=$ATTENTION\
dataset_name=array-record/c4/en/3.0.1 eval_dataset_name=array-record/c4/en/3.0.1

echo
Expand All @@ -29,7 +33,7 @@ echo "Run_2: Resuming using the tfds input pipeline"
echo

python3 MaxText/train.py MaxText/configs/base.yml run_name=$RUN_NAME steps=5 ${model_params}\
max_target_length=128 per_device_batch_size=1\
max_target_length=128 per_device_batch_size=1 attention=$ATTENTION\
metrics_file=run_2_metrics.txt checkpoint_period=2 async_checkpointing=false\
dataset_path=/tmp/gcsfuse base_output_directory=$OUTPUT_PATH\

Expand All @@ -42,7 +46,7 @@ python3 MaxText/train.py MaxText/configs/base.yml run_name=$RUN_NAME steps=7 ${m
max_target_length=128 per_device_batch_size=1\
metrics_file=run_3_metrics.txt checkpoint_period=2 async_checkpointing=false\
dataset_path=/tmp/gcsfuse base_output_directory=$OUTPUT_PATH\
dataset_type=c4-array_record grain_worker_count=0\
dataset_type=c4-array_record grain_worker_count=0 attention=$ATTENTION\
dataset_name=array-record/c4/en/3.0.1 eval_dataset_name=array-record/c4/en/3.0.1

python3 end_to_end/tpu/eval_assert.py test_start_step run_2_metrics.txt 3.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ OUTPUT_PATH=${2}
DATASET_PATH=${3}
COLLECT_STACK_TRACE=${4}
DATASET_TYPE=${5}
ATTENTION=${6}
if [ -z "${6}" ]; then
ATTENTION='autoselected'
fi
eval_metrics=checkpoint_save_restore
model_params=" base_emb_dim=384 base_num_query_heads=8 base_num_kv_heads=8 base_mlp_dim=192 base_num_decoder_layers=8 head_dim=128"
CMD_DATA=""
Expand All @@ -33,13 +37,13 @@ fi
#Train
CMD1="python3 MaxText/train.py MaxText/configs/base.yml run_name=$RUN_NAME steps=5 max_target_length=128 per_device_batch_size=1\
metrics_file=saved_metrics.txt checkpoint_period=3 base_output_directory=$OUTPUT_PATH dataset_path=$DATASET_PATH\
async_checkpointing=false collect_stack_trace=$COLLECT_STACK_TRACE"
async_checkpointing=false collect_stack_trace=$COLLECT_STACK_TRACE attention=$ATTENTION"
CMD1+=$model_params
CMD1+=$CMD_DATA

CMD2="python3 MaxText/train.py MaxText/configs/base.yml run_name=$RUN_NAME steps=5 max_target_length=128 per_device_batch_size=1\
metrics_file=restored_metrics.txt base_output_directory=$OUTPUT_PATH dataset_path=$DATASET_PATH\
async_checkpointing=false collect_stack_trace=$COLLECT_STACK_TRACE"
async_checkpointing=false collect_stack_trace=$COLLECT_STACK_TRACE attention=$ATTENTION"
CMD2+=$model_params
CMD2+=$CMD_DATA

Expand Down

0 comments on commit fcf48fe

Please sign in to comment.