Skip to content

Commit

Permalink
Update 16b 32b tests to use full remat and larger batch size (#84)
Browse files Browse the repository at this point in the history
Update 16b and 32b configs to use full remat and larger batch size

Also:
* Add a config that runs these tests with additional optimizied rto
* Remove datestring from the RUN_NAME so checkpointing works.
  • Loading branch information
gobbleturk committed Jul 10, 2023
1 parent ac50f08 commit 4dd7414
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 8 deletions.
10 changes: 10 additions & 0 deletions MaxText/configs/16b.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
echo "Running 16b.sh"

RUN_NAME=${1}
OUTPUT_PATH=${2}
DATASET_PATH=${3}

bash rto_setup.sh

TFLOP_THRESHOLD=0 # set to 0 since we are not actually running as a test.
bash end_to_end/test_tflops_16b_params.sh ${RUN_NAME} ${TFLOP_THRESHOLD} ${OUTPUT_PATH} ${DATASET_PATH}
10 changes: 10 additions & 0 deletions MaxText/configs/32b.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
echo "Running 32b.sh"

RUN_NAME=${1}
OUTPUT_PATH=${2}
DATASET_PATH=${3}

bash rto_setup.sh

TFLOP_THRESHOLD=0 # set to 0 since we are not actually running as a test.
bash end_to_end/test_tflops_32b_params.sh ${RUN_NAME} ${TFLOP_THRESHOLD} ${OUTPUT_PATH} ${DATASET_PATH}
8 changes: 4 additions & 4 deletions end_to_end/test_tflops_16b_params.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,16 @@ DATASET_PATH=${4}

if [ -z ${5} ]
then
RUN_NAME=${USER}_$(date +%Y-%m-%d-%H-%M-%S)
RUN_NAME=${USER}
else
RUN_NAME=${5}_$(date +%Y-%m-%d-%H)
RUN_NAME=${5}
fi

# Train
export LIBTPU_INIT_ARGS="--xla_tpu_enable_data_parallel_all_reduce_opt=true --xla_tpu_data_parallel_opt_different_sized_ops=true --xla_tpu_enable_async_collective_fusion=true --xla_tpu_enable_async_collective_fusion_fuse_all_gather=true --xla_tpu_enable_async_collective_fusion_multiple_steps=true --xla_tpu_overlap_compute_collective_tc=true --xla_enable_async_all_gather=true"
python3 MaxText/train.py MaxText/configs/base.yml run_name=$RUN_NAME\
steps=150 per_device_batch_size=2 enable_checkpointing=false\
enable_profiler=false remat_policy=proj base_emb_dim=6144 base_mlp_dim=24576\
steps=150 per_device_batch_size=6 enable_checkpointing=false\
enable_profiler=false remat_policy=full base_emb_dim=6144 base_mlp_dim=24576\
base_num_heads=24 base_num_decoder_layers=36 head_dim=256\
max_target_length=2048 metrics_file='metrics.txt' base_output_directory=$OUTPUT_PATH\
dataset_path=$DATASET_PATH log_period=150
Expand Down
8 changes: 4 additions & 4 deletions end_to_end/test_tflops_32b_params.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,16 @@ DATASET_PATH=${4}

if [ -z ${5} ]
then
RUN_NAME=${USER}_$(date +%Y-%m-%d-%H-%M-%S)
RUN_NAME=${USER}
else
RUN_NAME=${5}_$(date +%Y-%m-%d-%H)
RUN_NAME=${5}
fi

# Train
export LIBTPU_INIT_ARGS="--xla_tpu_enable_data_parallel_all_reduce_opt=true --xla_tpu_data_parallel_opt_different_sized_ops=true --xla_tpu_enable_async_collective_fusion=true --xla_tpu_enable_async_collective_fusion_fuse_all_gather=true --xla_tpu_enable_async_collective_fusion_multiple_steps=true --xla_tpu_overlap_compute_collective_tc=true --xla_enable_async_all_gather=true"
python3 MaxText/train.py MaxText/configs/base.yml run_name=$RUN_NAME\
steps=150 per_device_batch_size=1 enable_checkpointing=false\
enable_profiler=false remat_policy=proj base_emb_dim=8192 base_mlp_dim=32768\
steps=150 per_device_batch_size=4 enable_checkpointing=false\
enable_profiler=false remat_policy=full base_emb_dim=8192 base_mlp_dim=32768\
base_num_heads=32 base_num_decoder_layers=40 head_dim=256\
max_target_length=2048 metrics_file='metrics.txt' base_output_directory=$OUTPUT_PATH\
dataset_path=$DATASET_PATH log_period=150
Expand Down
5 changes: 5 additions & 0 deletions rto_setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
echo "Running rto_setup.sh..."
first_line_res=$(ip route show | head -n 1)
sudo ip route change ${first_line_res} rto_min 5ms
sudo ethtool -K ens9 tx-nocache-copy on
echo "rto_setup finished"

0 comments on commit 4dd7414

Please sign in to comment.