From e84114dfe449de7ec4516c13ec40b9e8fadc16a3 Mon Sep 17 00:00:00 2001 From: Reed Wanderman-Milne Date: Thu, 9 May 2024 17:11:16 -0700 Subject: [PATCH] Remove async XLA_FLAGS from A3 configs. XLA PR https://github.com/openxla/xla/pull/11422 removed some XLA flags relating to async collectives. This caused the A3 configs to fail to run, so this change removes such flags from the A3 configs. The flags removed are: --xla_gpu_enable_async_all_gather=true --xla_gpu_enable_async_reduce_scatter=true --xla_gpu_enable_async_all_reduce=true Such flags had no impact before the XLA PR as the async collectives were already enabled by default. --- MaxText/configs/a3/llama_2_7b/16vm.sh | 4 ++-- MaxText/configs/a3/llama_2_7b/1vm.sh | 6 +++--- MaxText/configs/a3/llama_2_7b/2vm.sh | 4 ++-- MaxText/configs/a3/llama_2_7b/4vm.sh | 6 +++--- MaxText/configs/a3/llama_2_7b/8vm.sh | 4 ++-- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/MaxText/configs/a3/llama_2_7b/16vm.sh b/MaxText/configs/a3/llama_2_7b/16vm.sh index fa07a470e..aad9ea8bf 100644 --- a/MaxText/configs/a3/llama_2_7b/16vm.sh +++ b/MaxText/configs/a3/llama_2_7b/16vm.sh @@ -25,9 +25,9 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1 export NVTE_FUSED_ATTN=1 export NCCL_DEBUG=VERSION export XLA_FLAGS="--xla_dump_to=$OUTPUT_PATH/$RUN_NAME/HLO_dumps/ ---xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_async_all_gather=true --xla_gpu_enable_async_reduce_scatter=true +--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false --xla_gpu_simplify_all_fp_conversions --xla_gpu_graph_level=0 ---xla_gpu_enable_async_all_reduce=true --xla_gpu_enable_highest_priority_async_stream=true +--xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 --xla_gpu_all_gather_combine_threshold_bytes=134217728 --xla_gpu_reduce_scatter_combine_threshold_bytes=134217728 --xla_gpu_enable_pipelined_all_gather=true --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_pipelined_all_reduce=true diff --git a/MaxText/configs/a3/llama_2_7b/1vm.sh b/MaxText/configs/a3/llama_2_7b/1vm.sh index 9120b9311..492362ee4 100644 --- a/MaxText/configs/a3/llama_2_7b/1vm.sh +++ b/MaxText/configs/a3/llama_2_7b/1vm.sh @@ -25,9 +25,9 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1 export NVTE_FUSED_ATTN=1 export NCCL_DEBUG=VERSION export XLA_FLAGS="--xla_dump_to=$OUTPUT_PATH/$RUN_NAME/HLO_dumps/ ---xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_async_all_gather=true ---xla_gpu_enable_async_reduce_scatter=true --xla_gpu_enable_triton_gemm=false --xla_gpu_simplify_all_fp_conversions - --xla_gpu_graph_level=0 --xla_gpu_enable_async_all_reduce=true --xla_gpu_enable_highest_priority_async_stream=true +--xla_gpu_enable_latency_hiding_scheduler=true +--xla_gpu_enable_triton_gemm=false --xla_gpu_simplify_all_fp_conversions + --xla_gpu_graph_level=0 --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_all_reduce_combine_threshold_bytes=134217728 --xla_gpu_all_gather_combine_threshold_bytes=134217728 --xla_gpu_reduce_scatter_combine_threshold_bytes=67108864 --xla_gpu_enable_pipelined_all_gather=true --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_pipelined_all_reduce=true diff --git a/MaxText/configs/a3/llama_2_7b/2vm.sh b/MaxText/configs/a3/llama_2_7b/2vm.sh index d1ee728ac..4b15254fd 100644 --- a/MaxText/configs/a3/llama_2_7b/2vm.sh +++ b/MaxText/configs/a3/llama_2_7b/2vm.sh @@ -25,9 +25,9 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1 export NVTE_FUSED_ATTN=1 export NCCL_DEBUG=VERSION export XLA_FLAGS="--xla_dump_to=$OUTPUT_PATH/$RUN_NAME/HLO_dumps/ ---xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_async_all_gather=true --xla_gpu_enable_async_reduce_scatter=true +--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false --xla_gpu_simplify_all_fp_conversions --xla_gpu_graph_level=0 ---xla_gpu_enable_async_all_reduce=true --xla_gpu_enable_highest_priority_async_stream=true +--xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_all_reduce_combine_threshold_bytes=67108864 --xla_gpu_all_gather_combine_threshold_bytes=134217728 --xla_gpu_reduce_scatter_combine_threshold_bytes=67108864 --xla_gpu_enable_pipelined_all_gather=true --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_pipelined_all_reduce=true diff --git a/MaxText/configs/a3/llama_2_7b/4vm.sh b/MaxText/configs/a3/llama_2_7b/4vm.sh index 7712d036a..fcf2f86f3 100644 --- a/MaxText/configs/a3/llama_2_7b/4vm.sh +++ b/MaxText/configs/a3/llama_2_7b/4vm.sh @@ -25,9 +25,9 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1 export NVTE_FUSED_ATTN=1 export NCCL_DEBUG=VERSION export XLA_FLAGS="--xla_dump_to=$OUTPUT_PATH/$RUN_NAME/HLO_dumps/ ---xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_async_all_gather=true ---xla_gpu_enable_async_reduce_scatter=true --xla_gpu_enable_triton_gemm=false --xla_gpu_simplify_all_fp_conversions ---xla_gpu_graph_level=0 --xla_gpu_enable_async_all_reduce=true --xla_gpu_enable_highest_priority_async_stream=true +--xla_gpu_enable_latency_hiding_scheduler=true +--xla_gpu_enable_triton_gemm=false --xla_gpu_simplify_all_fp_conversions +--xla_gpu_graph_level=0 --xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_all_reduce_combine_threshold_bytes=536870912 --xla_gpu_all_gather_combine_threshold_bytes=134217728 --xla_gpu_reduce_scatter_combine_threshold_bytes=67108864 --xla_gpu_enable_pipelined_all_gather=true --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_pipelined_all_reduce=true diff --git a/MaxText/configs/a3/llama_2_7b/8vm.sh b/MaxText/configs/a3/llama_2_7b/8vm.sh index bb003f2f7..72090e0ac 100644 --- a/MaxText/configs/a3/llama_2_7b/8vm.sh +++ b/MaxText/configs/a3/llama_2_7b/8vm.sh @@ -25,9 +25,9 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1 export NVTE_FUSED_ATTN=1 export NCCL_DEBUG=VERSION export XLA_FLAGS="--xla_dump_to=$OUTPUT_PATH/$RUN_NAME/HLO_dumps/ ---xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_async_all_gather=true --xla_gpu_enable_async_reduce_scatter=true +--xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_triton_gemm=false --xla_gpu_simplify_all_fp_conversions --xla_gpu_graph_level=0 ---xla_gpu_enable_async_all_reduce=true --xla_gpu_enable_highest_priority_async_stream=true +--xla_gpu_enable_highest_priority_async_stream=true --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 --xla_gpu_all_gather_combine_threshold_bytes=134217728 --xla_gpu_reduce_scatter_combine_threshold_bytes=67108864 --xla_gpu_enable_pipelined_all_gather=true --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_pipelined_all_reduce=true