From e84114dfe449de7ec4516c13ec40b9e8fadc16a3 Mon Sep 17 00:00:00 2001
From: Reed Wanderman-Milne <reedwm@google.com>
Date: Thu, 9 May 2024 17:11:16 -0700
Subject: [PATCH] Remove async XLA_FLAGS from A3 configs.

XLA PR https://github.com/openxla/xla/pull/11422 removed some XLA flags relating to async collectives. This caused the A3 configs to fail to run, so this change removes such flags from the A3 configs. The flags removed are:

--xla_gpu_enable_async_all_gather=true
--xla_gpu_enable_async_reduce_scatter=true
--xla_gpu_enable_async_all_reduce=true

Such flags had no impact before the XLA PR as the async collectives were already enabled by default.
---
 MaxText/configs/a3/llama_2_7b/16vm.sh | 4 ++--
 MaxText/configs/a3/llama_2_7b/1vm.sh  | 6 +++---
 MaxText/configs/a3/llama_2_7b/2vm.sh  | 4 ++--
 MaxText/configs/a3/llama_2_7b/4vm.sh  | 6 +++---
 MaxText/configs/a3/llama_2_7b/8vm.sh  | 4 ++--
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/MaxText/configs/a3/llama_2_7b/16vm.sh b/MaxText/configs/a3/llama_2_7b/16vm.sh
index fa07a470e..aad9ea8bf 100644
--- a/MaxText/configs/a3/llama_2_7b/16vm.sh
+++ b/MaxText/configs/a3/llama_2_7b/16vm.sh
@@ -25,9 +25,9 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
 export NVTE_FUSED_ATTN=1
 export NCCL_DEBUG=VERSION
 export XLA_FLAGS="--xla_dump_to=$OUTPUT_PATH/$RUN_NAME/HLO_dumps/
---xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_async_all_gather=true --xla_gpu_enable_async_reduce_scatter=true
+--xla_gpu_enable_latency_hiding_scheduler=true
 --xla_gpu_enable_triton_gemm=false --xla_gpu_simplify_all_fp_conversions --xla_gpu_graph_level=0
---xla_gpu_enable_async_all_reduce=true --xla_gpu_enable_highest_priority_async_stream=true
+--xla_gpu_enable_highest_priority_async_stream=true
 --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 --xla_gpu_all_gather_combine_threshold_bytes=134217728
 --xla_gpu_reduce_scatter_combine_threshold_bytes=134217728 --xla_gpu_enable_pipelined_all_gather=true
 --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_pipelined_all_reduce=true
diff --git a/MaxText/configs/a3/llama_2_7b/1vm.sh b/MaxText/configs/a3/llama_2_7b/1vm.sh
index 9120b9311..492362ee4 100644
--- a/MaxText/configs/a3/llama_2_7b/1vm.sh
+++ b/MaxText/configs/a3/llama_2_7b/1vm.sh
@@ -25,9 +25,9 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
 export NVTE_FUSED_ATTN=1
 export NCCL_DEBUG=VERSION
 export XLA_FLAGS="--xla_dump_to=$OUTPUT_PATH/$RUN_NAME/HLO_dumps/
---xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_async_all_gather=true
---xla_gpu_enable_async_reduce_scatter=true --xla_gpu_enable_triton_gemm=false --xla_gpu_simplify_all_fp_conversions
- --xla_gpu_graph_level=0 --xla_gpu_enable_async_all_reduce=true --xla_gpu_enable_highest_priority_async_stream=true
+--xla_gpu_enable_latency_hiding_scheduler=true
+--xla_gpu_enable_triton_gemm=false --xla_gpu_simplify_all_fp_conversions
+ --xla_gpu_graph_level=0 --xla_gpu_enable_highest_priority_async_stream=true
  --xla_gpu_all_reduce_combine_threshold_bytes=134217728 --xla_gpu_all_gather_combine_threshold_bytes=134217728
  --xla_gpu_reduce_scatter_combine_threshold_bytes=67108864 --xla_gpu_enable_pipelined_all_gather=true
  --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_pipelined_all_reduce=true
diff --git a/MaxText/configs/a3/llama_2_7b/2vm.sh b/MaxText/configs/a3/llama_2_7b/2vm.sh
index d1ee728ac..4b15254fd 100644
--- a/MaxText/configs/a3/llama_2_7b/2vm.sh
+++ b/MaxText/configs/a3/llama_2_7b/2vm.sh
@@ -25,9 +25,9 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
 export NVTE_FUSED_ATTN=1
 export NCCL_DEBUG=VERSION
 export XLA_FLAGS="--xla_dump_to=$OUTPUT_PATH/$RUN_NAME/HLO_dumps/
---xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_async_all_gather=true --xla_gpu_enable_async_reduce_scatter=true
+--xla_gpu_enable_latency_hiding_scheduler=true
 --xla_gpu_enable_triton_gemm=false --xla_gpu_simplify_all_fp_conversions --xla_gpu_graph_level=0
---xla_gpu_enable_async_all_reduce=true --xla_gpu_enable_highest_priority_async_stream=true
+--xla_gpu_enable_highest_priority_async_stream=true
 --xla_gpu_all_reduce_combine_threshold_bytes=67108864 --xla_gpu_all_gather_combine_threshold_bytes=134217728
 --xla_gpu_reduce_scatter_combine_threshold_bytes=67108864 --xla_gpu_enable_pipelined_all_gather=true
 --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_pipelined_all_reduce=true
diff --git a/MaxText/configs/a3/llama_2_7b/4vm.sh b/MaxText/configs/a3/llama_2_7b/4vm.sh
index 7712d036a..fcf2f86f3 100644
--- a/MaxText/configs/a3/llama_2_7b/4vm.sh
+++ b/MaxText/configs/a3/llama_2_7b/4vm.sh
@@ -25,9 +25,9 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
 export NVTE_FUSED_ATTN=1
 export NCCL_DEBUG=VERSION
 export XLA_FLAGS="--xla_dump_to=$OUTPUT_PATH/$RUN_NAME/HLO_dumps/
---xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_async_all_gather=true
---xla_gpu_enable_async_reduce_scatter=true --xla_gpu_enable_triton_gemm=false --xla_gpu_simplify_all_fp_conversions
---xla_gpu_graph_level=0 --xla_gpu_enable_async_all_reduce=true --xla_gpu_enable_highest_priority_async_stream=true
+--xla_gpu_enable_latency_hiding_scheduler=true
+--xla_gpu_enable_triton_gemm=false --xla_gpu_simplify_all_fp_conversions
+--xla_gpu_graph_level=0 --xla_gpu_enable_highest_priority_async_stream=true
 --xla_gpu_all_reduce_combine_threshold_bytes=536870912 --xla_gpu_all_gather_combine_threshold_bytes=134217728
 --xla_gpu_reduce_scatter_combine_threshold_bytes=67108864 --xla_gpu_enable_pipelined_all_gather=true
 --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_pipelined_all_reduce=true
diff --git a/MaxText/configs/a3/llama_2_7b/8vm.sh b/MaxText/configs/a3/llama_2_7b/8vm.sh
index bb003f2f7..72090e0ac 100644
--- a/MaxText/configs/a3/llama_2_7b/8vm.sh
+++ b/MaxText/configs/a3/llama_2_7b/8vm.sh
@@ -25,9 +25,9 @@ export CUDA_DEVICE_MAX_CONNECTIONS=1
 export NVTE_FUSED_ATTN=1
 export NCCL_DEBUG=VERSION
 export XLA_FLAGS="--xla_dump_to=$OUTPUT_PATH/$RUN_NAME/HLO_dumps/
---xla_gpu_enable_latency_hiding_scheduler=true --xla_gpu_enable_async_all_gather=true --xla_gpu_enable_async_reduce_scatter=true
+--xla_gpu_enable_latency_hiding_scheduler=true
 --xla_gpu_enable_triton_gemm=false --xla_gpu_simplify_all_fp_conversions --xla_gpu_graph_level=0
---xla_gpu_enable_async_all_reduce=true --xla_gpu_enable_highest_priority_async_stream=true
+--xla_gpu_enable_highest_priority_async_stream=true
 --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 --xla_gpu_all_gather_combine_threshold_bytes=134217728
 --xla_gpu_reduce_scatter_combine_threshold_bytes=67108864 --xla_gpu_enable_pipelined_all_gather=true
 --xla_gpu_enable_pipelined_reduce_scatter=true --xla_gpu_enable_pipelined_all_reduce=true