Merge branch 'AMD' of https://github.com/EleutherAI/gpt-neox into main

Jarvis-LLM · May 22, 2023 · a315331 · a315331
2 parents 7222e30 + d1062f3
commit a315331
Show file tree

Hide file tree

Showing 5 changed files with 17 additions and 18 deletions.
diff --git a/megatron/fused_kernels/scaled_masked_softmax.cpp b/megatron/fused_kernels/scaled_masked_softmax.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <cuda_fp16.h>
+#include <hip/hip_fp16.h>
 #include <torch/extension.h>
 #include <vector>
 

diff --git a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
@@ -17,8 +17,7 @@
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_profiler_api.h>
+#include <hip/hip_fp16.h>
 #include <cuda_runtime.h>
 #include <torch/extension.h>
 #include "scaled_masked_softmax.h"

diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp b/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <cuda_fp16.h>
+#include <hip/hip_fp16.h>
 #include <torch/extension.h>
 #include <vector>
 

diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
@@ -17,8 +17,7 @@
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <cuda.h>
-#include <cuda_fp16.h>
-#include <cuda_profiler_api.h>
+#include <hip/hip_fp16.h>
 #include <cuda_runtime.h>
 #include <torch/extension.h>
 #include "scaled_upper_triang_masked_softmax.h"

diff --git a/megatron/fused_kernels/setup.py b/megatron/fused_kernels/setup.py
@@ -20,25 +20,26 @@ def _get_cuda_bare_metal_version(cuda_dir):
 
 srcpath = Path(__file__).parent.absolute()
 cc_flag = []
-_, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
-if int(bare_metal_major) >= 11:
-    cc_flag.append("-gencode")
-    cc_flag.append("arch=compute_80,code=sm_80")
+#_, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
+#if int(bare_metal_major) >= 11:
+#    cc_flag.append("-gencode")
+#    cc_flag.append("arch=compute_80,code=sm_80")
 
 nvcc_flags = [
     "-O3",
-    "-gencode",
-    "arch=compute_70,code=sm_70",
-    "--use_fast_math",
-    "-U__CUDA_NO_HALF_OPERATORS__",
-    "-U__CUDA_NO_HALF_CONVERSIONS__",
-    "--expt-relaxed-constexpr",
-    "--expt-extended-lambda",
+#    "-gencode",
+#    "arch=compute_70,code=sm_70",
+#    "--use_fast_math",
+    "-U__HIP_NO_HALF_OPERATORS__",
+    "-U__HIP_NO_HALF_CONVERSIONS__",
+    "-D__HIP_PLATFORM_AMD__=1",
+#    "--expt-relaxed-constexpr",
+#    "--expt-extended-lambda",
 ]
 cuda_ext_args = {"cxx": ["-O3"], "nvcc": nvcc_flags + cc_flag}
 layernorm_cuda_args = {
     "cxx": ["-O3"],
-    "nvcc": nvcc_flags + cc_flag + ["-maxrregcount=50"],
+    "nvcc": nvcc_flags + cc_flag, #  + ["-maxrregcount=50"],
 }
 setup(
     name="fused_kernels",