Skip to content

Commit

Permalink
Merge branch 'AMD' of https://github.com/EleutherAI/gpt-neox into main
Browse files Browse the repository at this point in the history
  • Loading branch information
kyriemao committed May 22, 2023
2 parents 7222e30 + d1062f3 commit a315331
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 18 deletions.
2 changes: 1 addition & 1 deletion megatron/fused_kernels/scaled_masked_softmax.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* limitations under the License.
*/

#include <cuda_fp16.h>
#include <hip/hip_fp16.h>
#include <torch/extension.h>
#include <vector>

Expand Down
3 changes: 1 addition & 2 deletions megatron/fused_kernels/scaled_masked_softmax_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_profiler_api.h>
#include <hip/hip_fp16.h>
#include <cuda_runtime.h>
#include <torch/extension.h>
#include "scaled_masked_softmax.h"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* limitations under the License.
*/

#include <cuda_fp16.h>
#include <hip/hip_fp16.h>
#include <torch/extension.h>
#include <vector>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_profiler_api.h>
#include <hip/hip_fp16.h>
#include <cuda_runtime.h>
#include <torch/extension.h>
#include "scaled_upper_triang_masked_softmax.h"
Expand Down
25 changes: 13 additions & 12 deletions megatron/fused_kernels/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,25 +20,26 @@ def _get_cuda_bare_metal_version(cuda_dir):

srcpath = Path(__file__).parent.absolute()
cc_flag = []
_, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
if int(bare_metal_major) >= 11:
cc_flag.append("-gencode")
cc_flag.append("arch=compute_80,code=sm_80")
#_, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
#if int(bare_metal_major) >= 11:
# cc_flag.append("-gencode")
# cc_flag.append("arch=compute_80,code=sm_80")

nvcc_flags = [
"-O3",
"-gencode",
"arch=compute_70,code=sm_70",
"--use_fast_math",
"-U__CUDA_NO_HALF_OPERATORS__",
"-U__CUDA_NO_HALF_CONVERSIONS__",
"--expt-relaxed-constexpr",
"--expt-extended-lambda",
# "-gencode",
# "arch=compute_70,code=sm_70",
# "--use_fast_math",
"-U__HIP_NO_HALF_OPERATORS__",
"-U__HIP_NO_HALF_CONVERSIONS__",
"-D__HIP_PLATFORM_AMD__=1",
# "--expt-relaxed-constexpr",
# "--expt-extended-lambda",
]
cuda_ext_args = {"cxx": ["-O3"], "nvcc": nvcc_flags + cc_flag}
layernorm_cuda_args = {
"cxx": ["-O3"],
"nvcc": nvcc_flags + cc_flag + ["-maxrregcount=50"],
"nvcc": nvcc_flags + cc_flag, # + ["-maxrregcount=50"],
}
setup(
name="fused_kernels",
Expand Down

0 comments on commit a315331

Please sign in to comment.