Skip to content

Commit

Permalink
[OpenBLAS_jll] Update to new build with BFloat16 kernels (JuliaLang#5…
Browse files Browse the repository at this point in the history
…3059)

This also

* drops a patch (`deps/patches/neoverse-generic-kernels.patch`) not
needed anymore for an [old
bug](OpenMathLib/OpenBLAS#2998) fixed upstream
in OpenBLAS. This results in ~5x speedup in the computation of
`BLAS.nrm2` (and hence `LinearAlgebra.norm` for vectors longer than
`LinearAlgebra.NRM2_CUTOFF` (== 32) elements) when the neoversen1
kernels are used, e.g. by default on all Apple Silicon CPUs
* adds a regression test for the above bug
* updates other patches when building openblas from source

Corresponding PR in Yggdrasil:
JuliaPackaging/Yggdrasil#7202.
  • Loading branch information
giordano committed Jan 26, 2024
1 parent 5cf1021 commit 5d4d6ab
Show file tree
Hide file tree
Showing 9 changed files with 301 additions and 123 deletions.
184 changes: 92 additions & 92 deletions deps/checksums/openblas

Large diffs are not rendered by default.

19 changes: 16 additions & 3 deletions deps/openblas.mk
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ endif
# don't touch scheduler affinity since we manage this ourselves
OPENBLAS_BUILD_OPTS += NO_AFFINITY=1

# Build BFloat16 kernels
OPENBLAS_BUILD_OPTS += BUILD_BFLOAT16=1

# Build for all architectures - required for distribution
ifeq ($(SANITIZE_MEMORY),1)
OPENBLAS_BUILD_OPTS += TARGET=GENERIC
Expand Down Expand Up @@ -95,12 +98,22 @@ $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-ofast-power.patch-applied: $(BUILDDIR)/
patch -p1 -f < $(SRCDIR)/patches/openblas-ofast-power.patch
echo 1 > $@

$(BUILDDIR)/$(OPENBLAS_SRC_DIR)/neoverse-generic-kernels.patch-applied: $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-ofast-power.patch-applied
$(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-avx512bf-kernels.patch-applied: $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-ofast-power.patch-applied
cd $(BUILDDIR)/$(OPENBLAS_SRC_DIR) && \
patch -p1 -f < $(SRCDIR)/patches/openblas-avx512bf-kernels.patch
echo 1 > $@

$(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-gemv-multithreading.patch-applied: $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-avx512bf-kernels.patch-applied
cd $(BUILDDIR)/$(OPENBLAS_SRC_DIR) && \
patch -p1 -f < $(SRCDIR)/patches/openblas-gemv-multithreading.patch
echo 1 > $@

$(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-darwin-sve.patch-applied: $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-gemv-multithreading.patch-applied
cd $(BUILDDIR)/$(OPENBLAS_SRC_DIR) && \
patch -p1 -f < $(SRCDIR)/patches/neoverse-generic-kernels.patch
patch -p1 -f < $(SRCDIR)/patches/openblas-darwin-sve.patch
echo 1 > $@

$(BUILDDIR)/$(OPENBLAS_SRC_DIR)/build-configured: $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/neoverse-generic-kernels.patch-applied
$(BUILDDIR)/$(OPENBLAS_SRC_DIR)/build-configured: $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/openblas-darwin-sve.patch-applied
echo 1 > $@

$(BUILDDIR)/$(OPENBLAS_SRC_DIR)/build-compiled: $(BUILDDIR)/$(OPENBLAS_SRC_DIR)/build-configured
Expand Down
19 changes: 0 additions & 19 deletions deps/patches/neoverse-generic-kernels.patch

This file was deleted.

107 changes: 107 additions & 0 deletions deps/patches/openblas-avx512bf-kernels.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
From 1dada6d65d89d19b2cf89b12169f6b2196c90f1d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <[email protected]>
Date: Fri, 12 Jan 2024 00:10:56 +0100
Subject: [PATCH 1/2] Add compiler test and flag for AVX512BF16 capability

---
c_check | 22 ++++++++++++++++++++++
1 file changed, 22 insertions(+)

diff --git a/c_check b/c_check
index b5e4a9ad00..3e507be818 100755
--- a/c_check
+++ b/c_check
@@ -244,6 +244,7 @@ case "$data" in
esac

no_avx512=0
+no_avx512bf=0
if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
tmpf="$tmpd/a.c"
@@ -262,6 +263,25 @@ if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
}

rm -rf "$tmpd"
+ if [ "$no_avx512" -eq 0 ]; then
+ tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
+ tmpf="$tmpd/a.c"
+ code='"__m512 a= _mm512_dpbf16_ps(a, (__m512bh) _mm512_loadu_si512(%1]), (__m512bh) _mm512_loadu_si512(%2]));"'
+ printf "#include <immintrin.h>\n\nint main(void){ %s; }\n" "$code" >> "$tmpf"
+ if [ "$compiler" = "PGI" ]; then
+ args=" -tp cooperlake -c -o $tmpf.o $tmpf"
+ else
+ args=" -march=cooperlake -c -o $tmpf.o $tmpf"
+ fi
+ no_avx512bf=0
+ {
+ $compiler_name $flags $args >/dev/null 2>&1
+ } || {
+ no_avx512bf=1
+ }
+
+ rm -rf "$tmpd"
+ fi
fi

no_rv64gv=0
@@ -409,6 +429,7 @@ done
[ "$makefile" = "-" ] && {
[ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
+ [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n"
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
exit 0
@@ -437,6 +458,7 @@ done
[ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n"
[ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
+ [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n"
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
[ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n"

From 995a990e24fdcc8080128a8abc17b4ccc66bd4fd Mon Sep 17 00:00:00 2001
From: Martin Kroeker <[email protected]>
Date: Fri, 12 Jan 2024 00:12:46 +0100
Subject: [PATCH 2/2] Make AVX512 BFLOAT16 kernels conditional on compiler
capability

---
kernel/x86_64/KERNEL.COOPERLAKE | 3 ++-
kernel/x86_64/KERNEL.SAPPHIRERAPIDS | 2 ++
2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE
index dba94aea86..22b042029f 100644
--- a/kernel/x86_64/KERNEL.COOPERLAKE
+++ b/kernel/x86_64/KERNEL.COOPERLAKE
@@ -1,5 +1,5 @@
include $(KERNELDIR)/KERNEL.SKYLAKEX
-
+ifneq ($(NO_AVX512BF16), 1)
SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_cooperlake.c
SBGEMM_SMALL_K_NN = sbgemm_small_kernel_nn_cooperlake.c
SBGEMM_SMALL_K_B0_NN = sbgemm_small_kernel_nn_cooperlake.c
@@ -20,3 +20,4 @@ SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX)
SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
+endif
diff --git a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS
index 3a832e9174..0ab2b4ddcf 100644
--- a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS
+++ b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS
@@ -1,5 +1,6 @@
include $(KERNELDIR)/KERNEL.COOPERLAKE

+ifneq ($(NO_AVX512BF16), 1)
SBGEMM_SMALL_M_PERMIT =
SBGEMM_SMALL_K_NN =
SBGEMM_SMALL_K_B0_NN =
@@ -20,3 +21,4 @@ SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX)
SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
+endif
34 changes: 34 additions & 0 deletions deps/patches/openblas-darwin-sve.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
From 03688a42622cf76e696859ce384e45aa26d927fc Mon Sep 17 00:00:00 2001
From: Ian McInerney <[email protected]>
Date: Tue, 23 Jan 2024 10:29:57 +0000
Subject: [PATCH] Build with proper aarch64 flags on Neoverse Darwin

We aren't affected by the problems in AppleClang that prompted this
fallback to an older architecture.
---
Makefile.arm64 | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Makefile.arm64 b/Makefile.arm64
index ed52a9424..a8f3cb0f0 100644
--- a/Makefile.arm64
+++ b/Makefile.arm64
@@ -135,11 +135,11 @@ ifeq ($(CORE), NEOVERSEN2)
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
-ifneq ($(OSNAME), Darwin)
+#ifneq ($(OSNAME), Darwin)
CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
-else
-CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
-endif
+#else
+#CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
+#endif
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
endif
--
2.43.0

22 changes: 22 additions & 0 deletions deps/patches/openblas-gemv-multithreading.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
From d2fc4f3b4d7f41527bc7dc8f62e9aa6229cfac89 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <[email protected]>
Date: Wed, 17 Jan 2024 20:59:24 +0100
Subject: [PATCH] Increase multithreading threshold by a factor of 50

---
interface/gemv.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/interface/gemv.c b/interface/gemv.c
index 1f07635799..2c121f1308 100644
--- a/interface/gemv.c
+++ b/interface/gemv.c
@@ -226,7 +226,7 @@ void CNAME(enum CBLAS_ORDER order,

#ifdef SMP

- if ( 1L * m * n < 2304L * GEMM_MULTITHREAD_THRESHOLD )
+ if ( 1L * m * n < 115200L * GEMM_MULTITHREAD_THRESHOLD )
nthreads = 1;
else
nthreads = num_cpu_avail(2);
28 changes: 20 additions & 8 deletions deps/patches/openblas-ofast-power.patch
Original file line number Diff line number Diff line change
@@ -1,17 +1,29 @@
diff --git a/Makefile.power b/Makefile.power
index 28a0bae0..b4869fbd 100644
index aa1ca080a..42c417a78 100644
--- a/Makefile.power
+++ b/Makefile.power
@@ -11,7 +11,7 @@ endif

ifeq ($(CORE), POWER10)
@@ -13,16 +13,16 @@ ifeq ($(CORE), POWER10)
ifneq ($(C_COMPILER), PGI)
ifeq ($(C_COMPILER), GCC))
ifeq ($(GCCVERSIONGTEQ10), 1)
-CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
+CCOMMON_OPT += -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
ifeq ($(F_COMPILER), IBM)
FCOMMON_OPT += -O2 -qrecur -qnosave
else ifneq ($(GCCVERSIONGT4), 1)
$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
-CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
+CCOMMON_OPT += -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
else
$(warning your compiler is too old to fully support POWER10, getting a newer version of gcc is recommended)
-CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math
+CCOMMON_OPT += -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math
endif
else
@@ -22,7 +22,7 @@ endif
-CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
+CCOMMON_OPT += -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
endif
ifeq ($(F_COMPILER), IBM)
FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr10 -qtune=pwr10 -qfloat=nomaf -qzerosize
@@ -34,7 +34,7 @@ endif

ifeq ($(CORE), POWER9)
ifneq ($(C_COMPILER), PGI)
Expand All @@ -20,7 +32,7 @@ index 28a0bae0..b4869fbd 100644
ifeq ($(C_COMPILER), GCC)
ifneq ($(GCCVERSIONGT4), 1)
$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
@@ -59,7 +59,7 @@ endif
@@ -70,7 +70,7 @@ endif

ifeq ($(CORE), POWER8)
ifneq ($(C_COMPILER), PGI)
Expand Down
9 changes: 9 additions & 0 deletions stdlib/LinearAlgebra/test/blas.jl
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,15 @@ Random.seed!(100)
@test BLAS.iamax(b) == findmax(fabs, b)[2] * (step(ind) >= 0)
end
end
@testset "nrm2 with non-finite elements" begin
# These tests would have caught <https://github.com/OpenMathLib/OpenBLAS/issues/2998>
# when running on appropriate hardware.
a = zeros(elty,n)
a[begin] = elty(-Inf)
@test BLAS.nrm2(a) === abs2(elty(Inf))
a[begin] = elty(NaN)
@test BLAS.nrm2(a) === abs2(elty(NaN))
end
@testset "scal" begin
α = rand(elty)
a = rand(elty,n)
Expand Down
2 changes: 1 addition & 1 deletion stdlib/OpenBLAS_jll/Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "OpenBLAS_jll"
uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
version = "0.3.26+1"
version = "0.3.26+2"

[deps]
# See note in `src/OpenBLAS_jll.jl` about this dependency.
Expand Down

0 comments on commit 5d4d6ab

Please sign in to comment.