Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

3x3 matrix multiply could potentially be faster #512

Open
KristofferC opened this issue Oct 6, 2018 · 28 comments
Open

3x3 matrix multiply could potentially be faster #512

KristofferC opened this issue Oct 6, 2018 · 28 comments
Labels
feature features and feature requests performance runtime performance vectorization

Comments

@KristofferC
Copy link
Contributor

KristofferC commented Oct 6, 2018

This issue is mostly to document that there is a potential speedup to be gain in e.g. 3x3 matrix multiply (and possible other operations where the matrix size is "odd").

Current StaticArrays.jl:

julia> for n in (2,3,4)
           s = Ref(rand(SMatrix{n,n}))
           @btime $(s)[] * $(s)[]
       end
  2.709 ns (0 allocations: 0 bytes)
  10.274 ns (0 allocations: 0 bytes)
  6.059 ns (0 allocations: 0 bytes)

3x3 is quite slow (the Ref shenanigans is to prevent spurious benchmark optimizations).

Handcoding something with SIMD.jl:

using StaticArrays
import SIMD
const SVec{N, T} = SIMD.Vec{N, T}

# load given range of linear indices into SVec
@generated function tosimd(D::NTuple{N, T}, ::Type{Val{strt}}, ::Type{Val{stp}}) where {N, T, strt, stp}
    expr = Expr(:tuple, [:(D[$i]) for i in strt:stp]...)
    M = length(expr.args)
    return quote
        $(Expr(:meta, :inline))
        @inbounds return SVec{$M, T}($expr)
    end
end

# constructor SMatrix from several SVecs
@generated function (::Type{SMatrix{dim, dim}})(r::NTuple{M, SVec{N}}) where {dim, M, N}
    return quote
        $(Expr(:meta, :inline))
        @inbounds return SMatrix{$dim, $dim}($(Expr(:tuple, [:(r[$j][$i]) for i in 1:N, j in 1:M]...)))
    end
end


function matmul3x3(a::SMatrix, b::SMatrix)
    D1 = a.data; D2 = b.data
    SV11 = tosimd(D1, Val{1}, Val{3})
    SV12 = tosimd(D1, Val{4}, Val{6})
    SV13 = tosimd(D1, Val{7}, Val{9})
    r1 = muladd(SV13, D2[3], muladd(SV12, D2[2], SV11 * D2[1]))
    r2 = muladd(SV13, D2[6], muladd(SV12, D2[5], SV11 * D2[4]))
    r3 = muladd(SV13, D2[9], muladd(SV12, D2[8], SV11 * D2[7]))
    return SMatrix{3,3}((r1, r2, r3))
end
julia> @btime matmul3x3($(Ref(s))[], $(Ref(s)[]))
  4.391 ns (0 allocations: 0 bytes)

julia> matmul3x3(s,s) - s*s
3×3 SArray{Tuple{3,3},Float64,2,9}:
 0.0  0.0   0.0        
 0.0  0.0  -1.11022e-16
 0.0  0.0   0.0        

which is a ~2.5x speedup.

The code for the handcoded is

julia> @code_native matmul3x3(s,s)
	.section	__TEXT,__text,regular,pure_instructions
	vmovupd	(%rsi), %xmm0
	vmovsd	16(%rsi), %xmm1         ## xmm1 = mem[0],zero
	vinsertf128	$1, %xmm1, %ymm0, %ymm0
	vmovupd	24(%rsi), %xmm1
	vmovsd	40(%rsi), %xmm2         ## xmm2 = mem[0],zero
	vinsertf128	$1, %xmm2, %ymm1, %ymm1
	vmovupd	48(%rsi), %xmm2
	vmovsd	64(%rsi), %xmm3         ## xmm3 = mem[0],zero
	vinsertf128	$1, %xmm3, %ymm2, %ymm2
	vbroadcastsd	(%rdx), %ymm3
	vmulpd	%ymm3, %ymm0, %ymm3
	vbroadcastsd	8(%rdx), %ymm4
	vfmadd213pd	%ymm3, %ymm1, %ymm4
	vbroadcastsd	16(%rdx), %ymm3
	vfmadd213pd	%ymm4, %ymm2, %ymm3
	vbroadcastsd	24(%rdx), %ymm4
	vmulpd	%ymm4, %ymm0, %ymm4
	vbroadcastsd	32(%rdx), %ymm5
	vfmadd213pd	%ymm4, %ymm1, %ymm5
	vbroadcastsd	40(%rdx), %ymm4
	vfmadd213pd	%ymm5, %ymm2, %ymm4
	vbroadcastsd	48(%rdx), %ymm5
	vmulpd	%ymm5, %ymm0, %ymm0
	vbroadcastsd	56(%rdx), %ymm5
	vfmadd213pd	%ymm0, %ymm1, %ymm5
	vbroadcastsd	64(%rdx), %ymm0
	vfmadd213pd	%ymm5, %ymm2, %ymm0
	vbroadcastsd	%xmm4, %ymm1
	vblendpd	$8, %ymm1, %ymm3, %ymm1 ## ymm1 = ymm3[0,1,2],ymm1[3]
	vmovupd	%ymm1, (%rdi)
	vextractf128	$1, %ymm0, %xmm1
	vinsertf128	$1, %xmm0, %ymm0, %ymm0
	vpermpd	$233, %ymm4, %ymm2      ## ymm2 = ymm4[1,2,2,3]
	vblendpd	$12, %ymm0, %ymm2, %ymm0 ## ymm0 = ymm2[0,1],ymm0[2,3]
	vmovupd	%ymm0, 32(%rdi)
	vmovlpd	%xmm1, 64(%rdi)
	movq	%rdi, %rax
	vzeroupper
	retq
	nopw	%cs:(%rax,%rax)
;}

while for the standard *

julia> @code_native s*s
	.section	__TEXT,__text,regular,pure_instructions
	vmovsd	(%rdx), %xmm0           ## xmm0 = mem[0],zero
	vmovsd	8(%rdx), %xmm7          ## xmm7 = mem[0],zero
	vmovsd	16(%rdx), %xmm6         ## xmm6 = mem[0],zero
	vmovsd	16(%rsi), %xmm11        ## xmm11 = mem[0],zero
	vmovsd	40(%rsi), %xmm12        ## xmm12 = mem[0],zero
	vmovsd	64(%rsi), %xmm9         ## xmm9 = mem[0],zero
	vmovsd	24(%rdx), %xmm3         ## xmm3 = mem[0],zero
	vmovupd	(%rsi), %xmm4
	vmovupd	8(%rsi), %xmm10
	vmovhpd	(%rsi), %xmm11, %xmm5   ## xmm5 = xmm11[0],mem[0]
	vinsertf128	$1, %xmm5, %ymm4, %ymm5
	vunpcklpd	%xmm3, %xmm0, %xmm1 ## xmm1 = xmm0[0],xmm3[0]
	vmovddup	%xmm0, %xmm0    ## xmm0 = xmm0[0,0]
	vinsertf128	$1, %xmm1, %ymm0, %ymm0
	vmulpd	%ymm0, %ymm5, %ymm0
	vmovsd	32(%rdx), %xmm5         ## xmm5 = mem[0],zero
	vmovupd	24(%rsi), %xmm8
	vmovhpd	24(%rsi), %xmm12, %xmm1 ## xmm1 = xmm12[0],mem[0]
	vinsertf128	$1, %xmm1, %ymm8, %ymm1
	vunpcklpd	%xmm5, %xmm7, %xmm2 ## xmm2 = xmm7[0],xmm5[0]
	vmovddup	%xmm7, %xmm7    ## xmm7 = xmm7[0,0]
	vinsertf128	$1, %xmm2, %ymm7, %ymm2
	vmulpd	%ymm2, %ymm1, %ymm1
	vaddpd	%ymm1, %ymm0, %ymm1
	vmovsd	40(%rdx), %xmm0         ## xmm0 = mem[0],zero
	vmovhpd	48(%rsi), %xmm9, %xmm2  ## xmm2 = xmm9[0],mem[0]
	vmovupd	48(%rsi), %xmm13
	vinsertf128	$1, %xmm2, %ymm13, %ymm2
	vunpcklpd	%xmm0, %xmm6, %xmm7 ## xmm7 = xmm6[0],xmm0[0]
	vmovddup	%xmm6, %xmm6    ## xmm6 = xmm6[0,0]
	vinsertf128	$1, %xmm7, %ymm6, %ymm6
	vmulpd	%ymm6, %ymm2, %ymm2
	vaddpd	%ymm2, %ymm1, %ymm14
	vmovsd	48(%rdx), %xmm1         ## xmm1 = mem[0],zero
	vmovsd	56(%rdx), %xmm2         ## xmm2 = mem[0],zero
	vmovsd	64(%rdx), %xmm6         ## xmm6 = mem[0],zero
	vinsertf128	$1, %xmm4, %ymm10, %ymm4
	vmovddup	%xmm3, %xmm3    ## xmm3 = xmm3[0,0]
	vmovddup	%xmm1, %xmm7    ## xmm7 = xmm1[0,0]
	vinsertf128	$1, %xmm7, %ymm3, %ymm3
	vmulpd	%ymm3, %ymm4, %ymm3
	vmovupd	32(%rsi), %xmm4
	vinsertf128	$1, %xmm8, %ymm4, %ymm4
	vmovddup	%xmm5, %xmm5    ## xmm5 = xmm5[0,0]
	vmovddup	%xmm2, %xmm7    ## xmm7 = xmm2[0,0]
	vinsertf128	$1, %xmm7, %ymm5, %ymm5
	vmulpd	%ymm5, %ymm4, %ymm4
	vaddpd	%ymm4, %ymm3, %ymm3
	vmovupd	56(%rsi), %xmm4
	vinsertf128	$1, %xmm13, %ymm4, %ymm4
	vmovddup	%xmm0, %xmm0    ## xmm0 = xmm0[0,0]
	vmovddup	%xmm6, %xmm5    ## xmm5 = xmm6[0,0]
	vinsertf128	$1, %xmm5, %ymm0, %ymm0
	vmulpd	%ymm0, %ymm4, %ymm0
	vaddpd	%ymm0, %ymm3, %ymm0
	vmulsd	%xmm1, %xmm11, %xmm1
	vmulsd	%xmm2, %xmm12, %xmm2
	vaddsd	%xmm2, %xmm1, %xmm1
	vmulsd	%xmm6, %xmm9, %xmm2
	vaddsd	%xmm2, %xmm1, %xmm1
	vmovupd	%ymm14, (%rdi)
	vmovupd	%ymm0, 32(%rdi)
	vmovsd	%xmm1, 64(%rdi)
	movq	%rdi, %rax
	vzeroupper
	retq
	nop
;}

We can see how much the xmm registries are used for this compared to the SIMD version. Comparing to the 4x4 case

julia> s = rand(SMatrix{4,4})

julia> @code_native s*s
	.section	__TEXT,__text,regular,pure_instructions
	vbroadcastsd	(%rdx), %ymm4
	vmovupd	(%rsi), %ymm3
	vmovupd	32(%rsi), %ymm2
	vmovupd	64(%rsi), %ymm1
	vmovupd	96(%rsi), %ymm0
	vmulpd	%ymm4, %ymm3, %ymm4
	vbroadcastsd	8(%rdx), %ymm5
	vmulpd	%ymm5, %ymm2, %ymm5
	vaddpd	%ymm5, %ymm4, %ymm4
        ...
	vbroadcastsd	96(%rdx), %ymm7
	vmulpd	%ymm7, %ymm3, %ymm3
	vbroadcastsd	104(%rdx), %ymm7
	vmulpd	%ymm7, %ymm2, %ymm2
	vaddpd	%ymm2, %ymm3, %ymm2
	vbroadcastsd	112(%rdx), %ymm3
	vmulpd	%ymm3, %ymm1, %ymm1
	vaddpd	%ymm1, %ymm2, %ymm1
	vbroadcastsd	120(%rdx), %ymm2
	vmulpd	%ymm2, %ymm0, %ymm0
	vaddpd	%ymm0, %ymm1, %ymm0
	vmovupd	%ymm4, (%rdi)
	vmovupd	%ymm5, 32(%rdi)
	vmovupd	%ymm6, 64(%rdi)
	vmovupd	%ymm0, 96(%rdi)
	movq	%rdi, %rax
	vzeroupper
	retq
	nopl	(%rax)

we can now see that everything is using the ymm registries which is what we want.

@andyferris
Copy link
Member

Very nice!

Do you think this is something we should import SIMD.jl for, or could we rather specialize the 3x3 matrix multiplication case? What happens with SIMD and large matrices (when SIMD.jl was brand new I heard LLVM at the time used to crash for large odd LLVM vectors like size 7... is that fixed)? What about when the eltype is not a SIMD type?

Also, is the 3x3 matrix * 3 vector code we generate optimal, or does it suffer from similar issues?

@KristofferC
Copy link
Contributor Author

Not sure what the best thing is to do. FWIW, this just seems to be the SLP doing a bad job vectorizing the code, would be interesting to write the loop in C++ and see what code is generated.

We should only dispatch to SIMD-stuff if the eltype is one of the native LLVM types.

Regarding matvec, it currently does not get vectorized at all and forcing it to do so doesn't seem to give any real benefit.

I'm not sure if opening the SIMD.jl door is a good idea or not. Would be cool to see where more performance can be squeezed out but explicit simd is arguably a bit messy.

@chriselrod
Copy link
Contributor

chriselrod commented Oct 9, 2018

We can do much better at many different sizes!
I benchmarked padded statically sized arrays vs StaticArrays.jl here.
Padding allowed for apply optimized matrix multiplication kernels, which are much faster.

Now, as the next step, SIMD.jl provides masked load and store operations. I stole all the code from SIMD.jl, and also switched it to simply use NTuple{N,Core.VecElement{T}} rather than wrapping it in a single field struct.
This has the advantage of being easier on LLVM. SIMD.jl results in redundant mov operations that do not always get eliminated by the compiler. It also results in more segmentation faults when used with libraries like SLEEF.
The disadvantage is that extending any base methods would be type piracy. Therefore SIMDPirates.jl does not extend any base methods, instead defining functions like vmul for multiplication. I added an @pirate macro that will replace expressions like a * b with vmul(a, b), in case someone prefers writing using the nice overloaded syntax of SIMD.jl. It is untested, so I won't promise it works.

I would propose an eventual PR incorporating some of what follows, or at least it's consideration.

Sorry for the messiness of the following code, but in working on jBLAS.jl, I wanted only a single kernel to generate the desired kernel for dense matrix multiplication. There, it optionally prefetches the following kernel, decides between initializing the product (D = A * X) or updating (D += A * X), etc.
Much of that machinery can be trimmed away, and odds are the code could be simplified regardless.

I also only wrapped a direct call to the kernel here. Note that this only works so long as at least 1 register is left unoccupied after storing the entire product matrix, and a full column of matrix A, on the CPU's registers.
If the product matrix is larger than that, you'd have to start looping over the kernel and applying it in chunks. I'm not doing that here for simplicity, meaning 8x6 products is the maximum size for which this code is fast on avx-2 machines, and 16x14 products is the maximum for avx-512.

I would however recommend looping over the kernel, and only falling back to BLAS for matrices > 100x100. On that note, lots of other performance improvements are possible for MMatrices. That is, simply use new() to initialize undefined MMatrices, and then fill them with a loop. Also, replace the getindex call with an unsafe_load. These are all far faster to compile for large MMarrays.

Without further adieu,

using StaticArrays, BenchmarkTools, SIMDPirates, LinearAlgebra, Base.Cartesian, CpuId
using Core.Intrinsics: llvmcall

# These constants can be defined in a build script, so that CpuId is only a build dependency.
const REGISTER_SIZE = simdbytes()
const CACHELINE_SIZE = cachelinesize()

struct PrefetchA
    A::Int
end
struct PrefetchX
    X::Int
end
struct PrefetchAX
    A::Int
    X::Int
end
prefetch_A(::Any)    = (nothing, nothing)
prefetch_X(::Any)    = (nothing, nothing, nothing)
function prefetch_A(::Type{PF}) where {PF <: Union{PrefetchA, PrefetchAX}}
    (
        :(@nexprs $Qₚ q -> prefetch(pA + pf.A + $CACHELINE_SIZE*(q-1), Val(3), Val(0))),
        :(@nexprs $Qₚ q -> prefetch(pA + pf.A + n*$AD_stride + $CACHELINE_SIZE*(q-1), Val(3), Val(0)))
    )
end
function prefetch_X(::Type{PrefetchX})
    (
        :(@nexprs $Pₖ p -> prefetch(pX + pf.X + (p-1)*$X_stride, Val(3), Val(0))),
        :(@nexprs $Pₖ p -> prefetch(pX + pf.X + n₁*$T_size + (p-1)*$X_stride, Val(3), Val(0))),
        :(prefetch(pX + pf.X + $(N*T_size) + (p-1)*$X_stride, Val(3), Val(0)))
    )
end
@generated function prefetch(address, ::Val{Locality} = Val(1), ::Val{RorW} = Val(0)) where {Locality, RorW}
    prefetch_call_string = """%addr = inttoptr i64 %0 to i8*
    call void @llvm.prefetch(i8* %addr, i32 $RorW, i32 $Locality, i32 1)
    ret void"""
    quote
        $(Expr(:meta, :inline))
        llvmcall(("declare void @llvm.prefetch(i8* , i32 , i32 , i32 )",
        $prefetch_call_string), Cvoid, Tuple{Ptr{Cvoid}}, address)
    end
end
mask_expr(W, r) = :($(Expr(:tuple, [i > r ? Core.VecElement{Bool}(false) : Core.VecElement{Bool}(true) for i  1:W]...)))

function mulinit(V, WT, Q, Pₖ, X_stride, r, mask_expr, inline_expr, pfA_1)
    if r == 0
        q_load_expr = :(@nexprs $Q q -> vA_q = vload($V, pA + $WT*(q-1)))
    else
        q_load_expr = quote
            @nexprs $(Q-1) q -> vA_q = vload($V, pA + $WT*(q-1))
          $(Symbol(:vA_, Q)) = vload($V, pA + $(WT*(Q-1)),$mask_expr)
      end
    end

    quote
        $inline_expr
        $q_load_expr
        @nexprs $Pₖ p -> begin
            vX = vbroadcast($V, unsafe_load(pX + (p-1)*$X_stride))
            @nexprs $Q q -> Dx_p_q = SIMDPirates.vmul(vA_q, vX)
        end
        $pfA_1
    end
end
function gemminit(V, WT, Q, Pₖ, AD_stride, r, mask_expr, inline_expr)
    if r == 0
        q = quote
            $inline_expr
            @nexprs $Pₖ p -> @nexprs $Q q -> Dx_p_q = vload($V, pD + $WT*(q-1) + $AD_stride*(p-1))
        end
    else
        q = quote
            $inline_expr
        end
        for p  1:Pₖ
            for q  1:Q-1
                push!(q.args, :($(Symbol(:Dx_,p,:_,q)) = vload($V, pD + $(WT*(q-1) + AD_stride*(p-1)))))
            end
            push!(q.args, :($(Symbol(:Dx_,p,:_,Q)) = vload($V, pD + $(WT*(Q-1) + AD_stride*(p-1)),$mask_expr)))
        end
    end
    q
end

function kernel_quote(Mₖ,Pₖ,stride_AD,stride_X,N,T,init,inline = false, pf = nothing)
    T_size = sizeof(T)
    AD_stride = stride_AD * T_size
    X_stride = stride_X * T_size
    W = REGISTER_SIZE ÷ T_size
    while W > 2Mₖ
        W >>= 1
    end
    WT = W * T_size
    Q, r = divrem(Mₖ, W) #Assuming Mₖ is a multiple of W
    V = Vec{W,T}
    if r == 0
        mask = :()
        A_load_expr = :(@nexprs $Q q -> vA_q = vload($V, pA + n*$AD_stride + $WT*(q-1)))
        D_store = :(@nexprs $Q q -> vstore(Dx_p_q, pD + $WT*(q-1) + $AD_stride*(p-1)))
    else
        mask = mask_expr(W, r)
        if Q == 0
            Q = 1
            A_load_expr = :($(Symbol(:vA_, Q)) = vload($V, pA + n*$AD_stride + $(WT*(Q-1)), $mask))
        else
            A_load_expr = quote
                @nexprs $Q q -> vA_q = vload($V, pA + n*$AD_stride + $WT*(q-1))
            end
            Q += 1
            push!(A_load_expr.args, :($(Symbol(:vA_, Q)) = vload($V, pA + n*$AD_stride + $(WT*(Q-1)), $mask)))
        end
        D_store = quote
                    @nexprs $(Q-1) q -> vstore(Dx_p_q, pD + $WT*(q-1) + $AD_stride*(p-1))
                    vstore($(Symbol(:Dx_p_, Q)), pD + $(WT*(Q-1)) + $AD_stride*(p-1), $mask)
                end
    end
    C = CACHELINE_SIZE ÷ T_size
    Qₚ = cld(Mₖ, C)
    # Check whether we are prefetching A and/or X.
    pfA_1, pfA_2 = prefetch_A(pf)
    pfX_1, pfX_2, pfX_3 = prefetch_X(pf)
    inline_expr = inline ? Expr(:meta, :inline) : :(nothing)
    if init
        q = mulinit(V, WT, Q, Pₖ, X_stride, r, mask, inline_expr, pfA_1)
    else
        q = gemminit(V, WT, Q, Pₖ, AD_stride, r, mask, inline_expr)
    end

    if pfX_1 == nothing
        push!(q.args,
        quote
            for n  $(init ? 1 : 0):$(N-1)
                $A_load_expr
                @nexprs $Pₖ p -> begin
                    vX = vbroadcast($V, unsafe_load(pX + n*$T_size + (p-1)*$X_stride))
                    @nexprs $Q q -> Dx_p_q = vfma(vA_q, vX, Dx_p_q)
                end
                $pfA_2
                # @nexprs $Qₚ q -> prefetch(pA + pf.A + n*$AD_stride + $CACHELINE_SIZE*(q-1), Val(3), Val(0))
            end
            @nexprs $Pₖ p -> $D_store
            nothing
        end)
    else
        push!(q.args,
        quote
            # @nexprs $Qₚ q -> prefetch(pA + pf.A + $CACHELINE_SIZE*(q-1), Val(3), Val(0))
            for n  $(init ? 1 : 0):$(C-1)
                $A_load_expr
                @nexprs $Pₖ p -> begin
                    vX = vbroadcast($V, unsafe_load(pX + n*$T_size + (p-1)*$X_stride))
                    @nexprs $Q q -> Dx_p_q = vfma(vA_q, vX, Dx_p_q)
                end
                $pfA_2
                # @nexprs $Qₚ q -> prefetch(pA + pf.A + n*$AD_stride + $CACHELINE_SIZE*(q-1), Val(3), Val(0))
            end
            # @nexprs $Pₖ p -> prefetch(pX + pf.X + (p-1)*$X_stride, Val(3), Val(0))
            $pfX_1

            for n₁  $C:$C:$(N-C)
                for n  n₁:n₁+$(C-1)
                    $A_load_expr
                    @nexprs $Pₖ p -> begin
                        vX = vbroadcast($V, unsafe_load(pX + n*$T_size + (p-1)*$X_stride))
                        @nexprs $Q q -> Dx_p_q = vfma(vA_q, vX, Dx_p_q)
                    end
                    $pfA_2
                    # @nexprs $Qₚ q -> prefetch(pA + pf.A + n*$AD_stride + $CACHELINE_SIZE*(q-1), Val(3), Val(0))
                end
                # @nexprs $Pₖ p -> prefetch(pX + pf.X + n₁*$T_size + (p-1)*$X_stride, Val(3), Val(0))
                $pfX_2
            end
            for n  $(N - (N % C)):$(N-1)
                $A_load_expr
                @nexprs $Pₖ p -> begin
                    vX = vbroadcast($V, unsafe_load(pX + n*$T_size + (p-1)*$X_stride))
                    @nexprs $Q q -> Dx_p_q = vfma(vA_q, vX, Dx_p_q)
                end
                $pfA_2
                # @nexprs $Qₚ q -> prefetch(pA + pf.A + n*$AD_stride + $CACHELINE_SIZE*(q-1), Val(3), Val(0))
            end
            @nexprs $Pₖ p -> begin
                # prefetch(pX + pf.X + $(N*T_size) + (p-1)*$X_stride, Val(3), Val(0))
                $pfX_3
                $D_store
            end
            nothing
        end)
    end
    q
end
@generated function fastmul!(D::MMatrix{M,P,T},A::MMatrix{M,N,T},X::MMatrix{N,P,T}) where {M,N,P,T}
    quote
        pD, pA, pX = pointer(D), pointer(A), pointer(X)
        # Mₖ,Pₖ,stride_AD,stride_X,N,T,init
        $(kernel_quote(M,P,M,N,N,T,true,true))
    end
end

max_cols = REGISTER_SIZE == 32 ? 6 : 14
for n  2:(REGISTER_SIZE == 32 ? 8 : 16)
    m1 = @MMatrix randn(n,n)
    m2 = @MMatrix randn(n,min(n,max_cols))
    m3 = @MMatrix randn(n,min(n,max_cols))
    @show size(m3), size(m1), size(m2)
    println("MMatrix Multiplication:")
    @btime mul!($m3, $m1, $m2)
    println("SMatrix Multiplication:")
    s3 = @btime $m1 * $m2
    println("fastmul!:")
    @btime fastmul!($m3, $m1, $m2)
    @show m3 - s3
    println("")
end

On a computer with avx-512, this yields:

(size(m3), size(m1), size(m2)) = ((2, 2), (2, 2), (2, 2))
MMatrix Multiplication:
  3.135 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  1.584 ns (0 allocations: 0 bytes)
fastmul!:
  1.896 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0; 0.0 0.0]

(size(m3), size(m1), size(m2)) = ((3, 3), (3, 3), (3, 3))
MMatrix Multiplication:
  7.383 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  8.274 ns (0 allocations: 0 bytes)
fastmul!:
  3.237 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0 0.0; 0.0 0.0 0.0; 0.0 0.0 0.0]

(size(m3), size(m1), size(m2)) = ((4, 4), (4, 4), (4, 4))
MMatrix Multiplication:
  10.524 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  3.528 ns (0 allocations: 0 bytes)
fastmul!:
  4.154 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0]

(size(m3), size(m1), size(m2)) = ((5, 5), (5, 5), (5, 5))
MMatrix Multiplication:
  19.038 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  22.561 ns (0 allocations: 0 bytes)
fastmul!:
  5.894 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0]

(size(m3), size(m1), size(m2)) = ((6, 6), (6, 6), (6, 6))
MMatrix Multiplication:
  30.316 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  39.103 ns (0 allocations: 0 bytes)
fastmul!:
  7.837 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0]

(size(m3), size(m1), size(m2)) = ((7, 7), (7, 7), (7, 7))
MMatrix Multiplication:
  51.105 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  62.420 ns (0 allocations: 0 bytes)
fastmul!:
  11.871 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0]

(size(m3), size(m1), size(m2)) = ((8, 8), (8, 8), (8, 8))
MMatrix Multiplication:
  36.552 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  12.940 ns (0 allocations: 0 bytes)
fastmul!:
  12.794 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0]

(size(m3), size(m1), size(m2)) = ((9, 9), (9, 9), (9, 9))
MMatrix Multiplication:
  68.433 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  73.896 ns (0 allocations: 0 bytes)
fastmul!:
  24.042 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0]

(size(m3), size(m1), size(m2)) = ((10, 10), (10, 10), (10, 10))
MMatrix Multiplication:
  106.568 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  109.546 ns (0 allocations: 0 bytes)
fastmul!:
  31.296 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0]

(size(m3), size(m1), size(m2)) = ((11, 11), (11, 11), (11, 11))
MMatrix Multiplication:
  161.298 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  151.703 ns (0 allocations: 0 bytes)
fastmul!:
  38.405 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0]

(size(m3), size(m1), size(m2)) = ((12, 12), (12, 12), (12, 12))
MMatrix Multiplication:
  210.829 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  216.941 ns (0 allocations: 0 bytes)
fastmul!:
  47.986 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0]

(size(m3), size(m1), size(m2)) = ((13, 13), (13, 13), (13, 13))
MMatrix Multiplication:
  315.835 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  302.625 ns (0 allocations: 0 bytes)
fastmul!:
  65.856 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0]

(size(m3), size(m1), size(m2)) = ((14, 14), (14, 14), (14, 14))
MMatrix Multiplication:
  466.087 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  396.795 ns (0 allocations: 0 bytes)
fastmul!:
  66.755 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0]

(size(m3), size(m1), size(m2)) = ((15, 14), (15, 15), (15, 14))
MMatrix Multiplication:
  548.775 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  450.668 ns (0 allocations: 0 bytes)
fastmul!:
  67.804 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0]

(size(m3), size(m1), size(m2)) = ((16, 14), (16, 16), (16, 14))
MMatrix Multiplication:
  545.207 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  83.752 ns (0 allocations: 0 bytes)
fastmul!:
  65.457 ns (0 allocations: 0 bytes)
m3 - s3 = [0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0; 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0]

EDIT:
Unfortunately, I'm not sure how we could actually use the masked store instructions with SArrays, since they aren't heap allocated, meaning we can't get pointers.
Still, this could make using preallocating MArrays extremely fast.

Anyone more familiar with LLVM know how?

Also, FWIW:

julia> n = 3;

julia> m1 = @MMatrix randn(n,n);

julia> m2 = @MMatrix randn(n,n);

julia> m3 = @MMatrix randn(n,n);

julia> @code_native fastmul!(m3, m1, m2)
	.text
; Function fastmul! {
; Location: REPL[17]:2
	movq	%rsi, -8(%rsp)
	movq	8(%rsi), %rax
	movq	16(%rsi), %rcx
; Function macro expansion; {
; Location: REPL[17]:5
; Function macro expansion; {
; Location: REPL[14]:6
; Function vload; {
; Location: memory.jl:80
; Function vload; {
; Location: memory.jl:80
; Function macro expansion; {
; Location: memory.jl:109
	movb	$7, %dl
	kmovd	%edx, %k1
	vmovupd	(%rax), %ymm0 {%k1} {z}
;}}}}
; Function macro expansion; {
; Location: llvmwrap.jl:81
	vmulpd	48(%rcx){1to4}, %ymm0, %ymm1
	vmulpd	24(%rcx){1to4}, %ymm0, %ymm2
	vmulpd	(%rcx){1to4}, %ymm0, %ymm0
	movq	(%rsi), %rdx
;}
; Function macro expansion; {
; Location: REPL[14]:16
; Function macro expansion; {
; Location: REPL[16]:48
; Function vload; {
; Location: memory.jl:80
; Function vload; {
; Location: memory.jl:80
; Function macro expansion; {
; Location: memory.jl:109
	vmovupd	24(%rax), %ymm3 {%k1} {z}
;}}}}
; Function macro expansion; {
; Location: llvmwrap.jl:170
	vfmadd231pd	8(%rcx){1to4}, %ymm3, %ymm0
	vfmadd231pd	32(%rcx){1to4}, %ymm3, %ymm2
	vfmadd231pd	56(%rcx){1to4}, %ymm3, %ymm1
;}
; Function macro expansion; {
; Location: REPL[16]:48
; Function vload; {
; Location: memory.jl:80
; Function vload; {
; Location: memory.jl:80
; Function macro expansion; {
; Location: memory.jl:109
	vmovupd	48(%rax), %ymm3 {%k1} {z}
;}}}
; Location: REPL[16]:51
; Function vfma; {
; Location: floating_point_arithmetic.jl:48
; Function llvmwrap; {
; Location: llvmwrap.jl:148
; Function llvmwrap; {
; Location: llvmwrap.jl:148
; Function macro expansion; {
; Location: llvmwrap.jl:170
	vfmadd231pd	16(%rcx){1to4}, %ymm3, %ymm0
	vfmadd231pd	40(%rcx){1to4}, %ymm3, %ymm2
	vfmadd231pd	64(%rcx){1to4}, %ymm3, %ymm1
;}}}}
; Location: REPL[16]:30
; Function vstore; {
; Location: memory.jl:179
; Function vstore; {
; Location: memory.jl:179
; Function macro expansion; {
; Location: memory.jl:207
	vmovupd	%ymm0, (%rdx) {%k1}
	vmovupd	%ymm2, 24(%rdx) {%k1}
	vmovupd	%ymm1, 48(%rdx) {%k1}
;}}}}}
	movabsq	$139651999485960, %rax  # imm = 0x7F0343D25008
	vzeroupper
	retq
;}}

Worth pointing out that a bit of thermal throttling is obvious with these benchmarks.
Maybe I should add sleep(X) into the for loop between each mul? For example, the minimum time for 14x14 was 66.755ns, but when I rerun just that benchmark after letting my computer rest for a few seconds:

julia> n = 14;

julia> m1 = @MMatrix randn(n,n);

julia> m2 = @MMatrix randn(n,n);

julia> m3 = @MMatrix randn(n,n);

julia> @benchmark fastmul!($m3, $m1, $m2)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     61.836 ns (0.00% GC)
  median time:      61.975 ns (0.00% GC)
  mean time:        63.593 ns (0.00% GC)
  maximum time:     102.706 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     981

Not sure what constitutes fair benchmarking -- thermal throttling is something we'd be concerned with in practice. But the function ordered last is likely to suffer more than the others.

EDIT:
Interestingly, KrisofferC's method is faster, but I can't replicate the performance with MMatrices:

julia> import SIMD

julia> const SVec{N, T} = SIMD.Vec{N, T}
SIMD.Vec{N,T} where T where N

julia> # load given range of linear indices into SVec
       @generated function tosimd(D::NTuple{N, T}, ::Type{Val{strt}}, ::Type{Val{stp}}) where {N, T, strt, stp}
           expr = Expr(:tuple, [:(D[$i]) for i in strt:stp]...)
           M = length(expr.args)
           return quote
               $(Expr(:meta, :inline))
               @inbounds return SVec{$M, T}($expr)
           end
       end
tosimd (generic function with 1 method)

julia> # constructor SMatrix from several SVecs
       @generated function (::Type{SMatrix{dim, dim}})(r::NTuple{M, SVec{N}}) where {dim, M, N}
           return quote
               $(Expr(:meta, :inline))
               @inbounds return SMatrix{$dim, $dim}($(Expr(:tuple, [:(r[$j][$i]) for i in 1:N, j in 1:M]...)))
           end
       end

julia> n = 3;

julia> m1 = @MMatrix randn(n,n);

julia> m2 = @MMatrix randn(n,n);

julia> m3 = @MMatrix randn(n,n);

julia> function matmul3x3(a::MMatrix{3,3,T}, b::MMatrix{3,3,T}) where T
          D1 = a.data; D2 = b.data
          SV11 = tosimd(D1, Val{1}, Val{3})
          SV12 = tosimd(D1, Val{4}, Val{6})
          SV13 = tosimd(D1, Val{7}, Val{9})
          r1 = muladd(SV13, D2[3], muladd(SV12, D2[2], SV11 * D2[1]))
          r2 = muladd(SV13, D2[6], muladd(SV12, D2[5], SV11 * D2[4]))
          r3 = muladd(SV13, D2[9], muladd(SV12, D2[8], SV11 * D2[7]))
          return SMatrix{3,3}((r1, r2, r3))
       end
matmul3x3 (generic function with 2 methods)

julia> function matmul3x3_vstore!(C::MMatrix{3,3,T}, A::MMatrix{3,3,T}, B::MMatrix{3,3,T}) where T
          D1 = A.data; D2 = B.data
          SV11 = tosimd(D1, Val{1}, Val{3})
          SV12 = tosimd(D1, Val{4}, Val{6})
          SV13 = tosimd(D1, Val{7}, Val{9})
          pC = pointer(C)
          SIMD.vstore(muladd(SV13, D2[3], muladd(SV12, D2[2], SV11 * D2[1])), pC)
          SIMD.vstore(muladd(SV13, D2[6], muladd(SV12, D2[5], SV11 * D2[4])), pC + 3sizeof(T))
          SIMD.vstore(muladd(SV13, D2[9], muladd(SV12, D2[8], SV11 * D2[7])), pC + 6sizeof(T))
          return C
       end
matmul3x3_vstore! (generic function with 1 method)

julia> function matmul3x3_unsafe_store!(C::MMatrix{3,3,T}, A::MMatrix{3,3,T}, B::MMatrix{3,3,T}) where T
          D1 = A.data; D2 = B.data
          SV11 = tosimd(D1, Val{1}, Val{3})
          SV12 = tosimd(D1, Val{4}, Val{6})
          SV13 = tosimd(D1, Val{7}, Val{9})
          r1 = muladd(SV13, D2[3], muladd(SV12, D2[2], SV11 * D2[1]))
          r2 = muladd(SV13, D2[6], muladd(SV12, D2[5], SV11 * D2[4]))
          r3 = muladd(SV13, D2[9], muladd(SV12, D2[8], SV11 * D2[7]))
          S = SMatrix{3,3}((r1, r2, r3))
          unsafe_store!(Base.unsafe_convert(Ptr{typeof(S)}, pointer(C)), S)
          return C
       end
matmul3x3_unsafe_store! (generic function with 1 method)

julia> @btime fastmul!($m3, $m1, $m2);
  4.031 ns (0 allocations: 0 bytes)

julia> @btime matmul3x3_vstore!($m3, $m1, $m2);
  12.027 ns (0 allocations: 0 bytes)

julia> @btime matmul3x3($m1, $m2); #KristofferC's function
  2.527 ns (0 allocations: 0 bytes)

julia> @btime matmul3x3_unsafe_store!($m3, $m1, $m2);
  13.783 ns (0 allocations: 0 bytes)

This implies to me that we can do better than the masked load/stores I demonstrated in this post.

@KristofferC
Copy link
Contributor Author

KristofferC commented Oct 18, 2018

Seems clang kinda barfs on e.g. a 3x3 pattern as well: https://godbolt.org/z/TGCusg.

It is interesting to note that when the sizes correspond to the width of the SIMD registers the SLP does just as well as the hand-written.

(size(m3), size(m1), size(m2)) = ((8, 8), (8, 8), (8, 8))
MMatrix Multiplication:
  36.552 ns (0 allocations: 0 bytes)
SMatrix Multiplication:
  12.940 ns (0 allocations: 0 bytes)
fastmul!:
  12.794 ns (0 allocations: 0 bytes)

A nice wish would be for LLVM to just improve how they generate code for this odd matrix sizes (or implement something like http:https://lists.llvm.org/pipermail/llvm-dev/2018-October/126871.html which presumably will be fast if we emit).

@c42f c42f mentioned this issue Oct 23, 2018
2 tasks
@tkoolen
Copy link
Contributor

tkoolen commented Jan 19, 2019

To what extent are these speedups due to using muladd? StaticArrays currently doesn't use muladd at all, and using it constitutes a tradeoff.

@ChrisRackauckas
Copy link
Member

and using it constitutes a tradeoff.

How so? At least from my understanding of it, on setups where it can be done on hardware, it is a quicker operation and has less numerical error. Where it can't be done on hardware it just defaults back to the normal way.

@tkoolen
Copy link
Contributor

tkoolen commented Jan 19, 2019

From ?muladd:

The result can be different on different machines and can also be different on the same machine due to constant propagation or other optimizations.

@chriselrod
Copy link
Contributor

chriselrod commented Jan 19, 2019 via email

@KristofferC
Copy link
Contributor Author

I always saw not using muladd in StaticArrays as just a missed optimization. Are you saying it is an intended choice? Seems quite out of spirit with other choices made in this package.

@tkoolen
Copy link
Contributor

tkoolen commented Jan 19, 2019

I don't know if it was intended. I tend to agree.

@c42f
Copy link
Member

c42f commented Jan 21, 2019

Highly unlikely that muladd was avoided on purpose (I didn't write that code but I was sitting next to @andyferris when he originally wrote the package :-) )

Given that it's faster and has less rounding error on newer CPUs, a patch to use muladd would be very appropriate for this package, I think.

@chriselrod
Copy link
Contributor

chriselrod commented Jan 21, 2019

Here are three fairly simple possibilities. The first two are more or less identical, except one wraps items in VecElements. The third unsuccessfully tries to encourage the compiler to use masked load/stores.

@generated function Base.:*(A::SMatrix{M,N,T}, B::SMatrix{N,P,T}) where {M,N,P,T}
    outtup = Vector{Expr}(undef, M*P)
    i = 0
    for p  1:P, m  1:M
        i += 1
        outtup[i] = :(@inbounds $(Symbol(:C_, p))[$m] )
    end
    quote
        $(Expr(:meta, :inline))
        A_col = $(Expr(:tuple, [:(@inbounds A[$m,1]) for m  1:M]...))
        Base.Cartesian.@nexprs $P p -> begin
            @inbounds B_p = B[1,p]
            C_p = $(Expr(:tuple, [:(@inbounds A_col[$n] * B_p) for n  1:N]...))
        end
        @fastmath @inbounds for n  2:$N
            A_col = $(Expr(:tuple, [:(@inbounds A[$m,n]) for m  1:M]...))
            Base.Cartesian.@nexprs $P p -> begin
                @inbounds B_p = B[n,p]
                C_p = $(Expr(:tuple, [:(@inbounds A_col[$n] * B_p + C_p[$n]) for n  1:N]...))
            end
        end
        SMatrix{$M,$P,$T}(
            $(Expr(:tuple, outtup...))
        )
    end
end



@generated function Base.:*(A::SMatrix{M,N,T}, B::SMatrix{N,P,T}) where {M,N,P,T}
    outtup = Vector{Expr}(undef, M*P)
    i = 0
    for p  1:P, m  1:M
        i += 1
        outtup[i] = :(@inbounds $(Symbol(:C_, p))[$m].value )
    end
    quote
        $(Expr(:meta, :inline))
        A_col = $(Expr(:tuple, [:(@inbounds Core.VecElement(A[$m,1])) for m  1:M]...))
        Base.Cartesian.@nexprs $P p -> begin
            @inbounds B_p = B[1,p]
            C_p = $(Expr(:tuple, [:(@inbounds Core.VecElement(A_col[$n].value * B_p)) for n  1:N]...))
        end
        @fastmath @inbounds for n  2:$N
            A_col = $(Expr(:tuple, [:(@inbounds Core.VecElement(A[$m,n])) for m  1:M]...))
            Base.Cartesian.@nexprs $P p -> begin
                @inbounds B_p = B[n,p]
                C_p = $(Expr(:tuple, [:(@inbounds Core.VecElement(A_col[$n].value * B_p + C_p[$n].value)) for n  1:N]...))
            end
        end
        SMatrix{$M,$P,$T}(
            $(Expr(:tuple, outtup...))
        )
    end
end

@generated function Base.:*(A::SMatrix{M,N,T}, B::SMatrix{N,P,T}) where {M,N,P,T}
    outtup = Vector{Expr}(undef, M*P)
    Mpow2 = cld(M,2) * 2
    Mrem = Mpow2 - M
    remaining_zeros = [Core.VecElement(zero(T)) for i  1:Mrem]
    i = 0
    for p  1:P, m  1:M
        i += 1
        outtup[i] = :(@inbounds $(Symbol(:C_, p))[$(m+Mrem)].value )
    end
    quote
        $(Expr(:meta, :inline))
        Adata = A.data
        A_col = $(Expr(:tuple, remaining_zeros..., [:(@inbounds Core.VecElement(Adata[$m])) for m  1:M]...))
        Base.Cartesian.@nexprs $P p -> begin
            @inbounds B_p = B[1,p]
            C_p = $(Expr(:tuple, [:(@inbounds Core.VecElement(A_col[$m].value * B_p)) for m  1:Mpow2]...))
        end
        @fastmath @inbounds for n  2:$N
            A_col = $(Expr(:tuple, [:(@inbounds Core.VecElement(Adata[$(m - Mpow2) + n*$M])) for m  1:Mpow2]...))
            Base.Cartesian.@nexprs $P p -> begin
                @inbounds B_p = B[n,p]
                C_p = $(Expr(:tuple, [:(@inbounds Core.VecElement(A_col[$m].value * B_p + C_p[$m].value)) for m  1:Mpow2]...))
            end
        end
        SMatrix{$M,$P,$T}(
            $(Expr(:tuple, outtup...))
        )
    end
end

However, they are all pretty bad whenever M is not a multiple of SIMD vector width. Trying the third version:

julia> s = @SMatrix randn(3,3);

julia> t = @SMatrix randn(3,3);

julia> @btime matmul3x3($s,$t)
  2.525 ns (0 allocations: 0 bytes)
3×3 SArray{Tuple{3,3},Float64,2,9}:
 0.600566  -1.94865   0.100573
 1.0341    -1.76276  -1.42636 
 0.34147   -3.58425   0.46392 

julia> @btime $s * $t
  8.270 ns (0 allocations: 0 bytes)
3×3 SArray{Tuple{3,3},Float64,2,9}:
 0.600566  -1.94865   0.100573
 1.0341    -1.76276  -1.42636 
 0.34147   -3.58425   0.46392 

Or, looking at vector width x vector width (try 4x4 if your computer has avx2; this one has avx512):

julia> A = @SMatrix randn(8,8);

julia> B = @SMatrix randn(8,8);

julia> C = @SMatrix randn(7,7);

julia> D = @SMatrix randn(7,7);

julia> @benchmark $A * $B
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     11.009 ns (0.00% GC)
  median time:      11.172 ns (0.00% GC)
  mean time:        11.552 ns (0.00% GC)
  maximum time:     42.911 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     999

julia> @benchmark $C * $D
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     66.681 ns (0.00% GC)
  median time:      66.690 ns (0.00% GC)
  mean time:        68.901 ns (0.00% GC)
  maximum time:     103.893 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     978

For 7x7, this is actually worse than the current method:

julia> A = @SMatrix randn(8,8);

julia> B = @SMatrix randn(8,8);

julia> C = @SMatrix randn(7,7);

julia> D = @SMatrix randn(7,7);

julia> @benchmark $A * $B
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     16.281 ns (0.00% GC)
  median time:      16.390 ns (0.00% GC)
  mean time:        16.973 ns (0.00% GC)
  maximum time:     63.614 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     998

julia> @benchmark $C * $D
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     61.545 ns (0.00% GC)
  median time:      61.663 ns (0.00% GC)
  mean time:        63.357 ns (0.00% GC)
  maximum time:     107.715 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     981

The assembly looks really good though in the 8x8 case:

julia> @code_native A * B
	.text
; ┌ @ REPL[88]:2 within `*'
; │┌ @ REPL[88]:17 within `macro expansion'
; ││┌ @ REPL[88]:2 within `*'
	vmovupd	(%rsi), %zmm7
	vmulpd	(%rdx){1to8}, %zmm7, %zmm0
	vmulpd	64(%rdx){1to8}, %zmm7, %zmm1
	vmulpd	128(%rdx){1to8}, %zmm7, %zmm2
	vmulpd	192(%rdx){1to8}, %zmm7, %zmm3
	vmulpd	256(%rdx){1to8}, %zmm7, %zmm4
	vmulpd	320(%rdx){1to8}, %zmm7, %zmm5
	vmulpd	384(%rdx){1to8}, %zmm7, %zmm6
	vmulpd	448(%rdx){1to8}, %zmm7, %zmm7
	movq	$-56, %rax
	nopw	%cs:(%rax,%rax)
; │└└
; │┌ @ fastmath.jl:163 within `macro expansion'
L80:
	vmovupd	512(%rsi,%rax,8), %zmm8
; │└
; │┌ @ REPL[88]:23 within `macro expansion'
; ││┌ @ fastmath.jl:161 within `add_fast'
	vfmadd231pd	64(%rdx,%rax){1to8}, %zmm8, %zmm0
	vfmadd231pd	128(%rdx,%rax){1to8}, %zmm8, %zmm1
	vfmadd231pd	192(%rdx,%rax){1to8}, %zmm8, %zmm2
	vfmadd231pd	256(%rdx,%rax){1to8}, %zmm8, %zmm3
	vfmadd231pd	320(%rdx,%rax){1to8}, %zmm8, %zmm4
	vfmadd231pd	384(%rdx,%rax){1to8}, %zmm8, %zmm5
	vfmadd231pd	448(%rdx,%rax){1to8}, %zmm8, %zmm6
	vfmadd231pd	512(%rdx,%rax){1to8}, %zmm8, %zmm7
; ││└
; ││┌ @ range.jl:595 within `iterate'
; │││┌ @ promotion.jl:403 within `=='
	addq	$8, %rax
; ││└└
	jne	L80
; ││ @ REPL[88]:26 within `macro expansion'
	vmovupd	%zmm0, (%rdi)
	vmovupd	%zmm1, 64(%rdi)
	vmovupd	%zmm2, 128(%rdi)
	vmovupd	%zmm3, 192(%rdi)
	vmovupd	%zmm4, 256(%rdi)
	vmovupd	%zmm5, 320(%rdi)
	vmovupd	%zmm6, 384(%rdi)
	vmovupd	%zmm7, 448(%rdi)
	movq	%rdi, %rax
	vzeroupper
	retq
	nopl	(%rax)
; └└

so it may be worth checking if M is a multiple of two in the generated function.

I spent a couple more hours experimenting, trying to get the compiler to emit masked loads for SArrays. No luck.

Masked loads and stores would be a great option for MArrays, though.

For StaticArrays, I think the best solution is still to pad them.

julia> using SIMDArrays, BenchmarkTools

julia> A = @Static randn(8,8);

julia> B = @Static randn(8,8);

julia> C = @Static randn(7,7);

julia> D = @Static randn(7,7);

julia> @benchmark $A * $B
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     11.665 ns (0.00% GC)
  median time:      11.735 ns (0.00% GC)
  mean time:        12.215 ns (0.00% GC)
  maximum time:     43.882 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     999

julia> @benchmark $C * $D
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     8.550 ns (0.00% GC)
  median time:      8.591 ns (0.00% GC)
  mean time:        8.900 ns (0.00% GC)
  maximum time:     39.067 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     999

julia> Array(A) * Array(B) .≈ A * B
8×8 BitArray{2}:
 1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1
 1  1  1  1  1  1  1  1

julia> Array(C) * Array(D) .≈ C * D
7×7 BitArray{2}:
 1  1  1  1  1  1  1
 1  1  1  1  1  1  1
 1  1  1  1  1  1  1
 1  1  1  1  1  1  1
 1  1  1  1  1  1  1
 1  1  1  1  1  1  1
 1  1  1  1  1  1  1

The trick is simply to pad the matrices with a number of rows less than SIMD vector width.

julia> length(C.data), length(C)
(56, 49)

julia> C
7×7 SIMDArrays.StaticSIMDArray{Tuple{7,7},Float64,2,8,56}:
  0.319496    0.428686  -1.11775   -1.3375    -0.279712   1.66079     0.401079
 -0.0888366  -0.776635   0.656009  -0.632125  -0.354105   2.75417    -0.299715
 -0.496308   -0.825651  -1.23162    0.996198   0.574873  -0.0737101   1.29759 
  1.30891     0.904274  -0.64274    1.07277   -0.222821   0.778396   -1.35605 
  0.783055   -1.40736    0.26168    1.54089   -1.21921    0.69172    -0.628125
  0.96238    -0.950408  -0.684878   0.272511  -0.188624  -0.0396921   0.494139
 -0.814892    0.769325   0.673766  -0.402637   1.01501   -1.94723     0.770561

julia> C.data[1:10]
(0.31949627322535246, -0.0888366159816679, -0.49630848286450396, 1.3089137129729653, 0.7830545118498307, 0.9623798624797774, -0.8148924827876292, 0.18360020071865166, 0.42868648749010024, -0.776634962264874)

C is a 7x7 matrix, but has 56 elements with a stride of 8. Notice that elements 7 through 9 of the underlying tuple are -0.8148924827876292, 0.18360020071865166, 0.42868648749010024. The last element of the first column, padding, and then the first element of the second column.
Because of this, we get efficient asm:

julia> @code_native C * D
	.text
; ┌ @ blas.jl:117 within `*'
; │┌ @ blas.jl:117 within `macro expansion'
	vmovupd	(%rsi), %zmm6
; ││ @ blas.jl:23 within `macro expansion'
; ││┌ @ floating_point_arithmetic.jl:182 within `evmul'
; │││┌ @ llvmwrap.jl:62 within `llvmwrap' @ llvmwrap.jl:62
; ││││┌ @ llvmwrap.jl:81 within `macro expansion'
	vmulpd	(%rdx){1to8}, %zmm6, %zmm0
	vmulpd	64(%rdx){1to8}, %zmm6, %zmm1
	vmulpd	128(%rdx){1to8}, %zmm6, %zmm2
	vmulpd	192(%rdx){1to8}, %zmm6, %zmm3
	vmulpd	256(%rdx){1to8}, %zmm6, %zmm4
	vmulpd	320(%rdx){1to8}, %zmm6, %zmm5
	vmulpd	384(%rdx){1to8}, %zmm6, %zmm6
	movq	$-48, %rax
	nopl	(%rax)
; ││└└└
; ││ @ blas.jl:28 within `macro expansion'
L64:
	vmovupd	448(%rsi,%rax,8), %zmm7
; ││ @ blas.jl:31 within `macro expansion'
; ││┌ @ SIMDPirates.jl:33 within `vfma'
; │││┌ @ floating_point_arithmetic.jl:427 within `macro expansion'
; ││││┌ @ SIMDPirates.jl:33 within `vfma'
; │││││┌ @ floating_point_arithmetic.jl:349 within `macro expansion'
; ││││││┌ @ llvmwrap.jl:148 within `llvmwrap' @ llvmwrap.jl:148
; │││││││┌ @ llvmwrap.jl:170 within `macro expansion'
	vfmadd231pd	56(%rdx,%rax){1to8}, %zmm7, %zmm0
	vfmadd231pd	120(%rdx,%rax){1to8}, %zmm7, %zmm1
	vfmadd231pd	184(%rdx,%rax){1to8}, %zmm7, %zmm2
	vfmadd231pd	248(%rdx,%rax){1to8}, %zmm7, %zmm3
	vfmadd231pd	312(%rdx,%rax){1to8}, %zmm7, %zmm4
	vfmadd231pd	376(%rdx,%rax){1to8}, %zmm7, %zmm5
	vfmadd231pd	440(%rdx,%rax){1to8}, %zmm7, %zmm6
; │└└└└└└└
; │┌ @ promotion.jl:403 within `macro expansion'
	addq	$8, %rax
; │└
; │ @ blas.jl:31 within `*'
	jne	L64
; │ @ blas.jl:36 within `*'
	vmovupd	%zmm0, (%rdi)
	vmovupd	%zmm1, 64(%rdi)
	vmovupd	%zmm2, 128(%rdi)
	vmovupd	%zmm3, 192(%rdi)
	vmovupd	%zmm4, 256(%rdi)
	vmovupd	%zmm5, 320(%rdi)
	vmovupd	%zmm6, 384(%rdi)
	movq	%rdi, %rax
	vzeroupper
	retq
	nopl	(%rax)
; └

Or, for the 3x3 case:

julia> S = @Static randn(3,3)
3×3 SIMDArrays.StaticSIMDArray{Tuple{3,3},Float64,2,4,12}:
 -1.72684   -0.053865   -0.541157
  0.020725  -0.0593686  -0.1204  
  0.587405  -0.667932    0.623411

julia> T = @Static randn(3,3)
3×3 SIMDArrays.StaticSIMDArray{Tuple{3,3},Float64,2,4,12}:
 -0.00492318   1.76432  -2.19455 
 -0.359605     1.9209    0.167068
  0.722771    -1.35129   0.520166

julia> @benchmark $S * $T
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     1.849 ns (0.00% GC)
  median time:      1.873 ns (0.00% GC)
  mean time:        1.950 ns (0.00% GC)
  maximum time:     18.867 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1000

The downside is that IndexStyle must now be IndexCartesian, to avoid breaking code where these arrays interact with other matrix types.
However, they are allowed to act like IndexLinear while interacting with each other. I've not messed with custom broadcasting much, but it's probably possible to handle that there.

EDIT:
I'd like to make another commend about manual unrolling. My code above just uses for loops, because they're at least as fast to run for bigger matrices, while compiling far more quickly. For smaller matrices, LLVM will automatically unroll for you anyway:

julia> W = @Static randn(4,4)
4×4 SIMDArrays.StaticSIMDArray{Tuple{4,4},Float64,2,4,16}:
 -1.13708   -0.480141  -0.330682   0.820593
  0.482933  -1.05726    0.536663   0.344132
  0.322102   1.18024    0.930196  -0.58379 
  0.272023   0.567929  -0.504169   1.01676 

julia> Y = @Static randn(4,4)
4×4 SIMDArrays.StaticSIMDArray{Tuple{4,4},Float64,2,4,16}:
  0.492714  1.19693    1.33252    0.0144647
  0.901112  1.52111   -0.507832  -1.94051  
 -0.265069  1.10911    0.862128  -0.739434 
 -0.498172  0.293976  -1.07335    0.623734 

julia> @code_native W * Y
	.text
; ┌ @ blas.jl:117 within `*'
; │┌ @ blas.jl:117 within `macro expansion'
	vmovupd	(%rsi), %ymm0
; ││ @ blas.jl:28 within `macro expansion'
	vmovupd	32(%rsi), %ymm1
	vmovupd	64(%rsi), %ymm2
	vmovupd	96(%rsi), %ymm3
; ││ @ blas.jl:23 within `macro expansion'
; ││┌ @ floating_point_arithmetic.jl:182 within `evmul'
; │││┌ @ llvmwrap.jl:62 within `llvmwrap' @ llvmwrap.jl:62
; ││││┌ @ llvmwrap.jl:81 within `macro expansion'
	vmulpd	96(%rdx){1to4}, %ymm0, %ymm4
	vmulpd	64(%rdx){1to4}, %ymm0, %ymm5
	vmulpd	32(%rdx){1to4}, %ymm0, %ymm6
	vmulpd	(%rdx){1to4}, %ymm0, %ymm0
; │└└└└
; │┌ @ llvmwrap.jl:170 within `macro expansion'
	vfmadd231pd	8(%rdx){1to4}, %ymm1, %ymm0
	vfmadd231pd	40(%rdx){1to4}, %ymm1, %ymm6
	vfmadd231pd	72(%rdx){1to4}, %ymm1, %ymm5
	vfmadd231pd	104(%rdx){1to4}, %ymm1, %ymm4
	vfmadd231pd	16(%rdx){1to4}, %ymm2, %ymm0
	vfmadd231pd	48(%rdx){1to4}, %ymm2, %ymm6
	vfmadd231pd	80(%rdx){1to4}, %ymm2, %ymm5
	vfmadd231pd	112(%rdx){1to4}, %ymm2, %ymm4
	vfmadd231pd	24(%rdx){1to4}, %ymm3, %ymm0
	vfmadd231pd	56(%rdx){1to4}, %ymm3, %ymm6
	vfmadd231pd	88(%rdx){1to4}, %ymm3, %ymm5
	vfmadd231pd	120(%rdx){1to4}, %ymm3, %ymm4
; │└
; │ @ blas.jl:36 within `*'
	vmovupd	%ymm0, (%rdi)
	vmovupd	%ymm6, 32(%rdi)
	vmovupd	%ymm5, 64(%rdi)
	vmovupd	%ymm4, 96(%rdi)
	movq	%rdi, %rax
	vzeroupper
	retq
	nopl	(%rax)
; └

I'm inclined to just trust LLVM on whether or not to unroll the loop.

@c42f
Copy link
Member

c42f commented Jan 21, 2019

Regarding unrolling, it's important to keep in mind interactions between unrolling and type inference. In particular, any type inference opportunities which are exposed by unrolling will be invisible to the julia compiler if we leave it to the LLVM layer. This can be a problem in highly generic code.

That's not to say we don't want to remove unrolling from this package: I'd love it if we could remove all unrolling-related generated functions. At the time StaticArrays was started (julia-0.5 era) that definitely wasn't an option, but the compiler has improved fantastically since then.

@c42f
Copy link
Member

c42f commented Jan 21, 2019

Regarding padding - adding any would completely change the memory layout of Vector{SVector{...}} which is somewhat questionable in my mind. But more importantly the memory overhead would be excessive for small vectors and matrices. And what about objects which are not floats or ints? You should be able to have an SVector{2,BigFloat} and the implementation of this should not be burdened with SIMD-related stuff.

So I feel this is not a task for StaticArrays itself, but rather a task for an array wrapper in the spirit of https://github.com/piever/StructArrays.jl (see also StructsOfArrays.jl) which can present a SIMD-friendly data layout internally, while also presenting a human-friendly API as a simple array of structs. For more on how this type of technique can decouple API from memory layout in a really powerful way, see @Keno's part of the Celeste presentation from juliacon 2017: https://www.youtube.com/watch?v=uecdcADM3hY

@chriselrod
Copy link
Contributor

chriselrod commented Jan 21, 2019

For 7x7 matrices, the padded version requires 1.14 times more memory (8 rows /7 rows), but multiplying two of them is 7 times faster (61 ns / 8.6 ns).
The 8x7 matrix actually requires less registers than the 7x7, and registers are the most precious memory.
The 7x7 matrix will either be stored in pieces (eg, 2 xmm and a ymm), or assembly / disassembly into the larger registers takes time, and temporarily occupies several.

Regarding Vector{SVector{...}}, because Vectors are heap allocated, you can use masked load and store operations (for the architectures that support them). This would allow a 7 x N matrix to be converted to a Vector{SVector{7}}, with a getindex function defined on Arrays of StaticArrays so that a masked load is used to produce a SVector{7} that has 8 elements under the hood.

The generated function determining the padding size would already have to use sizeof(T) to determine padding. Would be easy to add

function determine_padding(T)
    isbitstype(T) || return 0
    ...
end

Although odds are for most types that aren't floats or ints, you'd either just want to return 0 (because they're big), or would prefer some other fancy scheme (like StructArrays?).

For me, it's easier to just use SIMDArrays for the padding, and other functionality. But I haven't gotten around to writing unit tests or documentation for any of my libraries. And it is unlikely I'll find the time to do so before I graduate (hopefully after). I'd like to see some of it registered/upstreamed/or otherwise somehow made official, eventually.
As is, I often start work in an exploratory fashion, I rely on all the well supported and robust libraries like StaticArrays. But when I feel like optimizing code, I switch to my hacked together and undocumented junk.

I'll have to look at that StructArrays library more, but by the looks of it, it doesn't support StaticArrays (eg, storing a vector of SVector{4,Float64} under the hood as 4 vectors of Float64). I wrote ScatteredArrays to do that, because often it is simple matrix-type data structures (with indices, not fields) that I want to break up under the hood in struct-of-arrays fashion.

julia> using ScatteredArrays, StaticArrays, Random

julia> sm = ScatteredMatrix{Float64,2,SMatrix{3,4,Float64,12}}(undef, 5, 6);

julia> Random.randn!(sm.data); typeof(sm.data)
Array{Float64,3}

julia> size(sm.data)
(5, 6, 12)

julia> sm[1,1]
3×4 SArray{Tuple{3,4},Float64,2,12}:
 -0.410485  0.14147   -0.612619    0.767951 
 -1.48007   0.907609  -0.0631815   0.20726  
 -0.104371  0.812684  -0.296792   -0.0752005

julia> sm.data[1,1,:]
12-element Array{Float64,1}:
 -0.41048499089682966
 -1.480068431968409  
 -0.10437062724429408
  0.14146994042866523
  0.9076085592863015 
  0.8126842410603605 
 -0.6126191772222825 
 -0.06318146709474118
 -0.29679219500032633
  0.7679508044942372 
  0.20725992583359093
 -0.07520049352014255

An N-dimensional ScatteredArray is stored under the hood as a N+1 dimensional array, where the last dimension corresponds to the M-dimensions of the isbits array object.

Similarly, SIMDArray's padding also occurs "under the hood", obscured from the user.

Also, Julia's compiler actually seems rather bad at the type of vectorization StructArrays and ScatteredArrays enable. For example, this thread on discourse featured a fairly simple problem that Julia failed to vectorize, but Fortran compilers did.
I discussed a slightly more complicated problem on the Rust discourse, where I also posted benchmarks and code + assembly for a list of languages/compilers:
C++ (GNU, Clang, icpc), Rust, Fortran (GNU, Flang, ifort), and ispc.
Clang did the best, at 2 microseconds. icpc, Flang, and ispc were close behind, at around 2.4 microseconds. Then the GNU compilers at around 4.2, and Rust at about 4.6. ifort took about 16 microseconds.

And Julia took about 20 microseconds in all the comparable permutations. "Cheating", via a macro that forcibly vectorizes a loop via applying SIMD intrinsics, I got 2.9 microseconds (at the time of last updating the comment on the Rust forum, it was about 3.5 microseconds; I also reported a Julia's 2.6 microsecond time in that post via the macro, but it is invalid due to losing accuracy).

This benchmark was dominated by LLVM-based compilers (Clang, Flang, and ispc are all LLVM front ends), with Rust also at least managing to vectorize code. It was striking that Julia could not.
Could I file a "missed optimization" issue on Julia's github about this sort of thing?
Ideally, all it would take is a simple @simd or @fastmath, and not macro from a library:

function juliatest3!(X::AbstractMatrix{T}, Data::AbstractMatrix{T}) where T
     @inbounds @fastmath for i  1:size(Data,1)

        x1,x2,x3 = Data[i,1],Data[i,2],Data[i,3]
        S11,S12,S22,S13,S23,S33 = Data[i,5],Data[i,6],Data[i,7],Data[i,8],Data[i,9],Data[i,10]

        Ui33 = 1 / sqrt(S33)
        U13 = S13 * Ui33
        U23 = S23 * Ui33
        Ui22 = 1 / sqrt(S22 - U23*U23)
        U12 = (S12 - U13*U23) * Ui22

        Ui33x3 = Ui33 * x3

        Ui11 = 1 / sqrt(S11 - U12*U12 - U13*U13)
        Ui12 = - U12 * Ui11 * Ui22
        Ui13x3 = - (U13 * Ui11 + U23 * Ui12) * Ui33x3
        Ui23x3 = - U23 * Ui22 * Ui33x3

        X[i,1] = Ui11*x1 + Ui12*x2 + Ui13x3
        X[i,2] = Ui22*x2 + Ui23x3
        X[i,3] = Ui33x3
    end
end

@inline function pdbacksolve(x1,x2,x3,S11,S12,S22,S13,S23,S33)
    @pirate begin
        Ui33 = rsqrt(S33)
        U13 = S13 * Ui33
        U23 = S23 * Ui33
        Ui22 = rsqrt(S22 - U23*U23)
        U12 = (S12 - U13*U23) * Ui22

        Ui33x3 = Ui33 * x3

        Ui11 = rsqrt(S11 - U12*U12 - U13*U13)
        Ui12 = - U12 * Ui11 * Ui22
        Ui13x3 = - (U13 * Ui11 + U23 * Ui12) * Ui33x3
        Ui23x3 = - U23 * Ui22 * Ui33x3

        (
            Ui11*x1 + Ui12*x2 + Ui13x3,
            Ui22*x2 + Ui23x3,
            Ui33x3
        )
    end
end


@generated function juliatest!(X::AbstractMatrix{T}, Data::AbstractMatrix{T}) where T
    quote
        @vectorize $T for i  1:size(Data,1)
            X[i,:] .= pdbacksolve(
                Data[i,1],Data[i,2],Data[i,3],
                Data[i,5],Data[i,6],Data[i,7],Data[i,8],Data[i,9],Data[i,10]
            )
        end
    end
end

julia> @benchmark juliatest3!($X32,  $BPP32)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     19.657 μs (0.00% GC)
  median time:      20.168 μs (0.00% GC)
  mean time:        20.803 μs (0.00% GC)
  maximum time:     54.302 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

julia> @benchmark juliatest!($X32,  $BPP32)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     2.845 μs (0.00% GC)
  median time:      2.888 μs (0.00% GC)
  mean time:        3.020 μs (0.00% GC)
  maximum time:     6.644 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     9

julia> @benchmark clangtest($X32,  $BPP32,  $N)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     2.033 μs (0.00% GC)
  median time:      2.053 μs (0.00% GC)
  mean time:        2.157 μs (0.00% GC)
  maximum time:     5.858 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     9

julia> (@macroexpand @vectorize Float32 for i  1:size(Data,1)
                   X[i,:] .= pdbacksolve(
                       Data[i,1],Data[i,2],Data[i,3],
                       Data[i,5],Data[i,6],Data[i,7],Data[i,8],Data[i,9],Data[i,10]
                   )
               end) |> striplines
quote
    ##N#442 = size(Data, 1)
    (Q, r) = (LoopVectorization.VectorizationBase).size_loads(Data, 1, Val{16}())
    begin
        ##stride#447 = LoopVectorization.stride_row(X)
        ##stride#449 = LoopVectorization.stride_row(Data)
    end
    ##pData#448 = vectorizable(Data)
    ##pX#443 = vectorizable(X)
    begin
        for ##i#441 = 1:16:Q * 16
            ##iter#440 = ##i#441
            begin
                ####pData#448_i#450 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 0##stride#449)
                ####pData#448_i#451 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 1##stride#449)
                ####pData#448_i#452 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 2##stride#449)
                ####pData#448_i#453 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 4##stride#449)
                ####pData#448_i#454 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 5##stride#449)
                ####pData#448_i#455 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 6##stride#449)
                ####pData#448_i#456 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 7##stride#449)
                ####pData#448_i#457 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 8##stride#449)
                ####pData#448_i#458 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 9##stride#449)
                begin
                    ##B#444 = LoopVectorization.extract_data.(pdbacksolve(####pData#448_i#450, ####pData#448_i#451, ####pData#448_i#452, ####pData#448_i#453, ####pData#448_i#454, ####pData#448_i#455, ####pData#448_i#456, ####pData#448_i#457, ####pData#448_i#458))
                    for ##j#446 = 0:SIMDPirates.vsub(length(##B#444), 1)
                        begin
                            $(Expr(:inbounds, true))
                            local #119#val = SIMDPirates.vstore(getindex(##B#444, SIMDPirates.vadd(1, ##j#446)), ##pX#443, SIMDPirates.vadd(##iter#440, SIMDPirates.vmul(##stride#447, ##j#446)))
                            $(Expr(:inbounds, :pop))
                            #119#val
                        end
                    end
                end
            end
        end
    end
    begin
        if r > 0
            mask = SIMDPirates.vless_or_equal(SIMDPirates.vsub((Core.VecElement{Int32}(1), Core.VecElement{Int32}(2), Core.VecElement{Int32}(3), Core.VecElement{Int32}(4), Core.VecElement{Int32}(5), Core.VecElement{Int32}(6), Core.VecElement{Int32}(7), Core.VecElement{Int32}(8), Core.VecElement{Int32}(9), Core.VecElement{Int32}(10), Core.VecElement{Int32}(11), Core.VecElement{Int32}(12), Core.VecElement{Int32}(13), Core.VecElement{Int32}(14), Core.VecElement{Int32}(15), Core.VecElement{Int32}(16)), unsafe_trunc(Int32, r)), zero(Int32))
            ##i#441 = (##N#442 - r) + 1
            ##iter#440 = ##i#441
            begin
                ####pData#448_i#450 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 0##stride#449, mask)
                ####pData#448_i#451 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 1##stride#449, mask)
                ####pData#448_i#452 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 2##stride#449, mask)
                ####pData#448_i#453 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 4##stride#449, mask)
                ####pData#448_i#454 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 5##stride#449, mask)
                ####pData#448_i#455 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 6##stride#449, mask)
                ####pData#448_i#456 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 7##stride#449, mask)
                ####pData#448_i#457 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 8##stride#449, mask)
                ####pData#448_i#458 = SIMDPirates.vload(NTuple{16,VecElement{Float32}}, ##pData#448, ##iter#440 + 9##stride#449, mask)
                begin
                    ##B#444 = LoopVectorization.extract_data.(pdbacksolve(####pData#448_i#450, ####pData#448_i#451, ####pData#448_i#452, ####pData#448_i#453, ####pData#448_i#454, ####pData#448_i#455, ####pData#448_i#456, ####pData#448_i#457, ####pData#448_i#458))
                    for ##j#446 = 0:SIMDPirates.vsub(length(##B#444), 1)
                        begin
                            $(Expr(:inbounds, true))
                            local #120#val = SIMDPirates.vstore(getindex(##B#444, SIMDPirates.vadd(1, ##j#446)), ##pX#443, SIMDPirates.vadd(##iter#440, SIMDPirates.vmul(##stride#447, ##j#446)), mask)
                            $(Expr(:inbounds, :pop))
                            #120#val
                        end
                    end
                end
            end
        end
    end
end

gensyms aren't the easiest thing to read.
[EDIT: The mask calculation can be improved. ]

But I really like the "StructArrays"-style thinking when it comes to vectorization. Similarly, the SPMD-style, which recognizes that often the best way to vectorize is across loop iterations, not within (on that note, I enjoyed this series of blog posts on the history of ispc ).

PS.
Can't believe I didn't see that video until now. Incredible stuff!
I did think Newton's Trust Region was the "coolest" method from Optim, and found it normally needed far less iterations. What surprised me, however, is that a second order method can scale better than a first order method -- despite needing the explicit Hessians -- due to different scaling in number of needed iterations.

The ILP stuff was interesting; viewing code regions in terms of the number of instructions that can be run in parallel, to figure out how far you have to "step back" to vectorize it.
Which then leads naturally to things like the StructArrays / managing memory layout to get good vectorization.

@c42f
Copy link
Member

c42f commented Jan 22, 2019

@chriselrod The detailed investigation you're doing here is great! Don't let me discourage you, it would be really great if StaticArrays would vectorize more effectively, and I am by no means an expert (nor have the time right now to become one).

What I'm pointing out are some of the constraints of a generic small matrices library which I feel make adding padding difficult right now and possibly undesirable in general.

A technical API-related reason why adding padding is difficult with the current state of things is that the size of the storage .data member of SArray cannot be computed from the shape automatically, and users are already forced to manually deal with the total data size and number of dimensions in some circumstances. For example, the concrete type of an SMatrix{2,3,Int} is SArray{Tuple{2,3},Int,2,6}. On the occasion that users are forced to write down the concrete type, at least the 6 is simply 2x3 right now. With padding it wouldn't even be fixed and might vary per architecture! At the very least we need the compiler to improve so that these details can be computed by the library, or change the library so that users never write this concrete type by hand. The desire to write the curly bracket notation is strong though and Core.apply_type can't be overloaded. For extensive discussion, see JuliaLang/julia#18466.

This would allow a 7 x N matrix to be converted to a Vector{SVector{7}}, with a getindex function defined on Arrays of StaticArrays so that a masked load is used to produce a SVector{7} that has 8 elements under the hood.

The memory layout of Vector{SVector{7}} is not independent of the padding which goes in a SVector{7}. If our type SVector{7} has padding, that padding will end up in the Vector{SVector{7}}. This is true of any other heap allocated data structures as well.

This is why I think adding padding might be undesirable because it makes the ABI of anything using StaticArrays architecture dependent and makes the memory usage unpredictable and surprising for users. A lot of people will be using StaticArrays for 3x3 matrices or length 3 vectors in which case the overhead of an additional 1/3 (say) seems considerable to me. In the case that you need SIMD-friendly heap layout you can use some kind of SIMD-friendly SimdFriendlyVector <: AbstractVector{<:StaticArray} to achieve that in the spirit of StructArrays et al.

Now, avoiding padding also means that StaticArrays allocated on the stack have the wrong layout for SIMD. But on the other hand, reorganizing this purely local storage layout via compiler transformations seems plausible at the LLVM level (at least to someone slightly naive like myself ;-)) and doesn't interfere with the user-visible API or ABI.

So the path forward that I'm suggesting is some combination of:

  1. Improve the code emitted by StaticArrays (eg, unrolling strategies) or use of the likes of @simd so that the julia LLVM passes can make something useful of it
  2. Improve the LLVM optimization passes in the julia compiler as required to support (1)
  3. Implement SIMD-friendly containers for heap data which are API-compatible with more naive dense storage.

Does that make sense? Is there something fundamental I'm overlooking here?

@chriselrod
Copy link
Contributor

The memory layout of Vector{SVector{7}} is not independent of the padding which goes in a SVector{7}. If our type SVector{7} has padding, that padding will end up in the Vector{SVector{7}}. This is true of any other heap allocated data structures as well.

What I was trying to say there only makes this point worse:

With padding it wouldn't even be fixed and might vary per architecture!

However...
Let SVector{N,T,L} be a static vector of length N, type T, and total length (with padding) of L.
What I was essentially suggesting is that you reinterpret to 7 x P matrix of Float64 to get a P-length
Vector{SVector{7,Foat64,7}}
and that getindex(::Vector{<:Sarray}, I...) is overloaded, so that it will return a SVector{7,Float64,8}. This can be achieved via using masked load instructions, so that loading from the end of the vector neither segfaults or loads performance-killing subnormals.
Masked store instructions would also be used to avoid segfaults.

If the arrays have enough extra memory allocated at the end to avoid segfaults, and if subnormals aren't a problem, regular load/stores could be used. However, the first seems hard to guarantee, and for the latter, I wouldn't want to change global state via a set_zero_subnormals.
[Unfortunately, I don't think masked operations are fast on all architectures?]

This will a) avoid the memory overhead of storing extra data, while yeilding considerable performance boosts. The 3x3 padded matrix matmul took about 2 nanoseconds, versus the > 8 ns on my machine currently for 3x3.
Although the 2.5 ns from Kristoffer's method is much better, so it'd be great to at least see that added.
I think 3 rows is one of the most common use cases (3d points).

However, I do think your points are reasonable.
It may be unwise to add features, especially ones that can be relatively opaque, by default.
Instead, adding tools and documentation to let people opt in to the ones that would help would probably serve people better, via SimdFriendlyVector, etc.

Now, avoiding padding also means that StaticArrays allocated on the stack have the wrong layout for SIMD. But on the other hand, reorganizing this purely local storage layout via compiler transformations seems plausible at the LLVM level (at least to someone slightly naive like myself ;-)) and doesn't interfere with the user-visible API or ABI.

I don't know either, but because LLVM can find out it wants to assemble SIMD vectors, and does so with very inefficient code (right now), it seems superficially plausible that in the future it could do so "earlier" by representing them differently. Maybe that doesn't involve much more than a vectorization pass after eliminating all the higher level static array machinery. But I'm no compiler expert!
But probably still only when they don't escape function boundaries.
I've seen compilers emit mask instructions for things like vectorized square roots. Wish I saw them do it more often for loads/stores.

Improve the code emitted by StaticArrays (eg, unrolling strategies) or use of the likes of @simd so that the julia LLVM passes can make something useful of it

@fastmath and @simd both give permission to use fma instructions, unroll loops, etc. But it's more generally applicable (ie, don't need loops).

Does that make sense? Is there something fundamental I'm overlooking here?

Yes. While I do want padded stack matrices to be well supported somewhere, I think I agree with your suggested approach.

@c42f
Copy link
Member

c42f commented Jan 22, 2019

What I was essentially suggesting is that you reinterpret to 7 x P matrix of Float64 to get a P-length
Vector{SVector{7,Foat64,7}} and that getindex(::Vector{<:Sarray}, I...) is overloaded, so that it will return a SVector{7,Float64,8}.

Oh I'd completely missed this: you're suggesting that the L parameter of SArray might be a feature rather than an irritating inconvenience forced on us by the compiler. Clever idea!

We could possibly use this if it could be made transparent to the average user.

@andyferris
Copy link
Member

Yes, that is interesting. What I feel we should really be using is something like SArray{axes, T, N, L} where for example axes = (OneTo(3),) for a standard length-3 static vector. (However, we've chose the Tuple{...} "hack" to make a nicer SVector and so-on.)

The assertions about L could be made just to just check it's at least large enough for the length of the array - this doesn't seem problematic to me. I guess you might wants strides and so-on in the most general case.

More broadly (and sorry I've been super duper busy this month) generally @c42f is on the mark here, about the intent of the code. The code here was originally written for a much older version of Julia and complete unrolling was the only way to get success - just like for FixedSizeArrays.jl and ImmutableArrays.jl. More importantly, a major design goal was to be generic with respect to the eltype - for example, the arrays are small so there shouldn't be too many problems with unrolling them even when the elements are very complex (note that we don't/can't force + or * on the elements to inline). I will admit to laziness with respect to Float32 and Float64, in my opinion Julia in general and LLVM in particular should be well tuned for doing things like geometry with these eltypes, and the auto-vectorizer is sometimes quite pleasing.

Basically - if you want to support SIMD better, sure go ahead of course we'd all love that, but keep in mind that (a) the optimal approach depends on the user's architecture (which is why I liked leaving this to LLVM to sort out) and (b) such methods will need to be for specific element types, not the generic versions.

@chriselrod
Copy link
Contributor

chriselrod commented Feb 3, 2019

@fastmath let's LLVM use fma instructions, when the host architecture has them, and gives it a little more license to reorder floating point computations.

For my own libraries, I have VectorizationBase.jl which has a build script that depends on CpuId to create a file with some useful info, such as simdbytes(). Then the appropriate vector length can be calculated for Float32 and Float64. But this is not very generic; I wouldn't use it for anything other than the native float or integer types.
This is also useful for knowing the number of registers / how much data can be held in registers before spilling.

For example, with avx2, 8x8 * 8x8 matrix multiplication does not fit in the registers. Therefore, it computes it computes it blockwise (unlike earlier posts, this comment is using a chip without avx512, but only avx2):

julia> using BenchmarkTools, SIMDArrays, StaticArrays

julia> A = @Static randn(8,8);

julia> B = @Static randn(8,8);

julia> C = @SMatrix randn(8,8);

julia> D = @SMatrix randn(8,8);

julia> @benchmark $A * $B
@benchmark $BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     37.530 ns (0.00% GC)
  median time:      40.827 ns (0.00% GC)
  mean time:        41.024 ns (0.00% GC)
  maximum time:     67.167 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     992

julia> @benchmark $C * $D
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     60.743 ns (0.00% GC)
  median time:      63.097 ns (0.00% GC)
  mean time:        63.510 ns (0.00% GC)
  maximum time:     246.020 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     982

The code isn't totally optimal. After computing each block, LLVM moves part of the computed block onto the stack. Then at the end, it moves it back off the stack, into registers, and back again onto the stack:

	vmovups	-56(%rsp), %ymm0
	vmovups	-88(%rsp), %ymm2
	vmovups	-120(%rsp), %ymm1
	movq	%rdi, %rax
	vmovups	%ymm0, (%rdi)
	vmovups	%ymm2, 32(%rdi)
	vmovups	%ymm1, 64(%rdi)
	vmovupd	%ymm3, 96(%rdi)
	vmovupd	%ymm4, 128(%rdi)
	vmovupd	%ymm5, 160(%rdi)
	vmovupd	%ymm7, 192(%rdi)

Not the end of the world. Still faster than my heap-allocated array function for some reason:

julia> using LinearAlgebra

julia> X = @Sized randn(8,8);

julia> Y = @Sized randn(8,8);

julia> Z = @Sized randn(8,8);

julia> @benchmark mul!($Z, $X, $Y)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     46.849 ns (0.00% GC)
  median time:      50.203 ns (0.00% GC)
  mean time:        51.597 ns (0.00% GC)
  maximum time:     64.032 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     987

So I'll have to look into that.

The reduced copying becomes more important as we start to get a little larger:

julia> X = @Sized randn(16,16);

julia> Y = @Sized randn(16,16);

julia> Z = @Sized randn(16,16);

julia> @benchmark mul!($Z, $X, $Y)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     299.849 ns (0.00% GC)
  median time:      314.733 ns (0.00% GC)
  mean time:        321.455 ns (0.00% GC)
  maximum time:     528.492 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     258

julia> A = @Static randn(16,16);

julia> B = @Static randn(16,16);

julia> C = @SMatrix randn(16,16);

julia> D = @SMatrix randn(16,16);

julia> @benchmark $A * $B
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     442.076 ns (0.00% GC)
  median time:      462.576 ns (0.00% GC)
  mean time:        473.638 ns (0.00% GC)
  maximum time:     841.722 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     198

julia> @benchmark $C * $D
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     568.087 ns (0.00% GC)
  median time:      585.087 ns (0.00% GC)
  mean time:        602.290 ns (0.00% GC)
  maximum time:     1.098 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     183

But at these small sizes, they are all still faster than OpenBLAS:

julia> aA = Array(A); aB = Array(B); AB = similar(aA);

julia> @benchmark mul!($AB, $aA, $aB)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     745.484 ns (0.00% GC)
  median time:      770.881 ns (0.00% GC)
  mean time:        790.861 ns (0.00% GC)
  maximum time:     1.837 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     126

So I think the size range over which StaticArrays is more efficient than OpenBLAS can be made much much larger. Although, there will come a point where you wouldn't want the arrays to be parameterized by size anymore (ie, where you wouldn't want to recompile things for each new size).

These blocking patterns are of course architecture dependent. Have any suggestions on a good way to handle that?

This, and optimal vector widths, are the sort of things we'd like to know to really take advantage of "L as a feature". So if you're onboard with that plan, should I create a PR?
Do you want to update sizing from Tuple{M,N,P} to Tuple{OneTo(M),OneTo(N),OneTo(P)}, or what do you have in mind there?
I just more or less copied StaticArrays in SIMDArrays, except I added another "stride" parameter:

julia> A = @Static randn(15,16);

julia> typeof(A) # Tuple{# rows,# columns}, T, dim, stride, length of underlying tuple
SIMDArrays.StaticSIMDArray{Tuple{15,16},Float64,2,16,256}

@c42f c42f added performance runtime performance vectorization feature features and feature requests labels Jul 31, 2019
@KristofferC
Copy link
Contributor Author

KristofferC commented Feb 8, 2020

Apparently, LLVM has matrix multiplication intrinsics now:

https://llvm.org/docs/LangRef.html#llvm-matrix-multiply-intrinsic

@chriselrod
Copy link
Contributor

I look forward to trying them.
So far, I haven't been able to successfully build Julia with LLVM 10.

@c42f
Copy link
Member

c42f commented Feb 10, 2020

Yes we should try those out when we get the chance. More immediately (julia 1.4), we can try the idea of copying into an appropriately-shaped container of VecElement and seeing whether that helps things. XRef https://discourse.julialang.org/t/ann-loopvectorization/32843/66?u=chris_foster

@andyferris
Copy link
Member

For reference, IIRC last time I thought about using VecElement LLVM would crash for odd lengths 7 and up, and the fact that 5 didn’t crash was brand new. It didn’t seem reliable for generic code.

@chriselrod
Copy link
Contributor

chriselrod commented Feb 10, 2020

This should be fixed on Julia master: JuliaLang/julia#34473
Julia and LLVM didn't always agree on alignment of structs containing tuples of VecElements, which would often cause crashes (due to an assertion) and data corruption before the assertions were turned on (that data corruption also often caused crashes).

This PR is also important: JuliaLang/julia#34490
Without it, llvmcall-defined methods would only work for those supported lengths (other lengths would be llvm-arrays instead of llvm-vectors).

@KristofferC
Copy link
Contributor Author

Yeah, both LLVM and Julia have gotten significantly better at LLVM Vectors.

@hyrodium
Copy link
Collaborator

hyrodium commented Aug 3, 2022

Is this issue still active? The current benchmark shows:

julia> for n in (2,3,4,5,6,7)
           s = Ref(rand(SMatrix{n,n}))
           @btime $(s)[] * $(s)[]
       end
  3.907 ns (0 allocations: 0 bytes)
  6.091 ns (0 allocations: 0 bytes)
  10.159 ns (0 allocations: 0 bytes)
  24.795 ns (0 allocations: 0 bytes)
  42.373 ns (0 allocations: 0 bytes)
  84.822 ns (0 allocations: 0 bytes)

(@v1.7) pkg> st StaticArrays
      Status `~/.julia/environments/v1.7/Project.toml`
  [90137ffa] StaticArrays v1.5.2 `~/.julia/dev/StaticArrays`

julia> versioninfo()
Julia Version 1.7.3
Commit 742b9abb4d (2022-05-06 12:58 UTC)
Platform Info:
  OS: Linux (x86_64-pc-linux-gnu)
  CPU: AMD Ryzen 7 2700X Eight-Core Processor
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-12.0.1 (ORCJIT, znver1)

@chriselrod
Copy link
Contributor

chriselrod commented Aug 3, 2022

Is this issue still active? The current benchmark shows:

Yes.

julia> @benchmark ($(Ref(A3))[]*$(Ref(A3))[])
BenchmarkTools.Trial: 10000 samples with 1000 evaluations.
 Range (min  max):  4.645 ns  16.075 ns  ┊ GC (min  max): 0.00%  0.00%
 Time  (median):     4.662 ns              ┊ GC (median):    0.00%
 Time  (mean ± σ):   4.670 ns ±  0.149 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

                   ▁▄ ▅▅ █ ▇█ ▇▆ ▄▃ ▁ ▁
  ▂▂▁▂▂▁▂▂▁▃▄▁▄▁▆▇▁██▁██▁█▁██▁██▁██▁█▁█▇▁▇▅▁▄▄▁▄▁▄▃▁▃▃▁▃▃▁▂▂ ▄
  4.64 ns        Histogram: frequency by time        4.68 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.

julia> @benchmark matmul3x3($(Ref(A3))[],$(Ref(A3))[])
BenchmarkTools.Trial: 10000 samples with 1000 evaluations.
 Range (min  max):  3.160 ns  17.475 ns  ┊ GC (min  max): 0.00%  0.00%
 Time  (median):     3.205 ns              ┊ GC (median):    0.00%
 Time  (mean ± σ):   3.212 ns ±  0.212 ns  ┊ GC (mean ± σ):  0.00% ± 0.00%

                                          ▁▁▁    ▂▃▆▇█▆▆▂
  ▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▁▂▂▂▃▃▃▃▂▂▂▁▃▃▅▆████▇▇█▁████████▇▄▃ ▄
  3.16 ns        Histogram: frequency by time        3.21 ns <

 Memory estimate: 0 bytes, allocs estimate: 0.

Where matmul3x3 is as defined in the issue + @inline.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
feature features and feature requests performance runtime performance vectorization
Projects
None yet
Development

No branches or pull requests

7 participants