src/matrix_multiply_add.jl

# import LinearAlgebra.MulAddMul

abstract type MulAddMul{TA,TB} end

struct AlphaBeta{TA,TB} <: MulAddMul{TA,TB}
    α::TA
    β::TB
end
@inline alpha(ab::AlphaBeta) = ab.α
@inline beta(ab::AlphaBeta) = ab.β

struct NoMulAdd{TA,TB} <: MulAddMul{TA,TB} end
@inline alpha(ma::NoMulAdd{TA,TB}) where {TA,TB} = one(TA)
@inline beta(ma::NoMulAdd{TA,TB}) where {TA,TB} = zero(TB)

"""
    StaticMatMulLike

Static wrappers used for multiplication dispatch.
"""
const StaticMatMulLike{s1, s2, T} = Union{
    StaticMatrix{s1, s2, T},
    Symmetric{T, <:StaticMatrix{s1, s2, T}},
    Hermitian{T, <:StaticMatrix{s1, s2, T}},
    LowerTriangular{T, <:StaticMatrix{s1, s2, T}},
    UpperTriangular{T, <:StaticMatrix{s1, s2, T}},
    UnitLowerTriangular{T, <:StaticMatrix{s1, s2, T}},
    UnitUpperTriangular{T, <:StaticMatrix{s1, s2, T}},
    Adjoint{T, <:StaticMatrix{s1, s2, T}},
    Transpose{T, <:StaticMatrix{s1, s2, T}},
    Diagonal{T, <:StaticVector{s1, T}}}

"""
    gen_by_access(expr_gen, a::Type{<:AbstractArray}, asym = :wrapped_a)

Statically generate outer code for fully unrolled multiplication loops.
Returned code does wrapper-specific tests (for example if a symmetric matrix view is
`U` or `L`) and the body of the if expression is then generated by function `expr_gen`.
The function `expr_gen` receives access pattern description symbol as its argument
and this symbol is then consumed by uplo_access to generate the right code for matrix
element access.

The name of the matrix to test is indicated by `asym`.
"""
function gen_by_access(expr_gen, a::Type{<:StaticVecOrMat}, asym = :wrapped_a)
    return expr_gen(:any)
end
function gen_by_access(expr_gen, a::Type{<:Symmetric{<:Any, <:StaticMatrix}}, asym = :wrapped_a)
    return quote
        if $(asym).uplo == 'U'
            $(expr_gen(:up))
        else
            $(expr_gen(:lo))
        end
    end
end
function gen_by_access(expr_gen, a::Type{<:Hermitian{<:Any, <:StaticMatrix}}, asym = :wrapped_a)
    return quote
        if $(asym).uplo == 'U'
            $(expr_gen(:up_herm))
        else
            $(expr_gen(:lo_herm))
        end
    end
end
function gen_by_access(expr_gen, a::Type{<:UpperTriangular{<:Any, <:StaticMatrix}}, asym = :wrapped_a)
    return expr_gen(:upper_triangular)
end
function gen_by_access(expr_gen, a::Type{<:LowerTriangular{<:Any, <:StaticMatrix}}, asym = :wrapped_a)
    return expr_gen(:lower_triangular)
end
function gen_by_access(expr_gen, a::Type{<:UnitUpperTriangular{<:Any, <:StaticMatrix}}, asym = :wrapped_a)
    return expr_gen(:unit_upper_triangular)
end
function gen_by_access(expr_gen, a::Type{<:UnitLowerTriangular{<:Any, <:StaticMatrix}}, asym = :wrapped_a)
    return expr_gen(:unit_lower_triangular)
end
function gen_by_access(expr_gen, a::Type{<:Transpose{<:Any, <:StaticVecOrMat}}, asym = :wrapped_a)
    return expr_gen(:transpose)
end
function gen_by_access(expr_gen, a::Type{<:Adjoint{<:Any, <:StaticVecOrMat}}, asym = :wrapped_a)
    return expr_gen(:adjoint)
end
function gen_by_access(expr_gen, a::Type{<:Diagonal{<:Any, <:StaticVector}}, asym = :wrapped_a)
    return expr_gen(:diagonal)
end
"""
    gen_by_access(expr_gen, a::Type{<:AbstractArray}, b::Type{<:AbstractArray})

Similar to gen_by_access with only one type argument. The difference is that tests for both
arrays of type `a` and `b` are generated and `expr_gen` receives two access arguments,
first for matrix `a` and the second for matrix `b`.
"""
function gen_by_access(expr_gen, a::Type{<:StaticMatrix}, b::Type)
    return quote
        return $(gen_by_access(b, :wrapped_b) do access_b
            expr_gen(:any, access_b)
        end)
    end
end
function gen_by_access(expr_gen, a::Type{<:Symmetric{<:Any, <:StaticMatrix}}, b::Type)
    return quote
        if wrapped_a.uplo == 'U'
            return $(gen_by_access(b, :wrapped_b) do access_b
                expr_gen(:up, access_b)
            end)
        else
            return $(gen_by_access(b, :wrapped_b) do access_b
                expr_gen(:lo, access_b)
            end)
        end
    end
end
function gen_by_access(expr_gen, a::Type{<:Hermitian{<:Any, <:StaticMatrix}}, b::Type)
    return quote
        if wrapped_a.uplo == 'U'
            return $(gen_by_access(b, :wrapped_b) do access_b
                expr_gen(:up_herm, access_b)
            end)
        else
            return $(gen_by_access(b, :wrapped_b) do access_b
                expr_gen(:lo_herm, access_b)
            end)
        end
    end
end
function gen_by_access(expr_gen, a::Type{<:UpperTriangular{<:Any, <:StaticMatrix}}, b::Type)
    return quote
        return $(gen_by_access(b, :wrapped_b) do access_b
            expr_gen(:upper_triangular, access_b)
        end)
    end
end
function gen_by_access(expr_gen, a::Type{<:LowerTriangular{<:Any, <:StaticMatrix}}, b::Type)
    return quote
        return $(gen_by_access(b, :wrapped_b) do access_b
            expr_gen(:lower_triangular, access_b)
        end)
    end
end
function gen_by_access(expr_gen, a::Type{<:UnitUpperTriangular{<:Any, <:StaticMatrix}}, b::Type)
    return quote
        return $(gen_by_access(b, :wrapped_b) do access_b
            expr_gen(:unit_upper_triangular, access_b)
        end)
    end
end
function gen_by_access(expr_gen, a::Type{<:UnitLowerTriangular{<:Any, <:StaticMatrix}}, b::Type)
    return quote
        return $(gen_by_access(b, :wrapped_b) do access_b
            expr_gen(:unit_lower_triangular, access_b)
        end)
    end
end
function gen_by_access(expr_gen, a::Type{<:Transpose{<:Any, <:StaticMatrix}}, b::Type)
    return quote
        return $(gen_by_access(b, :wrapped_b) do access_b
            expr_gen(:transpose, access_b)
        end)
    end
end
function gen_by_access(expr_gen, a::Type{<:Adjoint{<:Any, <:StaticMatrix}}, b::Type)
    return quote
        return $(gen_by_access(b, :wrapped_b) do access_b
            expr_gen(:adjoint, access_b)
        end)
    end
end
function gen_by_access(expr_gen, a::Type{<:Diagonal{<:Any, <:StaticVector}}, b::Type)
    return quote
        return $(gen_by_access(b, :wrapped_b) do access_b
            expr_gen(:diagonal, access_b)
        end)
    end
end


""" Size that stores whether a Matrix is a Transpose
Useful when selecting multiplication methods, and avoiding allocations when dealing with
the `Transpose` type by passing around the original matrix.
Should pair with `parent`.
"""
struct TSize{S,T}
    function TSize{S,T}() where {S,T}
        new{S::Tuple{Vararg{StaticDimension}},T::Symbol}()
    end
end
TSize(A::Type{<:StaticArrayLike}) = TSize{size(A), gen_by_access(identity, A)}()
TSize(A::StaticArrayLike) = TSize(typeof(A))
TSize(S::Size{s}, T=:any) where s = TSize{s,T}()
TSize(s::Number) = TSize(Size(s))
istranspose(::TSize{<:Any,T}) where T = (T === :transpose)
size(::TSize{S}) where S = S
Size(::TSize{S}) where S = Size{S}()
access_type(::TSize{<:Any,T}) where T = T
Base.transpose(::TSize{S,:transpose}) where {S} = TSize{reverse(S),:any}()
Base.transpose(::TSize{S,:any}) where {S} = TSize{reverse(S),:transpose}()

# Get the parent of transposed arrays, or the array itself if it has no parent
# Different from Base.parent because we only want to get rid of Transpose and Adjoint
# The two last methods can't be combined into one for StaticVecOrMat because then dispatch
# goes wrong for SizedArray
@inline mul_parent(A::Union{StaticMatMulLike, Adjoint{<:Any,<:StaticVector}, Transpose{<:Any,<:StaticVector}}) = Base.parent(A)
@inline mul_parent(A::StaticMatrix) = A
@inline mul_parent(A::StaticVector) = A

# Using the full StaticVecOrMatLike in dest of that one method of mul! takes a lot of load time
const StaticVecOrMatLikeForFiveArgMulDest{T} = Union{
    StaticVector{<:Any, T},
    StaticMatrix{<:Any, <:Any, T},
    Transpose{T, <:StaticVecOrMat{T}},
    Adjoint{T, <:StaticVecOrMat{T}}
}

# 5-argument matrix multiplication
#    To avoid allocations, strip away Transpose type and store transpose info in Size
@inline LinearAlgebra.mul!(dest::StaticVecOrMatLikeForFiveArgMulDest, A::StaticVecOrMatLike, B::StaticVecOrMatLike,
    α::Number, β::Number) = _mul!(TSize(dest), mul_parent(dest), Size(A), Size(B), A, B,
    AlphaBeta(α,β))

# See https://github.com/JuliaArrays/StaticArrays.jl/issues/857.
# `StaticVecOrMatLike` is a very large union, and this caused Julia 1.0.x to
# use a large amount of memory when compiling StaticArrays in the presence
# of other packages that also extend `LinearAlgebra.mul!` in a similar
# fashion. As an example, this led to JuMP using more than 4 GB of RAM when
# running `using JuMP`, causing its CI to crash.
#
# Computing the eltypes within the function, rather than specifying them in
# the type arguments greatly improves the issue, with `using JuMP` going
# from:
#     julia> @time using JuMP
#     [ Info: Precompiling JuMP [4076af6c-e467-56ae-b986-b466b2749572]
#     Killed
# to:
#     julia> @time using JuMP
#     [ Info: Precompiling JuMP [4076af6c-e467-56ae-b986-b466b2749572]
#     101.723598 seconds (12.85 M allocations: 812.902 MiB, 0.31% gc time)
@inline function LinearAlgebra.mul!(
    dest::StaticVecOrMatLike,
    A::StaticVecOrMatLike,
    B::StaticVecOrMatLike,
)
    TDest, TA, TB = eltype(dest), eltype(A), eltype(B)
    TMul = promote_op(matprod, TA, TB)
    return _mul!(TSize(dest), mul_parent(dest), Size(A), Size(B), A, B, NoMulAdd{TMul, TDest}())
end

"Calculate the product of the dimensions being multiplied. Useful as a heuristic for unrolling."
@inline multiplied_dimension(A::Type{<:StaticVecOrMatLike}, B::Type{<:StaticVecOrMatLike}) =
    prod(size(A)) * size(B,2)

"Validate the dimensions of a matrix multiplication, including matrix-vector products"
function check_dims(::Size{sc}, ::Size{sa}, ::Size{sb}) where {sa,sb,sc}
    if sb[1] != sa[2] || sc[1] != sa[1]
        return false
    elseif length(sc) == 2 || length(sb) == 2
        sc2 = length(sc) == 1 ? 1 : sc[2]
        sb2 = length(sb) == 1 ? 1 : sb[2]
        if sc2 != sb2
            return false
        end
    end
    return true
end

"""
    uplo_access(sa, asym, k, j, uplo)

Generate code for matrix element access, for a matrix of size `sa` locally referred to
as `asym` in the context where the result will be used. Both indices `k` and `j` need to be
statically known for this function to work. `uplo` is the access pattern mode generated
by the `gen_by_access` function.
"""
function uplo_access(sa, asym, k, j, uplo)
    TAsym = Symbol("T"*string(asym))
    if uplo == :any
        return :($asym[$(LinearIndices(sa)[k, j])])
    elseif uplo == :up
        if k < j
            return :($asym[$(LinearIndices(sa)[k, j])])
        elseif k == j
            return :(LinearAlgebra.symmetric($asym[$(LinearIndices(sa)[k, j])], :U))
        else
            return :(transpose($asym[$(LinearIndices(sa)[j, k])]))
        end
    elseif uplo == :lo
        if k > j
            return :($asym[$(LinearIndices(sa)[k, j])])
        elseif k == j
            return :(LinearAlgebra.symmetric($asym[$(LinearIndices(sa)[k, j])], :L))
        else
            return :(transpose($asym[$(LinearIndices(sa)[j, k])]))
        end
    elseif uplo == :up_herm
        if k < j
            return :($asym[$(LinearIndices(sa)[k, j])])
        elseif k == j
            return :(LinearAlgebra.hermitian($asym[$(LinearIndices(sa)[k, j])], :U))
        else
            return :(adjoint($asym[$(LinearIndices(sa)[j, k])]))
        end
    elseif uplo == :lo_herm
        if k > j
            return :($asym[$(LinearIndices(sa)[k, j])])
        elseif k == j
            return :(LinearAlgebra.hermitian($asym[$(LinearIndices(sa)[k, j])], :L))
        else
            return :(adjoint($asym[$(LinearIndices(sa)[j, k])]))
        end
    elseif uplo == :upper_triangular
        if k <= j
            return :($asym[$(LinearIndices(sa)[k, j])])
        else
            return :(zero($TAsym))
        end
    elseif uplo == :lower_triangular
        if k >= j
            return :($asym[$(LinearIndices(sa)[k, j])])
        else
            return :(zero($TAsym))
        end
    elseif uplo == :unit_upper_triangular
        if k < j
            return :($asym[$(LinearIndices(sa)[k, j])])
        elseif k == j
            return :(oneunit($TAsym))
        else
            return :(zero($TAsym))
        end
    elseif uplo == :unit_lower_triangular
        if k > j
            return :($asym[$(LinearIndices(sa)[k, j])])
        elseif k == j
            return :(oneunit($TAsym))
        else
            return :(zero($TAsym))
        end
    elseif uplo == :upper_hessenberg
        if k <= j+1
            return :($asym[$(LinearIndices(sa)[k, j])])
        else
            return :(zero($TAsym))
        end
    elseif uplo == :transpose
        return :(transpose($asym[$(LinearIndices(reverse(sa))[j, k])]))
    elseif uplo == :adjoint
        return :(adjoint($asym[$(LinearIndices(reverse(sa))[j, k])]))
    elseif uplo == :diagonal
        if k == j
            return :($asym[$k])
        else
            return :(zero($TAsym))
        end
    else
        error("Unknown uplo: $uplo")
    end
end

""" Combine left and right sides of an assignment expression, short-cutting
        lhs = α * rhs + β * lhs,
    element-wise.
If α = 1, the multiplication by α is removed. If β = 0, the second rhs term is removed.
"""
function _muladd_expr(lhs::Array{Expr}, rhs::Array{Expr}, ::Type{<:AlphaBeta})
    @assert length(lhs) == length(rhs)
    n = length(rhs)
    rhs = [:(α * $(expr)) for expr in rhs]
    rhs = [:($(lhs[k]) * β + $(rhs[k])) for k = 1:n]
    exprs = [:($(lhs[k]) = $(rhs[k])) for k = 1:n]
    _assign(lhs, rhs)
    return exprs
end

@inline _muladd_expr(lhs::Array{Expr}, rhs::Array{Expr}, ::Type{<:MulAddMul}) = _assign(lhs, rhs)

@inline function _assign(lhs::Array{Expr}, rhs::Array{Expr})
    @assert length(lhs) == length(rhs)
    [:($(lhs[k]) = $(rhs[k])) for k = 1:length(lhs)]
end

"Obtain an expression for the linear index of var[k,j], taking transposes into account"
function _lind(var::Symbol, A::Type{TSize{sa,tA}}, k::Int, j::Int) where {sa,tA}
    ula = uplo_access(sa, var, k, j, tA)
    if ula.head == :call && ula.args[1] == :transpose
        # TODO: can this be properly fixed at all?
        return ula.args[2]
    end
    return ula
end

function combine_products(expr_list)
    filtered = filter(expr_list) do expr
        if expr.head != :call || expr.args[1] !== :*
            error("expected call to *")
        end
        for arg in expr.args[2:end]
            if isa(arg, Expr) && arg.head == :call && arg.args[1] === :zero
                return false
            end
        end
        return true
    end
    if isempty(filtered)
        return :(zero(T))
    else
        return reduce(filtered) do ex1, ex2
            if ex2.head != :call || ex2.args[1] !== :*
                error("expected call to *")
            end

            return :(muladd($(ex2.args[2]), $(ex2.args[3]), $ex1))
        end
    end
end

# Matrix-vector multiplication
@generated function _mul!(Sc::TSize{sc}, c::StaticVecOrMatLike, Sa::Size{sa}, Sb::Size{sb},
        wrapped_a::StaticMatMulLike{<:Any, <:Any, Ta}, b::StaticVector{<:Any, Tb}, _add::MulAddMul,
        ::Val{col}=Val(1)) where {sa, sb, sc, col, Ta, Tb}
    if sa[2] != sb[1] || sc[1] != sa[1]
        throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc"))
    end

    if sa[2] != 0
        assign_expr = gen_by_access(wrapped_a) do access_a
            lhs = [_lind(:c,Sc,k,col) for k = 1:sa[1]]
            ab = [combine_products([:($(uplo_access(sa, :a, k, j, access_a)) * b[$j]) for j = 1:sa[2]]) for k = 1:sa[1]]
            exprs = _muladd_expr(lhs, ab, _add)

            return :(@inbounds $(Expr(:block, exprs...)))
        end
    else
        exprs = [:(c[$k] = zero(eltype(c))) for k = 1:sa[1]]
        assign_expr = :(@inbounds $(Expr(:block, exprs...)))
    end

    return quote
        # @_inline_meta
        α = alpha(_add)
        β = beta(_add)
        a = mul_parent(wrapped_a)
        $assign_expr
        return c
    end
end

# Outer product
@generated function _mul!(::TSize{sc}, c::StaticMatrix, tsa::Size{sa}, tsb::Size{sb},
        a::StaticVector, b::Union{Transpose{<:Any, <:StaticVector}, Adjoint{<:Any, <:StaticVector}}, _add::MulAddMul) where {sa, sb, sc}
    if sc[1] != sa[1] || sc[2] != sb[2]
        throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc"))
    end

    conjugate_b = b <: Adjoint

    lhs = [:(c[$(LinearIndices(sc)[i,j])]) for i = 1:sa[1], j = 1:sb[2]]
    if conjugate_b
        ab = [:(a[$i] * adjoint(b[$j])) for i = 1:sa[1], j = 1:sb[2]]
    else
        ab = [:(a[$i] * transpose(b[$j])) for i = 1:sa[1], j = 1:sb[2]]
    end

    exprs = _muladd_expr(lhs, ab, _add)

    return quote
        @_inline_meta
        α = alpha(_add)
        β = beta(_add)
        @inbounds $(Expr(:block, exprs...))
        return c
    end
end

# Matrix-matrix multiplication
@generated function _mul!(Sc::TSize{sc}, c::StaticMatMulLike,
        Sa::Size{sa}, Sb::Size{sb},
        a::StaticMatMulLike, b::StaticMatMulLike,
        _add::MulAddMul) where {sa, sb, sc}
    Ta,Tb,Tc = eltype(a), eltype(b), eltype(c)
    can_blas = Tc == Ta && Tc == Tb && Tc <: BlasFloat && a <: Union{StaticMatrix,Transpose} && b <: Union{StaticMatrix,Transpose}

    mult_dim = multiplied_dimension(a,b)
    a_tri_mul = a <: LinearAlgebra.AbstractTriangular ? 2 : 1
    b_tri_mul = b <: LinearAlgebra.AbstractTriangular ? 2 : 1
    ab_tri_mul = (a == 2 && b == 2) ? 2 : 1
    if mult_dim < 4*4*4*a_tri_mul*b_tri_mul*ab_tri_mul || a <: Diagonal || b <: Diagonal
        return quote
            @_inline_meta
            muladd_unrolled_all!(Sc, c, Sa, Sb, a, b, _add)
            return c
        end
    elseif mult_dim < 14*14*14 # Something seems broken for this one with large matrices (becomes allocating)
        return quote
            @_inline_meta
            muladd_unrolled_chunks!(Sc, c, Sa, Sb, a, b, _add)
            return c
        end
    else
        if can_blas
            return quote
                @_inline_meta
                mul_blas!(Sc, c, TSize(a), TSize(b), mul_parent(a), mul_parent(b), _add)
                return c
            end
        else
            return quote
                @_inline_meta
                muladd_unrolled_chunks!(Sc, c, Sa, Sb, a, b, _add)
                return c
            end
        end
    end
end


@generated function muladd_unrolled_all!(Sc::TSize{sc}, wrapped_c::StaticMatMulLike, Sa::Size{sa}, Sb::Size{sb},
        wrapped_a::StaticMatMulLike{<:Any,<:Any,Ta}, wrapped_b::StaticMatMulLike{<:Any,<:Any,Tb}, _add::MulAddMul) where {sa, sb, sc, Ta, Tb}
    if !check_dims(Size(sc),Size(sa),Size(sb))
        throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc"))
    end

    if sa[2] != 0
        lhs = [_lind(:c, Sc, k1, k2) for k1 = 1:sa[1], k2 = 1:sb[2]]

        assign_expr = gen_by_access(wrapped_a, wrapped_b) do access_a, access_b

            ab = [combine_products([:($(uplo_access(sa, :a, k1, j, access_a)) * $(uplo_access(sb, :b, j, k2, access_b))) for j = 1:sa[2]]
                ) for k1 = 1:sa[1], k2 = 1:sb[2]]

            exprs = _muladd_expr(lhs, ab, _add)
            return :(@inbounds $(Expr(:block, exprs...)))
        end
    else
        exprs = [:(c[$k] = zero(eltype(c))) for k = 1:sc[1]*sc[2]]
        assign_expr = :(@inbounds $(Expr(:block, exprs...)))
    end

    return quote
        @_inline_meta
        # α = _add.alpha
        # β = _add.beta
        α = alpha(_add)
        β = beta(_add)
        c = mul_parent(wrapped_c)
        a = mul_parent(wrapped_a)
        b = mul_parent(wrapped_b)
        T = promote_op(matprod,Ta,Tb)
        $assign_expr
        return c
    end
end


@generated function muladd_unrolled_chunks!(Sc::TSize{sc}, wrapped_c::StaticMatMulLike, ::Size{sa}, Sb::Size{sb},
        wrapped_a::StaticMatMulLike{<:Any,<:Any,Ta}, wrapped_b::StaticMatMulLike{<:Any,<:Any,Tb}, _add::MulAddMul) where {sa, sb, sc, Ta, Tb}
    if sb[1] != sa[2] || sa[1] != sc[1] || sb[2] != sc[2]
        throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc"))
    end

    # This will not work for Symmetric and Hermitian wrappers of c
    lhs = [_lind(:c, Sc, k1, k2) for k1 = 1:sa[1], k2 = 1:sb[2]]

    #vect_exprs = [:($(Symbol("tmp_$k2")) = partly_unrolled_multiply(A, B[:, $k2])) for k2 = 1:sB[2]]

    # Do a custom b[:, k2] to return a SVector (an isbitstype type) rather than a mutable type. Avoids allocation == faster
    tmp_type = SVector{sb[1], eltype(wrapped_c)}

    assign_expr = gen_by_access(wrapped_a, wrapped_b) do access_a, access_b
        vect_exprs = [:($(Symbol("tmp_$k2")) = partly_unrolled_multiply($(Size{sa}()), $(Size{(sb[1],)}()),
            a, $(Expr(:call, tmp_type, [uplo_access(sb, :b, i, k2, access_b) for i = 1:sb[1]]...)), $(Val(access_a)))) for k2 = 1:sb[2]]

        # exprs = [:(c[$(LinearIndices(sc)[k1, k2])] = $(Symbol("tmp_$k2"))[$k1]) for k1 = 1:sa[1], k2 = 1:sb[2]]
        rhs = [:($(Symbol("tmp_$k2"))[$k1]) for k1 = 1:sa[1], k2 = 1:sb[2]]
        exprs = _muladd_expr(lhs, rhs, _add)

        return quote
            @inbounds $(Expr(:block, vect_exprs...))
            @inbounds $(Expr(:block, exprs...))
        end
    end

    return quote
        @_inline_meta
        α = alpha(_add)
        β = beta(_add)
        c = mul_parent(wrapped_c)
        a = mul_parent(wrapped_a)
        b = mul_parent(wrapped_b)
        $assign_expr
    end
end

# @inline partly_unrolled_multiply(Sa::Size, Sb::Size, a::StaticMatrix, b::StaticArray) where {sa, sb, Ta, Tb} =
#     partly_unrolled_multiply(TSize(Sa), TSize(Sb), a, b)
@generated function partly_unrolled_multiply(Sa::Size{sa}, ::Size{sb}, a::StaticMatMulLike{<:Any, <:Any, Ta}, b::StaticArray{<:Tuple, Tb}, ::Val{access_a}) where {sa, sb, Ta, Tb, access_a}
    if sa[2] != sb[1]
        throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb"))
    end

    if sa[2] != 0
        exprs = [combine_products([:($(uplo_access(sa, :a, k, j, access_a))*b[$j]) for j = 1:sa[2]]) for k = 1:sa[1]]
    else
        exprs = [:(zero(promote_op(matprod,Ta,Tb))) for k = 1:sa[1]]
    end

    return quote
        $(Expr(:meta,:noinline))
        @inbounds return SVector(tuple($(exprs...)))
    end
end

@inline _get_raw_data(A::SizedArray) = A.data
@inline _get_raw_data(A::StaticArray) = A
# we need something heap-allocated to make sure BLAS calls are safe
@inline _get_raw_data(A::SArray) = MArray(A)

function mul_blas!(::TSize{<:Any,:any}, c::StaticMatrix,
        Sa::Union{TSize{<:Any,:any}, TSize{<:Any,:transpose}}, Sb::Union{TSize{<:Any,:any}, TSize{<:Any,:transpose}},
        a::StaticMatrix, b::StaticMatrix, _add::MulAddMul)
    mat_char(s) = istranspose(s) ? 'T' : 'N'
    T = eltype(a)
    A = _get_raw_data(a)
    B = _get_raw_data(b)
    C = _get_raw_data(c)
    BLAS.gemm!(mat_char(Sa), mat_char(Sb), T(alpha(_add)), A, B, T(beta(_add)), C)
end

# if C is transposed, transpose the entire expression
@inline mul_blas!(Sc::TSize{<:Any,:transpose}, c::StaticMatrix, Sa::TSize, Sb::TSize,
        a::StaticMatrix, b::StaticMatrix, _add::MulAddMul) =
    mul_blas!(transpose(Sc), c, transpose(Sb), transpose(Sa), b, a, _add)