Support for Julia 1.11 #2241

maleadt · 2024-01-17T13:21:57Z

MWEs for that issue (in descending order of abstraction):

using CUDA

inner() = nothing

function outer()
    s = CuDeviceStream()
    @cuda dynamic=true stream=s shmem=1 inner()
    return
end

using InteractiveUtils
function main()
    InteractiveUtils.code_llvm(outer, Tuple{})
    CUDA.code_llvm(outer, Tuple{})
end
isinteractive() || main()

using CUDA

inner(a, b, c, d, e, f, h) = nothing

function outer()
    @cuda dynamic=true inner(1, 1, 1, 1, 1, 1, 1)
    return
end

using InteractiveUtils
function main()
    InteractiveUtils.code_llvm(outer, Tuple{})
    CUDA.code_llvm(outer, Tuple{})
end
isinteractive() || main()

using CUDA

cudacall(f, types::Type, args...; kwargs...) = nothing

function outer(f)
    @inline cudacall(f, Tuple{}; stream=Ref(42), shmem=1)
    return
end

using InteractiveUtils
function main()
    InteractiveUtils.code_llvm(outer, Tuple{Nothing})
    CUDA.code_llvm(outer, Tuple{Nothing})
end
isinteractive() || main()

maleadt · 2024-06-10T08:54:21Z

Testing on backports-release-1.11@JuliaLang/julia@b69fc57, which includes JuliaLang/julia#54323, this does not seem to fix the third MWE here:

julia> InteractiveUtils.code_llvm(outer, Tuple{Nothing})
; Function Signature: outer(Nothing)
;  @ REPL[16]:1 within `outer`
define void @julia_outer_15341() #0 {
top:
  ret void
}

julia> CUDA.code_llvm(outer, Tuple{Nothing})
warning: linking module flags 'Dwarf Version': IDs have conflicting values ('i32 4' from globals with 'i32 2' from start)
;  @ REPL[16]:1 within `outer`
define void @julia_outer_15455() local_unnamed_addr {
top:
  %jlcallframe1 = alloca [4 x {}*], align 8
  %jlcallframe1.sub = getelementptr inbounds [4 x {}*], [4 x {}*]* %jlcallframe1, i64 0, i64 0
;  @ REPL[16]:2 within `outer`
; ┌ @ REPL[15]:1 within `cudacall`
; │┌ @ iterators.jl:279 within `pairs`
; ││┌ @ essentials.jl:459 within `Pairs`
; │││┌ @ namedtuple.jl:234 within `eltype`
; ││││┌ @ namedtuple.jl:236 within `nteltype`
; │││││┌ @ tuple.jl:271 within `eltype`
; ││││││┌ @ tuple.jl:291 within `_compute_eltype`
; │││││││┌ @ promotion.jl:175 within `promote_typejoin`
          %0 = load {}*, {}** bitcast (i8* getelementptr (i8, i8* @jl_small_typeof, i64 256) to {}**), align 8
          %1 = call fastcc nonnull {}* @julia_typejoin_15462({}* readonly %0, {}* readonly inttoptr (i64 126395047196048 to {}*))
; ││││││││ @ promotion.jl:176 within `promote_typejoin`
          %2 = load {}*, {}** bitcast (i8* getelementptr (i8, i8* @jl_small_typeof, i64 64) to {}**), align 8
          store {}* %2, {}** %jlcallframe1.sub, align 8
          %3 = getelementptr inbounds [4 x {}*], [4 x {}*]* %jlcallframe1, i64 0, i64 1
          store {}* %0, {}** %3, align 8
          %4 = getelementptr inbounds [4 x {}*], [4 x {}*]* %jlcallframe1, i64 0, i64 2
          store {}* inttoptr (i64 126395047196048 to {}*), {}** %4, align 8
          %5 = getelementptr inbounds [4 x {}*], [4 x {}*]* %jlcallframe1, i64 0, i64 3
          store {}* %1, {}** %5, align 8
          %6 = call nonnull {}* @jl_f_apply_type({}* null, {}** nonnull %jlcallframe1.sub, i32 4)
; └└└└└└└└
;  @ REPL[16]:3 within `outer`
  ret void
}

@aviatesk I thought you mentioned otherwise in JuliaLang/julia#54322 (comment)?

aviatesk · 2024-06-10T09:33:20Z

I have confirmed that the original issue (JuliaLang/julia#52938) has been fixed in backports-release-1.11 with the following code:

const CC = Core.Compiler
using Core: MethodInstance, CodeInstance, CodeInfo, MethodTable

## interpreter

if isdefined(CC, :CachedMethodTable)
    const ExternalMethodTableView = CC.CachedMethodTable{CC.OverlayMethodTable}
    get_method_table_view(world::UInt, mt::MethodTable) =
        CC.CachedMethodTable(CC.OverlayMethodTable(world, mt))
else
    const ExternalMethodTableView = CC.OverlayMethodTable
    get_method_table_view(world::UInt, mt::MethodTable) = CC.OverlayMethodTable(world, mt)
end

struct ExternalInterpreter <: CC.AbstractInterpreter
    world::UInt
    method_table::ExternalMethodTableView

    # code_cache
    inf_cache::Vector{CC.InferenceResult}
end

function ExternalInterpreter(world::UInt=Base.get_world_counter(); method_table)
    @assert world <= Base.get_world_counter()
    method_table = get_method_table_view(world, method_table)
    inf_cache = Vector{CC.InferenceResult}()

    return ExternalInterpreter(world, method_table, inf_cache)
end

CC.InferenceParams(interp::ExternalInterpreter) = CC.InferenceParams()
CC.OptimizationParams(interp::ExternalInterpreter) = CC.OptimizationParams()
CC.get_inference_world(interp::ExternalInterpreter) = interp.world
CC.get_inference_cache(interp::ExternalInterpreter) = interp.inf_cache
CC.cache_owner(interp::ExternalInterpreter) = Symbol("JuliaLang/julia#52938")

# No need to do any locking since we're not putting our results into the runtime cache
CC.lock_mi_inference(interp::ExternalInterpreter, mi::MethodInstance) = nothing
CC.unlock_mi_inference(interp::ExternalInterpreter, mi::MethodInstance) = nothing

function CC.add_remark!(interp::ExternalInterpreter, sv::CC.InferenceState, msg)
    @debug "Inference remark during External compilation of $(sv.linfo): $msg"
end

CC.may_optimize(interp::ExternalInterpreter) = true
CC.may_compress(interp::ExternalInterpreter) = true
CC.may_discard_trees(interp::ExternalInterpreter) = true
CC.verbose_stmt_info(interp::ExternalInterpreter) = false
CC.method_table(interp::ExternalInterpreter) = interp.method_table


# main

Base.Experimental.@MethodTable(GLOBAL_METHOD_TABLE)

inner(f, types::Type, args...; kwargs...) = nothing
outer(f) = @inline inner(f, Tuple{}; foo=Ref(42), bar=1)

interp = ExternalInterpreter(; method_table=GLOBAL_METHOD_TABLE)
only(Base.code_ircode(outer, Tuple{Nothing}; interp))

1 ─     return nothing                                                             │
  => Nothing

So, it seems likely that there is some interaction with the implementation of CUDA's external abstract interpreter?

aviatesk · 2024-06-10T10:45:29Z

@maleadt Can you share with me an example to run external abstract interpreter used CUDA.jl, in a way that doesn't require any CUDA driver installed?

maleadt · 2024-06-12T08:36:35Z

Looks like one of the quirks is a contributing factor here. Adding the following overlay mimics that:

Base.Experimental.@overlay GLOBAL_METHOD_TABLE @inline Base.throw_boundserror(A, I) = error()

aviatesk · 2024-06-12T09:39:29Z

I see, in that case, it seems necessary to use JuliaLang/julia#54322 and mark throw_boundserror as @consistent_overlay. Since throw_boundserror always just throws, it should be @consistent_overlay. I'm checking it locally.

aviatesk · 2024-06-12T09:52:56Z

Yeah, I confirmed it fixes the issue if we overlay it as @consistent_overlay CUDA_2241_MT @inline Base.throw_boundserror(A, I) = error().

maleadt · 2024-06-12T10:48:26Z

Since throw_boundserror always just throws, it should be @consistent_overlay.

FWIW, t actual implementation does some more work:

CUDA.jl/src/device/quirks.jl

Lines 10 to 17 in e1e5be2

 macro gputhrow(subtype, reason) 

 quote 

 info = kernel_state().exception_info 

 info.subtype = @strptr $subtype 

 info.reason = @strptr $reason 

 throw(nothing) 

 end 

 end

But it still ends in an unconditional throw, so I guess it's still consistent.

aviatesk · 2024-06-13T10:19:11Z

Ah I see, so our situation is trickier than I thought... In this case, both the original and @overlayed throw_boundserror will unconditionally throw, so it is indeed @consistent_overlay, but the @overlayed version modifies the global state of GPUCompiler.jl, so it is not effect_free and is actually not eligible for concrete evaluation.

In this case, the original version is :effect_free and eligible for concrete evaluation, so technically it's possible to concrete-evaluate it (, which had been happening due to the bug we aimed to fix for v1.11), but allowing that doesn't seem like the correct approach. This is because it would require us to do something like

macro gputhrow(subtype, reason) 
    quote Base.@assume_effects :effect_free begin
        info = kernel_state().exception_info 
        info.subtype = @strptr $subtype 
        info.reason = @strptr $reason 
        throw(nothing) 
    end end
end

, which is totally wrong usage of Base.@assume_effects.

In this case I think it might be a good solution to incorporate an idea like LazyString into the implementation of @gputhrow. Is it possible to implement @gputhrow as follows?

struct GPUError
    subtype
    reason
end
macro gputhrow(subtype, reason)
    :(GPUError($subtype, $reason))
end
function Base.show_error(io::IO, x::GPUError) # delay the effect of `@gputhrow`
    info = kernel_state().exception_info
    info.subtype = @strptr x.subtype
    info.reason = @strptr x.reason
    [...] # show the proper exception using the updated kernel state here?
end

The goal of this implementation is to delay the computational effects of @gputhrow as much as possible and defer them until the error is actually shown.

maleadt · 2024-06-13T10:40:18Z

In this case I think it might be a good solution to incorporate an idea like LazyString into the implementation of @gputhrow. Is it possible to implement @gputhrow as follows?

Sadly not, we don't support try/catch in GPU code (stack unwinding, and setjmp/longjmp are not supported), so we can't actually throw an error object for evaluation outside of the code generated by @gputhrow. In fact, the throw(nothing) that's generated there is lowered to what's basically @llvm.trap(), aborting execution of the GPU kernel after the exception has been reported. In addition, this pattern would require dynamic allocation of a GPUError object, which CUDA.jl happens to support, but other back-ends don't.

aviatesk · 2024-06-18T07:12:47Z

@maleadt After further investigation, it seems that simply using @consistent_overlay should suffice (there's no need to modify the implementation of the overlayed throw_boundserror and refine its effects).
CUDA's throw_boundserror does have effects, but those effects are ignored for this call graph at the @assume_effects :foldable annotation on typejoin. However, if @consistent_overlay is not used, the :nonoverlayed-bit of throw_boundserror gets tainted, which prevents typejoin from being concretely evaluated by GPUInterpreter. So just using @consistent_overlay would be sufficient and justified.

maleadt · 2024-06-18T07:47:29Z

CUDA's throw_boundserror does have effects, but those effects are ignored for this call graph at the @assume_effects :foldable annotation on typejoin.

I see. I hope effect mismatches from our other overrides (i.e. outside of the typejoin context, such as our math intrinsics) don't pose problems. For example, with a simple llvmcall many effects are different, which I assume is fine:

julia> Base.infer_effects(Base.cos, (Float32,))
(+c,+e,!n,+t,+s,+m,+i)

julia> cuda_cos(x::Float32) = ccall("extern __nv_cosf", llvmcall, Cfloat, (Cfloat,), x)
cuda_cos (generic function with 1 method)

julia> Base.infer_effects(cuda_cos, (Float32,))
(!c,!e,!n,!t,!s,!m,+i)

Or our ^ implementation, which differs in :nothrow:

julia> Base.infer_effects(Base.:(^), (Float64, Int64))
(+c,+e,+n,+t,+s,+m,+i)

julia> function cuda_pow(x::Float64, y::Int64)
           y == -1 && return inv(x)
           y == 0 && return one(x)
           y == 1 && return x
           y == 2 && return x*x
           y == 3 && return x*x*x
           x ^ Float64(y)
       end
cuda_pow (generic function with 1 method)

julia> Base.infer_effects(cuda_pow, (Float64, Int64))
(+c,+e,!n,+t,+s,+m,+i)

Again, me having doubts here is purely because of not fully understanding the effects analysis. Which is why I expressed some reservations in JuliaLang/julia#54322 (comment).

aviatesk · 2024-06-18T07:55:11Z

Even if the effects mismatch, there is no problem using @consistent_overlay.
To summarize the usage conditions of @consistent_overlay explained in JuliaLang/julia#54322 concisely:

If f(x) returns a value, f′(x) must return the identical value.
If f(x) throws an exception, f′(x) must also throw an exception (although the exceptions do not need to be identical).

As long as these conditions are met, there is no problem using @consistent_overlay f, and we don't need to care about the finer details of effect analysis.

maleadt · 2024-06-18T08:26:48Z

Thanks, that really helps!

maleadt added bug Something isn't working upstream Somebody else's problem. labels Jan 17, 2024

jonas-schulze mentioned this issue Mar 28, 2024

BFloat16 support broken on Julia 1.11 #2306

Closed

maleadt mentioned this issue May 23, 2024

effects: add new @consistent_overlay macro JuliaLang/julia#54322

Merged

aviatesk added a commit to JuliaLang/julia that referenced this issue Jun 12, 2024

add test case from JuliaGPU/CUDA.jl#2241

ee313e5

aviatesk added a commit to JuliaLang/julia that referenced this issue Jun 15, 2024

add test case from JuliaGPU/CUDA.jl#2241

f3ff647

aviatesk added a commit to JuliaLang/julia that referenced this issue Jun 17, 2024

add test case from JuliaGPU/CUDA.jl#2241

6d5768f

aviatesk added a commit to JuliaLang/julia that referenced this issue Jun 18, 2024

add test cases from JuliaGPU/CUDA.jl#2241

3f6d21c

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Support for Julia 1.11 #2241

Support for Julia 1.11 #2241

maleadt commented Jan 17, 2024

maleadt commented Jun 10, 2024

aviatesk commented Jun 10, 2024

aviatesk commented Jun 10, 2024

maleadt commented Jun 12, 2024

aviatesk commented Jun 12, 2024

aviatesk commented Jun 12, 2024

maleadt commented Jun 12, 2024

aviatesk commented Jun 13, 2024

maleadt commented Jun 13, 2024

aviatesk commented Jun 18, 2024

maleadt commented Jun 18, 2024

aviatesk commented Jun 18, 2024 •

edited

Loading

maleadt commented Jun 18, 2024

Support for Julia 1.11 #2241

Support for Julia 1.11 #2241

Comments

maleadt commented Jan 17, 2024

maleadt commented Jun 10, 2024

aviatesk commented Jun 10, 2024

aviatesk commented Jun 10, 2024

maleadt commented Jun 12, 2024

aviatesk commented Jun 12, 2024

aviatesk commented Jun 12, 2024

maleadt commented Jun 12, 2024

aviatesk commented Jun 13, 2024

maleadt commented Jun 13, 2024

aviatesk commented Jun 18, 2024

maleadt commented Jun 18, 2024

aviatesk commented Jun 18, 2024 • edited Loading

maleadt commented Jun 18, 2024

aviatesk commented Jun 18, 2024 •

edited

Loading