Compile-time regression #46

ali-ramadhan · 2019-03-28T20:37:54Z

Loving the new @launch macro! But I noticed my kernels for Oceananigans.jl are taking much longer to compile once I upgraded to GPUifyLoops v0.2.0 and one kernel in particular would never finish compiling as it somehow got stuck in an infinite recursive call ending with a StackOverflowError.

I was able to track this down to the following line

GPUifyLoops.jl/src/GPUifyLoops.jl

Line 109 in 66a6de1

kernel = cufunction(contextualize(f), kernel_tt; compiler_kwargs...)

which I changed to

kernel = cufunction(f, kernel_tt; compiler_kwargs...)

which got me up and running again as none of my kernels require contextualize and brought compilation time back down to 3.5 minutes from 8.5 minutes with contextualize. No idea why the compiler got stuck when contextualize was turned on as I didn't have any function calls that could be converted.

I was wondering if it would be a good idea to make contextualize an option when launching a kernel? From my understanding, it lets you write exp(x) in your kernel which gets converted to CUDAnative.exp(x) when called via @launch CUDA(). I can see contextualize being useful but might be nice to turn it on for certain kernels that need it to speed up compilation.

I'm happy work on this. In case contextualize should always be on I can just keep this on my fork.

I was able to reproduce this with Julia 1.1 and the latest nightly [Version 1.2.0-DEV.562 (2019-03-28) Commit 11156024da].

Unfortunately I'm not sure what the issue was so I can't provide a minimal working example.

Verbose details included below.

Problematic kernel (I'm working on making it less ugly!)

function calculate_interior_source_terms!(grid::Grid, constants, eos, cfg, u, v, w, T, S, pHY′, Gu, Gv, Gw, GT, GS, F)
    Nx, Ny, Nz = grid.Nx, grid.Ny, grid.Nz
    Δx, Δy, Δz = grid.Δx, grid.Δy, grid.Δz
    fCor = constants.f
    ρ₀ = eos.ρ₀
    𝜈h, 𝜈v, κh, κv = cfg.𝜈h, cfg.𝜈v, cfg.κh, cfg.κv
    @loop for k in (1:grid.Nz; blockIdx().z)
        @loop for j in (1:grid.Ny; (blockIdx().y - 1) * blockDim().y + threadIdx().y)
            @loop for i in (1:grid.Nx; (blockIdx().x - 1) * blockDim().x + threadIdx().x)
                # u-momentum equation
                @inbounds Gu[i, j, k] = (-u∇u(u, v, w, Nx, Ny, Nz, Δx, Δy, Δz, i, j, k)
                                            + fCor*avg_xy(v, Nx, Ny, i, j, k)
                                            - δx_c2f(pHY′, Nx, i, j, k) / (Δx * ρ₀)
                                            + 𝜈∇²u(u, 𝜈h, 𝜈v, Nx, Ny, Nz, Δx, Δy, Δz, i, j, k)
                                            + F.u(u, v, w, T, S, Nx, Ny, Nz, Δx, Δy, Δz, i, j, k))
                # v-momentum equation
                @inbounds Gv[i, j, k] = (-u∇v(u, v, w, Nx, Ny, Nz, Δx, Δy, Δz, i, j, k)
                                            - fCor*avg_xy(u, Nx, Ny, i, j, k)
                                            - δy_c2f(pHY′, Ny, i, j, k) / (Δy * ρ₀)
                                            + 𝜈∇²v(v, 𝜈h, 𝜈v, Nx, Ny, Nz, Δx, Δy, Δz, i, j, k)
                                            + F.v(u, v, w, T, S, Nx, Ny, Nz, Δx, Δy, Δz, i, j, k))
                # w-momentum equation: comment about how pressure and buoyancy are handled
                @inbounds Gw[i, j, k] = (-u∇w(u, v, w, Nx, Ny, Nz, Δx, Δy, Δz, i, j, k)
                                            + 𝜈∇²w(w, 𝜈h, 𝜈v, Nx, Ny, Nz, Δx, Δy, Δz, i, j, k)
                                            + F.w(u, v, w, T, S, Nx, Ny, Nz, Δx, Δy, Δz, i, j, k))
                # temperature equation
                @inbounds GT[i, j, k] = (-div_flux(u, v, w, T, Nx, Ny, Nz, Δx, Δy, Δz, i, j, k)
                                            + κ∇²(T, κh, κv, Nx, Ny, Nz, Δx, Δy, Δz, i, j, k)
                                            + F.T(u, v, w, T, S, Nx, Ny, Nz, Δx, Δy, Δz, i, j, k))
                # salinity equation
                @inbounds GS[i, j, k] = (-div_flux(u, v, w, S, Nx, Ny, Nz, Δx, Δy, Δz, i, j, k)
                                            + κ∇²(S, κh, κv, Nx, Ny, Nz, Δx, Δy, Δz, i, j, k)
                                            + F.S(u, v, w, T, S, Nx, Ny, Nz, Δx, Δy, Δz, i, j, k))
            end
        end
    end
    @synchronize
end

Launching the kernel

@launch CUDA() calculate_interior_source_terms!(grid, constants, eos, cfg, uvw..., TS..., pr.pHY′.data, Gⁿ..., forcing, threads=(Tx, Ty), blocks=(Bx, By, Bz))

Error

ERROR: StackOverflowError:
Stacktrace:                                                                                                                                                                                       
 [1] snca_compress!(::Array{Core.Compiler.Node,1}, ::Array{UInt64,1}, ::UInt64, ::UInt64) at ./compiler/ssair/domtree.jl:204
 [2] snca_compress!(::Array{Core.Compiler.Node,1}, ::Array{UInt64,1}, ::UInt64, ::UInt64) at ./compiler/ssair/domtree.jl:207 (repeats 25909 times)
 [3] SNCA(::Core.Compiler.CFG) at ./compiler/ssair/domtree.jl:258
 [4] construct_domtree(::Core.Compiler.CFG) at ./compiler/ssair/domtree.jl:104
 [5] run_passes(::Core.CodeInfo, ::Int64, ::Core.Compiler.OptimizationState) at ./compiler/ssair/driver.jl:120
 [6] optimize(::Core.Compiler.OptimizationState, ::Any) at ./compiler/optimize.jl:167
 [7] typeinf(::Core.Compiler.InferenceState) at ./compiler/typeinfer.jl:33
 [8] typeinf_edge(::Method, ::Any, ::Core.SimpleVector, ::Core.Compiler.InferenceState) at ./compiler/typeinfer.jl:503
 [9] abstract_call_method(::Method, ::Any, ::Core.SimpleVector, ::Core.Compiler.InferenceState) at ./compiler/abstractinterpretation.jl:363
 [10] abstract_call_gf_by_type(::Any, ::Array{Any,1}, ::Any, ::Core.Compiler.InferenceState, ::Int64) at ./compiler/abstractinterpretation.jl:92
 [11] abstract_call(::Any, ::Nothing, ::Array{Any,1}, ::Array{Any,1}, ::Core.Compiler.InferenceState, ::Int64) at ./compiler/abstractinterpretation.jl:802
 [12] abstract_apply(::Any, ::Array{Any,1}, ::Array{Any,1}, ::Core.Compiler.InferenceState, ::Int64) at ./compiler/abstractinterpretation.jl:545
 [13] abstract_call(::Any, ::Array{Any,1}, ::Array{Any,1}, ::Array{Any,1}, ::Core.Compiler.InferenceState, ::Int64) at ./compiler/abstractinterpretation.jl:593
 [14] abstract_call(::Any, ::Array{Any,1}, ::Array{Any,1}, ::Array{Any,1}, ::Core.Compiler.InferenceState) at ./compiler/abstractinterpretation.jl:592
 1 GPUifyLoops.jl +                                                                                                                                                                                  X 
 [15] abstract_eval_call(::Array{Any,1}, ::Array{Any,1}, ::Array{Any,1}, ::Core.Compiler.InferenceState) at ./compiler/abstractinterpretation.jl:831
 [16] abstract_eval(::Any, ::Array{Any,1}, ::Core.Compiler.InferenceState) at ./compiler/abstractinterpretation.jl:901
 [17] typeinf_local(::Core.Compiler.InferenceState) at ./compiler/abstractinterpretation.jl:1159
 [18] typeinf_nocycle(::Core.Compiler.InferenceState) at ./compiler/abstractinterpretation.jl:1215
 [19] typeinf(::Core.Compiler.InferenceState) at ./compiler/typeinfer.jl:12
 [20] typeinf at ./compiler/typeinfer.jl:8 [inlined]
 [21] typeinf_type(::Method, ::Any, ::Core.SimpleVector, ::Core.Compiler.Params) at ./compiler/typeinfer.jl:614
 [22] return_types(::Any, ::Any) at ./reflection.jl:1068
 [23] check_method(::CUDAnative.CompilerContext) at /home/alir_mit_edu/.julia/packages/CUDAnative/PFgO3/src/compiler/validation.jl:12
 [24] compile(::CUDAnative.CompilerContext) at /home/alir_mit_edu/.julia/packages/CUDAnative/PFgO3/src/compiler/driver.jl:67
 [25] #compile#109(::Bool, ::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}, ::typeof(CUDAnative.compile), ::VersionNumber, ::Any, ::Any) at /home/alir_mit_edu/.julia/packages/CUDAnative/PFgO3/src/compiler/driver.jl:45
 [26] compile at /home/alir_mit_edu/.julia/packages/CUDAnative/PFgO3/src/compiler/driver.jl:43 [inlined]
 [27] #compile#108(::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}, ::typeof(CUDAnative.compile), ::CUDAdrv.CuDevice, ::Function, ::Any) at /home/alir_mit_edu/.julia/packages/CUDAnative/PFgO3/src/compiler/driver.jl:18
 [28] compile at /home/alir_mit_edu/.julia/packages/CUDAnative/PFgO3/src/compiler/driver.jl:16 [inlined]
 [29] macro expansion at /home/alir_mit_edu/.julia/packages/CUDAnative/PFgO3/src/execution.jl:269 [inlined]
 [30] #cufunction#123(::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}, ::typeof(CUDAnative.cufunction), ::getfield(GPUifyLoops, Symbol("##16#17")){typeof(Oceananigans.calculate_interior_source_terms!)}, ::Type{Tuple{RegularCartesianGrid{Float64,StepRangeLen{Float64,Base.TwicePrecision{Float64},Base.TwicePrecision{Float64}}},PlanetaryConstants{Float64},LinearEquationOfState{Float64},ModelConfiguration{Float64},CUDAnative.CuDeviceArray{Float64,3,CUDAnative.AS.Global},CUDAnative.CuDeviceArray{Float64,3,CUDAnative.AS.Global},CUDAnative.CuDeviceArray{Float64,3,CUDAnative.AS.Global},CUDAnative.CuDeviceArray{Float64,3,CUDAnative.AS.Global},CUDAnative.CuDeviceArray{Float64,3,CUDAnative.AS.Global},CUDAnative.CuDeviceArray{Float64,3,CUDAnative.AS.Global},CUDAnative.CuDeviceArray{Float64,3,CUDAnative.AS.Global},CUDAnative.CuDeviceArray{Float64,3,CUDAnative.AS.Global},CUDAnative.CuDeviceArray{Float64,3,CUDAnative.AS.Global},CUDAnative.CuDeviceArray{Float64,3,CUDAnative.AS.Global},CUDAnative.CuDeviceArray{Float64,3,CUDAnative.AS.Global},Forcing{typeof(Oceananigans.zero_func),typeof(Oceananigans.zero_func),typeof(Oceananigans.zero_func),typeof(Oceananigans.zero_func),typeof(Oceananigans.zero_func)}}}) at /home/alir_mit_edu/.julia/packages/CUDAnative/PFgO3/src/execution.jl:240
 [31] cufunction(::Function, ::Type) at /home/alir_mit_edu/.julia/packages/CUDAnative/PFgO3/src/execution.jl:240
 [32] macro expansion at /home/alir_mit_edu/.julia/packages/GPUifyLoops/xOg1y/src/GPUifyLoops.jl:109 [inlined]
 [33] macro expansion at ./gcutils.jl:87 [inlined]
 [34] #launch#34(::Base.Iterators.Pairs{Symbol,Tuple{Int64,Int64,Vararg{Int64,N} where N},Tuple{Symbol,Symbol},NamedTuple{(:threads, :blocks),Tuple{Tuple{Int64,Int64},Tuple{Int64,Int64,Int64}}}}, ::typeof(GPUifyLoops.launch), ::GPUifyLoops.CUDA, ::typeof(Oceananigans.calculate_interior_source_terms!), ::RegularCartesianGrid{Float64,StepRangeLen{Float64,Base.TwicePrecision{Float64},Base.TwicePrecision{Float64}}}, ::Vararg{Any,N} where N) at /home/alir_mit_edu/.julia/packages/GPUifyLoops/xOg1y/src/GPUifyLoops.jl:106
 [35] (::getfield(GPUifyLoops, Symbol("#kw##launch")))(::NamedTuple{(:threads, :blocks),Tuple{Tuple{Int64,Int64},Tuple{Int64,Int64,Int64}}}, ::typeof(GPUifyLoops.launch), ::GPUifyLoops.CUDA, ::typeof(Oceananigans.calculate_interior_source_terms!), ::RegularCartesianGrid{Float64,StepRangeLen{Float64,Base.TwicePrecision{Float64},Base.TwicePrecision{Float64}}}, ::Vararg{Any,N} where N) at ./none:0
 [36] macro expansion at /home/alir_mit_edu/.julia/packages/GPUifyLoops/xOg1y/src/GPUifyLoops.jl:50 [inlined]
 [37] time_step!(::Model{GPU}, ::Int64, ::Int64) at /home/alir_mit_edu/gpu_test/Oceananigans.jl/src/time_steppers.jl:79
 [38] top-level scope at util.jl:156

The text was updated successfully, but these errors were encountered:

ali-ramadhan · 2019-03-28T20:53:11Z

Hmmm, now it seems like all the time stepping kernels are being compiled every time step so code that @launches kernels is very slow but GPU code that doesn't is still fast after first function call.

I might have done something bad...

lcw · 2019-03-28T21:24:28Z

If you remove the call to contextualize then I believe the kernel will be running on the host and not on the GPU. If you pass in CuArray's the scalar access will be slow. You can try setting CuArrays.allowscalar(false) to warn about scalar access of CuArray's.

ali-ramadhan · 2019-03-28T21:30:36Z

Ah that would explain the slowdown, thanks for the tip! I set CuArrays.allowscalar(true) for debugging but it came back to bite me here haha.

I must have misunderstood what contextualize does. I'll add the call back in and try to debug some more.

The kernels still take much longer to compile compared to GPUifyLoops v0.1.0 which I'm guessing might be because of all the Cassette stuff but I'm not sure.

I should probably get that last kernel working before worrying about speed, but 9 minute compilation times makes debugging very slow :(

lcw · 2019-03-28T21:36:06Z

Yea, the compile times are killing our kernels as well. I haven't been able to wait long enough to get my kernels to even compile with v0.2.0. :-/

ali-ramadhan · 2019-03-28T21:41:48Z

That sounds pretty bad... Do you know how long they took to compile with v0.1.0?

I'm noticing that they're taking ~2.5x more time to compile with v0.2.0.

vchuravy · 2019-03-28T21:47:33Z

:/ Since v0.2.0 got rid of the @setup stuff it does require contextualize for the entire kernel. Compile times in the minutes are bad either way, I haven't had any clever ideas yet.

ali-ramadhan · 2019-03-28T22:06:48Z

Yeah I know it's a difficult problem, I don't know enough to have any ideas.

For now I think if I spin up 10 Google Cloud GPU instances I can debug 10x as fast!

vchuravy · 2019-03-28T22:11:24Z

Can you post the output of CUDAnative.timings() for some of your more egregious functions?

ali-ramadhan · 2019-03-28T22:26:11Z

Just got CUDAnative.timings() working. I'll try running it on v0.1.0 and v0.2.0 then try again on your new branches.

ali-ramadhan · 2019-03-28T23:09:49Z

I used Oceananigans.jl to create an ocean model on a GPU and time step it once which should compile and run most of the code, and almost all the CUDA kernels.

With GPUifyLoops 0.1.0

julia> using CUDAnative
julia> using Oceananigans
CUDA-enabled GPU(s) detected:
CuDevice(0): Tesla V100-SXM2-16GB

julia> model = Model(N=(128, 128, 128), L=(100, 100, 100), arch=:GPU, float_type=Float32);
julia> time_step!(model, 1, 1)
julia> CUDAnative.timings()
 ──────────────────────────────────────────────────────────
                                             Time          
                                     ──────────────────────
          Tot / % measured:                302s / 21.2%    

 Section                     ncalls     time   %tot     avg
 ──────────────────────────────────────────────────────────
 LLVM middle-end                 14    60.3s  94.4%   4.31s
   IR generation                 14    58.6s  91.7%   4.19s
     rewrite                     14    55.8s  87.3%   3.98s
     linking                     14    1.54s  2.41%   110ms
     emission                    14    1.26s  1.97%  89.8ms
     clean-up                    14   5.42ms  0.01%   387μs
   optimization                  14    1.66s  2.59%   118ms
   device library                14   86.4ms  0.14%  6.17ms
   runtime library               14   6.44ms  0.01%   460μs
   verification                  14   2.52ms  0.00%   180μs
 Julia front-end                 14    1.99s  3.11%   142ms
 CUDA object generation          14    945ms  1.48%  67.5ms
   linking                       14    805ms  1.26%  57.5ms
   compilation                   14    140ms  0.22%  10.0ms
 LLVM back-end                   14    643ms  1.01%  45.9ms
   machine-code generation       14    116ms  0.18%  8.27ms
   preparation                   14   35.0ms  0.05%  2.50ms
 ──────────────────────────────────────────────────────────

With GPUifyLoops 0.2.0 (without calling/compiling the most expensive kernel calculate_interior_source_terms! that stack overflowed)

julia> using CUDAnative
julia> using Oceananigans
CUDA-enabled GPU(s) detected:
CuDevice(0): Tesla V100-SXM2-16GB

julia> model = Model(N=(128, 128, 128), L=(100, 100, 100), arch=GPU(), float_type=Float32);
julia> time_step!(model, 1, 1)
julia> CUDAnative.timings()
 ──────────────────────────────────────────────────────────
                                             Time          
                                     ──────────────────────
          Tot / % measured:                649s / 78.4%    

 Section                     ncalls     time   %tot     avg
 ──────────────────────────────────────────────────────────
 LLVM middle-end                 13     501s  98.3%   38.5s
   IR generation                 13     493s  96.8%   37.9s
     rewrite                     13     402s  79.0%   31.0s
     emission                    13    51.2s  10.1%   3.94s
     linking                     13    38.8s  7.63%   2.99s
     clean-up                    13    134ms  0.03%  10.3ms
   optimization                  13    7.79s  1.53%   599ms
   device library                13    271ms  0.05%  20.8ms
   runtime library               13   8.55ms  0.00%   658μs
   verification                  13   5.36ms  0.00%   412μs
 Julia front-end                 13    7.26s  1.43%   558ms
 CUDA object generation          13    632ms  0.12%  48.6ms
   linking                       13    503ms  0.10%  38.7ms
   compilation                   13    129ms  0.03%  9.91ms
 LLVM back-end                   13    562ms  0.11%  43.3ms
   machine-code generation       13    104ms  0.02%  7.97ms
   preparation                   13   37.0ms  0.01%  2.84ms
 ──────────────────────────────────────────────────────────

Will try on the new branches now.

cc JuliaGPU/CUDAnative.jl#365

maleadt · 2019-03-29T06:54:00Z

Oh wow that's actually spent inside of the CUDAnative IR rewriting passes. That's surprising, but good news as well since we should be able to optimize more easily then, say, have inference work effectively with Cassette.

Could you send me the modules you're processing? That should be easier than replicating your set-up:

diff --git a/src/compiler/irgen.jl b/src/compiler/irgen.jl
index a4e1f51..730b12b 100644
--- a/src/compiler/irgen.jl
+++ b/src/compiler/irgen.jl
@@ -173,6 +173,9 @@ function irgen(job::CompilerJob, linfo::Core.MethodInstance, world)
     globalUnique += 1
     llvmfn *= "_$globalUnique"
     LLVM.name!(entry, llvmfn)
+    open(joinpath(tempdir(), "module_$(globalUnique).bc"), "w") do io
+        write(io, mod)
+    end
 
     # minimal required optimization
     @timeit to[] "rewrite" ModulePassManager() do pm

ali-ramadhan · 2019-03-29T14:02:27Z

That would be awesome!

I can easily modify irgen.jl with the change you suggested and send you the .bc files, is this what you're suggesting?

maleadt · 2019-03-29T14:14:48Z

Yes.

ali-ramadhan · 2019-03-29T14:57:47Z

Got 24 modules/files out. Wasn't sure of the best way of sharing them so I just put them up on a public Google Drive folder: https://goo.gl/DARdUc

I was running with a Tesla V100-SXM2-16GB GPU if that matters.

   _       _ _(_)_     |  Documentation: https://docs.julialang.org
  (_)     | (_) (_)    |
   _ _   _| |_  __ _   |  Type "?" for help, "]?" for Pkg help.
  | | | | | | |/ _` |  |
  | | |_| | | | (_| |  |  Version 1.2.0-DEV.562 (2019-03-28)
 _/ |\__'_|_|_|\__'_|  |  Commit 11156024da (1 day old master)
|__/                   |
julia> using CUDAnative
julia> using Oceananigans
julia> model = Model(N=(128, 128, 128), L=(100, 100, 100), arch=GPU(), float_type=Float32);
julia> time_step!(model, 1, 1)
julia> CUDAnative.timings()
 ──────────────────────────────────────────────────────────
                                             Time          
                                     ──────────────────────
          Tot / % measured:                637s / 79.5%    
 Section                     ncalls     time   %tot     avg
 ──────────────────────────────────────────────────────────
 LLVM middle-end                 13     498s  98.4%   38.3s
   IR generation                 13     490s  96.8%   37.7s
     rewrite                     13     400s  79.0%   30.8s
     emission                    13    51.1s  10.1%   3.93s
     linking                     13    38.2s  7.55%   2.94s
     clean-up                    13    106ms  0.02%  8.13ms
   optimization                  13    7.78s  1.54%   598ms
   device library                13    265ms  0.05%  20.4ms
   runtime library               13   8.92ms  0.00%   686μs
   verification                  13   5.64ms  0.00%   434μs
 Julia front-end                 13    7.41s  1.46%   570ms
 LLVM back-end                   13    570ms  0.11%  43.8ms
   machine-code generation       13    103ms  0.02%  7.94ms
   preparation                   13   53.2ms  0.01%  4.09ms
 CUDA object generation          13    292ms  0.06%  22.5ms
   linking                       13    160ms  0.03%  12.3ms
   compilation                   13    132ms  0.03%  10.1ms
 ──────────────────────────────────────────────────────────

maleadt · 2019-03-29T15:16:57Z

Processing module_13.bc took 105.59s

Yup, those are some good modules.

ali-ramadhan · 2019-03-29T18:43:23Z

@vchuravy just tried running with the latest Julia#master which includes JuliaLang/julia#31531. Good news is that it doesn't stack overflow anymore but it's been compiling for 90 minutes and counting...

Sounds like @maleadt has a fix though! CliMA/Oceananigans.jl#66 (comment)

maleadt · 2019-03-29T18:55:49Z

Please try JuliaGPU/CUDAnative.jl#369 and report new timings, should have fixed some inadvertent quadratic behavior and made it linear.

Note that there's probably more inefficiencies; I haven't been working with exceptionally large modules. Don't hesitate to send me these, I'm all for optimizing the compiler (if it matters).

ali-ramadhan · 2019-03-29T19:04:43Z

Awesome, thanks so much for the lightning fast fix! Will give it a try.

maleadt · 2019-03-29T19:16:48Z

No problem. Pushed an additional optimization; let me know which commit you tested on.

ali-ramadhan · 2019-03-29T20:06:36Z

Testing with your later commit (JuliaGPU/CUDAnative.jl@8801976) but it's still busy compiling (40 minutes and counting...).

When I commented out the call to the problematic kernel above #46 (comment) everything took ~8 minutes. And when I tested on Julia#master with JuliaLang/julia#31531 it was still compiling after 2 hours, so I don't know what the upper bound might be. This must be a pretty bad kernel though.

I'll let it keep compiling for as long I can to see what the CUDAnative.timings() look like.

maleadt · 2019-03-29T20:08:52Z

Strange, the speedup I noticed on the modules you sent was pretty large (400s -> 5s).
If the culprit is still the initial IR rewrite, the patch I sent before should have dumped the problematic module already so you should be able to find and upload it again.

ali-ramadhan · 2019-03-29T20:16:48Z

Interesting, I'll repatch and reproduce the modules.

The modules I shared before may have not included the massive problematic kernel as it was causing stack overflows until JuliaLang/julia@5875d8c.

ali-ramadhan · 2019-03-29T20:31:08Z

I've shared the new modules in a new Google Drive folder: https://goo.gl/AeMDfP

There are now 2 more modules for a total of 26. And the new ones (25, 26) are huge compared to the others which are tiny, so hopefully that's the problematic kernel.

Sorry they weren't included earlier.

4.0K /tmp/module_10.bc  4.0K /tmp/module_14.bc  4.0K /tmp/module_18.bc   12K /tmp/module_21.bc  1.5M /tmp/module_25.bc  4.0K /tmp/module_4.bc  4.0K /tmp/module_8.bc
4.0K /tmp/module_11.bc  4.0K /tmp/module_15.bc  4.0K /tmp/module_19.bc   12K /tmp/module_22.bc  924K /tmp/module_26.bc  4.0K /tmp/module_5.bc  4.0K /tmp/module_9.bc
4.0K /tmp/module_12.bc  4.0K /tmp/module_16.bc  4.0K /tmp/module_1.bc    16K /tmp/module_23.bc  4.0K /tmp/module_2.bc   4.0K /tmp/module_6.bc
4.0K /tmp/module_13.bc  4.0K /tmp/module_17.bc  4.0K /tmp/module_20.bc   12K /tmp/module_24.bc  4.0K /tmp/module_3.bc   4.0K /tmp/module_7.bc

maleadt · 2019-03-29T20:37:31Z

There are now 2 more modules for a total of 26. And the new ones (25, 26) are huge compared to the others which are tiny, so hopefully that's the problematic kernel.

No, they are smaller and not as problematic as the previous dump of modules you sent.
On CUDAnative master, processing that module 25 takes 20-30s, on the optimizations branch it finishes within .5s. The old dump had modules that took over 100s, and now also process instantly.

Anyway, I won't be doing much development this weekend, so take your time to figure out what the impact of those optimizations is and which modules remain problematic (for what part of the compiler -- the timings should be helpful there).

EDIT: also note that many of the modules are probably from the CUDAnative runtime library, which explains the many tiny kernels. You could get rid of those by warming up and making sure the runtime is compiled (or commenting-out the global rm(...) line in CUDAnative/src/compiler/rtlib.jl, which invalidates the existing runtime libraries when CUDAnative changes and the precompilation cache is invalidated).

ali-ramadhan · 2019-03-29T20:50:08Z

Ah I see, I thought the relative file size meant something. I tried again without the kernel I think is problematic and indeed compilation is much faster (500 s -> 100 s) and IR rewrite time is only 7 seconds.

The problem might be on my end at this point. I'll play around with that one kernel and see if I can isolate the problem and get it to compile, but everything looks great without it.

Thanks again for your help!

   _       _ _(_)_     |  Documentation: https://docs.julialang.org
  (_)     | (_) (_)    |
   _ _   _| |_  __ _   |  Type "?" for help, "]?" for Pkg help.
  | | | | | | |/ _` |  |
  | | |_| | | | (_| |  |  Version 1.2.0-DEV.572 (2019-03-29)
 _/ |\__'_|_|_|\__'_|  |  Commit 4b1230277c (0 days old master)
|__/                   |

julia> using CUDAnative
julia> using Oceananigans
julia> model = Model(N=(128, 128, 128), L=(100, 100, 100), arch=GPU(), float_type=Float32);
julia> time_step!(model, 1, 1)
julia> CUDAnative.timings()
 ────────────────────────────────────────────────────────────────────────────────────
                                             Time                   Allocations      
                                     ──────────────────────   ───────────────────────
          Tot / % measured:                203s / 53.2%           7.51GiB / 43.0%    

 Section                     ncalls     time   %tot     avg     alloc   %tot      avg
 ────────────────────────────────────────────────────────────────────────────────────
 LLVM middle-end                 13    99.2s  91.9%   7.63s   1.65GiB  51.0%   130MiB
   IR generation                 13    91.4s  84.7%   7.03s   1.58GiB  48.8%   124MiB
     emission                    13    49.5s  45.9%   3.81s   1.24GiB  38.4%  97.6MiB
     linking                     13    34.3s  31.8%   2.64s      208B  0.00%    16.0B
     rewrite                     13    6.81s  6.31%   524ms    312MiB  9.44%  24.0MiB
       hide unreachable       32.0k    4.43s  4.10%   139μs    168MiB  5.07%  5.37KiB
         find                 32.0k    2.62s  2.43%  82.0μs   8.94MiB  0.27%     293B
         predecessors         32.0k    1.24s  1.15%  38.8μs   36.2MiB  1.09%  1.16KiB
         replace              32.0k    372ms  0.34%  11.6μs   1.90MiB  0.06%    62.3B
       lower throw               13    730ms  0.68%  56.2ms   52.1MiB  1.58%  4.01MiB
       hide trap                 13    219ms  0.20%  16.9ms   9.53MiB  0.29%   751KiB
     clean-up                    13    130ms  0.12%  10.0ms   32.6MiB  0.98%  2.51MiB
   optimization                  13    7.46s  6.91%   574ms   71.9MiB  2.17%  5.53MiB
   device library                13    262ms  0.24%  20.2ms    139KiB  0.00%  10.7KiB
   runtime library               13   8.15ms  0.01%   627μs   7.69KiB  0.00%     606B
   verification                  13   5.00ms  0.00%   385μs     0.00B  0.00%    0.00B
 Julia front-end                 13    7.34s  6.80%   564ms   1.49GiB  46.0%   117MiB
 CUDA object generation          13    773ms  0.72%  59.5ms   29.7MiB  0.90%  2.29MiB
   linking                       13    652ms  0.60%  50.2ms   13.0MiB  0.39%  1.00MiB
   compilation                   13    121ms  0.11%  9.29ms   16.7MiB  0.51%  1.29MiB
 LLVM back-end                   13    667ms  0.62%  51.3ms   70.2MiB  2.12%  5.40MiB
   machine-code generation       13    101ms  0.09%  7.77ms    137KiB  0.00%  10.5KiB
   preparation                   13   49.6ms  0.05%  3.82ms   2.79MiB  0.08%   220KiB
 ────────────────────────────────────────────────────────────────────────────────────

maleadt · 2019-03-29T21:31:59Z

IR emission and linking are both pretty direct calls into respectively julia and LLVM, so there's not much to optimize there. Although the allocations are suspicious... To debug that phase of the compiler, it would be useful to have a single kernel invocation (or even better, a function + tuple type for use with eg. CUDAnative.code_llvm) that shows bad performance.

ali-ramadhan · 2019-03-30T13:29:46Z

Turns out the "problematic kernel" does compile, it just takes 2 hours, most of which is spent in IR emission and linking.

If there's not much to optimize then I wonder if this one kernel can be launched the way it was done in GPUifyLoops 0.1.0 (where this kernel compiled relatively quickly) and all the others can be launched with Casette and contextualize.

I'll try modifying the kernel to see if it's just the large number of @inline functions keeping the compiler busy or if it's some bad statements.

────────────────────────────────────────────────────────────────────────────────────
                                             Time                   Allocations      
                                     ──────────────────────   ───────────────────────
          Tot / % measured:              43636s / 16.3%           16.5GiB / 73.9%    
 Section                     ncalls     time   %tot     avg     alloc   %tot      avg
 ────────────────────────────────────────────────────────────────────────────────────
 LLVM middle-end                 14    7100s   100%    507s   6.18GiB  50.8%   452MiB
   IR generation                 14    7064s  99.1%    505s   6.09GiB  50.0%   446MiB
     linking                     14    4344s  60.9%    310s      224B  0.00%    16.0B
     emission                    14    2682s  37.6%    192s   5.12GiB  42.1%   375MiB
     rewrite                     14    33.8s  0.47%   2.41s    856MiB  6.87%  61.2MiB
       hide unreachable        129k    19.8s  0.28%   153μs    660MiB  5.29%  5.22KiB
         find                  129k    11.4s  0.16%  88.0μs   38.7MiB  0.31%     314B
         predecessors          129k    5.43s  0.08%  42.0μs    112MiB  0.90%     908B
         replace               129k    1.70s  0.02%  13.1μs   8.68MiB  0.07%    70.3B
       lower throw               14    2.03s  0.03%   145ms   77.8MiB  0.62%  5.56MiB
       hide trap                 14    902ms  0.01%  64.4ms   32.0MiB  0.26%  2.28MiB
     clean-up                    14    603ms  0.01%  43.1ms    133MiB  1.07%  9.52MiB
   optimization                  14    35.4s  0.50%   2.53s   95.3MiB  0.76%  6.80MiB
   device library                14    406ms  0.01%  29.0ms    140KiB  0.00%  10.0KiB
   verification                  14   62.7ms  0.00%  4.48ms     0.00B  0.00%    0.00B
   runtime library               14   9.71ms  0.00%   694μs   7.69KiB  0.00%     562B
 Julia front-end                 14    28.3s  0.40%   2.02s   5.89GiB  48.4%   431MiB
 CUDA object generation          14    836ms  0.01%  59.7ms   31.7MiB  0.25%  2.27MiB
   linking                       14    695ms  0.01%  49.6ms   14.0MiB  0.11%  1.00MiB
   compilation                   14    141ms  0.00%  10.1ms   17.7MiB  0.14%  1.27MiB
 LLVM back-end                   14    813ms  0.01%  58.1ms   73.0MiB  0.59%  5.22MiB
   machine-code generation       14    172ms  0.00%  12.3ms    200KiB  0.00%  14.3KiB
   preparation                   14   52.6ms  0.00%  3.76ms   2.79MiB  0.02%   204KiB
 ────────────────────────────────────────────────────────────────────────────────────

The new modules that were spit out seem much larger including a 38 MiB one.

total 51M
 20K module_10.bc  628K module_13.bc  4.0K module_16.bc  4.0K module_19.bc   12K module_21.bc   12K module_24.bc   12K module_2.bc  1.5M module_5.bc  2.2M module_8.bc
 16K module_11.bc  3.0M module_14.bc  4.0K module_17.bc   12K module_1.bc    12K module_22.bc  1.5M module_25.bc   16K module_3.bc  924K module_6.bc  2.4M module_9.bc
444K module_12.bc  4.0K module_15.bc  4.0K module_18.bc  4.0K module_20.bc   16K module_23.bc  924K module_26.bc   12K module_4.bc   38M module_7.bc

ali-ramadhan · 2019-03-30T13:38:23Z

Not sure if this is helpful or what you were suggesting but running CUDAnative.code_llvm on the problematic kernel that took 2 hours to compile (but executes quickly as expected, <5 ms per call on 128^3 CuArrays) shows this: https://gist.github.com/ali-ramadhan/04eec486a05551433a1c52feebb2b889

vchuravy · 2019-03-30T16:27:31Z

Not sure if this is helpful

Not quite, since you are using Any and thus we only see the generic calls. What are the actual types for CUDAnative.code_llvm(Oceananigans.calculate_interior_source_terms!, (Grid, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any)) that are used instead of the Any?

ali-ramadhan · 2019-04-01T01:37:46Z

Ah I thought I had to match the signature as defined. Just tried again with the actual types (no parameters), which looks kind of similar so not sure if it's helpful: https://gist.github.com/ali-ramadhan/1d82f1414642fa9b5ba3b1300c51f66e

I also tried with the explicit types + parameterized types (see bottom of comment) but hit an LLVM error.

Not sure if it's relevant but I also tried with the explicit types plus parameterized types, e.g. CuArray{Float32,3} but I ran into https://github.com/JuliaGPU/CUDAnative.jl/issues/368.

julia> CUDAnative.code_llvm(Oceananigans.calculate_interior_source_terms!, (RegularCartesianGrid{Float32,StepRangeLen{Float32,Float64,Float64}}, PlanetaryConstants{Float64}, LinearEquationOfState{Float64}, ModelConfiguration{Float64}, CuArray{Float32,3}, CuArray{Float32,3}, CuArray{Float32,3}, CuArray{Float32,3}, CuArray{Float32,3}, CuArray{Float32,3}, CuArray{Float32,3}, CuArray{Float32,3}, CuArray{Float32,3}, CuArray{Float32,3}, CuArray{Float32,3}, Forcing{typeof(Oceananigans.zero_func),typeof(Oceananigans.zero_func),typeof(Oceananigans.zero_func),typeof(Oceananigans.zero_func),typeof(Oceananigans.zero_func)}))
ERROR: LLVM error: inlinable function call in a function with debug info must have a !dbg location
  %2 = call fastcc %jl_value_t addrspace(10)* @ptx_gc_pool_alloc(i64 16)
inlinable function call in a function with debug info must have a !dbg location
  %2 = call fastcc %jl_value_t addrspace(10)* @ptx_gc_pool_alloc(i64 16)
inlinable function call in a function with debug info must have a !dbg location
  %2 = call fastcc %jl_value_t addrspace(10)* @ptx_gc_pool_alloc(i64 16)

... Same error message repeated ~230 times...

Stacktrace:
 [1] verify(::LLVM.Module) at /home/alir_mit_edu/.julia/packages/LLVM/tPWXv/src/analysis.jl:11
 [2] macro expansion at /home/alir_mit_edu/.julia/packages/TimerOutputs/7zSea/src/TimerOutput.jl:216 [inlined]
 [3] macro expansion at /home/alir_mit_edu/.julia/packages/CUDAnative/s2bPe/src/compiler/driver.jl:98 [inlined]
 [4] macro expansion at /home/alir_mit_edu/.julia/packages/TimerOutputs/7zSea/src/TimerOutput.jl:216 [inlined]
 [5] #codegen#114(::Bool, ::Bool, ::Bool, ::typeof(CUDAnative.codegen), ::Symbol, ::CUDAnative.CompilerJob) at /home/alir_mit_edu/.julia/packages/CUDAnative/s2bPe/src/compiler/driver.jl:79
 [6] #codegen at ./none:0 [inlined]
 [7] #code_llvm#156(::Bool, ::Bool, ::Bool, ::typeof(CUDAnative.code_llvm), ::Base.TTY, ::CUDAnative.CompilerJob) at /home/alir_mit_edu/.julia/packages/CUDAnative/s2bPe/src/reflection.jl:40
 [8] #code_llvm at ./none:0 [inlined]
 [9] #code_llvm#155(::Bool, ::VersionNumber, ::Bool, ::Bool, ::Bool, ::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}, ::typeof(CUDAnative.code_llvm), ::Base.TTY, ::Function, ::Any) at /home/alir_mit_edu/.julia/packages/CUDAnative/s2bPe/src/reflection.jl:35
 [10] code_llvm(::Base.TTY, ::Function, ::Any) at /home/alir_mit_edu/.julia/packages/CUDAnative/s2bPe/src/reflection.jl:33
 [11] #code_llvm#157(::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}, ::typeof(CUDAnative.code_llvm), ::Any, ::Any) at /home/alir_mit_edu/.julia/packages/CUDAnative/s2bPe/src/reflection.jl:47
 [12] code_llvm(::Any, ::Any) at /home/alir_mit_edu/.julia/packages/CUDAnative/s2bPe/src/reflection.jl:47
 [13] top-level scope at REPL[37]:1

vchuravy · 2019-04-01T22:16:22Z

@ali-ramadhan and I spend a bit more time on this issue. The underlying problem is that the 265 fix for Cassette causes a 3 magnitude blow-up in code-size and a lot more work for CUDAnative.jl

So I will disable that. Together with the CUDAnative improvements compile times for Ali are now rather reasonable. (10s)

49: disable 265 fix if not requested during precompile time r=vchuravy a=vchuravy fixes #46 Co-authored-by: Valentin Churavy <[email protected]>

ali-ramadhan changed the title ~~Possible to disable contextualize on CUDA kernels to avoid stack overflows and speed up compilation?~~ Good idea to disable contextualize on CUDA kernels to avoid stack overflows and speed up compilation? Mar 28, 2019

ali-ramadhan changed the title ~~Good idea to disable contextualize on CUDA kernels to avoid stack overflows and speed up compilation?~~ Good idea to disable contextualize on sp,e CUDA kernels to avoid stack overflows and speed up compilation? Mar 28, 2019

ali-ramadhan changed the title ~~Good idea to disable contextualize on sp,e CUDA kernels to avoid stack overflows and speed up compilation?~~ Good idea to disable contextualize on some CUDA kernels to avoid stack overflows and speed up compilation? Mar 28, 2019

ali-ramadhan mentioned this issue Mar 28, 2019

Upgrade to GPUifyLoops v0.2.0 and latest CUDA* packages. CliMA/Oceananigans.jl#153

Closed

vchuravy mentioned this issue Mar 28, 2019

don't force inline everything #48

Closed

vchuravy mentioned this issue Mar 28, 2019

Prevent domtree construction from overflowing for large functions JuliaLang/julia#31531

Merged

ali-ramadhan mentioned this issue Mar 28, 2019

honour noinline #47

Merged

ali-ramadhan mentioned this issue Mar 29, 2019

How to reduce compile time for GPU code? CliMA/Oceananigans.jl#66

Closed

vchuravy changed the title ~~Good idea to disable contextualize on some CUDA kernels to avoid stack overflows and speed up compilation?~~ Compile-time regression Mar 30, 2019

vchuravy mentioned this issue Apr 1, 2019

disable 265 fix if not requested during precompile time #49

Merged

bors bot added a commit that referenced this issue Apr 3, 2019

Merge #49

984011d

49: disable 265 fix if not requested during precompile time r=vchuravy a=vchuravy fixes #46 Co-authored-by: Valentin Churavy <[email protected]>

bors bot closed this as completed in #49 Apr 3, 2019

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Compile-time regression #46

Compile-time regression #46

ali-ramadhan commented Mar 28, 2019 •

edited

Loading

ali-ramadhan commented Mar 28, 2019

lcw commented Mar 28, 2019 •

edited

Loading

ali-ramadhan commented Mar 28, 2019

lcw commented Mar 28, 2019

ali-ramadhan commented Mar 28, 2019

vchuravy commented Mar 28, 2019

ali-ramadhan commented Mar 28, 2019

vchuravy commented Mar 28, 2019

ali-ramadhan commented Mar 28, 2019

ali-ramadhan commented Mar 28, 2019 •

edited

Loading

maleadt commented Mar 29, 2019

ali-ramadhan commented Mar 29, 2019

maleadt commented Mar 29, 2019

ali-ramadhan commented Mar 29, 2019

maleadt commented Mar 29, 2019

ali-ramadhan commented Mar 29, 2019

maleadt commented Mar 29, 2019

ali-ramadhan commented Mar 29, 2019

maleadt commented Mar 29, 2019

ali-ramadhan commented Mar 29, 2019

maleadt commented Mar 29, 2019

ali-ramadhan commented Mar 29, 2019

ali-ramadhan commented Mar 29, 2019

maleadt commented Mar 29, 2019 •

edited

Loading

ali-ramadhan commented Mar 29, 2019 •

edited

Loading

maleadt commented Mar 29, 2019

ali-ramadhan commented Mar 30, 2019

ali-ramadhan commented Mar 30, 2019 •

edited

Loading

vchuravy commented Mar 30, 2019

ali-ramadhan commented Apr 1, 2019 •

edited

Loading

vchuravy commented Apr 1, 2019

Compile-time regression #46

Compile-time regression #46

Comments

ali-ramadhan commented Mar 28, 2019 • edited Loading

ali-ramadhan commented Mar 28, 2019

lcw commented Mar 28, 2019 • edited Loading

ali-ramadhan commented Mar 28, 2019

lcw commented Mar 28, 2019

ali-ramadhan commented Mar 28, 2019

vchuravy commented Mar 28, 2019

ali-ramadhan commented Mar 28, 2019

vchuravy commented Mar 28, 2019

ali-ramadhan commented Mar 28, 2019

ali-ramadhan commented Mar 28, 2019 • edited Loading

maleadt commented Mar 29, 2019

ali-ramadhan commented Mar 29, 2019

maleadt commented Mar 29, 2019

ali-ramadhan commented Mar 29, 2019

maleadt commented Mar 29, 2019

ali-ramadhan commented Mar 29, 2019

maleadt commented Mar 29, 2019

ali-ramadhan commented Mar 29, 2019

maleadt commented Mar 29, 2019

ali-ramadhan commented Mar 29, 2019

maleadt commented Mar 29, 2019

ali-ramadhan commented Mar 29, 2019

ali-ramadhan commented Mar 29, 2019

maleadt commented Mar 29, 2019 • edited Loading

ali-ramadhan commented Mar 29, 2019 • edited Loading

maleadt commented Mar 29, 2019

ali-ramadhan commented Mar 30, 2019

ali-ramadhan commented Mar 30, 2019 • edited Loading

vchuravy commented Mar 30, 2019

ali-ramadhan commented Apr 1, 2019 • edited Loading

vchuravy commented Apr 1, 2019

ali-ramadhan commented Mar 28, 2019 •

edited

Loading

lcw commented Mar 28, 2019 •

edited

Loading

ali-ramadhan commented Mar 28, 2019 •

edited

Loading

maleadt commented Mar 29, 2019 •

edited

Loading

ali-ramadhan commented Mar 29, 2019 •

edited

Loading

ali-ramadhan commented Mar 30, 2019 •

edited

Loading

ali-ramadhan commented Apr 1, 2019 •

edited

Loading