Skip to content
This repository has been archived by the owner on May 27, 2021. It is now read-only.

Commit

Permalink
Merge pull request #365 from JuliaGPU/tb/timer_outputs
Browse files Browse the repository at this point in the history
Add some timer outputs.
  • Loading branch information
maleadt committed Mar 22, 2019
2 parents 68f687c + dddecc7 commit 1d310b1
Show file tree
Hide file tree
Showing 9 changed files with 125 additions and 92 deletions.
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"

[extras]
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
Expand Down
1 change: 1 addition & 0 deletions src/CUDAnative.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ using LLVM
using LLVM.Interop

using Adapt
using TimerOutputs
using DataStructures

using Pkg
Expand Down
12 changes: 12 additions & 0 deletions src/compiler.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# JIT compilation of Julia code to PTX

const to = Ref{TimerOutput}()

function timings!(new=TimerOutput())
global to
to[] = new
return
end

timings() = (TimerOutputs.print_timer(to[]; allocations=false); println())

include(joinpath("compiler", "common.jl"))
include(joinpath("compiler", "irgen.jl"))
include(joinpath("compiler", "optim.jl"))
Expand All @@ -12,4 +22,6 @@ include(joinpath("compiler", "driver.jl"))
function __init_compiler__()
# enable generation of FMA instructions to mimic behavior of nvcc
LLVM.clopts("--nvptx-fma-level=1")

timings!()
end
167 changes: 90 additions & 77 deletions src/compiler/driver.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,35 +4,29 @@
const compile_hook = Ref{Union{Nothing,Function}}(nothing)

"""
compile(to::Symbol, cap::VersionNumber, f, tt, kernel=true;
kernel=true, optimize=true, strip=false, ...)
compile(target::Symbol, cap::VersionNumber, f, tt, kernel=true;
optimize=true, strip=false, ...)
Compile a function `f` invoked with types `tt` for device capability `cap` to one of the
following formats as specified by the `to` argument: `:julia` for Julia IR, `:llvm` for LLVM
IR, `:ptx` for PTX assembly and `:cuda` for CUDA driver objects. If the `kernel` flag is
set, specialized code generation and optimization for kernel functions is enabled.
following formats as specified by the `target` argument: `:julia` for Julia IR, `:llvm` for
LLVM IR, `:ptx` for PTX assembly and `:cuda` for CUDA driver objects. If the `kernel` flag
is set, specialized code generation and optimization for kernel functions is enabled.
The following keyword arguments are supported:
- `hooks`: enable compiler hooks that drive reflection functions (default: true)
- `libraries`: link auxiliary bitcode libraries that may be required (default: true)
- `optimize`: optimize the code (default: true)
- `strip`: strip non-functional metadata and debug information (default: false)
Other keyword arguments can be found in the documentation of [`cufunction`](@ref).
"""
compile(to::Symbol, cap::VersionNumber, @nospecialize(f::Core.Function), @nospecialize(tt),
kernel::Bool=true; hooks::Bool=true, libraries::Bool=true,
optimize::Bool=true, strip::Bool=false,
kwargs...) =
compile(to, CompilerJob(f, tt, cap, kernel; kwargs...);
hooks=hooks, libraries=libraries, optimize=optimize, strip=strip)

function compile(to::Symbol, job::CompilerJob;
hooks::Bool=true, libraries::Bool=true,
optimize::Bool=true, strip::Bool=false)
compile(target::Symbol, cap::VersionNumber, @nospecialize(f::Core.Function), @nospecialize(tt),
kernel::Bool=true; optimize::Bool=true, strip::Bool=false, kwargs...) =
compile(target, CompilerJob(f, tt, cap, kernel; kwargs...); optimize=optimize, strip=strip)

function compile(target::Symbol, job::CompilerJob;
libraries::Bool=true, optimize::Bool=true, strip::Bool=false)
@debug "(Re)compiling function" job

if hooks && compile_hook[] != nothing
if compile_hook[] != nothing
global globalUnique
previous_globalUnique = globalUnique

Expand All @@ -41,65 +35,77 @@ function compile(to::Symbol, job::CompilerJob;
globalUnique = previous_globalUnique
end

return codegen(target, job; libraries=libraries, optimize=optimize, strip=strip)
end

function codegen(target::Symbol, job::CompilerJob; libraries::Bool=true,
optimize::Bool=true, strip::Bool=false)
## Julia IR

check_method(job)

# get the method instance
world = typemax(UInt)
meth = which(job.f, job.tt)
sig = Base.signature_type(job.f, job.tt)::Type
(ti, env) = ccall(:jl_type_intersection_with_env, Any,
(Any, Any), sig, meth.sig)::Core.SimpleVector
if VERSION >= v"1.2.0-DEV.320"
meth = Base.func_for_method_checked(meth, ti, env)
else
meth = Base.func_for_method_checked(meth, ti)
@timeit to[] "Julia front-end" begin
check_method(job)

# get the method instance
world = typemax(UInt)
meth = which(job.f, job.tt)
sig = Base.signature_type(job.f, job.tt)::Type
(ti, env) = ccall(:jl_type_intersection_with_env, Any,
(Any, Any), sig, meth.sig)::Core.SimpleVector
if VERSION >= v"1.2.0-DEV.320"
meth = Base.func_for_method_checked(meth, ti, env)
else
meth = Base.func_for_method_checked(meth, ti)
end
linfo = ccall(:jl_specializations_get_linfo, Ref{Core.MethodInstance},
(Any, Any, Any, UInt), meth, ti, env, world)
end
linfo = ccall(:jl_specializations_get_linfo, Ref{Core.MethodInstance},
(Any, Any, Any, UInt), meth, ti, env, world)

to == :julia && return linfo
target == :julia && return linfo


## LLVM IR

ir, kernel = irgen(job, linfo, world)
# preload libraries
if libraries
libdevice = load_libdevice(job.cap)
runtime = load_runtime(job.cap)
end

need_library(lib) = any(f -> isdeclaration(f) &&
intrinsic_id(f) == 0 &&
haskey(functions(lib), LLVM.name(f)),
functions(ir))

if libraries
libdevice = load_libdevice(job.cap)
if need_library(libdevice)
link_libdevice!(job, ir, libdevice)
@timeit to[] "LLVM middle-end" begin
ir, kernel = @timeit to[] "IR generation" irgen(job, linfo, world)

if libraries
@timeit to[] "device library" if need_library(libdevice)
link_libdevice!(job, ir, libdevice)
end
end
end

if optimize
kernel = optimize!(job, ir, kernel)
end
if optimize
kernel = @timeit to[] "optimization" optimize!(job, ir, kernel)
end

if libraries
runtime = load_runtime(job.cap)
if need_library(runtime)
link_library!(job, ir, runtime)
if libraries
@timeit to[] "runtime library" if need_library(runtime)
link_library!(job, ir, runtime)
end
end
end

verify(ir)
@timeit to[] "verification" verify(ir)

if strip
strip_debuginfo!(ir)
end
if strip
@timeit to[] "strip debug info" strip_debuginfo!(ir)
end

kernel_fn = LLVM.name(kernel)
kernel_ft = eltype(llvmtype(kernel))
kernel_fn = LLVM.name(kernel)
kernel_ft = eltype(llvmtype(kernel))
end

to == :llvm && return ir, kernel
target == :llvm && return ir, kernel


## dynamic parallelism
Expand Down Expand Up @@ -132,8 +138,7 @@ function compile(to::Symbol, job::CompilerJob;
for dyn_job in keys(worklist)
# cached compilation
dyn_kernel_fn = get!(kernels, dyn_job) do
dyn_ir, dyn_kernel = compile(:llvm, dyn_job; hooks=false,
optimize=optimize, strip=strip)
dyn_ir, dyn_kernel = codegen(:llvm, dyn_job; optimize=optimize, strip=strip)
dyn_kernel_fn = LLVM.name(dyn_kernel)
dyn_kernel_ft = eltype(llvmtype(dyn_kernel))
link!(ir, dyn_ir)
Expand Down Expand Up @@ -163,37 +168,45 @@ function compile(to::Symbol, job::CompilerJob;

## PTX machine code

prepare_execution!(job, ir)
@timeit to[] "LLVM back-end" begin
@timeit to[] "preparation" prepare_execution!(job, ir)

check_invocation(job, kernel)
check_ir(job, ir)
check_invocation(job, kernel)
check_ir(job, ir)

asm = mcgen(job, ir, kernel)
asm = @timeit to[] "machine-code generation" mcgen(job, ir, kernel)
end

to == :ptx && return asm, kernel_fn
target == :ptx && return asm, kernel_fn


## CUDA objects

# enable debug options based on Julia's debug setting
jit_options = Dict{CUDAdrv.CUjit_option,Any}()
if Base.JLOptions().debug_level == 1
jit_options[CUDAdrv.GENERATE_LINE_INFO] = true
elseif Base.JLOptions().debug_level >= 2
jit_options[CUDAdrv.GENERATE_DEBUG_INFO] = true
end
@timeit to[] "CUDA object generation" begin
# enable debug options based on Julia's debug setting
jit_options = Dict{CUDAdrv.CUjit_option,Any}()
if Base.JLOptions().debug_level == 1
jit_options[CUDAdrv.GENERATE_LINE_INFO] = true
elseif Base.JLOptions().debug_level >= 2
jit_options[CUDAdrv.GENERATE_DEBUG_INFO] = true
end

# link the CUDA device library
linker = CUDAdrv.CuLink(jit_options)
CUDAdrv.add_file!(linker, libcudadevrt, CUDAdrv.LIBRARY)
CUDAdrv.add_data!(linker, kernel_fn, asm)
image = CUDAdrv.complete(linker)
# link the CUDA device library
@timeit to[] "linking" begin
linker = CUDAdrv.CuLink(jit_options)
CUDAdrv.add_file!(linker, libcudadevrt, CUDAdrv.LIBRARY)
CUDAdrv.add_data!(linker, kernel_fn, asm)
image = CUDAdrv.complete(linker)
end

cuda_mod = CuModule(image, jit_options)
cuda_fun = CuFunction(cuda_mod, kernel_fn)
@timeit to[] "compilation" begin
cuda_mod = CuModule(image, jit_options)
cuda_fun = CuFunction(cuda_mod, kernel_fn)
end
end

to == :cuda && return cuda_fun, cuda_mod
target == :cuda && return cuda_fun, cuda_mod


error("Unknown compilation target $to")
error("Unknown compilation target $target")
end
15 changes: 8 additions & 7 deletions src/compiler/irgen.jl
Original file line number Diff line number Diff line change
Expand Up @@ -107,16 +107,18 @@ function compile_linfo(job::CompilerJob, linfo::Core.MethodInstance, world)
end

function irgen(job::CompilerJob, linfo::Core.MethodInstance, world)
entry, modules = compile_linfo(job, linfo, world)
entry, modules = @timeit to[] "emission" compile_linfo(job, linfo, world)

# link in dependent modules
mod = popfirst!(modules)
for dep in modules
link!(mod, dep)
@timeit to[] "linking" begin
mod = popfirst!(modules)
for dep in modules
link!(mod, dep)
end
end

# clean up incompatibilities
for llvmf in functions(mod)
@timeit to[] "clean-up" for llvmf in functions(mod)
llvmfn = LLVM.name(llvmf)

# only occurs in debug builds
Expand Down Expand Up @@ -173,15 +175,14 @@ function irgen(job::CompilerJob, linfo::Core.MethodInstance, world)
LLVM.name!(entry, llvmfn)

# minimal required optimization
ModulePassManager() do pm
@timeit to[] "rewrite" ModulePassManager() do pm
global current_job
current_job = job

add!(pm, ModulePass("LowerThrow", lower_throw!))
add!(pm, FunctionPass("HideUnreachable", hide_unreachable!))
add!(pm, ModulePass("HideTrap", hide_trap!))
always_inliner!(pm)
verifier!(pm)
run!(pm, mod)
end

Expand Down
4 changes: 2 additions & 2 deletions src/compiler/rtlib.jl
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,8 @@ end

function emit_function!(mod, cap, f, types, name)
tt = Base.to_tuple_type(types)
new_mod, entry = compile(:llvm, cap, f, tt, #=kernel=# false;
hooks=false, libraries=false)
new_mod, entry = codegen(:llvm, CompilerJob(f, tt, cap, #=kernel=# false);
libraries=false)
LLVM.name!(entry, name)
link!(mod, new_mod)
end
Expand Down
6 changes: 3 additions & 3 deletions src/reflection.jl
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ function code_llvm(io::IO, @nospecialize(func::Core.Function), @nospecialize(typ
end
function code_llvm(io::IO, job::CompilerJob; optimize::Bool=true,
dump_module::Bool=false, strip_ir_metadata::Bool=true)
ir, entry = compile(:llvm, job; hooks=false, optimize=optimize, strip=strip_ir_metadata)
ir, entry = codegen(:llvm, job; optimize=optimize, strip=strip_ir_metadata)
if dump_module
show(io, ir)
else
Expand Down Expand Up @@ -67,7 +67,7 @@ function code_ptx(io::IO, @nospecialize(func::Core.Function), @nospecialize(type
code_ptx(io, job; strip_ir_metadata=strip_ir_metadata)
end
function code_ptx(io::IO, job::CompilerJob; strip_ir_metadata::Bool=true)
asm, _ = compile(:ptx, job; hooks=false, strip=strip_ir_metadata)
asm, _ = codegen(:ptx, job; strip=strip_ir_metadata)
print(io, asm)
end
code_ptx(@nospecialize(func), @nospecialize(types); kwargs...) =
Expand Down Expand Up @@ -98,7 +98,7 @@ function code_sass(io::IO, job::CompilerJob)
error("Your CUDA installation does not provide ptxas or nvdisasm, both of which are required for code_sass")
end

ptx, _ = compile(:ptx, job; hooks=false)
ptx, _ = codegen(:ptx, job)

fn = tempname()
gpu = "sm_$(job.cap.major)$(job.cap.minor)"
Expand Down
9 changes: 6 additions & 3 deletions test/codegen.jl
Original file line number Diff line number Diff line change
Expand Up @@ -472,7 +472,8 @@ end
@testset "non-isbits arguments" begin
foobar(i) = (sink(unsafe_trunc(Int,i)); return)

@test_throws_message(CUDAnative.KernelError, CUDAnative.compile(:ptx, v"3.5", foobar, Tuple{BigInt})) do msg
@test_throws_message(CUDAnative.KernelError,
CUDAnative.codegen(:ptx, CUDAnative.CompilerJob(foobar, Tuple{BigInt}, v"3.5", true))) do msg
occursin("passing and using non-bitstype argument", msg) &&
occursin("BigInt", msg)
end
Expand All @@ -481,7 +482,8 @@ end
@testset "invalid LLVM IR" begin
foobar(i) = println(i)

@test_throws_message(CUDAnative.InvalidIRError, CUDAnative.compile(:ptx, v"3.5", foobar, Tuple{Int})) do msg
@test_throws_message(CUDAnative.InvalidIRError,
CUDAnative.codegen(:ptx, CUDAnative.CompilerJob(foobar, Tuple{Int}, v"3.5", true))) do msg
occursin("invalid LLVM IR", msg) &&
occursin(CUDAnative.RUNTIME_FUNCTION, msg) &&
occursin("[1] println", msg) &&
Expand All @@ -492,7 +494,8 @@ end
@testset "invalid LLVM IR (ccall)" begin
foobar(p) = (unsafe_store!(p, ccall(:time, Cint, ())); nothing)

@test_throws_message(CUDAnative.InvalidIRError, CUDAnative.compile(:ptx, v"3.5", foobar, Tuple{Ptr{Int}})) do msg
@test_throws_message(CUDAnative.InvalidIRError,
CUDAnative.codegen(:ptx, CUDAnative.CompilerJob(foobar, Tuple{Ptr{Int}}, v"3.5", true))) do msg
occursin("invalid LLVM IR", msg) &&
occursin(CUDAnative.POINTER_FUNCTION, msg) &&
occursin(r"\[1\] .+foobar", msg)
Expand Down
2 changes: 2 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -74,4 +74,6 @@ else
@warn("CUDAnative.jl has not been configured; skipping on-device tests.")
end

CUDAnative.timings()

end

0 comments on commit 1d310b1

Please sign in to comment.