From dd0a37f884e83a502b089a1ef924d8c5be22a0ea Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 31 Oct 2023 15:47:14 +0100 Subject: [PATCH 01/19] Allow scalar indexing with unified arrays. --- lib/cudadrv/memory.jl | 3 ++- src/array.jl | 35 ++++++++++++++++++++++++++++++++--- 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/lib/cudadrv/memory.jl b/lib/cudadrv/memory.jl index 20897f8f96..75ba4d09c4 100644 --- a/lib/cudadrv/memory.jl +++ b/lib/cudadrv/memory.jl @@ -213,6 +213,7 @@ struct UnifiedBuffer <: AbstractBuffer ctx::CuContext ptr::CuPtr{Cvoid} bytesize::Int + dirty::Threads.Atomic{Bool} end UnifiedBuffer() = UnifiedBuffer(context(), CU_NULL, 0) @@ -244,7 +245,7 @@ function alloc(::Type{UnifiedBuffer}, bytesize::Integer, ptr_ref = Ref{CuPtr{Cvoid}}() CUDA.cuMemAllocManaged(ptr_ref, bytesize, flags) - return UnifiedBuffer(context(), ptr_ref[], bytesize) + return UnifiedBuffer(context(), ptr_ref[], bytesize, Threads.Atomic{Bool}(false), ) end diff --git a/src/array.jl b/src/array.jl index 3708c5ca3a..8b76fec96f 100644 --- a/src/array.jl +++ b/src/array.jl @@ -342,10 +342,39 @@ Base.convert(::Type{T}, x::T) where T <: CuArray = x ## interop with C libraries -Base.unsafe_convert(::Type{Ptr{T}}, x::CuArray{T}) where {T} = - throw(ArgumentError("cannot take the CPU address of a $(typeof(x))")) +function Base.unsafe_convert(::Type{Ptr{T}}, x::CuArray{T}) where {T} + buf = x.data[] + if buf isa Mem.DeviceBuffer + throw(ArgumentError("cannot take the CPU address of a $(typeof(x))")) + elseif buf isa Mem.UnifiedBuffer + # TODO: atomics + if buf.dirty[] + synchronize() + buf.dirty[] = false + end + end + convert(Ptr{T}, buf) + x.offset*Base.elsize(x) +end + function Base.unsafe_convert(::Type{CuPtr{T}}, x::CuArray{T}) where {T} - convert(CuPtr{T}, x.data[]) + x.offset*Base.elsize(x) + buf = x.data[] + if buf isa Mem.UnifiedBuffer + buf.dirty[] = true + end + convert(CuPtr{T}, buf) + x.offset*Base.elsize(x) +end + + +## indexing + +function Base.getindex(x::CuArray{T, <:Any, Mem.UnifiedBuffer}, I::Int) where T + ptr = Base.unsafe_convert(Ptr{T}, x) + unsafe_load(ptr) +end + +function Base.setindex!(x::CuArray{T, <:Any, Mem.UnifiedBuffer}, v, I::Int) where T + ptr = Base.unsafe_convert(Ptr{T}, x) + unsafe_store!(ptr, v) end From 331573d1c0516956c23c4c4b32c66e4a0032dd3a Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 31 Oct 2023 16:35:38 +0100 Subject: [PATCH 02/19] Make the default memory type configurable. --- LocalPreferences.toml | 4 ++++ src/array.jl | 33 ++++++++++++++++++++++++++++++--- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/LocalPreferences.toml b/LocalPreferences.toml index e0e7507033..afa63bf788 100644 --- a/LocalPreferences.toml +++ b/LocalPreferences.toml @@ -12,6 +12,10 @@ # making it possible to do use cooperative multitasking. #nonblocking_synchronization = true +# which memory type unspecified allocations should default to. +# possible values: "device", "unified", "host" +#default_memory = "unified" + [CUDA_Driver_jll] # whether to attempt to load a forwards-compatibile userspace driver. # only turn this off if you experience issues, e.g., when using a local diff --git a/src/array.jl b/src/array.jl index 8b76fec96f..7e3815f984 100644 --- a/src/array.jl +++ b/src/array.jl @@ -132,10 +132,23 @@ const CuVector{T} = CuArray{T,1} const CuMatrix{T} = CuArray{T,2} const CuVecOrMat{T} = Union{CuVector{T},CuMatrix{T}} -# default to non-unified memory +# unspecified memory allocation +const default_memory = let str = Preferences.@load_preference("default_memory", "unified") + if str == "device" + Mem.DeviceBuffer + elseif str == "unified" + Mem.UnifiedBuffer + elseif str == "host" + Mem.HostBuffer + else + error("unknown default memory type: $default_memory") + end +end CuArray{T,N}(::UndefInitializer, dims::Dims{N}) where {T,N} = - CuArray{T,N,Mem.DeviceBuffer}(undef, dims) + CuArray{T,N,default_memory}(undef, dims) +is_device(a::CuArray) = isa(a.data[], Mem.DeviceBuffer) is_unified(a::CuArray) = isa(a.data[], Mem.UnifiedBuffer) +is_host(a::CuArray) = isa(a.data[], Mem.HostBuffer) # buffer, type and dimensionality specified CuArray{T,N,B}(::UndefInitializer, dims::NTuple{N,Integer}) where {T,N,B} = @@ -650,7 +663,21 @@ julia> CuArray(1:3) 3 ``` """ -@inline cu(xs; unified::Bool=false) = adapt(CuArrayAdaptor{unified ? Mem.UnifiedBuffer : Mem.DeviceBuffer}(), xs) +@inline function cu(xs; device::Bool=false, unified::Bool=false, host::Bool=false) + if device + unified + host > 1 + throw(ArgumentError("Can only specify one of `device`, `unified`, or `host`")) + end + memory = if device + Mem.DeviceBuffer + elseif unified + Mem.UnifiedBuffer + elseif host + Mem.HostBuffer + else + default_memory + end + adapt(CuArrayAdaptor{memory}(), xs) +end Base.getindex(::typeof(cu), xs...) = CuArray([xs...]) From f3383df1b01939dd186326a80bef02bd10f5fe53 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 31 Oct 2023 17:05:10 +0100 Subject: [PATCH 03/19] Fix null ctor. --- lib/cudadrv/memory.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/cudadrv/memory.jl b/lib/cudadrv/memory.jl index 75ba4d09c4..5ff22d0257 100644 --- a/lib/cudadrv/memory.jl +++ b/lib/cudadrv/memory.jl @@ -216,7 +216,7 @@ struct UnifiedBuffer <: AbstractBuffer dirty::Threads.Atomic{Bool} end -UnifiedBuffer() = UnifiedBuffer(context(), CU_NULL, 0) +UnifiedBuffer() = UnifiedBuffer(context(), CU_NULL, 0, Threads.Atomic{Bool}(false)) Base.pointer(buf::UnifiedBuffer) = buf.ptr Base.sizeof(buf::UnifiedBuffer) = buf.bytesize @@ -245,7 +245,7 @@ function alloc(::Type{UnifiedBuffer}, bytesize::Integer, ptr_ref = Ref{CuPtr{Cvoid}}() CUDA.cuMemAllocManaged(ptr_ref, bytesize, flags) - return UnifiedBuffer(context(), ptr_ref[], bytesize, Threads.Atomic{Bool}(false), ) + return UnifiedBuffer(context(), ptr_ref[], bytesize, Threads.Atomic{Bool}(false)) end From 085fb0c1bcb4e93284d83fd0d4fb668cb174f80d Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 31 Oct 2023 17:12:04 +0100 Subject: [PATCH 04/19] Fix unsafe_wrap. --- src/array.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/array.jl b/src/array.jl index 7e3815f984..72fe68a00d 100644 --- a/src/array.jl +++ b/src/array.jl @@ -243,7 +243,7 @@ function _unsafe_wrap(::Type{T}, ptr::CuPtr{T}, dims::NTuple{N,Int}; buf = try typ = memory_type(ptr) if is_managed(ptr) - Mem.UnifiedBuffer(ctx, ptr, sz) + Mem.UnifiedBuffer(ctx, ptr, sz, Threads.Atomic{Bool}(false)) elseif typ == CU_MEMORYTYPE_DEVICE # TODO: can we identify whether this pointer was allocated asynchronously? Mem.DeviceBuffer(ctx, ptr, sz, false) From d5e6f1f9fbe2c9d7d76b62b3b40fa05e5ef79b61 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 31 Oct 2023 17:20:38 +0100 Subject: [PATCH 05/19] Fix some tests. --- test/base/array.jl | 53 +++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/test/base/array.jl b/test/base/array.jl index f7c3c77bef..2b75967299 100644 --- a/test/base/array.jl +++ b/test/base/array.jl @@ -3,33 +3,38 @@ import Adapt using ChainRulesCore: add!!, is_inplaceable_destination @testset "constructors" begin - xs = CuArray{Int}(undef, 2, 3) - @test device(xs) == device() - @test context(xs) == context() - @test collect(CuArray([1 2; 3 4])) == [1 2; 3 4] - @test collect(cu[1, 2, 3]) == [1, 2, 3] - @test collect(cu([1, 2, 3])) == [1, 2, 3] - @test testf(vec, rand(5,3)) - @test cu(1:3) === 1:3 - @test Base.elsize(xs) == sizeof(Int) - @test pointer(CuArray{Int, 2}(xs)) != pointer(xs) - - # test aggressive conversion to Float32, but only for floats, and only with `cu` - @test cu([1]) isa CuArray{Int} - @test cu(Float64[1]) isa CuArray{Float32} - @test cu(ComplexF64[1+1im]) isa CuArray{ComplexF32} - @test Adapt.adapt(CuArray, Float64[1]) isa CuArray{Float64} - @test Adapt.adapt(CuArray, ComplexF64[1]) isa CuArray{ComplexF64} - @test Adapt.adapt(CuArray{Float16}, Float64[1]) isa CuArray{Float16} - - @test_throws ArgumentError Base.unsafe_convert(Ptr{Int}, xs) - @test_throws ArgumentError Base.unsafe_convert(Ptr{Float32}, xs) + let xs = CuArray{Int}(undef, 2, 3) + # basic properties + @test device(xs) == device() + @test context(xs) == context() + @test collect(CuArray([1 2; 3 4])) == [1 2; 3 4] + @test collect(cu[1, 2, 3]) == [1, 2, 3] + @test collect(cu([1, 2, 3])) == [1, 2, 3] + @test testf(vec, rand(5,3)) + @test cu(1:3) === 1:3 + @test Base.elsize(xs) == sizeof(Int) + @test pointer(CuArray{Int, 2}(xs)) != pointer(xs) + + # test aggressive conversion to Float32, but only for floats, and only with `cu` + @test cu([1]) isa CuArray{Int} + @test cu(Float64[1]) isa CuArray{Float32} + @test cu(ComplexF64[1+1im]) isa CuArray{ComplexF32} + @test Adapt.adapt(CuArray, Float64[1]) isa CuArray{Float64} + @test Adapt.adapt(CuArray, ComplexF64[1]) isa CuArray{ComplexF64} + @test Adapt.adapt(CuArray{Float16}, Float64[1]) isa CuArray{Float16} + end + + # test pointer conversions + let xs = CuVector{Int,Mem.DeviceBuffer}(undef, 1) + @test_throws ArgumentError Base.unsafe_convert(Ptr{Int}, xs) + @test_throws ArgumentError Base.unsafe_convert(Ptr{Float32}, xs) + end # unsafe_wrap let - arr = CuArray{Int}(undef, 2) - ptr = pointer(arr) B = Mem.DeviceBuffer + arr = CuVector{Int,B}(undef, 2) + ptr = pointer(arr) ## compare the fields we care about function test_eq(a, T, N, dims) @@ -863,4 +868,4 @@ end c = add!!(a, b) @test c == a′ + b @test c === a -end \ No newline at end of file +end From 0a681104a53ab69da08e7946a4b67bd01ec055fa Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 31 Oct 2023 19:17:05 +0100 Subject: [PATCH 06/19] Fix another default constructor. --- src/array.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/array.jl b/src/array.jl index 72fe68a00d..e7097d1bc2 100644 --- a/src/array.jl +++ b/src/array.jl @@ -333,7 +333,7 @@ const AnyCuVecOrMat{T} = Union{AnyCuVector{T}, AnyCuMatrix{T}} end @inline CuArray{T,N}(xs::AbstractArray{<:Any,N}) where {T,N} = - CuArray{T,N,Mem.Device}(xs) + CuArray{T,N,default_memory}(xs) @inline CuArray{T,N}(xs::CuArray{<:Any,N,B}) where {T,N,B} = CuArray{T,N,B}(xs) From cce48ccb39553727b031762a9a52f145eeaf82be Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 31 Oct 2023 20:06:17 +0100 Subject: [PATCH 07/19] Bugfix. --- src/array.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/array.jl b/src/array.jl index e7097d1bc2..c5ee22888b 100644 --- a/src/array.jl +++ b/src/array.jl @@ -381,12 +381,12 @@ end ## indexing function Base.getindex(x::CuArray{T, <:Any, Mem.UnifiedBuffer}, I::Int) where T - ptr = Base.unsafe_convert(Ptr{T}, x) + ptr = Base.unsafe_convert(Ptr{T}, x) + Base._memory_offset(x, I) unsafe_load(ptr) end function Base.setindex!(x::CuArray{T, <:Any, Mem.UnifiedBuffer}, v, I::Int) where T - ptr = Base.unsafe_convert(Ptr{T}, x) + ptr = Base.unsafe_convert(Ptr{T}, x) + Base._memory_offset(x, I) unsafe_store!(ptr, v) end From 579309c0ca90ea63274cc58e5fd16455a6451fa7 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 31 Oct 2023 20:14:47 +0100 Subject: [PATCH 08/19] Test fixes. --- test/base/array.jl | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/test/base/array.jl b/test/base/array.jl index 2b75967299..c871938eb0 100644 --- a/test/base/array.jl +++ b/test/base/array.jl @@ -671,7 +671,7 @@ end dev = device() let - a = CuVector{Int}(undef, 1) + a = CuVector{Int,Mem.DeviceBuffer}(undef, 1) @test !is_unified(a) @test !is_managed(pointer(a)) end @@ -694,12 +694,6 @@ end end let - # default ctor: device memory - let a = CUDA.rand(1) - @test !is_unified(a) - @test !is_managed(pointer(a)) - end - for B = [Mem.DeviceBuffer, Mem.UnifiedBuffer] a = CuVector{Float32,B}(rand(Float32, 1)) @test !xor(B == Mem.UnifiedBuffer, is_unified(a)) @@ -740,12 +734,12 @@ end end # cu: supports unified keyword - let a = cu(rand(Float64, 1); unified=true) - @test is_unified(a) + let a = cu(rand(Float64, 1); device=true) + @test !is_unified(a) @test eltype(a) == Float32 end - let a = cu(rand(Float64, 1)) - @test !is_unified(a) + let a = cu(rand(Float64, 1); unified=true) + @test is_unified(a) @test eltype(a) == Float32 end end From a48715f39a0b14632c23be6b1596ac27d740996e Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 31 Oct 2023 20:35:19 +0100 Subject: [PATCH 09/19] Pointer conversion improvements. --- src/array.jl | 51 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/src/array.jl b/src/array.jl index c5ee22888b..eee24d3885 100644 --- a/src/array.jl +++ b/src/array.jl @@ -313,8 +313,15 @@ const StridedCuMatrix{T} = StridedCuArray{T,2} const StridedCuVecOrMat{T} = Union{StridedCuVector{T}, StridedCuMatrix{T}} Base.pointer(x::StridedCuArray{T}) where {T} = Base.unsafe_convert(CuPtr{T}, x) -@inline function Base.pointer(x::StridedCuArray{T}, i::Integer) where T - Base.unsafe_convert(CuPtr{T}, x) + Base._memory_offset(x, i) +@inline function Base.pointer(x::StridedCuArray{T}, i::Integer; type=Mem.Device) where T + PT = if type == Mem.Device + CuPtr{T} + elseif type == Mem.Host + Ptr{T} + else + error("unknown memory type") + end + Base.unsafe_convert(PT, x) + Base._memory_offset(x, i) end # anything that's (secretly) backed by a CuArray @@ -380,15 +387,11 @@ end ## indexing -function Base.getindex(x::CuArray{T, <:Any, Mem.UnifiedBuffer}, I::Int) where T - ptr = Base.unsafe_convert(Ptr{T}, x) + Base._memory_offset(x, I) - unsafe_load(ptr) -end +Base.getindex(x::CuArray{<:Any, <:Any, Mem.UnifiedBuffer}, I::Int) = + unsafe_load(pointer(x, I; type=Mem.Host)) -function Base.setindex!(x::CuArray{T, <:Any, Mem.UnifiedBuffer}, v, I::Int) where T - ptr = Base.unsafe_convert(Ptr{T}, x) + Base._memory_offset(x, I) - unsafe_store!(ptr, v) -end +Base.setindex!(x::CuArray{<:Any, <:Any, Mem.UnifiedBuffer}, v, I::Int) = + unsafe_store!(pointer(x, I; type=Mem.Host), v) ## interop with device arrays @@ -402,8 +405,16 @@ end ## memory copying typetagdata(a::Array, i=1) = ccall(:jl_array_typetagdata, Ptr{UInt8}, (Any,), a) + i - 1 -typetagdata(a::CuArray, i=1) = - convert(CuPtr{UInt8}, a.data[]) + a.maxsize + a.offset + i - 1 +function typetagdata(a::CuArray, i=1; type=Mem.Device) + PT = if type == Mem.Device + CuPtr{UInt8} + elseif type == Mem.Host + Ptr{UInt8} + else + error("unknown memory type") + end + convert(PT, a.data[]) + a.maxsize + a.offset + i - 1 +end function Base.copyto!(dest::DenseCuArray{T}, doffs::Integer, src::Array{T}, soffs::Integer, n::Integer) where T @@ -518,11 +529,11 @@ function Base.unsafe_copyto!(dest::DenseCuArray{T,<:Any,<:Union{Mem.UnifiedBuffe synchronize() GC.@preserve src dest begin - cpu_ptr = pointer(src, soffs) - unsafe_copyto!(host_pointer(pointer(dest, doffs)), cpu_ptr, n) + ptr = pointer(src, soffs) + unsafe_copyto!(pointer(dest, doffs; type=Mem.Host), ptr, n) if Base.isbitsunion(T) - cpu_ptr = typetagdata(src, soffs) - unsafe_copyto!(host_pointer(typetagdata(dest, doffs)), cpu_ptr, n) + ptr = typetagdata(src, soffs) + unsafe_copyto!(typetagdata(dest, doffs; type=Mem.Host), ptr, n) end end return dest @@ -534,11 +545,11 @@ function Base.unsafe_copyto!(dest::Array{T}, doffs, synchronize() GC.@preserve src dest begin - cpu_ptr = pointer(dest, doffs) - unsafe_copyto!(cpu_ptr, host_pointer(pointer(src, soffs)), n) + ptr = pointer(dest, doffs) + unsafe_copyto!(ptr, pointer(src, soffs; type=Mem.Host), n) if Base.isbitsunion(T) - cpu_ptr = typetagdata(dest, doffs) - unsafe_copyto!(cpu_ptr, host_pointer(typetagdata(src, soffs)), n) + ptr = typetagdata(dest, doffs) + unsafe_copyto!(ptr, typetagdata(src, soffs; type=Mem.Host), n) end end From 9a498c334131f7046f48a857f1931effb8a31465 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 31 Oct 2023 20:42:32 +0100 Subject: [PATCH 10/19] Report preferences in the versioninfo output. --- src/utilities.jl | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/utilities.jl b/src/utilities.jl index 9a447fe49f..29972e7741 100644 --- a/src/utilities.jl +++ b/src/utilities.jl @@ -110,6 +110,23 @@ function versioninfo(io::IO=stdout) println(io) end + prefs = [ + "nonblocking_synchronization" => Preferences.load_preference(CUDA, "nonblocking_synchronization"), + "default_memory" => Preferences.load_preference(CUDA, "default_memory"), + "CUDA_Runtime_jll.version" => Preferences.load_preference(CUDA_Runtime_jll, "version"), + "CUDA_Runtime_jll.local" => Preferences.load_preference(CUDA_Runtime_jll, "local"), + "CUDA_Driver_jll.compat" => Preferences.load_preference(CUDA_Driver_jll, "compat"), + ] + if any(x->!isnothing(x[2]), prefs) + println(io, "Preferences:") + for (key, val) in prefs + if !isnothing(val) + println(io, "- $key: $val") + end + end + println(io) + end + devs = devices() if isempty(devs) println(io, "No CUDA-capable devices.") From 427eb3ed6d6a6b83944f5c9d9a63b25500edc31c Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 31 Oct 2023 20:44:56 +0100 Subject: [PATCH 11/19] Keep the default on device memory, but test unified memory support. --- .buildkite/pipeline.yml | 21 ++++++++++++++++++++- LocalPreferences.toml | 2 +- src/array.jl | 2 +- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index edd315982c..c47cdbea1e 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -241,10 +241,29 @@ steps: env: JULIA_CUDA_USE_COMPAT: 'false' # NVIDIA bug #3418723: injection tools prevent probing libcuda if: build.message !~ /\[skip tests\]/ && - build.message !~ /\[skip sanitizer\]/ && !build.pull_request.draft timeout_in_minutes: 10 + - label: "Unified memory" + plugins: + - JuliaCI/julia#v1: + version: 1.9 + - JuliaCI/julia-test#v1: + test_args: "--quickfail" + - JuliaCI/julia-coverage#v1: + dirs: + - src + - lib + - examples + agents: + queue: "juliagpu" + cuda: "*" + commands: | + echo -e "[CUDA]\ndefault_memory = \"unified\"" >LocalPreferences.toml + if: build.message !~ /\[skip tests\]/ && + !build.pull_request.draft + timeout_in_minutes: 120 + # we want to benchmark every commit on the master branch, even if it failed CI - wait: ~ continue_on_failure: true diff --git a/LocalPreferences.toml b/LocalPreferences.toml index afa63bf788..513fc75593 100644 --- a/LocalPreferences.toml +++ b/LocalPreferences.toml @@ -14,7 +14,7 @@ # which memory type unspecified allocations should default to. # possible values: "device", "unified", "host" -#default_memory = "unified" +#default_memory = "device" [CUDA_Driver_jll] # whether to attempt to load a forwards-compatibile userspace driver. diff --git a/src/array.jl b/src/array.jl index eee24d3885..76dc320cdb 100644 --- a/src/array.jl +++ b/src/array.jl @@ -133,7 +133,7 @@ const CuMatrix{T} = CuArray{T,2} const CuVecOrMat{T} = Union{CuVector{T},CuMatrix{T}} # unspecified memory allocation -const default_memory = let str = Preferences.@load_preference("default_memory", "unified") +const default_memory = let str = Preferences.@load_preference("default_memory", "device") if str == "device" Mem.DeviceBuffer elseif str == "unified" From 9ecbdfded5dfab5e662e15f05a9d38525c1c34f0 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 31 Oct 2023 21:42:14 +0100 Subject: [PATCH 12/19] Skip scalar indexing tests when using unified memory. --- test/runtests.jl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/runtests.jl b/test/runtests.jl index e02a00b0e0..281e609e79 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -58,6 +58,10 @@ const tests = ["core$(path_separator)initialization"] # needs to run first const test_runners = Dict() ## GPUArrays testsuite for name in keys(TestSuite.tests) + if CUDA.default_memory == Mem.Unified && name == "indexing scalar" + # GPUArrays' scalar indexing tests assume that indexing is not supported + continue + end push!(tests, "gpuarrays$(path_separator)$name") test_runners["gpuarrays$(path_separator)$name"] = ()->TestSuite.tests[name](CuArray) end From e945351b7dafe9473ac084148e8a69aa12649085 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 31 Oct 2023 22:16:30 +0100 Subject: [PATCH 13/19] Store the dirty bit in task local storage, for correctness. --- lib/cudadrv/memory.jl | 5 ++--- src/array.jl | 52 ++++++++++++++++++++++++++++++------------- 2 files changed, 38 insertions(+), 19 deletions(-) diff --git a/lib/cudadrv/memory.jl b/lib/cudadrv/memory.jl index 5ff22d0257..20897f8f96 100644 --- a/lib/cudadrv/memory.jl +++ b/lib/cudadrv/memory.jl @@ -213,10 +213,9 @@ struct UnifiedBuffer <: AbstractBuffer ctx::CuContext ptr::CuPtr{Cvoid} bytesize::Int - dirty::Threads.Atomic{Bool} end -UnifiedBuffer() = UnifiedBuffer(context(), CU_NULL, 0, Threads.Atomic{Bool}(false)) +UnifiedBuffer() = UnifiedBuffer(context(), CU_NULL, 0) Base.pointer(buf::UnifiedBuffer) = buf.ptr Base.sizeof(buf::UnifiedBuffer) = buf.bytesize @@ -245,7 +244,7 @@ function alloc(::Type{UnifiedBuffer}, bytesize::Integer, ptr_ref = Ref{CuPtr{Cvoid}}() CUDA.cuMemAllocManaged(ptr_ref, bytesize, flags) - return UnifiedBuffer(context(), ptr_ref[], bytesize, Threads.Atomic{Bool}(false)) + return UnifiedBuffer(context(), ptr_ref[], bytesize) end diff --git a/src/array.jl b/src/array.jl index 76dc320cdb..628f3b8db3 100644 --- a/src/array.jl +++ b/src/array.jl @@ -243,7 +243,7 @@ function _unsafe_wrap(::Type{T}, ptr::CuPtr{T}, dims::NTuple{N,Int}; buf = try typ = memory_type(ptr) if is_managed(ptr) - Mem.UnifiedBuffer(ctx, ptr, sz, Threads.Atomic{Bool}(false)) + Mem.UnifiedBuffer(ctx, ptr, sz) elseif typ == CU_MEMORYTYPE_DEVICE # TODO: can we identify whether this pointer was allocated asynchronously? Mem.DeviceBuffer(ctx, ptr, sz, false) @@ -312,8 +312,7 @@ const StridedCuVector{T} = StridedCuArray{T,1} const StridedCuMatrix{T} = StridedCuArray{T,2} const StridedCuVecOrMat{T} = Union{StridedCuVector{T}, StridedCuMatrix{T}} -Base.pointer(x::StridedCuArray{T}) where {T} = Base.unsafe_convert(CuPtr{T}, x) -@inline function Base.pointer(x::StridedCuArray{T}, i::Integer; type=Mem.Device) where T +@inline function Base.pointer(x::StridedCuArray{T}, i::Integer=1; type=Mem.Device) where T PT = if type == Mem.Device CuPtr{T} elseif type == Mem.Host @@ -360,30 +359,51 @@ CuArray{T,N}(xs::CuArray{T,N,B}) where {T,N,B} = copy(xs) Base.convert(::Type{T}, x::T) where T <: CuArray = x -## interop with C libraries +## interop with libraries -function Base.unsafe_convert(::Type{Ptr{T}}, x::CuArray{T}) where {T} - buf = x.data[] - if buf isa Mem.DeviceBuffer - throw(ArgumentError("cannot take the CPU address of a $(typeof(x))")) - elseif buf isa Mem.UnifiedBuffer - # TODO: atomics - if buf.dirty[] - synchronize() - buf.dirty[] = false - end +# when a unified buffer is converted to a device pointer, we assume it will be accessed +# asynchronously. we keep track of that in the task local storage, and use that information +# to perform additional synchronization when converting the buffer to a host pointer. +# TODO: optimize this! it currently halves the performance of scalar indexing. +function mark_async(buf::Mem.UnifiedBuffer) + tls = task_local_storage() + if haskey(tls, :CUDA_ASYNC_BUFFERS) + async_buffers = tls[:CUDA_ASYNC_BUFFERS]::Vector{Mem.UnifiedBuffer} + in(buf, async_buffers) && return + pushfirst!(async_buffers, buf) + else + tls[:CUDA_ASYNC_BUFFERS] = [buf] end - convert(Ptr{T}, buf) + x.offset*Base.elsize(x) + return +end +function ensure_sync(buf::Mem.UnifiedBuffer) + tls = task_local_storage() + haskey(tls, :CUDA_ASYNC_BUFFERS) || return + async_buffers = tls[:CUDA_ASYNC_BUFFERS]::Vector{Mem.UnifiedBuffer} + in(buf, async_buffers) || return + synchronize() + filter!(!isequal(buf), async_buffers) + return end function Base.unsafe_convert(::Type{CuPtr{T}}, x::CuArray{T}) where {T} buf = x.data[] if buf isa Mem.UnifiedBuffer - buf.dirty[] = true + mark_async(buf) end convert(CuPtr{T}, buf) + x.offset*Base.elsize(x) end +function Base.unsafe_convert(::Type{Ptr{T}}, x::CuArray{T}) where {T} + buf = x.data[] + if buf isa Mem.DeviceBuffer + throw(ArgumentError("cannot take the CPU address of a $(typeof(x))")) + elseif buf isa Mem.UnifiedBuffer + ensure_sync(buf) + end + convert(Ptr{T}, buf) + x.offset*Base.elsize(x) +end + ## indexing From 4873f560c9a8a1ea2f4a8e1d208a559e7c15b944 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 1 Nov 2023 11:03:44 +0100 Subject: [PATCH 14/19] Prefetch unified memory before kernel launches. --- lib/cusparse/array.jl | 14 +++++++------- src/array.jl | 18 +++++++++--------- src/compiler/execution.jl | 39 ++++++++++++++++++++++++--------------- src/texture.jl | 2 +- test/core/execution.jl | 2 +- 5 files changed, 42 insertions(+), 33 deletions(-) diff --git a/lib/cusparse/array.jl b/lib/cusparse/array.jl index 0a5f2c0bbf..64f79eed82 100644 --- a/lib/cusparse/array.jl +++ b/lib/cusparse/array.jl @@ -417,9 +417,9 @@ Adapt.adapt_storage(::Type{CuArray}, xs::SparseMatrixCSC) = CuSparseMatrixCSC(xs Adapt.adapt_storage(::Type{CuArray{T}}, xs::SparseVector) where {T} = CuSparseVector{T}(xs) Adapt.adapt_storage(::Type{CuArray{T}}, xs::SparseMatrixCSC) where {T} = CuSparseMatrixCSC{T}(xs) -Adapt.adapt_storage(::CUDA.CuArrayAdaptor, xs::AbstractSparseArray) = +Adapt.adapt_storage(::CUDA.CuArrayKernelAdaptor, xs::AbstractSparseArray) = adapt(CuArray, xs) -Adapt.adapt_storage(::CUDA.CuArrayAdaptor, xs::AbstractSparseArray{<:AbstractFloat}) = +Adapt.adapt_storage(::CUDA.CuArrayKernelAdaptor, xs::AbstractSparseArray{<:AbstractFloat}) = adapt(CuArray{Float32}, xs) Adapt.adapt_storage(::Type{Array}, xs::CuSparseVector) = SparseVector(xs) @@ -546,7 +546,7 @@ end # interop with device arrays -function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseVector) +function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseVector) return CuSparseDeviceVector( adapt(to, x.iPtr), adapt(to, x.nzVal), @@ -554,7 +554,7 @@ function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseVector) ) end -function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCSR) +function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseMatrixCSR) return CuSparseDeviceMatrixCSR( adapt(to, x.rowPtr), adapt(to, x.colVal), @@ -563,7 +563,7 @@ function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCSR) ) end -function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCSC) +function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseMatrixCSC) return CuSparseDeviceMatrixCSC( adapt(to, x.colPtr), adapt(to, x.rowVal), @@ -572,7 +572,7 @@ function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCSC) ) end -function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixBSR) +function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseMatrixBSR) return CuSparseDeviceMatrixBSR( adapt(to, x.rowPtr), adapt(to, x.colVal), @@ -582,7 +582,7 @@ function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixBSR) ) end -function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCOO) +function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseMatrixCOO) return CuSparseDeviceMatrixCOO( adapt(to, x.rowInd), adapt(to, x.colInd), diff --git a/src/array.jl b/src/array.jl index 628f3b8db3..b229ba5bbd 100644 --- a/src/array.jl +++ b/src/array.jl @@ -388,7 +388,7 @@ end function Base.unsafe_convert(::Type{CuPtr{T}}, x::CuArray{T}) where {T} buf = x.data[] - if buf isa Mem.UnifiedBuffer + if is_unified(x) mark_async(buf) end convert(CuPtr{T}, buf) + x.offset*Base.elsize(x) @@ -396,9 +396,9 @@ end function Base.unsafe_convert(::Type{Ptr{T}}, x::CuArray{T}) where {T} buf = x.data[] - if buf isa Mem.DeviceBuffer + if is_device(x) throw(ArgumentError("cannot take the CPU address of a $(typeof(x))")) - elseif buf isa Mem.UnifiedBuffer + elseif is_unified(x) ensure_sync(buf) end convert(Ptr{T}, buf) + x.offset*Base.elsize(x) @@ -637,19 +637,19 @@ Adapt.adapt_storage(::Type{<:CuArray{T, N, B}}, xs::AT) where {T, N, B, AT<:Abst # eagerly converts Float64 to Float32, for performance reasons -struct CuArrayAdaptor{B} end +struct CuArrayKernelAdaptor{B} end -Adapt.adapt_storage(::CuArrayAdaptor{B}, xs::AbstractArray{T,N}) where {T,N,B} = +Adapt.adapt_storage(::CuArrayKernelAdaptor{B}, xs::AbstractArray{T,N}) where {T,N,B} = isbits(xs) ? xs : CuArray{T,N,B}(xs) -Adapt.adapt_storage(::CuArrayAdaptor{B}, xs::AbstractArray{T,N}) where {T<:AbstractFloat,N,B} = +Adapt.adapt_storage(::CuArrayKernelAdaptor{B}, xs::AbstractArray{T,N}) where {T<:AbstractFloat,N,B} = isbits(xs) ? xs : CuArray{Float32,N,B}(xs) -Adapt.adapt_storage(::CuArrayAdaptor{B}, xs::AbstractArray{T,N}) where {T<:Complex{<:AbstractFloat},N,B} = +Adapt.adapt_storage(::CuArrayKernelAdaptor{B}, xs::AbstractArray{T,N}) where {T<:Complex{<:AbstractFloat},N,B} = isbits(xs) ? xs : CuArray{ComplexF32,N,B}(xs) # not for Float16 -Adapt.adapt_storage(::CuArrayAdaptor{B}, xs::AbstractArray{T,N}) where {T<:Union{Float16,BFloat16},N,B} = +Adapt.adapt_storage(::CuArrayKernelAdaptor{B}, xs::AbstractArray{T,N}) where {T<:Union{Float16,BFloat16},N,B} = isbits(xs) ? xs : CuArray{T,N,B}(xs) """ @@ -707,7 +707,7 @@ julia> CuArray(1:3) else default_memory end - adapt(CuArrayAdaptor{memory}(), xs) + adapt(CuArrayKernelAdaptor{memory}(), xs) end Base.getindex(::typeof(cu), xs...) = CuArray([xs...]) diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl index 9276876026..177da0efa2 100644 --- a/src/compiler/execution.jl +++ b/src/compiler/execution.jl @@ -121,35 +121,44 @@ end ## host to device value conversion -struct Adaptor end +struct KernelAdaptor end # convert CUDA host pointers to device pointers # TODO: use ordinary ptr? -Adapt.adapt_storage(to::Adaptor, p::CuPtr{T}) where {T} = reinterpret(LLVMPtr{T,AS.Generic}, p) +Adapt.adapt_storage(to::KernelAdaptor, p::CuPtr{T}) where {T} = + reinterpret(LLVMPtr{T,AS.Generic}, p) + +# convert CUDA host arrays to device arrays +function Adapt.adapt_storage(::KernelAdaptor, xs::DenseCuArray{T,N}) where {T,N} + # prefetch unified memory as we're likely to use it on the GPU + # TODO: make this configurable? + if is_unified(xs) && sizeof(xs) > 0 && !is_capturing() + buf = xs.data[] + subbuf = Mem.UnifiedBuffer(buf.ctx, pointer(xs), sizeof(xs)) + Mem.prefetch(subbuf) + end + + Base.unsafe_convert(CuDeviceArray{T,N,AS.Global}, xs) +end # Base.RefValue isn't GPU compatible, so provide a compatible alternative struct CuRefValue{T} <: Ref{T} x::T end Base.getindex(r::CuRefValue{T}) where T = r.x -Adapt.adapt_structure(to::Adaptor, r::Base.RefValue) = CuRefValue(adapt(to, r[])) +Adapt.adapt_structure(to::KernelAdaptor, r::Base.RefValue) = CuRefValue(adapt(to, r[])) # broadcast sometimes passes a ref(type), resulting in a GPU-incompatible DataType box. # avoid that by using a special kind of ref that knows about the boxed type. struct CuRefType{T} <: Ref{DataType} end Base.getindex(r::CuRefType{T}) where T = T -Adapt.adapt_structure(to::Adaptor, r::Base.RefValue{<:Union{DataType,Type}}) = CuRefType{r[]}() +Adapt.adapt_structure(to::KernelAdaptor, r::Base.RefValue{<:Union{DataType,Type}}) = + CuRefType{r[]}() # case where type is the function being broadcasted -Adapt.adapt_structure(to::Adaptor, bc::Base.Broadcast.Broadcasted{Style, <:Any, Type{T}}) where {Style, T} = - Base.Broadcast.Broadcasted{Style}((x...) -> T(x...), adapt(to, bc.args), bc.axes) - -Adapt.adapt_storage(::Adaptor, xs::CuArray{T,N}) where {T,N} = - Base.unsafe_convert(CuDeviceArray{T,N,AS.Global}, xs) - -# we materialize ReshapedArray/ReinterpretArray/SubArray/... directly as a device array -Adapt.adapt_structure(::Adaptor, xs::DenseCuArray{T,N}) where {T,N} = - Base.unsafe_convert(CuDeviceArray{T,N,AS.Global}, xs) +Adapt.adapt_structure(to::KernelAdaptor, + bc::Broadcast.Broadcasted{Style, <:Any, Type{T}}) where {Style, T} = + Broadcast.Broadcasted{Style}((x...) -> T(x...), adapt(to, bc.args), bc.axes) """ cudaconvert(x) @@ -159,9 +168,9 @@ converted to a GPU-friendly format. By default, the function does nothing and re input object `x` as-is. Do not add methods to this function, but instead extend the underlying Adapt.jl package and -register methods for the the `CUDA.Adaptor` type. +register methods for the the `CUDA.KernelAdaptor` type. """ -cudaconvert(arg) = adapt(Adaptor(), arg) +cudaconvert(arg) = adapt(KernelAdaptor(), arg) ## abstract kernel functionality diff --git a/src/texture.jl b/src/texture.jl index 24f9da6fc9..4f1a9444eb 100644 --- a/src/texture.jl +++ b/src/texture.jl @@ -319,6 +319,6 @@ memory_source(::Any) = error("Unknown texture source $(typeof(t))") memory_source(::CuArray) = LinearMemory() memory_source(::CuTextureArray) = ArrayMemory() -Adapt.adapt_storage(::Adaptor, t::CuTexture{T,N}) where {T,N} = +Adapt.adapt_storage(::KernelAdaptor, t::CuTexture{T,N}) where {T,N} = CuDeviceTexture{T,N,typeof(memory_source(parent(t))), t.normalized_coordinates, typeof(t.interpolation)}(size(t), t.handle) diff --git a/test/core/execution.jl b/test/core/execution.jl index cacdea8e6e..e6e13e91a1 100644 --- a/test/core/execution.jl +++ b/test/core/execution.jl @@ -470,7 +470,7 @@ end @eval struct Host end @eval struct Device end - Adapt.adapt_storage(::CUDA.Adaptor, a::Host) = Device() + Adapt.adapt_storage(::CUDA.KernelAdaptor, a::Host) = Device() Base.convert(::Type{Int}, ::Host) = 1 Base.convert(::Type{Int}, ::Device) = 2 From 6f9c4addbeacbb790dd6fdc873969e8eb7ed20b4 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 1 Nov 2023 15:17:34 +0100 Subject: [PATCH 15/19] Rework unsafe_wrap. --- src/array.jl | 117 ++++++++++++++++++++++++++++++++++++--------- test/base/array.jl | 104 ++++++++++++++++++++++++---------------- 2 files changed, 157 insertions(+), 64 deletions(-) diff --git a/src/array.jl b/src/array.jl index b229ba5bbd..4363bf41b1 100644 --- a/src/array.jl +++ b/src/array.jl @@ -1,4 +1,4 @@ -export CuArray, CuVector, CuMatrix, CuVecOrMat, cu, is_unified +export CuArray, CuVector, CuMatrix, CuVecOrMat, cu, is_device, is_unified, is_host ## array type @@ -207,36 +207,47 @@ function Base.deepcopy_internal(x::CuArray, dict::IdDict) end +## unsafe_wrap + """ unsafe_wrap(CuArray, ptr::CuPtr{T}, dims; own=false, ctx=context()) -Wrap a `CuArray` object around the data at the address given by `ptr`. The pointer -element type `T` determines the array element type. `dims` is either an integer (for a 1d -array) or a tuple of the array dimensions. `own` optionally specified whether Julia should -take ownership of the memory, calling `cudaFree` when the array is no longer referenced. The -`ctx` argument determines the CUDA context where the data is allocated in. + # requires + unsafe_wrap(Array, a::CuArray) + + # requires HMM + unsafe_wrap(CuArray, ptr::ptr{T}, dims) + unsafe_wrap(CuArray, a::Array) + +Wrap a `CuArray` object around the data at the address given by the CUDA-managed pointer +`ptr`. The element type `T` determines the array element type. `dims` is either an integer +(for a 1d array) or a tuple of the array dimensions. `own` optionally specified whether +Julia should take ownership of the memory, calling `cudaFree` when the array is no longer +referenced. The `ctx` argument determines the CUDA context where the data is allocated in. """ +unsafe_wrap + +# managed pointer to CuArray function Base.unsafe_wrap(::Union{Type{CuArray},Type{CuArray{T}},Type{CuArray{T,N}}}, ptr::CuPtr{T}, dims::NTuple{N,Int}; own::Bool=false, ctx::CuContext=context()) where {T,N} - buf = _unsafe_wrap(T, ptr, dims; own, ctx) + buf = _unsafe_wrap_managed(T, ptr, dims; own, ctx) data = DataRef(own ? _free_buffer : (args...) -> (#= do nothing =#), buf) - CuArray{T, length(dims)}(data, dims) + CuArray{T,N}(data, dims) end function Base.unsafe_wrap(::Type{CuArray{T,N,B}}, ptr::CuPtr{T}, dims::NTuple{N,Int}; own::Bool=false, ctx::CuContext=context()) where {T,N,B} - buf = _unsafe_wrap(T, ptr, dims; own, ctx) + buf = _unsafe_wrap_managed(T, ptr, dims; own, ctx) if typeof(buf) !== B - error("Declared buffer type does not match inferred buffer type.") + throw(ArgumentError("Declared buffer type does not match inferred buffer type.")) end data = DataRef(own ? _free_buffer : (args...) -> (#= do nothing =#), buf) - CuArray{T, length(dims)}(data, dims) + CuArray{T,N}(data, dims) end - -function _unsafe_wrap(::Type{T}, ptr::CuPtr{T}, dims::NTuple{N,Int}; +function _unsafe_wrap_managed(::Type{T}, ptr::CuPtr{T}, dims::NTuple{N,Int}; own::Bool=false, ctx::CuContext=context()) where {T,N} - isbitstype(T) || error("Can only unsafe_wrap a pointer to a bits type") + isbitstype(T) || throw(ArgumentError("Can only unsafe_wrap a pointer to a bits type")) sz = prod(dims) * sizeof(T) # identify the buffer @@ -253,24 +264,84 @@ function _unsafe_wrap(::Type{T}, ptr::CuPtr{T}, dims::NTuple{N,Int}; error("Unknown memory type; please file an issue.") end catch err - error("Could not identify the buffer type; are you passing a valid CUDA pointer to unsafe_wrap?") + throw(ArgumentError("Could not identify the buffer type; are you passing a valid CUDA pointer to unsafe_wrap?")) end return buf end - -function Base.unsafe_wrap(Atype::Union{Type{CuArray},Type{CuArray{T}},Type{CuArray{T,1}}}, +# integer size input +function Base.unsafe_wrap(::Union{Type{CuArray},Type{CuArray{T}},Type{CuArray{T,1}}}, p::CuPtr{T}, dim::Int; own::Bool=false, ctx::CuContext=context()) where {T} - unsafe_wrap(Atype, p, (dim,); own, ctx) + unsafe_wrap(CuArray{T,1}, p, (dim,); own, ctx) end -function Base.unsafe_wrap(Atype::Type{CuArray{T,1,B}}, - p::CuPtr{T}, dim::Int; +function Base.unsafe_wrap(::Type{CuArray{T,1,B}}, p::CuPtr{T}, dim::Int; own::Bool=false, ctx::CuContext=context()) where {T,B} - unsafe_wrap(Atype, p, (dim,); own, ctx) + unsafe_wrap(CuArray{T,1,B}, p, (dim,); own, ctx) end -Base.unsafe_wrap(T::Type{<:CuArray}, ::Ptr, dims::NTuple{N,Int}; kwargs...) where {N} = - throw(ArgumentError("cannot wrap a CPU pointer with a $T")) +# managed pointer to Array +function Base.unsafe_wrap(::Union{Type{Array},Type{Array{T}},Type{Array{T,N}}}, + p::CuPtr{T}, dims::NTuple{N,Int}; + own::Bool=false) where {T,N} + if !is_managed(p) && memory_type(p) != CU_MEMORYTYPE_HOST + throw(ArgumentError("Can only create a CPU array object from a unified or host CUDA array")) + end + unsafe_wrap(Array{T,N}, reinterpret(Ptr{T}, p), dims; own) +end +# integer size input +function Base.unsafe_wrap(::Union{Type{Array},Type{Array{T}},Type{Array{T,1}}}, + p::CuPtr{T}, dim::Int; own::Bool=false) where {T} + unsafe_wrap(Array{T,1}, p, (dim,); own) +end +# array input +function Base.unsafe_wrap(::Union{Type{Array},Type{Array{T}},Type{Array{T,N}}}, + a::CuArray{T,N}) where {T,N} + p = pointer(a; type=Mem.Host) + unsafe_wrap(Array, p, size(a)) +end + +# unmanaged pointer to CuArray +function Base.unsafe_wrap(::Union{Type{CuArray},Type{CuArray{T}},Type{CuArray{T,N}}}, + p::Ptr{T}, dims::NTuple{N,Int}; ctx::CuContext=context()) where {T,N} + isbitstype(T) || throw(ArgumentError("Can only unsafe_wrap a pointer to a bits type")) + sz = prod(dims) * sizeof(T) + + if driver_version() < v"12.2" + error("Accessing host memory requires HMM support, which is only available in CUDA 12.2+ using the open-source driver.") + end + if attribute(device(), DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS) != 1 + error("Accessing host memory requires HMM support, which is not provided by your $(name(device())).") + end + + buf = Mem.UnifiedBuffer(ctx, reinterpret(CuPtr{Nothing}, p), sz) + data = DataRef((args...) -> (#= do nothing =#), buf) + CuArray{T,N}(data, dims) +end +function Base.unsafe_wrap(::Type{CuArray{T,N,B}}, p::Ptr{T}, dims::NTuple{N,Int}; + ctx::CuContext=context()) where {T,N,B} + if B !== Mem.UnifiedBuffer + throw(ArgumentError("Can only wrap an unmanaged pointer to a CuArray with a UnifiedBuffer")) + end + unsafe_wrap(CuArray{T,N}, p, dims; ctx) +end +# integer size input +function Base.unsafe_wrap(::Union{Type{CuArray},Type{CuArray{T}},Type{CuArray{T,1}}}, + p::Ptr{T}, dim::Int) where {T} + unsafe_wrap(CuArray{T,1}, p, (dim,)) +end +function Base.unsafe_wrap(::Type{CuArray{T,1,B}}, p::Ptr{T}, dim::Int) where {T,B} + unsafe_wrap(CuArray{T,1,B}, p, (dim,)) +end +# array input +function Base.unsafe_wrap(::Union{Type{CuArray},Type{CuArray{T}},Type{CuArray{T,N}}}, + a::Array{T,N}) where {T,N} + p = pointer(a) + unsafe_wrap(CuArray{T,N}, p, size(a)) +end +function Base.unsafe_wrap(::Type{CuArray{T,1,B}}, a::Array{T,1}) where {T,B} + p = pointer(a) + unsafe_wrap(CuArray{T,1,B}, p, size(a)) +end ## array interface diff --git a/test/base/array.jl b/test/base/array.jl index c871938eb0..f244cfa8e0 100644 --- a/test/base/array.jl +++ b/test/base/array.jl @@ -30,47 +30,6 @@ using ChainRulesCore: add!!, is_inplaceable_destination @test_throws ArgumentError Base.unsafe_convert(Ptr{Float32}, xs) end - # unsafe_wrap - let - B = Mem.DeviceBuffer - arr = CuVector{Int,B}(undef, 2) - ptr = pointer(arr) - - ## compare the fields we care about - function test_eq(a, T, N, dims) - @test eltype(a) == T - @test ndims(a) == N - @test a.data[].ptr == ptr - @test a.data[].ctx == context() - @test a.maxsize == arr.maxsize - @test a.offset == arr.offset - @test a.dims == dims - end - - test_eq(unsafe_wrap(CuArray, ptr, 2), Int, 1, (2,)) - test_eq(unsafe_wrap(CuArray{Int}, ptr, 2), Int, 1, (2,)) - test_eq(unsafe_wrap(CuArray{Int,1}, ptr, 2), Int, 1, (2,)) - test_eq(unsafe_wrap(CuArray{Int,1,B}, ptr, 2), Int, 1, (2,)) - test_eq(unsafe_wrap(CuArray, ptr, (1,2)), Int, 2, (1,2)) - test_eq(unsafe_wrap(CuArray{Int}, ptr, (1,2)), Int, 2, (1,2)) - test_eq(unsafe_wrap(CuArray{Int,2}, ptr, (1,2)), Int, 2, (1,2)) - test_eq(unsafe_wrap(CuArray{Int,2,B}, ptr, (1,2)), Int, 2, (1,2)) - - @test_throws ErrorException unsafe_wrap(CuArray{Int,1,Mem.HostBuffer}, ptr, 2) - @test_throws ErrorException unsafe_wrap(CuArray{Int,2,Mem.HostBuffer}, ptr, (1,2)) - end - let buf = Mem.alloc(Mem.Host, sizeof(Int), Mem.HOSTALLOC_DEVICEMAP) - gpu_ptr = convert(CuPtr{Int}, buf) - gpu_arr = unsafe_wrap(CuArray, gpu_ptr, 1) - gpu_arr .= 42 - - synchronize() - - cpu_ptr = convert(Ptr{Int}, buf) - cpu_arr = unsafe_wrap(Array, cpu_ptr, 1) - @test cpu_arr == [42] - end - @test collect(CUDA.zeros(2, 2)) == zeros(Float32, 2, 2) @test collect(CUDA.ones(2, 2)) == ones(Float32, 2, 2) @@ -92,6 +51,69 @@ using ChainRulesCore: add!!, is_inplaceable_destination end end +@testset "unsafe_wrap" begin + hmm = CUDA.driver_version() >= v"12.2" && + attribute(device(), CUDA.DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS) == 1 + + # managed memory -> CuArray + for a in [cu([1]; device=true), cu([1]; unified=true)] + p = pointer(a) + for AT in [CuArray, CuArray{Int}, CuArray{Int,1}, typeof(a)], + b in [unsafe_wrap(AT, p, 1), unsafe_wrap(AT, p, (1,))] + @test typeof(b) == typeof(a) + @test pointer(b) == p + @test size(b) == (1,) + end + end + + # managed memory -> Array + let a = cu([1]; unified=true) + p = pointer(a) + for AT in [Array, Array{Int}, Array{Int,1}], + b in [unsafe_wrap(AT, p, 1), unsafe_wrap(AT, p, (1,)), unsafe_wrap(AT, a)] + @test typeof(b) == Array{Int,1} + @test pointer(b) == reinterpret(Ptr{Int}, p) + @test size(b) == (1,) + end + end + + # unmanaged memory -> CuArray + if hmm + a = [1] + p = pointer(a) + for AT in [CuArray, CuArray{Int}, CuArray{Int,1}, CuArray{Int,1,Mem.UnifiedBuffer}], + b in [unsafe_wrap(AT, p, 1), unsafe_wrap(AT, p, (1,)), unsafe_wrap(AT, a)] + @test typeof(b) == CuArray{Int,1,Mem.UnifiedBuffer} + @test pointer(b) == reinterpret(CuPtr{Int}, p) + @test size(b) == (1,) + end + end + + # errors + let a = cu([1]; device=true) + @test_throws ArgumentError unsafe_wrap(Array, a) + @test_throws ArgumentError unsafe_wrap(CuArray{Int,1,Mem.UnifiedBuffer}, pointer(a), 1) + end + if hmm + let a = [1] + @test_throws ArgumentError unsafe_wrap(CuArray{Int,1,Mem.DeviceBuffer}, a) + end + end + + # some actual operations + let buf = Mem.alloc(Mem.Host, sizeof(Int), Mem.HOSTALLOC_DEVICEMAP) + gpu_ptr = convert(CuPtr{Int}, buf) + gpu_arr = unsafe_wrap(CuArray, gpu_ptr, 1) + gpu_arr .= 42 + + synchronize() + + cpu_ptr = convert(Ptr{Int}, buf) + cpu_arr = unsafe_wrap(Array, cpu_ptr, 1) + @test cpu_arr == [42] + end +end + @testset "adapt" begin A = rand(Float32, 3, 3) dA = CuArray(A) From fc638056cd67de1d62f00763f1220052e1ffb20b Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 1 Nov 2023 16:06:34 +0100 Subject: [PATCH 16/19] Better support for host memory. --- src/array.jl | 36 +++++++++++++++++++----------------- src/pool.jl | 6 ++++++ test/runtests.jl | 2 +- 3 files changed, 26 insertions(+), 18 deletions(-) diff --git a/src/array.jl b/src/array.jl index 4363bf41b1..5f6cd712b7 100644 --- a/src/array.jl +++ b/src/array.jl @@ -432,34 +432,36 @@ Base.convert(::Type{T}, x::T) where T <: CuArray = x ## interop with libraries -# when a unified buffer is converted to a device pointer, we assume it will be accessed -# asynchronously. we keep track of that in the task local storage, and use that information -# to perform additional synchronization when converting the buffer to a host pointer. -# TODO: optimize this! it currently halves the performance of scalar indexing. -function mark_async(buf::Mem.UnifiedBuffer) +# when CPU-accessible buffers are converted to a device pointer, we assume it will be +# accessed asynchronously. we keep track of that in the task local storage, and use that +# information to perform additional synchronization when converting the buffer to a host +# pointer. TODO: optimize this! it currently halves the performance of scalar indexing. +function mark_async(buf::Union{Mem.HostBuffer,Mem.UnifiedBuffer}) + ptr = convert(Ptr{Nothing}, buf) tls = task_local_storage() if haskey(tls, :CUDA_ASYNC_BUFFERS) - async_buffers = tls[:CUDA_ASYNC_BUFFERS]::Vector{Mem.UnifiedBuffer} - in(buf, async_buffers) && return - pushfirst!(async_buffers, buf) + async_buffers = tls[:CUDA_ASYNC_BUFFERS]::Vector{Ptr{Nothing}} + in(ptr, async_buffers) && return + pushfirst!(async_buffers, ptr) else - tls[:CUDA_ASYNC_BUFFERS] = [buf] + tls[:CUDA_ASYNC_BUFFERS] = [ptr] end return end -function ensure_sync(buf::Mem.UnifiedBuffer) +function ensure_sync(buf::Union{Mem.HostBuffer,Mem.UnifiedBuffer}) tls = task_local_storage() haskey(tls, :CUDA_ASYNC_BUFFERS) || return - async_buffers = tls[:CUDA_ASYNC_BUFFERS]::Vector{Mem.UnifiedBuffer} - in(buf, async_buffers) || return + async_buffers = tls[:CUDA_ASYNC_BUFFERS]::Vector{Ptr{Nothing}} + ptr = convert(Ptr{Nothing}, buf) + in(ptr, async_buffers) || return synchronize() - filter!(!isequal(buf), async_buffers) + filter!(!isequal(ptr), async_buffers) return end function Base.unsafe_convert(::Type{CuPtr{T}}, x::CuArray{T}) where {T} buf = x.data[] - if is_unified(x) + if is_unified(x) || is_host(x) mark_async(buf) end convert(CuPtr{T}, buf) + x.offset*Base.elsize(x) @@ -469,7 +471,7 @@ function Base.unsafe_convert(::Type{Ptr{T}}, x::CuArray{T}) where {T} buf = x.data[] if is_device(x) throw(ArgumentError("cannot take the CPU address of a $(typeof(x))")) - elseif is_unified(x) + elseif is_unified(x) || is_host(x) ensure_sync(buf) end convert(Ptr{T}, buf) + x.offset*Base.elsize(x) @@ -478,10 +480,10 @@ end ## indexing -Base.getindex(x::CuArray{<:Any, <:Any, Mem.UnifiedBuffer}, I::Int) = +Base.getindex(x::CuArray{<:Any, <:Any, <:Union{Mem.Host,Mem.Unified}}, I::Int) = unsafe_load(pointer(x, I; type=Mem.Host)) -Base.setindex!(x::CuArray{<:Any, <:Any, Mem.UnifiedBuffer}, v, I::Int) = +Base.setindex!(x::CuArray{<:Any, <:Any, <:Union{Mem.Host,Mem.Unified}}, v, I::Int) = unsafe_store!(pointer(x, I; type=Mem.Host), v) diff --git a/src/pool.jl b/src/pool.jl index 11b9aee724..9d476d591f 100644 --- a/src/pool.jl +++ b/src/pool.jl @@ -451,6 +451,12 @@ end end buf, time end +@inline function _alloc(::Type{Mem.HostBuffer}, sz; stream::Union{Nothing,CuStream}) + time = Base.@elapsed begin + buf = Mem.alloc(Mem.Host, sz) + end + buf, time +end """ free(buf) diff --git a/test/runtests.jl b/test/runtests.jl index 281e609e79..b744bea2cc 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -58,7 +58,7 @@ const tests = ["core$(path_separator)initialization"] # needs to run first const test_runners = Dict() ## GPUArrays testsuite for name in keys(TestSuite.tests) - if CUDA.default_memory == Mem.Unified && name == "indexing scalar" + if CUDA.default_memory != Mem.Device && name == "indexing scalar" # GPUArrays' scalar indexing tests assume that indexing is not supported continue end From 5460bfb28f8eb78ced7e2cb39250857fb2044698 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 1 Nov 2023 16:06:59 +0100 Subject: [PATCH 17/19] Run CI with host memory too. --- .buildkite/pipeline.yml | 76 ++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 39 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index c47cdbea1e..6a80f416ce 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -19,8 +19,7 @@ steps: cuda: "*" commands: | echo -e "[CUDA_Runtime_jll]\nlocal = \"true\"" >LocalPreferences.toml - if: build.message !~ /\[skip tests\]/ && - build.message !~ /\[skip julia\]/ + if: build.message !~ /\[skip tests\]/ timeout_in_minutes: 120 matrix: setup: @@ -44,7 +43,7 @@ steps: - JuliaCI/julia#v1: version: 1.9 - JuliaCI/julia-test#v1: - test_args: "core base libraries" + test_args: "--quickfail core base libraries" - JuliaCI/julia-coverage#v1: dirs: - src @@ -53,9 +52,7 @@ steps: agents: queue: "juliagpu" cuda: "*" - if: build.message !~ /\[skip tests\]/ && - build.message !~ /\[skip cuda\]/ && - !build.pull_request.draft + if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft timeout_in_minutes: 120 matrix: setup: @@ -73,6 +70,34 @@ steps: echo -e "[CUDA_Runtime_jll]\nversion = \"{{matrix.cuda}}\"" >LocalPreferences.toml echo -e "[CUDA_Driver_jll]\ncompat = \"false\"" >>LocalPreferences.toml + - group: "Memory" + key: "memory" + depends_on: "julia" + steps: + - label: "CuArray with {{matrix.memory}} memory" + plugins: + - JuliaCI/julia#v1: + version: 1.9 + - JuliaCI/julia-test#v1: + test_args: "--quickfail core base" + - JuliaCI/julia-coverage#v1: + dirs: + - src + - lib + - examples + agents: + queue: "juliagpu" + cuda: "*" + if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft + timeout_in_minutes: 120 + matrix: + setup: + memory: + - "unified" + - "host" + commands: | + echo -e "[CUDA]\ndefault_memory = \"{{matrix.memory}}\"" >LocalPreferences.toml + - group: ":nesting_dolls: Subpackages" depends_on: "cuda" steps: @@ -104,9 +129,7 @@ steps: agents: queue: "juliagpu" cuda: "*" - if: build.message !~ /\[skip tests\]/ && - build.message !~ /\[skip subpackages\]/ && - !build.pull_request.draft + if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft timeout_in_minutes: 120 commands: | julia --project -e ' @@ -165,9 +188,7 @@ steps: agents: queue: "juliagpu" cuda: "*" - if: build.message !~ /\[skip tests\]/ && - build.message !~ /\[skip downstream\]/ && - !build.pull_request.draft + if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft timeout_in_minutes: 60 soft_fail: - exit_status: 3 @@ -240,30 +261,9 @@ steps: cuda: "*" env: JULIA_CUDA_USE_COMPAT: 'false' # NVIDIA bug #3418723: injection tools prevent probing libcuda - if: build.message !~ /\[skip tests\]/ && - !build.pull_request.draft + if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft timeout_in_minutes: 10 - - label: "Unified memory" - plugins: - - JuliaCI/julia#v1: - version: 1.9 - - JuliaCI/julia-test#v1: - test_args: "--quickfail" - - JuliaCI/julia-coverage#v1: - dirs: - - src - - lib - - examples - agents: - queue: "juliagpu" - cuda: "*" - commands: | - echo -e "[CUDA]\ndefault_memory = \"unified\"" >LocalPreferences.toml - if: build.message !~ /\[skip tests\]/ && - !build.pull_request.draft - timeout_in_minutes: 120 - # we want to benchmark every commit on the master branch, even if it failed CI - wait: ~ continue_on_failure: true @@ -293,9 +293,8 @@ steps: agents: queue: "juliagpu" cuda: "*" - if: build.message !~ /\[skip benchmarks\]/ && - build.branch !~ /^master$$/ && - !build.pull_request.draft + if: build.message !~ /\[skip benchmarks\]/ && !build.pull_request.draft && + build.branch !~ /^master$$/ timeout_in_minutes: 30 # if we will submit results, use the benchmark queue so that we will @@ -329,8 +328,7 @@ steps: queue: "benchmark" gpu: "rtx2070" cuda: "*" - if: build.message !~ /\[skip benchmarks\]/ && - build.branch =~ /^master$$/ + if: build.message !~ /\[skip benchmarks\]/ && build.branch =~ /^master$$/ matrix: setup: julia: From 7bacf91641bd122d5628fe63403319a62180ed14 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 1 Nov 2023 16:58:51 +0100 Subject: [PATCH 18/19] Add bounds checking to optimized scalar indexing functions. --- src/array.jl | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/array.jl b/src/array.jl index 5f6cd712b7..dffd661e5f 100644 --- a/src/array.jl +++ b/src/array.jl @@ -480,11 +480,15 @@ end ## indexing -Base.getindex(x::CuArray{<:Any, <:Any, <:Union{Mem.Host,Mem.Unified}}, I::Int) = +function Base.getindex(x::CuArray{<:Any, <:Any, <:Union{Mem.Host,Mem.Unified}}, I::Int) + @boundscheck checkbounds(x, I) unsafe_load(pointer(x, I; type=Mem.Host)) +end -Base.setindex!(x::CuArray{<:Any, <:Any, <:Union{Mem.Host,Mem.Unified}}, v, I::Int) = +function Base.setindex!(x::CuArray{<:Any, <:Any, <:Union{Mem.Host,Mem.Unified}}, v, I::Int) + @boundscheck checkbounds(x, I) unsafe_store!(pointer(x, I; type=Mem.Host), v) +end ## interop with device arrays From f377f812868d3b527f80af6b998d89c5bebeed0c Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 1 Nov 2023 17:04:06 +0100 Subject: [PATCH 19/19] Try testing libraries with host memory. --- .buildkite/pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 6a80f416ce..a23e01d9d0 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -79,7 +79,7 @@ steps: - JuliaCI/julia#v1: version: 1.9 - JuliaCI/julia-test#v1: - test_args: "--quickfail core base" + test_args: "--quickfail core base libraries" - JuliaCI/julia-coverage#v1: dirs: - src