From dd0a37f884e83a502b089a1ef924d8c5be22a0ea Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 31 Oct 2023 15:47:14 +0100
Subject: [PATCH 01/19] Allow scalar indexing with unified arrays.

---
 lib/cudadrv/memory.jl |  3 ++-
 src/array.jl          | 35 ++++++++++++++++++++++++++++++++---
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/lib/cudadrv/memory.jl b/lib/cudadrv/memory.jl
index 20897f8f96..75ba4d09c4 100644
--- a/lib/cudadrv/memory.jl
+++ b/lib/cudadrv/memory.jl
@@ -213,6 +213,7 @@ struct UnifiedBuffer <: AbstractBuffer
     ctx::CuContext
     ptr::CuPtr{Cvoid}
     bytesize::Int
+    dirty::Threads.Atomic{Bool}
 end
 
 UnifiedBuffer() = UnifiedBuffer(context(), CU_NULL, 0)
@@ -244,7 +245,7 @@ function alloc(::Type{UnifiedBuffer}, bytesize::Integer,
     ptr_ref = Ref{CuPtr{Cvoid}}()
     CUDA.cuMemAllocManaged(ptr_ref, bytesize, flags)
 
-    return UnifiedBuffer(context(), ptr_ref[], bytesize)
+    return UnifiedBuffer(context(), ptr_ref[], bytesize, Threads.Atomic{Bool}(false), )
 end
 
 
diff --git a/src/array.jl b/src/array.jl
index 3708c5ca3a..8b76fec96f 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -342,10 +342,39 @@ Base.convert(::Type{T}, x::T) where T <: CuArray = x
 
 ## interop with C libraries
 
-Base.unsafe_convert(::Type{Ptr{T}}, x::CuArray{T}) where {T} =
-  throw(ArgumentError("cannot take the CPU address of a $(typeof(x))"))
+function Base.unsafe_convert(::Type{Ptr{T}}, x::CuArray{T}) where {T}
+  buf = x.data[]
+  if buf isa Mem.DeviceBuffer
+    throw(ArgumentError("cannot take the CPU address of a $(typeof(x))"))
+  elseif buf isa Mem.UnifiedBuffer
+    # TODO: atomics
+    if buf.dirty[]
+      synchronize()
+      buf.dirty[] = false
+    end
+  end
+  convert(Ptr{T}, buf) + x.offset*Base.elsize(x)
+end
+
 function Base.unsafe_convert(::Type{CuPtr{T}}, x::CuArray{T}) where {T}
-  convert(CuPtr{T}, x.data[]) + x.offset*Base.elsize(x)
+  buf = x.data[]
+  if buf isa Mem.UnifiedBuffer
+    buf.dirty[] = true
+  end
+  convert(CuPtr{T}, buf) + x.offset*Base.elsize(x)
+end
+
+
+## indexing
+
+function Base.getindex(x::CuArray{T, <:Any, Mem.UnifiedBuffer}, I::Int) where T
+  ptr = Base.unsafe_convert(Ptr{T}, x)
+  unsafe_load(ptr)
+end
+
+function Base.setindex!(x::CuArray{T, <:Any, Mem.UnifiedBuffer}, v, I::Int) where T
+  ptr = Base.unsafe_convert(Ptr{T}, x)
+  unsafe_store!(ptr, v)
 end
 
 

From 331573d1c0516956c23c4c4b32c66e4a0032dd3a Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 31 Oct 2023 16:35:38 +0100
Subject: [PATCH 02/19] Make the default memory type configurable.

---
 LocalPreferences.toml |  4 ++++
 src/array.jl          | 33 ++++++++++++++++++++++++++++++---
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/LocalPreferences.toml b/LocalPreferences.toml
index e0e7507033..afa63bf788 100644
--- a/LocalPreferences.toml
+++ b/LocalPreferences.toml
@@ -12,6 +12,10 @@
 # making it possible to do use cooperative multitasking.
 #nonblocking_synchronization = true
 
+# which memory type unspecified allocations should default to.
+# possible values: "device", "unified", "host"
+#default_memory = "unified"
+
 [CUDA_Driver_jll]
 # whether to attempt to load a forwards-compatibile userspace driver.
 # only turn this off if you experience issues, e.g., when using a local
diff --git a/src/array.jl b/src/array.jl
index 8b76fec96f..7e3815f984 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -132,10 +132,23 @@ const CuVector{T} = CuArray{T,1}
 const CuMatrix{T} = CuArray{T,2}
 const CuVecOrMat{T} = Union{CuVector{T},CuMatrix{T}}
 
-# default to non-unified memory
+# unspecified memory allocation
+const default_memory = let str = Preferences.@load_preference("default_memory", "unified")
+  if str == "device"
+    Mem.DeviceBuffer
+  elseif str == "unified"
+    Mem.UnifiedBuffer
+  elseif str == "host"
+    Mem.HostBuffer
+  else
+    error("unknown default memory type: $default_memory")
+  end
+end
 CuArray{T,N}(::UndefInitializer, dims::Dims{N}) where {T,N} =
-  CuArray{T,N,Mem.DeviceBuffer}(undef, dims)
+  CuArray{T,N,default_memory}(undef, dims)
+is_device(a::CuArray) = isa(a.data[], Mem.DeviceBuffer)
 is_unified(a::CuArray) = isa(a.data[], Mem.UnifiedBuffer)
+is_host(a::CuArray) = isa(a.data[], Mem.HostBuffer)
 
 # buffer, type and dimensionality specified
 CuArray{T,N,B}(::UndefInitializer, dims::NTuple{N,Integer}) where {T,N,B} =
@@ -650,7 +663,21 @@ julia> CuArray(1:3)
  3
 ```
 """
-@inline cu(xs; unified::Bool=false) = adapt(CuArrayAdaptor{unified ? Mem.UnifiedBuffer : Mem.DeviceBuffer}(), xs)
+@inline function cu(xs; device::Bool=false, unified::Bool=false, host::Bool=false)
+  if device + unified + host > 1
+    throw(ArgumentError("Can only specify one of `device`, `unified`, or `host`"))
+  end
+  memory = if device
+    Mem.DeviceBuffer
+  elseif unified
+    Mem.UnifiedBuffer
+  elseif host
+    Mem.HostBuffer
+  else
+    default_memory
+  end
+  adapt(CuArrayAdaptor{memory}(), xs)
+end
 
 Base.getindex(::typeof(cu), xs...) = CuArray([xs...])
 

From f3383df1b01939dd186326a80bef02bd10f5fe53 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 31 Oct 2023 17:05:10 +0100
Subject: [PATCH 03/19] Fix null ctor.

---
 lib/cudadrv/memory.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/cudadrv/memory.jl b/lib/cudadrv/memory.jl
index 75ba4d09c4..5ff22d0257 100644
--- a/lib/cudadrv/memory.jl
+++ b/lib/cudadrv/memory.jl
@@ -216,7 +216,7 @@ struct UnifiedBuffer <: AbstractBuffer
     dirty::Threads.Atomic{Bool}
 end
 
-UnifiedBuffer() = UnifiedBuffer(context(), CU_NULL, 0)
+UnifiedBuffer() = UnifiedBuffer(context(), CU_NULL, 0, Threads.Atomic{Bool}(false))
 
 Base.pointer(buf::UnifiedBuffer) = buf.ptr
 Base.sizeof(buf::UnifiedBuffer) = buf.bytesize
@@ -245,7 +245,7 @@ function alloc(::Type{UnifiedBuffer}, bytesize::Integer,
     ptr_ref = Ref{CuPtr{Cvoid}}()
     CUDA.cuMemAllocManaged(ptr_ref, bytesize, flags)
 
-    return UnifiedBuffer(context(), ptr_ref[], bytesize, Threads.Atomic{Bool}(false), )
+    return UnifiedBuffer(context(), ptr_ref[], bytesize, Threads.Atomic{Bool}(false))
 end
 
 

From 085fb0c1bcb4e93284d83fd0d4fb668cb174f80d Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 31 Oct 2023 17:12:04 +0100
Subject: [PATCH 04/19] Fix unsafe_wrap.

---
 src/array.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/array.jl b/src/array.jl
index 7e3815f984..72fe68a00d 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -243,7 +243,7 @@ function _unsafe_wrap(::Type{T}, ptr::CuPtr{T}, dims::NTuple{N,Int};
   buf = try
     typ = memory_type(ptr)
     if is_managed(ptr)
-      Mem.UnifiedBuffer(ctx, ptr, sz)
+      Mem.UnifiedBuffer(ctx, ptr, sz, Threads.Atomic{Bool}(false))
     elseif typ == CU_MEMORYTYPE_DEVICE
       # TODO: can we identify whether this pointer was allocated asynchronously?
       Mem.DeviceBuffer(ctx, ptr, sz, false)

From d5e6f1f9fbe2c9d7d76b62b3b40fa05e5ef79b61 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 31 Oct 2023 17:20:38 +0100
Subject: [PATCH 05/19] Fix some tests.

---
 test/base/array.jl | 53 +++++++++++++++++++++++++---------------------
 1 file changed, 29 insertions(+), 24 deletions(-)

diff --git a/test/base/array.jl b/test/base/array.jl
index f7c3c77bef..2b75967299 100644
--- a/test/base/array.jl
+++ b/test/base/array.jl
@@ -3,33 +3,38 @@ import Adapt
 using ChainRulesCore: add!!, is_inplaceable_destination
 
 @testset "constructors" begin
-  xs = CuArray{Int}(undef, 2, 3)
-  @test device(xs) == device()
-  @test context(xs) == context()
-  @test collect(CuArray([1 2; 3 4])) == [1 2; 3 4]
-  @test collect(cu[1, 2, 3]) == [1, 2, 3]
-  @test collect(cu([1, 2, 3])) == [1, 2, 3]
-  @test testf(vec, rand(5,3))
-  @test cu(1:3) === 1:3
-  @test Base.elsize(xs) == sizeof(Int)
-  @test pointer(CuArray{Int, 2}(xs)) != pointer(xs)
-
-  # test aggressive conversion to Float32, but only for floats, and only with `cu`
-  @test cu([1]) isa CuArray{Int}
-  @test cu(Float64[1]) isa CuArray{Float32}
-  @test cu(ComplexF64[1+1im]) isa CuArray{ComplexF32}
-  @test Adapt.adapt(CuArray, Float64[1]) isa CuArray{Float64}
-  @test Adapt.adapt(CuArray, ComplexF64[1]) isa CuArray{ComplexF64}
-  @test Adapt.adapt(CuArray{Float16}, Float64[1]) isa CuArray{Float16}
-
-  @test_throws ArgumentError Base.unsafe_convert(Ptr{Int}, xs)
-  @test_throws ArgumentError Base.unsafe_convert(Ptr{Float32}, xs)
+  let xs = CuArray{Int}(undef, 2, 3)
+    # basic properties
+    @test device(xs) == device()
+    @test context(xs) == context()
+    @test collect(CuArray([1 2; 3 4])) == [1 2; 3 4]
+    @test collect(cu[1, 2, 3]) == [1, 2, 3]
+    @test collect(cu([1, 2, 3])) == [1, 2, 3]
+    @test testf(vec, rand(5,3))
+    @test cu(1:3) === 1:3
+    @test Base.elsize(xs) == sizeof(Int)
+    @test pointer(CuArray{Int, 2}(xs)) != pointer(xs)
+
+    # test aggressive conversion to Float32, but only for floats, and only with `cu`
+    @test cu([1]) isa CuArray{Int}
+    @test cu(Float64[1]) isa CuArray{Float32}
+    @test cu(ComplexF64[1+1im]) isa CuArray{ComplexF32}
+    @test Adapt.adapt(CuArray, Float64[1]) isa CuArray{Float64}
+    @test Adapt.adapt(CuArray, ComplexF64[1]) isa CuArray{ComplexF64}
+    @test Adapt.adapt(CuArray{Float16}, Float64[1]) isa CuArray{Float16}
+  end
+
+  # test pointer conversions
+  let xs = CuVector{Int,Mem.DeviceBuffer}(undef, 1)
+    @test_throws ArgumentError Base.unsafe_convert(Ptr{Int}, xs)
+    @test_throws ArgumentError Base.unsafe_convert(Ptr{Float32}, xs)
+  end
 
   # unsafe_wrap
   let
-    arr = CuArray{Int}(undef, 2)
-    ptr = pointer(arr)
     B = Mem.DeviceBuffer
+    arr = CuVector{Int,B}(undef, 2)
+    ptr = pointer(arr)
 
     ## compare the fields we care about
     function test_eq(a, T, N, dims)
@@ -863,4 +868,4 @@ end
   c = add!!(a, b)
   @test c == a′ + b
   @test c === a
-end
\ No newline at end of file
+end

From 0a681104a53ab69da08e7946a4b67bd01ec055fa Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 31 Oct 2023 19:17:05 +0100
Subject: [PATCH 06/19] Fix another default constructor.

---
 src/array.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/array.jl b/src/array.jl
index 72fe68a00d..e7097d1bc2 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -333,7 +333,7 @@ const AnyCuVecOrMat{T} = Union{AnyCuVector{T}, AnyCuMatrix{T}}
 end
 
 @inline CuArray{T,N}(xs::AbstractArray{<:Any,N}) where {T,N} =
-  CuArray{T,N,Mem.Device}(xs)
+  CuArray{T,N,default_memory}(xs)
 
 @inline CuArray{T,N}(xs::CuArray{<:Any,N,B}) where {T,N,B} =
   CuArray{T,N,B}(xs)

From cce48ccb39553727b031762a9a52f145eeaf82be Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 31 Oct 2023 20:06:17 +0100
Subject: [PATCH 07/19] Bugfix.

---
 src/array.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/array.jl b/src/array.jl
index e7097d1bc2..c5ee22888b 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -381,12 +381,12 @@ end
 ## indexing
 
 function Base.getindex(x::CuArray{T, <:Any, Mem.UnifiedBuffer}, I::Int) where T
-  ptr = Base.unsafe_convert(Ptr{T}, x)
+  ptr = Base.unsafe_convert(Ptr{T}, x) + Base._memory_offset(x, I)
   unsafe_load(ptr)
 end
 
 function Base.setindex!(x::CuArray{T, <:Any, Mem.UnifiedBuffer}, v, I::Int) where T
-  ptr = Base.unsafe_convert(Ptr{T}, x)
+  ptr = Base.unsafe_convert(Ptr{T}, x) + Base._memory_offset(x, I)
   unsafe_store!(ptr, v)
 end
 

From 579309c0ca90ea63274cc58e5fd16455a6451fa7 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 31 Oct 2023 20:14:47 +0100
Subject: [PATCH 08/19] Test fixes.

---
 test/base/array.jl | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/test/base/array.jl b/test/base/array.jl
index 2b75967299..c871938eb0 100644
--- a/test/base/array.jl
+++ b/test/base/array.jl
@@ -671,7 +671,7 @@ end
   dev = device()
 
   let
-    a = CuVector{Int}(undef, 1)
+    a = CuVector{Int,Mem.DeviceBuffer}(undef, 1)
     @test !is_unified(a)
     @test !is_managed(pointer(a))
   end
@@ -694,12 +694,6 @@ end
   end
 
   let
-    # default ctor: device memory
-    let a = CUDA.rand(1)
-      @test !is_unified(a)
-      @test !is_managed(pointer(a))
-    end
-
     for B = [Mem.DeviceBuffer, Mem.UnifiedBuffer]
       a = CuVector{Float32,B}(rand(Float32, 1))
       @test !xor(B == Mem.UnifiedBuffer, is_unified(a))
@@ -740,12 +734,12 @@ end
     end
 
     # cu: supports unified keyword
-    let a = cu(rand(Float64, 1); unified=true)
-      @test is_unified(a)
+    let a = cu(rand(Float64, 1); device=true)
+      @test !is_unified(a)
       @test eltype(a) == Float32
     end
-    let a = cu(rand(Float64, 1))
-      @test !is_unified(a)
+    let a = cu(rand(Float64, 1); unified=true)
+      @test is_unified(a)
       @test eltype(a) == Float32
     end
   end

From a48715f39a0b14632c23be6b1596ac27d740996e Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 31 Oct 2023 20:35:19 +0100
Subject: [PATCH 09/19] Pointer conversion improvements.

---
 src/array.jl | 51 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/src/array.jl b/src/array.jl
index c5ee22888b..eee24d3885 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -313,8 +313,15 @@ const StridedCuMatrix{T} = StridedCuArray{T,2}
 const StridedCuVecOrMat{T} = Union{StridedCuVector{T}, StridedCuMatrix{T}}
 
 Base.pointer(x::StridedCuArray{T}) where {T} = Base.unsafe_convert(CuPtr{T}, x)
-@inline function Base.pointer(x::StridedCuArray{T}, i::Integer) where T
-    Base.unsafe_convert(CuPtr{T}, x) + Base._memory_offset(x, i)
+@inline function Base.pointer(x::StridedCuArray{T}, i::Integer; type=Mem.Device) where T
+    PT = if type == Mem.Device
+      CuPtr{T}
+    elseif type == Mem.Host
+      Ptr{T}
+    else
+      error("unknown memory type")
+    end
+    Base.unsafe_convert(PT, x) + Base._memory_offset(x, i)
 end
 
 # anything that's (secretly) backed by a CuArray
@@ -380,15 +387,11 @@ end
 
 ## indexing
 
-function Base.getindex(x::CuArray{T, <:Any, Mem.UnifiedBuffer}, I::Int) where T
-  ptr = Base.unsafe_convert(Ptr{T}, x) + Base._memory_offset(x, I)
-  unsafe_load(ptr)
-end
+Base.getindex(x::CuArray{<:Any, <:Any, Mem.UnifiedBuffer}, I::Int) =
+  unsafe_load(pointer(x, I; type=Mem.Host))
 
-function Base.setindex!(x::CuArray{T, <:Any, Mem.UnifiedBuffer}, v, I::Int) where T
-  ptr = Base.unsafe_convert(Ptr{T}, x) + Base._memory_offset(x, I)
-  unsafe_store!(ptr, v)
-end
+Base.setindex!(x::CuArray{<:Any, <:Any, Mem.UnifiedBuffer}, v, I::Int) =
+  unsafe_store!(pointer(x, I; type=Mem.Host), v)
 
 
 ## interop with device arrays
@@ -402,8 +405,16 @@ end
 ## memory copying
 
 typetagdata(a::Array, i=1) = ccall(:jl_array_typetagdata, Ptr{UInt8}, (Any,), a) + i - 1
-typetagdata(a::CuArray, i=1) =
-  convert(CuPtr{UInt8}, a.data[]) + a.maxsize + a.offset + i - 1
+function typetagdata(a::CuArray, i=1; type=Mem.Device)
+  PT = if type == Mem.Device
+    CuPtr{UInt8}
+  elseif type == Mem.Host
+    Ptr{UInt8}
+  else
+    error("unknown memory type")
+  end
+  convert(PT, a.data[]) + a.maxsize + a.offset + i - 1
+end
 
 function Base.copyto!(dest::DenseCuArray{T}, doffs::Integer, src::Array{T}, soffs::Integer,
                       n::Integer) where T
@@ -518,11 +529,11 @@ function Base.unsafe_copyto!(dest::DenseCuArray{T,<:Any,<:Union{Mem.UnifiedBuffe
   synchronize()
 
   GC.@preserve src dest begin
-    cpu_ptr = pointer(src, soffs)
-    unsafe_copyto!(host_pointer(pointer(dest, doffs)), cpu_ptr, n)
+    ptr = pointer(src, soffs)
+    unsafe_copyto!(pointer(dest, doffs; type=Mem.Host), ptr, n)
     if Base.isbitsunion(T)
-      cpu_ptr = typetagdata(src, soffs)
-      unsafe_copyto!(host_pointer(typetagdata(dest, doffs)), cpu_ptr, n)
+      ptr = typetagdata(src, soffs)
+      unsafe_copyto!(typetagdata(dest, doffs; type=Mem.Host), ptr, n)
     end
   end
   return dest
@@ -534,11 +545,11 @@ function Base.unsafe_copyto!(dest::Array{T}, doffs,
   synchronize()
 
   GC.@preserve src dest begin
-    cpu_ptr = pointer(dest, doffs)
-    unsafe_copyto!(cpu_ptr, host_pointer(pointer(src, soffs)), n)
+    ptr = pointer(dest, doffs)
+    unsafe_copyto!(ptr, pointer(src, soffs; type=Mem.Host), n)
     if Base.isbitsunion(T)
-      cpu_ptr = typetagdata(dest, doffs)
-      unsafe_copyto!(cpu_ptr, host_pointer(typetagdata(src, soffs)), n)
+      ptr = typetagdata(dest, doffs)
+      unsafe_copyto!(ptr, typetagdata(src, soffs; type=Mem.Host), n)
     end
   end
 

From 9a498c334131f7046f48a857f1931effb8a31465 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 31 Oct 2023 20:42:32 +0100
Subject: [PATCH 10/19] Report preferences in the versioninfo output.

---
 src/utilities.jl | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/utilities.jl b/src/utilities.jl
index 9a447fe49f..29972e7741 100644
--- a/src/utilities.jl
+++ b/src/utilities.jl
@@ -110,6 +110,23 @@ function versioninfo(io::IO=stdout)
         println(io)
     end
 
+    prefs = [
+        "nonblocking_synchronization" => Preferences.load_preference(CUDA, "nonblocking_synchronization"),
+        "default_memory" => Preferences.load_preference(CUDA, "default_memory"),
+        "CUDA_Runtime_jll.version" => Preferences.load_preference(CUDA_Runtime_jll, "version"),
+        "CUDA_Runtime_jll.local" => Preferences.load_preference(CUDA_Runtime_jll, "local"),
+        "CUDA_Driver_jll.compat" => Preferences.load_preference(CUDA_Driver_jll, "compat"),
+    ]
+    if any(x->!isnothing(x[2]), prefs)
+        println(io, "Preferences:")
+        for (key, val) in prefs
+            if !isnothing(val)
+                println(io, "- $key: $val")
+            end
+        end
+        println(io)
+    end
+
     devs = devices()
     if isempty(devs)
         println(io, "No CUDA-capable devices.")

From 427eb3ed6d6a6b83944f5c9d9a63b25500edc31c Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 31 Oct 2023 20:44:56 +0100
Subject: [PATCH 11/19] Keep the default on device memory, but test unified
 memory support.

---
 .buildkite/pipeline.yml | 21 ++++++++++++++++++++-
 LocalPreferences.toml   |  2 +-
 src/array.jl            |  2 +-
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index edd315982c..c47cdbea1e 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -241,10 +241,29 @@ steps:
         env:
           JULIA_CUDA_USE_COMPAT: 'false'  # NVIDIA bug #3418723: injection tools prevent probing libcuda
         if: build.message !~ /\[skip tests\]/ &&
-            build.message !~ /\[skip sanitizer\]/ &&
             !build.pull_request.draft
         timeout_in_minutes: 10
 
+      - label: "Unified memory"
+        plugins:
+          - JuliaCI/julia#v1:
+              version: 1.9
+          - JuliaCI/julia-test#v1:
+              test_args: "--quickfail"
+          - JuliaCI/julia-coverage#v1:
+              dirs:
+                - src
+                - lib
+                - examples
+        agents:
+          queue: "juliagpu"
+          cuda: "*"
+        commands: |
+          echo -e "[CUDA]\ndefault_memory = \"unified\"" >LocalPreferences.toml
+        if: build.message !~ /\[skip tests\]/ &&
+            !build.pull_request.draft
+        timeout_in_minutes: 120
+
   # we want to benchmark every commit on the master branch, even if it failed CI
   - wait: ~
     continue_on_failure: true
diff --git a/LocalPreferences.toml b/LocalPreferences.toml
index afa63bf788..513fc75593 100644
--- a/LocalPreferences.toml
+++ b/LocalPreferences.toml
@@ -14,7 +14,7 @@
 
 # which memory type unspecified allocations should default to.
 # possible values: "device", "unified", "host"
-#default_memory = "unified"
+#default_memory = "device"
 
 [CUDA_Driver_jll]
 # whether to attempt to load a forwards-compatibile userspace driver.
diff --git a/src/array.jl b/src/array.jl
index eee24d3885..76dc320cdb 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -133,7 +133,7 @@ const CuMatrix{T} = CuArray{T,2}
 const CuVecOrMat{T} = Union{CuVector{T},CuMatrix{T}}
 
 # unspecified memory allocation
-const default_memory = let str = Preferences.@load_preference("default_memory", "unified")
+const default_memory = let str = Preferences.@load_preference("default_memory", "device")
   if str == "device"
     Mem.DeviceBuffer
   elseif str == "unified"

From 9ecbdfded5dfab5e662e15f05a9d38525c1c34f0 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 31 Oct 2023 21:42:14 +0100
Subject: [PATCH 12/19] Skip scalar indexing tests when using unified memory.

---
 test/runtests.jl | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/runtests.jl b/test/runtests.jl
index e02a00b0e0..281e609e79 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -58,6 +58,10 @@ const tests = ["core$(path_separator)initialization"]    # needs to run first
 const test_runners = Dict()
 ## GPUArrays testsuite
 for name in keys(TestSuite.tests)
+    if CUDA.default_memory == Mem.Unified && name == "indexing scalar"
+        # GPUArrays' scalar indexing tests assume that indexing is not supported
+        continue
+    end
     push!(tests, "gpuarrays$(path_separator)$name")
     test_runners["gpuarrays$(path_separator)$name"] = ()->TestSuite.tests[name](CuArray)
 end

From e945351b7dafe9473ac084148e8a69aa12649085 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 31 Oct 2023 22:16:30 +0100
Subject: [PATCH 13/19] Store the dirty bit in task local storage, for
 correctness.

---
 lib/cudadrv/memory.jl |  5 ++---
 src/array.jl          | 52 ++++++++++++++++++++++++++++++-------------
 2 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/lib/cudadrv/memory.jl b/lib/cudadrv/memory.jl
index 5ff22d0257..20897f8f96 100644
--- a/lib/cudadrv/memory.jl
+++ b/lib/cudadrv/memory.jl
@@ -213,10 +213,9 @@ struct UnifiedBuffer <: AbstractBuffer
     ctx::CuContext
     ptr::CuPtr{Cvoid}
     bytesize::Int
-    dirty::Threads.Atomic{Bool}
 end
 
-UnifiedBuffer() = UnifiedBuffer(context(), CU_NULL, 0, Threads.Atomic{Bool}(false))
+UnifiedBuffer() = UnifiedBuffer(context(), CU_NULL, 0)
 
 Base.pointer(buf::UnifiedBuffer) = buf.ptr
 Base.sizeof(buf::UnifiedBuffer) = buf.bytesize
@@ -245,7 +244,7 @@ function alloc(::Type{UnifiedBuffer}, bytesize::Integer,
     ptr_ref = Ref{CuPtr{Cvoid}}()
     CUDA.cuMemAllocManaged(ptr_ref, bytesize, flags)
 
-    return UnifiedBuffer(context(), ptr_ref[], bytesize, Threads.Atomic{Bool}(false))
+    return UnifiedBuffer(context(), ptr_ref[], bytesize)
 end
 
 
diff --git a/src/array.jl b/src/array.jl
index 76dc320cdb..628f3b8db3 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -243,7 +243,7 @@ function _unsafe_wrap(::Type{T}, ptr::CuPtr{T}, dims::NTuple{N,Int};
   buf = try
     typ = memory_type(ptr)
     if is_managed(ptr)
-      Mem.UnifiedBuffer(ctx, ptr, sz, Threads.Atomic{Bool}(false))
+      Mem.UnifiedBuffer(ctx, ptr, sz)
     elseif typ == CU_MEMORYTYPE_DEVICE
       # TODO: can we identify whether this pointer was allocated asynchronously?
       Mem.DeviceBuffer(ctx, ptr, sz, false)
@@ -312,8 +312,7 @@ const StridedCuVector{T} = StridedCuArray{T,1}
 const StridedCuMatrix{T} = StridedCuArray{T,2}
 const StridedCuVecOrMat{T} = Union{StridedCuVector{T}, StridedCuMatrix{T}}
 
-Base.pointer(x::StridedCuArray{T}) where {T} = Base.unsafe_convert(CuPtr{T}, x)
-@inline function Base.pointer(x::StridedCuArray{T}, i::Integer; type=Mem.Device) where T
+@inline function Base.pointer(x::StridedCuArray{T}, i::Integer=1; type=Mem.Device) where T
     PT = if type == Mem.Device
       CuPtr{T}
     elseif type == Mem.Host
@@ -360,30 +359,51 @@ CuArray{T,N}(xs::CuArray{T,N,B}) where {T,N,B} = copy(xs)
 Base.convert(::Type{T}, x::T) where T <: CuArray = x
 
 
-## interop with C libraries
+## interop with libraries
 
-function Base.unsafe_convert(::Type{Ptr{T}}, x::CuArray{T}) where {T}
-  buf = x.data[]
-  if buf isa Mem.DeviceBuffer
-    throw(ArgumentError("cannot take the CPU address of a $(typeof(x))"))
-  elseif buf isa Mem.UnifiedBuffer
-    # TODO: atomics
-    if buf.dirty[]
-      synchronize()
-      buf.dirty[] = false
-    end
+# when a unified buffer is converted to a device pointer, we assume it will be accessed
+# asynchronously. we keep track of that in the task local storage, and use that information
+# to perform additional synchronization when converting the buffer to a host pointer.
+# TODO: optimize this! it currently halves the performance of scalar indexing.
+function mark_async(buf::Mem.UnifiedBuffer)
+  tls = task_local_storage()
+  if haskey(tls, :CUDA_ASYNC_BUFFERS)
+    async_buffers = tls[:CUDA_ASYNC_BUFFERS]::Vector{Mem.UnifiedBuffer}
+    in(buf, async_buffers) && return
+    pushfirst!(async_buffers, buf)
+  else
+    tls[:CUDA_ASYNC_BUFFERS] = [buf]
   end
-  convert(Ptr{T}, buf) + x.offset*Base.elsize(x)
+  return
+end
+function ensure_sync(buf::Mem.UnifiedBuffer)
+  tls = task_local_storage()
+  haskey(tls, :CUDA_ASYNC_BUFFERS) || return
+  async_buffers = tls[:CUDA_ASYNC_BUFFERS]::Vector{Mem.UnifiedBuffer}
+  in(buf, async_buffers) || return
+  synchronize()
+  filter!(!isequal(buf), async_buffers)
+  return
 end
 
 function Base.unsafe_convert(::Type{CuPtr{T}}, x::CuArray{T}) where {T}
   buf = x.data[]
   if buf isa Mem.UnifiedBuffer
-    buf.dirty[] = true
+    mark_async(buf)
   end
   convert(CuPtr{T}, buf) + x.offset*Base.elsize(x)
 end
 
+function Base.unsafe_convert(::Type{Ptr{T}}, x::CuArray{T}) where {T}
+  buf = x.data[]
+  if buf isa Mem.DeviceBuffer
+    throw(ArgumentError("cannot take the CPU address of a $(typeof(x))"))
+  elseif buf isa Mem.UnifiedBuffer
+    ensure_sync(buf)
+  end
+  convert(Ptr{T}, buf) + x.offset*Base.elsize(x)
+end
+
 
 ## indexing
 

From 4873f560c9a8a1ea2f4a8e1d208a559e7c15b944 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 1 Nov 2023 11:03:44 +0100
Subject: [PATCH 14/19] Prefetch unified memory before kernel launches.

---
 lib/cusparse/array.jl     | 14 +++++++-------
 src/array.jl              | 18 +++++++++---------
 src/compiler/execution.jl | 39 ++++++++++++++++++++++++---------------
 src/texture.jl            |  2 +-
 test/core/execution.jl    |  2 +-
 5 files changed, 42 insertions(+), 33 deletions(-)

diff --git a/lib/cusparse/array.jl b/lib/cusparse/array.jl
index 0a5f2c0bbf..64f79eed82 100644
--- a/lib/cusparse/array.jl
+++ b/lib/cusparse/array.jl
@@ -417,9 +417,9 @@ Adapt.adapt_storage(::Type{CuArray}, xs::SparseMatrixCSC) = CuSparseMatrixCSC(xs
 Adapt.adapt_storage(::Type{CuArray{T}}, xs::SparseVector) where {T} = CuSparseVector{T}(xs)
 Adapt.adapt_storage(::Type{CuArray{T}}, xs::SparseMatrixCSC) where {T} = CuSparseMatrixCSC{T}(xs)
 
-Adapt.adapt_storage(::CUDA.CuArrayAdaptor, xs::AbstractSparseArray) =
+Adapt.adapt_storage(::CUDA.CuArrayKernelAdaptor, xs::AbstractSparseArray) =
   adapt(CuArray, xs)
-Adapt.adapt_storage(::CUDA.CuArrayAdaptor, xs::AbstractSparseArray{<:AbstractFloat}) =
+Adapt.adapt_storage(::CUDA.CuArrayKernelAdaptor, xs::AbstractSparseArray{<:AbstractFloat}) =
   adapt(CuArray{Float32}, xs)
 
 Adapt.adapt_storage(::Type{Array}, xs::CuSparseVector) = SparseVector(xs)
@@ -546,7 +546,7 @@ end
 
 # interop with device arrays
 
-function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseVector)
+function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseVector)
     return CuSparseDeviceVector(
         adapt(to, x.iPtr),
         adapt(to, x.nzVal),
@@ -554,7 +554,7 @@ function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseVector)
     )
 end
 
-function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCSR)
+function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseMatrixCSR)
     return CuSparseDeviceMatrixCSR(
         adapt(to, x.rowPtr),
         adapt(to, x.colVal),
@@ -563,7 +563,7 @@ function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCSR)
     )
 end
 
-function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCSC)
+function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseMatrixCSC)
     return CuSparseDeviceMatrixCSC(
         adapt(to, x.colPtr),
         adapt(to, x.rowVal),
@@ -572,7 +572,7 @@ function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCSC)
     )
 end
 
-function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixBSR)
+function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseMatrixBSR)
     return CuSparseDeviceMatrixBSR(
         adapt(to, x.rowPtr),
         adapt(to, x.colVal),
@@ -582,7 +582,7 @@ function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixBSR)
     )
 end
 
-function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuSparseMatrixCOO)
+function Adapt.adapt_structure(to::CUDA.KernelAdaptor, x::CuSparseMatrixCOO)
     return CuSparseDeviceMatrixCOO(
         adapt(to, x.rowInd),
         adapt(to, x.colInd),
diff --git a/src/array.jl b/src/array.jl
index 628f3b8db3..b229ba5bbd 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -388,7 +388,7 @@ end
 
 function Base.unsafe_convert(::Type{CuPtr{T}}, x::CuArray{T}) where {T}
   buf = x.data[]
-  if buf isa Mem.UnifiedBuffer
+  if is_unified(x)
     mark_async(buf)
   end
   convert(CuPtr{T}, buf) + x.offset*Base.elsize(x)
@@ -396,9 +396,9 @@ end
 
 function Base.unsafe_convert(::Type{Ptr{T}}, x::CuArray{T}) where {T}
   buf = x.data[]
-  if buf isa Mem.DeviceBuffer
+  if is_device(x)
     throw(ArgumentError("cannot take the CPU address of a $(typeof(x))"))
-  elseif buf isa Mem.UnifiedBuffer
+  elseif is_unified(x)
     ensure_sync(buf)
   end
   convert(Ptr{T}, buf) + x.offset*Base.elsize(x)
@@ -637,19 +637,19 @@ Adapt.adapt_storage(::Type{<:CuArray{T, N, B}}, xs::AT) where {T, N, B, AT<:Abst
 
 # eagerly converts Float64 to Float32, for performance reasons
 
-struct CuArrayAdaptor{B} end
+struct CuArrayKernelAdaptor{B} end
 
-Adapt.adapt_storage(::CuArrayAdaptor{B}, xs::AbstractArray{T,N}) where {T,N,B} =
+Adapt.adapt_storage(::CuArrayKernelAdaptor{B}, xs::AbstractArray{T,N}) where {T,N,B} =
   isbits(xs) ? xs : CuArray{T,N,B}(xs)
 
-Adapt.adapt_storage(::CuArrayAdaptor{B}, xs::AbstractArray{T,N}) where {T<:AbstractFloat,N,B} =
+Adapt.adapt_storage(::CuArrayKernelAdaptor{B}, xs::AbstractArray{T,N}) where {T<:AbstractFloat,N,B} =
   isbits(xs) ? xs : CuArray{Float32,N,B}(xs)
 
-Adapt.adapt_storage(::CuArrayAdaptor{B}, xs::AbstractArray{T,N}) where {T<:Complex{<:AbstractFloat},N,B} =
+Adapt.adapt_storage(::CuArrayKernelAdaptor{B}, xs::AbstractArray{T,N}) where {T<:Complex{<:AbstractFloat},N,B} =
   isbits(xs) ? xs : CuArray{ComplexF32,N,B}(xs)
 
 # not for Float16
-Adapt.adapt_storage(::CuArrayAdaptor{B}, xs::AbstractArray{T,N}) where {T<:Union{Float16,BFloat16},N,B} =
+Adapt.adapt_storage(::CuArrayKernelAdaptor{B}, xs::AbstractArray{T,N}) where {T<:Union{Float16,BFloat16},N,B} =
   isbits(xs) ? xs : CuArray{T,N,B}(xs)
 
 """
@@ -707,7 +707,7 @@ julia> CuArray(1:3)
   else
     default_memory
   end
-  adapt(CuArrayAdaptor{memory}(), xs)
+  adapt(CuArrayKernelAdaptor{memory}(), xs)
 end
 
 Base.getindex(::typeof(cu), xs...) = CuArray([xs...])
diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl
index 9276876026..177da0efa2 100644
--- a/src/compiler/execution.jl
+++ b/src/compiler/execution.jl
@@ -121,35 +121,44 @@ end
 
 ## host to device value conversion
 
-struct Adaptor end
+struct KernelAdaptor end
 
 # convert CUDA host pointers to device pointers
 # TODO: use ordinary ptr?
-Adapt.adapt_storage(to::Adaptor, p::CuPtr{T}) where {T} = reinterpret(LLVMPtr{T,AS.Generic}, p)
+Adapt.adapt_storage(to::KernelAdaptor, p::CuPtr{T}) where {T} =
+    reinterpret(LLVMPtr{T,AS.Generic}, p)
+
+# convert CUDA host arrays to device arrays
+function Adapt.adapt_storage(::KernelAdaptor, xs::DenseCuArray{T,N}) where {T,N}
+  # prefetch unified memory as we're likely to use it on the GPU
+  # TODO: make this configurable?
+  if is_unified(xs) && sizeof(xs) > 0 && !is_capturing()
+    buf = xs.data[]
+    subbuf = Mem.UnifiedBuffer(buf.ctx, pointer(xs), sizeof(xs))
+    Mem.prefetch(subbuf)
+  end
+
+  Base.unsafe_convert(CuDeviceArray{T,N,AS.Global}, xs)
+end
 
 # Base.RefValue isn't GPU compatible, so provide a compatible alternative
 struct CuRefValue{T} <: Ref{T}
   x::T
 end
 Base.getindex(r::CuRefValue{T}) where T = r.x
-Adapt.adapt_structure(to::Adaptor, r::Base.RefValue) = CuRefValue(adapt(to, r[]))
+Adapt.adapt_structure(to::KernelAdaptor, r::Base.RefValue) = CuRefValue(adapt(to, r[]))
 
 # broadcast sometimes passes a ref(type), resulting in a GPU-incompatible DataType box.
 # avoid that by using a special kind of ref that knows about the boxed type.
 struct CuRefType{T} <: Ref{DataType} end
 Base.getindex(r::CuRefType{T}) where T = T
-Adapt.adapt_structure(to::Adaptor, r::Base.RefValue{<:Union{DataType,Type}}) = CuRefType{r[]}()
+Adapt.adapt_structure(to::KernelAdaptor, r::Base.RefValue{<:Union{DataType,Type}}) =
+    CuRefType{r[]}()
 
 # case where type is the function being broadcasted
-Adapt.adapt_structure(to::Adaptor, bc::Base.Broadcast.Broadcasted{Style, <:Any, Type{T}}) where {Style, T} =
-    Base.Broadcast.Broadcasted{Style}((x...) -> T(x...), adapt(to, bc.args), bc.axes)
-
-Adapt.adapt_storage(::Adaptor, xs::CuArray{T,N}) where {T,N} =
-  Base.unsafe_convert(CuDeviceArray{T,N,AS.Global}, xs)
-
-# we materialize ReshapedArray/ReinterpretArray/SubArray/... directly as a device array
-Adapt.adapt_structure(::Adaptor, xs::DenseCuArray{T,N}) where {T,N} =
-  Base.unsafe_convert(CuDeviceArray{T,N,AS.Global}, xs)
+Adapt.adapt_structure(to::KernelAdaptor,
+                      bc::Broadcast.Broadcasted{Style, <:Any, Type{T}}) where {Style, T} =
+    Broadcast.Broadcasted{Style}((x...) -> T(x...), adapt(to, bc.args), bc.axes)
 
 """
     cudaconvert(x)
@@ -159,9 +168,9 @@ converted to a GPU-friendly format. By default, the function does nothing and re
 input object `x` as-is.
 
 Do not add methods to this function, but instead extend the underlying Adapt.jl package and
-register methods for the the `CUDA.Adaptor` type.
+register methods for the the `CUDA.KernelAdaptor` type.
 """
-cudaconvert(arg) = adapt(Adaptor(), arg)
+cudaconvert(arg) = adapt(KernelAdaptor(), arg)
 
 
 ## abstract kernel functionality
diff --git a/src/texture.jl b/src/texture.jl
index 24f9da6fc9..4f1a9444eb 100644
--- a/src/texture.jl
+++ b/src/texture.jl
@@ -319,6 +319,6 @@ memory_source(::Any) = error("Unknown texture source $(typeof(t))")
 memory_source(::CuArray) = LinearMemory()
 memory_source(::CuTextureArray) = ArrayMemory()
 
-Adapt.adapt_storage(::Adaptor, t::CuTexture{T,N}) where {T,N} =
+Adapt.adapt_storage(::KernelAdaptor, t::CuTexture{T,N}) where {T,N} =
     CuDeviceTexture{T,N,typeof(memory_source(parent(t))),
                     t.normalized_coordinates, typeof(t.interpolation)}(size(t), t.handle)
diff --git a/test/core/execution.jl b/test/core/execution.jl
index cacdea8e6e..e6e13e91a1 100644
--- a/test/core/execution.jl
+++ b/test/core/execution.jl
@@ -470,7 +470,7 @@ end
     @eval struct Host   end
     @eval struct Device end
 
-    Adapt.adapt_storage(::CUDA.Adaptor, a::Host) = Device()
+    Adapt.adapt_storage(::CUDA.KernelAdaptor, a::Host) = Device()
 
     Base.convert(::Type{Int}, ::Host)   = 1
     Base.convert(::Type{Int}, ::Device) = 2

From 6f9c4addbeacbb790dd6fdc873969e8eb7ed20b4 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 1 Nov 2023 15:17:34 +0100
Subject: [PATCH 15/19] Rework unsafe_wrap.

---
 src/array.jl       | 117 ++++++++++++++++++++++++++++++++++++---------
 test/base/array.jl | 104 ++++++++++++++++++++++++----------------
 2 files changed, 157 insertions(+), 64 deletions(-)

diff --git a/src/array.jl b/src/array.jl
index b229ba5bbd..4363bf41b1 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -1,4 +1,4 @@
-export CuArray, CuVector, CuMatrix, CuVecOrMat, cu, is_unified
+export CuArray, CuVector, CuMatrix, CuVecOrMat, cu, is_device, is_unified, is_host
 
 
 ## array type
@@ -207,36 +207,47 @@ function Base.deepcopy_internal(x::CuArray, dict::IdDict)
 end
 
 
+## unsafe_wrap
+
 """
   unsafe_wrap(CuArray, ptr::CuPtr{T}, dims; own=false, ctx=context())
 
-Wrap a `CuArray` object around the data at the address given by `ptr`. The pointer
-element type `T` determines the array element type. `dims` is either an integer (for a 1d
-array) or a tuple of the array dimensions. `own` optionally specified whether Julia should
-take ownership of the memory, calling `cudaFree` when the array is no longer referenced. The
-`ctx` argument determines the CUDA context where the data is allocated in.
+  # requires
+  unsafe_wrap(Array, a::CuArray)
+
+  # requires HMM
+  unsafe_wrap(CuArray, ptr::ptr{T}, dims)
+  unsafe_wrap(CuArray, a::Array)
+
+Wrap a `CuArray` object around the data at the address given by the CUDA-managed pointer
+`ptr`. The element type `T` determines the array element type. `dims` is either an integer
+(for a 1d array) or a tuple of the array dimensions. `own` optionally specified whether
+Julia should take ownership of the memory, calling `cudaFree` when the array is no longer
+referenced. The `ctx` argument determines the CUDA context where the data is allocated in.
 """
+unsafe_wrap
+
+# managed pointer to CuArray
 function Base.unsafe_wrap(::Union{Type{CuArray},Type{CuArray{T}},Type{CuArray{T,N}}},
                           ptr::CuPtr{T}, dims::NTuple{N,Int};
                           own::Bool=false, ctx::CuContext=context()) where {T,N}
-  buf = _unsafe_wrap(T, ptr, dims; own, ctx)
+  buf = _unsafe_wrap_managed(T, ptr, dims; own, ctx)
   data = DataRef(own ? _free_buffer : (args...) -> (#= do nothing =#), buf)
-  CuArray{T, length(dims)}(data, dims)
+  CuArray{T,N}(data, dims)
 end
 function Base.unsafe_wrap(::Type{CuArray{T,N,B}},
                           ptr::CuPtr{T}, dims::NTuple{N,Int};
                           own::Bool=false, ctx::CuContext=context()) where {T,N,B}
-  buf = _unsafe_wrap(T, ptr, dims; own, ctx)
+  buf = _unsafe_wrap_managed(T, ptr, dims; own, ctx)
   if typeof(buf) !== B
-    error("Declared buffer type does not match inferred buffer type.")
+    throw(ArgumentError("Declared buffer type does not match inferred buffer type."))
   end
   data = DataRef(own ? _free_buffer : (args...) -> (#= do nothing =#), buf)
-  CuArray{T, length(dims)}(data, dims)
+  CuArray{T,N}(data, dims)
 end
-
-function _unsafe_wrap(::Type{T}, ptr::CuPtr{T}, dims::NTuple{N,Int};
+function _unsafe_wrap_managed(::Type{T}, ptr::CuPtr{T}, dims::NTuple{N,Int};
                       own::Bool=false, ctx::CuContext=context()) where {T,N}
-  isbitstype(T) || error("Can only unsafe_wrap a pointer to a bits type")
+  isbitstype(T) || throw(ArgumentError("Can only unsafe_wrap a pointer to a bits type"))
   sz = prod(dims) * sizeof(T)
 
   # identify the buffer
@@ -253,24 +264,84 @@ function _unsafe_wrap(::Type{T}, ptr::CuPtr{T}, dims::NTuple{N,Int};
       error("Unknown memory type; please file an issue.")
     end
   catch err
-      error("Could not identify the buffer type; are you passing a valid CUDA pointer to unsafe_wrap?")
+      throw(ArgumentError("Could not identify the buffer type; are you passing a valid CUDA pointer to unsafe_wrap?"))
   end
   return buf
 end
-
-function Base.unsafe_wrap(Atype::Union{Type{CuArray},Type{CuArray{T}},Type{CuArray{T,1}}},
+# integer size input
+function Base.unsafe_wrap(::Union{Type{CuArray},Type{CuArray{T}},Type{CuArray{T,1}}},
                           p::CuPtr{T}, dim::Int;
                           own::Bool=false, ctx::CuContext=context()) where {T}
-  unsafe_wrap(Atype, p, (dim,); own, ctx)
+  unsafe_wrap(CuArray{T,1}, p, (dim,); own, ctx)
 end
-function Base.unsafe_wrap(Atype::Type{CuArray{T,1,B}},
-                          p::CuPtr{T}, dim::Int;
+function Base.unsafe_wrap(::Type{CuArray{T,1,B}}, p::CuPtr{T}, dim::Int;
                           own::Bool=false, ctx::CuContext=context()) where {T,B}
-  unsafe_wrap(Atype, p, (dim,); own, ctx)
+  unsafe_wrap(CuArray{T,1,B}, p, (dim,); own, ctx)
 end
 
-Base.unsafe_wrap(T::Type{<:CuArray}, ::Ptr, dims::NTuple{N,Int}; kwargs...) where {N} =
-  throw(ArgumentError("cannot wrap a CPU pointer with a $T"))
+# managed pointer to Array
+function Base.unsafe_wrap(::Union{Type{Array},Type{Array{T}},Type{Array{T,N}}},
+                          p::CuPtr{T}, dims::NTuple{N,Int};
+                          own::Bool=false) where {T,N}
+  if !is_managed(p) && memory_type(p) != CU_MEMORYTYPE_HOST
+    throw(ArgumentError("Can only create a CPU array object from a unified or host CUDA array"))
+  end
+  unsafe_wrap(Array{T,N}, reinterpret(Ptr{T}, p), dims; own)
+end
+# integer size input
+function Base.unsafe_wrap(::Union{Type{Array},Type{Array{T}},Type{Array{T,1}}},
+                          p::CuPtr{T}, dim::Int; own::Bool=false) where {T}
+  unsafe_wrap(Array{T,1}, p, (dim,); own)
+end
+# array input
+function Base.unsafe_wrap(::Union{Type{Array},Type{Array{T}},Type{Array{T,N}}},
+                          a::CuArray{T,N}) where {T,N}
+  p = pointer(a; type=Mem.Host)
+  unsafe_wrap(Array, p, size(a))
+end
+
+# unmanaged pointer to CuArray
+function Base.unsafe_wrap(::Union{Type{CuArray},Type{CuArray{T}},Type{CuArray{T,N}}},
+                          p::Ptr{T}, dims::NTuple{N,Int}; ctx::CuContext=context()) where {T,N}
+  isbitstype(T) || throw(ArgumentError("Can only unsafe_wrap a pointer to a bits type"))
+  sz = prod(dims) * sizeof(T)
+
+  if driver_version() < v"12.2"
+    error("Accessing host memory requires HMM support, which is only available in CUDA 12.2+ using the open-source driver.")
+  end
+  if attribute(device(), DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS) != 1
+    error("Accessing host memory requires HMM support, which is not provided by your $(name(device())).")
+  end
+
+  buf = Mem.UnifiedBuffer(ctx, reinterpret(CuPtr{Nothing}, p), sz)
+  data = DataRef((args...) -> (#= do nothing =#), buf)
+  CuArray{T,N}(data, dims)
+end
+function Base.unsafe_wrap(::Type{CuArray{T,N,B}}, p::Ptr{T}, dims::NTuple{N,Int};
+                          ctx::CuContext=context()) where {T,N,B}
+  if B !== Mem.UnifiedBuffer
+    throw(ArgumentError("Can only wrap an unmanaged pointer to a CuArray with a UnifiedBuffer"))
+  end
+  unsafe_wrap(CuArray{T,N}, p, dims; ctx)
+end
+# integer size input
+function Base.unsafe_wrap(::Union{Type{CuArray},Type{CuArray{T}},Type{CuArray{T,1}}},
+                          p::Ptr{T}, dim::Int) where {T}
+  unsafe_wrap(CuArray{T,1}, p, (dim,))
+end
+function Base.unsafe_wrap(::Type{CuArray{T,1,B}}, p::Ptr{T}, dim::Int) where {T,B}
+  unsafe_wrap(CuArray{T,1,B}, p, (dim,))
+end
+# array input
+function Base.unsafe_wrap(::Union{Type{CuArray},Type{CuArray{T}},Type{CuArray{T,N}}},
+                          a::Array{T,N}) where {T,N}
+  p = pointer(a)
+  unsafe_wrap(CuArray{T,N}, p, size(a))
+end
+function Base.unsafe_wrap(::Type{CuArray{T,1,B}}, a::Array{T,1}) where {T,B}
+  p = pointer(a)
+  unsafe_wrap(CuArray{T,1,B}, p, size(a))
+end
 
 
 ## array interface
diff --git a/test/base/array.jl b/test/base/array.jl
index c871938eb0..f244cfa8e0 100644
--- a/test/base/array.jl
+++ b/test/base/array.jl
@@ -30,47 +30,6 @@ using ChainRulesCore: add!!, is_inplaceable_destination
     @test_throws ArgumentError Base.unsafe_convert(Ptr{Float32}, xs)
   end
 
-  # unsafe_wrap
-  let
-    B = Mem.DeviceBuffer
-    arr = CuVector{Int,B}(undef, 2)
-    ptr = pointer(arr)
-
-    ## compare the fields we care about
-    function test_eq(a, T, N, dims)
-      @test eltype(a) == T
-      @test ndims(a) == N
-      @test a.data[].ptr == ptr
-      @test a.data[].ctx == context()
-      @test a.maxsize == arr.maxsize
-      @test a.offset == arr.offset
-      @test a.dims == dims
-    end
-
-    test_eq(unsafe_wrap(CuArray, ptr, 2),                Int, 1, (2,))
-    test_eq(unsafe_wrap(CuArray{Int}, ptr, 2),           Int, 1, (2,))
-    test_eq(unsafe_wrap(CuArray{Int,1}, ptr, 2),         Int, 1, (2,))
-    test_eq(unsafe_wrap(CuArray{Int,1,B}, ptr, 2),       Int, 1, (2,))
-    test_eq(unsafe_wrap(CuArray, ptr, (1,2)),            Int, 2, (1,2))
-    test_eq(unsafe_wrap(CuArray{Int}, ptr, (1,2)),       Int, 2, (1,2))
-    test_eq(unsafe_wrap(CuArray{Int,2}, ptr, (1,2)),     Int, 2, (1,2))
-    test_eq(unsafe_wrap(CuArray{Int,2,B}, ptr, (1,2)),   Int, 2, (1,2))
-
-    @test_throws ErrorException unsafe_wrap(CuArray{Int,1,Mem.HostBuffer}, ptr, 2)
-    @test_throws ErrorException unsafe_wrap(CuArray{Int,2,Mem.HostBuffer}, ptr, (1,2))
-  end
-  let buf = Mem.alloc(Mem.Host, sizeof(Int), Mem.HOSTALLOC_DEVICEMAP)
-    gpu_ptr = convert(CuPtr{Int}, buf)
-    gpu_arr = unsafe_wrap(CuArray, gpu_ptr, 1)
-    gpu_arr .= 42
-
-    synchronize()
-
-    cpu_ptr = convert(Ptr{Int}, buf)
-    cpu_arr = unsafe_wrap(Array, cpu_ptr, 1)
-    @test cpu_arr == [42]
-  end
-
   @test collect(CUDA.zeros(2, 2)) == zeros(Float32, 2, 2)
   @test collect(CUDA.ones(2, 2)) == ones(Float32, 2, 2)
 
@@ -92,6 +51,69 @@ using ChainRulesCore: add!!, is_inplaceable_destination
   end
 end
 
+@testset "unsafe_wrap" begin
+    hmm = CUDA.driver_version() >= v"12.2" &&
+          attribute(device(), CUDA.DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS) == 1
+
+    # managed memory -> CuArray
+    for a in [cu([1]; device=true), cu([1]; unified=true)]
+        p = pointer(a)
+        for AT in [CuArray, CuArray{Int}, CuArray{Int,1}, typeof(a)],
+            b in [unsafe_wrap(AT, p, 1), unsafe_wrap(AT, p, (1,))]
+            @test typeof(b) == typeof(a)
+            @test pointer(b) == p
+            @test size(b) == (1,)
+        end
+    end
+
+    # managed memory -> Array
+    let a = cu([1]; unified=true)
+        p = pointer(a)
+        for AT in [Array, Array{Int}, Array{Int,1}],
+            b in [unsafe_wrap(AT, p, 1), unsafe_wrap(AT, p, (1,)), unsafe_wrap(AT, a)]
+            @test typeof(b) == Array{Int,1}
+            @test pointer(b) == reinterpret(Ptr{Int}, p)
+            @test size(b) == (1,)
+        end
+    end
+
+    # unmanaged memory -> CuArray
+    if hmm
+        a = [1]
+        p = pointer(a)
+        for AT in [CuArray, CuArray{Int}, CuArray{Int,1}, CuArray{Int,1,Mem.UnifiedBuffer}],
+            b in [unsafe_wrap(AT, p, 1), unsafe_wrap(AT, p, (1,)), unsafe_wrap(AT, a)]
+            @test typeof(b) == CuArray{Int,1,Mem.UnifiedBuffer}
+            @test pointer(b) == reinterpret(CuPtr{Int}, p)
+            @test size(b) == (1,)
+        end
+    end
+
+    # errors
+    let a = cu([1]; device=true)
+        @test_throws ArgumentError unsafe_wrap(Array, a)
+        @test_throws ArgumentError unsafe_wrap(CuArray{Int,1,Mem.UnifiedBuffer}, pointer(a), 1)
+    end
+    if hmm
+        let a = [1]
+            @test_throws ArgumentError unsafe_wrap(CuArray{Int,1,Mem.DeviceBuffer}, a)
+        end
+    end
+
+    # some actual operations
+    let buf = Mem.alloc(Mem.Host, sizeof(Int), Mem.HOSTALLOC_DEVICEMAP)
+        gpu_ptr = convert(CuPtr{Int}, buf)
+        gpu_arr = unsafe_wrap(CuArray, gpu_ptr, 1)
+        gpu_arr .= 42
+
+        synchronize()
+
+        cpu_ptr = convert(Ptr{Int}, buf)
+        cpu_arr = unsafe_wrap(Array, cpu_ptr, 1)
+        @test cpu_arr == [42]
+    end
+end
+
 @testset "adapt" begin
   A = rand(Float32, 3, 3)
   dA = CuArray(A)

From fc638056cd67de1d62f00763f1220052e1ffb20b Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 1 Nov 2023 16:06:34 +0100
Subject: [PATCH 16/19] Better support for host memory.

---
 src/array.jl     | 36 +++++++++++++++++++-----------------
 src/pool.jl      |  6 ++++++
 test/runtests.jl |  2 +-
 3 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/src/array.jl b/src/array.jl
index 4363bf41b1..5f6cd712b7 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -432,34 +432,36 @@ Base.convert(::Type{T}, x::T) where T <: CuArray = x
 
 ## interop with libraries
 
-# when a unified buffer is converted to a device pointer, we assume it will be accessed
-# asynchronously. we keep track of that in the task local storage, and use that information
-# to perform additional synchronization when converting the buffer to a host pointer.
-# TODO: optimize this! it currently halves the performance of scalar indexing.
-function mark_async(buf::Mem.UnifiedBuffer)
+# when CPU-accessible buffers are converted to a device pointer, we assume it will be
+# accessed asynchronously. we keep track of that in the task local storage, and use that
+# information to perform additional synchronization when converting the buffer to a host
+# pointer. TODO: optimize this! it currently halves the performance of scalar indexing.
+function mark_async(buf::Union{Mem.HostBuffer,Mem.UnifiedBuffer})
+  ptr = convert(Ptr{Nothing}, buf)
   tls = task_local_storage()
   if haskey(tls, :CUDA_ASYNC_BUFFERS)
-    async_buffers = tls[:CUDA_ASYNC_BUFFERS]::Vector{Mem.UnifiedBuffer}
-    in(buf, async_buffers) && return
-    pushfirst!(async_buffers, buf)
+    async_buffers = tls[:CUDA_ASYNC_BUFFERS]::Vector{Ptr{Nothing}}
+    in(ptr, async_buffers) && return
+    pushfirst!(async_buffers, ptr)
   else
-    tls[:CUDA_ASYNC_BUFFERS] = [buf]
+    tls[:CUDA_ASYNC_BUFFERS] = [ptr]
   end
   return
 end
-function ensure_sync(buf::Mem.UnifiedBuffer)
+function ensure_sync(buf::Union{Mem.HostBuffer,Mem.UnifiedBuffer})
   tls = task_local_storage()
   haskey(tls, :CUDA_ASYNC_BUFFERS) || return
-  async_buffers = tls[:CUDA_ASYNC_BUFFERS]::Vector{Mem.UnifiedBuffer}
-  in(buf, async_buffers) || return
+  async_buffers = tls[:CUDA_ASYNC_BUFFERS]::Vector{Ptr{Nothing}}
+  ptr = convert(Ptr{Nothing}, buf)
+  in(ptr, async_buffers) || return
   synchronize()
-  filter!(!isequal(buf), async_buffers)
+  filter!(!isequal(ptr), async_buffers)
   return
 end
 
 function Base.unsafe_convert(::Type{CuPtr{T}}, x::CuArray{T}) where {T}
   buf = x.data[]
-  if is_unified(x)
+  if is_unified(x) || is_host(x)
     mark_async(buf)
   end
   convert(CuPtr{T}, buf) + x.offset*Base.elsize(x)
@@ -469,7 +471,7 @@ function Base.unsafe_convert(::Type{Ptr{T}}, x::CuArray{T}) where {T}
   buf = x.data[]
   if is_device(x)
     throw(ArgumentError("cannot take the CPU address of a $(typeof(x))"))
-  elseif is_unified(x)
+  elseif is_unified(x) || is_host(x)
     ensure_sync(buf)
   end
   convert(Ptr{T}, buf) + x.offset*Base.elsize(x)
@@ -478,10 +480,10 @@ end
 
 ## indexing
 
-Base.getindex(x::CuArray{<:Any, <:Any, Mem.UnifiedBuffer}, I::Int) =
+Base.getindex(x::CuArray{<:Any, <:Any, <:Union{Mem.Host,Mem.Unified}}, I::Int) =
   unsafe_load(pointer(x, I; type=Mem.Host))
 
-Base.setindex!(x::CuArray{<:Any, <:Any, Mem.UnifiedBuffer}, v, I::Int) =
+Base.setindex!(x::CuArray{<:Any, <:Any, <:Union{Mem.Host,Mem.Unified}}, v, I::Int) =
   unsafe_store!(pointer(x, I; type=Mem.Host), v)
 
 
diff --git a/src/pool.jl b/src/pool.jl
index 11b9aee724..9d476d591f 100644
--- a/src/pool.jl
+++ b/src/pool.jl
@@ -451,6 +451,12 @@ end
   end
   buf, time
 end
+@inline function _alloc(::Type{Mem.HostBuffer}, sz; stream::Union{Nothing,CuStream})
+  time = Base.@elapsed begin
+    buf = Mem.alloc(Mem.Host, sz)
+  end
+  buf, time
+end
 
 """
     free(buf)
diff --git a/test/runtests.jl b/test/runtests.jl
index 281e609e79..b744bea2cc 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -58,7 +58,7 @@ const tests = ["core$(path_separator)initialization"]    # needs to run first
 const test_runners = Dict()
 ## GPUArrays testsuite
 for name in keys(TestSuite.tests)
-    if CUDA.default_memory == Mem.Unified && name == "indexing scalar"
+    if CUDA.default_memory != Mem.Device && name == "indexing scalar"
         # GPUArrays' scalar indexing tests assume that indexing is not supported
         continue
     end

From 5460bfb28f8eb78ced7e2cb39250857fb2044698 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 1 Nov 2023 16:06:59 +0100
Subject: [PATCH 17/19] Run CI with host memory too.

---
 .buildkite/pipeline.yml | 76 ++++++++++++++++++++---------------------
 1 file changed, 37 insertions(+), 39 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index c47cdbea1e..6a80f416ce 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -19,8 +19,7 @@ steps:
           cuda: "*"
         commands: |
           echo -e "[CUDA_Runtime_jll]\nlocal = \"true\"" >LocalPreferences.toml
-        if: build.message !~ /\[skip tests\]/ &&
-            build.message !~ /\[skip julia\]/
+        if: build.message !~ /\[skip tests\]/
         timeout_in_minutes: 120
         matrix:
           setup:
@@ -44,7 +43,7 @@ steps:
           - JuliaCI/julia#v1:
               version: 1.9
           - JuliaCI/julia-test#v1:
-              test_args: "core base libraries"
+              test_args: "--quickfail core base libraries"
           - JuliaCI/julia-coverage#v1:
               dirs:
                 - src
@@ -53,9 +52,7 @@ steps:
         agents:
           queue: "juliagpu"
           cuda: "*"
-        if: build.message !~ /\[skip tests\]/ &&
-            build.message !~ /\[skip cuda\]/ &&
-            !build.pull_request.draft
+        if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
         timeout_in_minutes: 120
         matrix:
           setup:
@@ -73,6 +70,34 @@ steps:
           echo -e "[CUDA_Runtime_jll]\nversion = \"{{matrix.cuda}}\"" >LocalPreferences.toml
           echo -e "[CUDA_Driver_jll]\ncompat = \"false\"" >>LocalPreferences.toml
 
+  - group: "Memory"
+    key: "memory"
+    depends_on: "julia"
+    steps:
+      - label: "CuArray with {{matrix.memory}} memory"
+        plugins:
+          - JuliaCI/julia#v1:
+              version: 1.9
+          - JuliaCI/julia-test#v1:
+              test_args: "--quickfail core base"
+          - JuliaCI/julia-coverage#v1:
+              dirs:
+                - src
+                - lib
+                - examples
+        agents:
+          queue: "juliagpu"
+          cuda: "*"
+        if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
+        timeout_in_minutes: 120
+        matrix:
+          setup:
+            memory:
+              - "unified"
+              - "host"
+        commands: |
+          echo -e "[CUDA]\ndefault_memory = \"{{matrix.memory}}\"" >LocalPreferences.toml
+
   - group: ":nesting_dolls: Subpackages"
     depends_on: "cuda"
     steps:
@@ -104,9 +129,7 @@ steps:
         agents:
           queue: "juliagpu"
           cuda: "*"
-        if: build.message !~ /\[skip tests\]/ &&
-            build.message !~ /\[skip subpackages\]/ &&
-            !build.pull_request.draft
+        if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
         timeout_in_minutes: 120
         commands: |
           julia --project -e '
@@ -165,9 +188,7 @@ steps:
         agents:
           queue: "juliagpu"
           cuda: "*"
-        if: build.message !~ /\[skip tests\]/ &&
-            build.message !~ /\[skip downstream\]/ &&
-            !build.pull_request.draft
+        if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
         timeout_in_minutes: 60
         soft_fail:
           - exit_status: 3
@@ -240,30 +261,9 @@ steps:
           cuda: "*"
         env:
           JULIA_CUDA_USE_COMPAT: 'false'  # NVIDIA bug #3418723: injection tools prevent probing libcuda
-        if: build.message !~ /\[skip tests\]/ &&
-            !build.pull_request.draft
+        if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
         timeout_in_minutes: 10
 
-      - label: "Unified memory"
-        plugins:
-          - JuliaCI/julia#v1:
-              version: 1.9
-          - JuliaCI/julia-test#v1:
-              test_args: "--quickfail"
-          - JuliaCI/julia-coverage#v1:
-              dirs:
-                - src
-                - lib
-                - examples
-        agents:
-          queue: "juliagpu"
-          cuda: "*"
-        commands: |
-          echo -e "[CUDA]\ndefault_memory = \"unified\"" >LocalPreferences.toml
-        if: build.message !~ /\[skip tests\]/ &&
-            !build.pull_request.draft
-        timeout_in_minutes: 120
-
   # we want to benchmark every commit on the master branch, even if it failed CI
   - wait: ~
     continue_on_failure: true
@@ -293,9 +293,8 @@ steps:
         agents:
           queue: "juliagpu"
           cuda: "*"
-        if: build.message !~ /\[skip benchmarks\]/ &&
-            build.branch !~ /^master$$/ &&
-            !build.pull_request.draft
+        if: build.message !~ /\[skip benchmarks\]/ && !build.pull_request.draft &&
+            build.branch !~ /^master$$/
         timeout_in_minutes: 30
 
       # if we will submit results, use the benchmark queue so that we will
@@ -329,8 +328,7 @@ steps:
           queue: "benchmark"
           gpu: "rtx2070"
           cuda: "*"
-        if: build.message !~ /\[skip benchmarks\]/ &&
-            build.branch =~ /^master$$/
+        if: build.message !~ /\[skip benchmarks\]/ && build.branch =~ /^master$$/
         matrix:
           setup:
             julia:

From 7bacf91641bd122d5628fe63403319a62180ed14 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 1 Nov 2023 16:58:51 +0100
Subject: [PATCH 18/19] Add bounds checking to optimized scalar indexing
 functions.

---
 src/array.jl | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/array.jl b/src/array.jl
index 5f6cd712b7..dffd661e5f 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -480,11 +480,15 @@ end
 
 ## indexing
 
-Base.getindex(x::CuArray{<:Any, <:Any, <:Union{Mem.Host,Mem.Unified}}, I::Int) =
+function Base.getindex(x::CuArray{<:Any, <:Any, <:Union{Mem.Host,Mem.Unified}}, I::Int)
+  @boundscheck checkbounds(x, I)
   unsafe_load(pointer(x, I; type=Mem.Host))
+end
 
-Base.setindex!(x::CuArray{<:Any, <:Any, <:Union{Mem.Host,Mem.Unified}}, v, I::Int) =
+function Base.setindex!(x::CuArray{<:Any, <:Any, <:Union{Mem.Host,Mem.Unified}}, v, I::Int)
+  @boundscheck checkbounds(x, I)
   unsafe_store!(pointer(x, I; type=Mem.Host), v)
+end
 
 
 ## interop with device arrays

From f377f812868d3b527f80af6b998d89c5bebeed0c Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 1 Nov 2023 17:04:06 +0100
Subject: [PATCH 19/19] Try testing libraries with host memory.

---
 .buildkite/pipeline.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 6a80f416ce..a23e01d9d0 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -79,7 +79,7 @@ steps:
           - JuliaCI/julia#v1:
               version: 1.9
           - JuliaCI/julia-test#v1:
-              test_args: "--quickfail core base"
+              test_args: "--quickfail core base libraries"
           - JuliaCI/julia-coverage#v1:
               dirs:
                 - src