diff --git a/docs/Manifest.toml b/docs/Manifest.toml new file mode 100644 index 0000000000..cf6109c7b7 --- /dev/null +++ b/docs/Manifest.toml @@ -0,0 +1,92 @@ +# This file is machine-generated - editing it directly is not advised + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[DocStringExtensions]] +deps = ["LibGit2", "Markdown", "Pkg", "Test"] +git-tree-sha1 = "88bb0edb352b16608036faadcc071adda068582a" +uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" +version = "0.8.1" + +[[Documenter]] +deps = ["Base64", "Dates", "DocStringExtensions", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"] +git-tree-sha1 = "885467cebde4639a3d81953652cc53ff5a73cb87" +uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +version = "0.24.3" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[JSON]] +deps = ["Dates", "Mmap", "Parsers", "Unicode"] +git-tree-sha1 = "b34d7cef7b337321e97d22242c3c2b91f476748e" +uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" +version = "0.21.0" + +[[LibGit2]] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[Parsers]] +deps = ["Dates", "Test"] +git-tree-sha1 = "0139ba59ce9bc680e2925aec5b7db79065d60556" +uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" +version = "0.3.10" + +[[Pkg]] +deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[Test]] +deps = ["Distributed", "InteractiveUtils", "Logging", "Random"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" diff --git a/docs/make.jl b/docs/make.jl index f5ce353f2d..dd173a2a33 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,17 +1,28 @@ using Documenter, CUDAdrv -makedocs( - modules = [CUDAdrv], - format = Documenter.HTML(prettyurls = get(ENV, "CI", nothing) == "true"), - sitename = "CUDAdrv.jl", - pages = [ - "Home" => "index.md", - "Manual" => [ - "man/usage.md" - ], - "Library" => [ - "lib/api.md", +const src = "https://github.com/JuliaGPU/CUDAdrv.jl" +const dst = "https://juliagpu.gitlab.io/CUDAdrv.jl/" + +function main() + makedocs( + sitename = "CUDAdrv.jl", + authors = "Tim Besard", + repo = "$src/blob/{commit}{path}#{line}", + format = Documenter.HTML( + # Use clean URLs on CI + prettyurls = get(ENV, "CI", nothing) == "true", + canonical = dst, + assets = ["assets/favicon.ico"], + analytics = "UA-154489943-5", + ), + doctest = false, + pages = Any[ + "Home" => "index.md", + "APIs" => [ + "driver.md", + ] ] - ], - doctest = true -) + ) +end + +isinteractive() || main() diff --git a/docs/src/assets/favicon.ico b/docs/src/assets/favicon.ico new file mode 100644 index 0000000000..9021a68fbb Binary files /dev/null and b/docs/src/assets/favicon.ico differ diff --git a/docs/src/assets/logo.png b/docs/src/assets/logo.png new file mode 100644 index 0000000000..c5610d094d Binary files /dev/null and b/docs/src/assets/logo.png differ diff --git a/docs/src/lib/api.md b/docs/src/driver.md similarity index 66% rename from docs/src/lib/api.md rename to docs/src/driver.md index 8e89e602ce..e00ffef6bb 100644 --- a/docs/src/lib/api.md +++ b/docs/src/driver.md @@ -1,4 +1,4 @@ -# API wrappers +# CUDA driver This section lists the package's public functionality that directly corresponds to functionality of the CUDA driver API. In general, the abstractions stay close to those of @@ -8,20 +8,6 @@ the CUDA driver API, so for more information on certain library calls you can co The documentation is grouped according to the modules of the driver API. -## Installation properties - -```@docs -CUDAdrv.vendor -``` - - -## Initialization - -```@docs -CUDAdrv.init -``` - - ## Error Handling ```@docs @@ -63,7 +49,7 @@ CUDAdrv.CuContext CUDAdrv.destroy!(::CuContext) CUDAdrv.CuCurrentContext CUDAdrv.activate(::CuContext) -CUDAdrv.synchronize(::CuContext) +CUDAdrv.synchronize() CUDAdrv.device(::CuContext) ``` @@ -96,8 +82,8 @@ CUDAdrv.CuFunction ```@docs CUDAdrv.CuGlobal CUDAdrv.eltype(::CuGlobal) -CUDAdrv.get(::CuGlobal) -CUDAdrv.set{T}(::CuGlobal{T}, ::T) +Base.getindex(::CuGlobal) +Base.setindex!(::CuGlobal{T}, ::T) where {T} ``` ### Linker @@ -111,27 +97,42 @@ CUDAdrv.complete CUDAdrv.CuModule(::CUDAdrv.CuLinkImage, args...) ``` + ## Memory Management +Three kinds of memory buffers can be allocated: device memory, host memory, and unified +memory. Each of these buffers can be allocated by calling `alloc` with the type of buffer as +first argument, and freed by calling `free`. Certain buffers have specific methods defined. + +```@docs +CUDAdrv.Mem.DeviceBuffer +CUDAdrv.Mem.alloc(::Type{Mem.DeviceBuffer}, ::Integer) +``` + +```@docs +CUDAdrv.Mem.HostBuffer +CUDAdrv.Mem.alloc(::Type{Mem.HostBuffer}, ::Integer, flags) +CUDAdrv.Mem.register(::Type{Mem.HostBuffer}, ::Ptr, ::Integer, flags) +CUDAdrv.Mem.unregister(::Mem.HostBuffer) +``` + ```@docs -CUDAdrv.Mem.alloc -CUDAdrv.Mem.free -CUDAdrv.Mem.set! -CUDAdrv.Mem.upload -CUDAdrv.Mem.upload! -CUDAdrv.Mem.download -CUDAdrv.Mem.download! -CUDAdrv.Mem.transfer -CUDAdrv.Mem.transfer! +CUDAdrv.Mem.UnifiedBuffer +CUDAdrv.Mem.alloc(::Type{Mem.UnifiedBuffer}, ::Integer, ::CUDAdrv.CUmemAttach_flags) +CUDAdrv.Mem.prefetch(::Mem.UnifiedBuffer, bytes::Integer; device, stream) +CUDAdrv.Mem.advise(::Mem.UnifiedBuffer, ::CUDAdrv.CUmem_advise, ::Integer; device) ``` +To work with these buffers, you need to `convert` them to a `Ptr` or `CuPtr`. Several +methods then work with these raw pointers: + + + ### Memory info ```@docs -CUDAdrv.Mem.info -CUDAdrv.Mem.total -CUDAdrv.Mem.used -CUDAdrv.Mem.free() +CUDAdrv.available_memory +CUDAdrv.total_memory ``` diff --git a/docs/src/index.md b/docs/src/index.md index 5cb884c387..cd665fdba7 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,56 +1,10 @@ # CUDAdrv.jl -*A Julia wrapper for the CUDA driver API.* - -This package aims to provide high-level wrappers for the functionality exposed by the CUDA -driver API, and is meant for users who need high- or low-level access to the CUDA toolkit or -the underlying hardware. - -The package is built upon the [low-level CUDA driver -API](http://docs.nvidia.com/cuda/cuda-driver-api/), but that shouldn't make the Julia -wrapper any harder to use. That said, it is a work-in-progress and does not offer the same -functionality or convenience as the more popular -[CUDArt](https://github.com/JuliaGPU/CUDArt.jl) package, which is built upon the -[higher-level CUDA runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/). - - -## Installation - -Requirements: - -* Julia 0.5 or higher (use - [v0.1.0](https://github.com/JuliaGPU/CUDAdrv.jl/releases/tag/v0.1.0) of this package - for compatibility with Julia 0.4) -* NVIDIA driver, providing `libcuda.so` (the full CUDA toolkit is not required) -* CUDA hardware - -At the Julia REPL: - -```julia -Pkg.add("CUDAdrv") -using CUDAdrv - -# optionally -Pkg.test("CUDAdrv") -``` - -Loading CUDAdrv might display error messages, indicating issues with your set-up. These -messages can be cryptic as they happen too early for decent error handling to be loaded. -However, please pay close attention to them as they might prevent CUDAdrv.jl from working -properly! Some common issues: - -* unknown error (code 999): this often indicates that your set-up is broken, eg. because you - didn't load the correct, or any, kernel module. Please verify your set-up, on Linux by - executing `nvidia-smi` or on other platforms by compiling and running CUDA C code using - `nvcc`. -* no device (code 100): CUDA didn't detect your device, because it is not supported by CUDA - or because you loaded the wrong kernel driver (eg. legacy when you need regular, or - vice-versa). CUDAdrv.jl cannot work in this case, because CUDA does not allow us to query - the driver version without a valid device, something we need in order to version the API - calls. -* using library stubs (code -1): if any API call returns -1, you're probably using the CUDA - driver library stubs which return this value for every function call. This is not - supported by CUDAdrv.jl, and is only intended to be used when compiling C or C++ code to - be linked with `libcuda.so` at a time when that library isn't available yet. Unless you - purposefully added the stub libraries to the search path, please run the build script with - `JULIA_DEBUG=CUDAdrv` and file a bug report. +Welcome to the API reference documentation of CUDAdrv.jl. This documentation is a work in +progress. For general usage instructions of CUDAdrv.jl and the rest of the Julia CUDA +toolchain, please refer to the [CUDA.jl documentation](https://juliagpu.gitlab.io/CUDA.jl/). + +Even though this package is built on the [low-level CUDA driver +API](http://docs.nvidia.com/cuda/cuda-driver-api/), that should not scare you: The wrappers +provided by CUDAdrv.jl are high level, and generally work similar to the higher-level CUDA +runtime API. diff --git a/docs/src/man/usage.md b/docs/src/man/usage.md deleted file mode 100644 index 11daf9c670..0000000000 --- a/docs/src/man/usage.md +++ /dev/null @@ -1,189 +0,0 @@ -# Usage - -Quick start: - -```jldoctest -using CUDAdrv - -dev = CuDevice(0) -ctx = CuContext(dev) - -# code that does GPU computations - -destroy!(ctx) - -# output - -``` - -To enable debug logging, launch Julia with the `JULIA_DEBUG` environment -variable set to `CUDAdrv`. - -```@meta -DocTestSetup = quote - using CUDAdrv - - dev = CuDevice(0) - ctx = CuContext(dev) -end -``` - - -## Automatic memory management - -Except for the encapsulating context, `destroy` or `unload` calls are never needed. Objects -are registered with the Julia garbage collector, and are automatically finalized when they -go out of scope. - -However, many CUDA API functions implicitly depend on global state, such as the current -active context. The wrapper needs to model those dependencies in order for objects not to -get destroyed before any dependent object is. If we fail to model these dependency -relations, API calls might randomly fail, eg. in the case of a missing context dependency -with a `INVALID_CONTEXT` or `CONTEXT_IS_DESTROYED` error message. File a bug report if -that happens. - - -## Device memory - -Device memory is represented as `Buffer` objects, which can be allocated or -initialized from existing arrays: - -```jldoctest -A = zeros(Float32,3,4) -d_A = Mem.upload(A); -typeof(d_A) - -# output - -CUDAdrv.Mem.Buffer -``` - -A variety of methods are defined to work with buffers, however, these are all -low-level methods. Use the CuArrays.jl package for a higher-level array -abstraction. - - -## Modules and custom kernels - -This will not teach you about CUDA programming; for that, please refer to the CUDA -documentation and other online sources. - -### Compiling your own modules - -You can write and use your own custom kernels, first writing a `.cu` file and compiling it -as a `ptx` module. On Linux, compilation would look something like this: - -``` -nvcc -ptx mycudamodule.cu -``` - -You can specify that the code should be compiled for compute capability 2.0 devices or -higher using: - -``` -nvcc -ptx -gencode=arch=compute_20,code=sm_20 mycudamodule.cu -``` - -If you want to write code that will support multiple datatypes (e.g., `Float32` and -`Float64`), it's recommended that you use C++ and write your code using templates. Then use -`extern C` to instantiate bindings for each datatype. For example: - -```cpp -template -__device__ void kernel_function1(T *data) { - // Code goes here -} -template -__device__ void kernel_function2(T1 *data1, T2 *data2) { - // Code goes here -} - -extern "C" -{ - void __global__ kernel_function1_float(float *data) {kernel_function1(data);} - void __global__ kernel_function1_double(double *data) {kernel_function1(data);} - void __global__ kernel_function2_int_float(int *data1, float *data2) {kernel_function2(data1,data2);} -} -``` - -#### Initializing and freeing PTX modules - -To easily make your kernels available, the recommended approach is to define something -analogous to the following for each `ptx` module (this example uses the kernels described in -the previous section): - -```julia -module MyCudaModule - -import CUDAdrv -import CUDAdrv: CuModule, CuModuleFile, CuFunction, cudacall - -export function1 - -const ptxdict = Dict() -const mdlist = Array{CuModule}(0) - -function mdinit(devlist) - global ptxdict - global mdlist - isempty(mdlist) || error("mdlist is not empty") - for dev in devlist - CuDevice(dev) - md = CuModuleFile("mycudamodule.ptx") - - ptxdict[("function1", Float32)] = CuFunction(md, "kernel_function1_float") - ptxdict[("function1", Float64)] = CuFunction(md, "kernel_function1_double") - ptxdict[("function2", Int32, Float32)] = CuFunction(md, "kernel_function2_int_float") - - push!(mdlist, md) - end -end - -mdclose() = (empty!(mdlist); empty!(ptxdict)) - -function finit() - mdclose() -end - -function init(devlist) - mdinit(devlist) -end - -function function1(griddim::CuDim, blockdim::CuDim, data::CuArray{T}) where T - cufunction1 = ptxdict[("function1", T)] - cudacall(cufunction1, griddim, blockdim, (Ptr{T},), data) -end - -... - -end # MyCudaModule -``` - -Usage will look something like the following: - -```julia -gpuid = 0 -dev = CuDevice(gpuid) # Or the ID of the GPU you want, if you have many of them -ctx = CuContext(dev) - -MyCudaModule.init(gpuid) -# Code that uses functions from your MyCudaModule -MyCudaModule.finit() - -destroy!(ctx) -``` - - - -# Other notes - -## Memory storage order - -Julia convention is that matrices are stored in column-major order, whereas C (and CUDA) use -row-major. For efficiency this wrapper avoids reordering memory, so that the linear sequence -of addresses is the same between main memory and the GPU. For most usages, this is probably -what you want. - -However, for the purposes of linear algebra, this effectively means that one is storing the -transpose of matrices on the GPU. Keep this in mind when manipulating code on your GPU -kernels. diff --git a/src/CUDAdrv.jl b/src/CUDAdrv.jl index de5673dde9..67115eb857 100644 --- a/src/CUDAdrv.jl +++ b/src/CUDAdrv.jl @@ -44,9 +44,9 @@ const __initialized__ = Ref(false) functional() = __initialized__[] function __init__() - silent = parse(Bool, get(ENV, "JULIA_CUDA_SILENT", "false")) - verbose = parse(Bool, get(ENV, "JULIA_CUDA_VERBOSE", "false")) precompiling = ccall(:jl_generating_output, Cint, ()) != 0 + silent = parse(Bool, get(ENV, "JULIA_CUDA_SILENT", "false")) || precompiling + verbose = parse(Bool, get(ENV, "JULIA_CUDA_VERBOSE", "false")) try if haskey(ENV, "_") && basename(ENV["_"]) == "rr" @@ -56,13 +56,13 @@ function __init__() cuInit(0) if version() <= v"9" - @warn "CUDAdrv.jl only supports NVIDIA drivers for CUDA 9.0 or higher (yours is for CUDA $(version()))" + silent || @warn "CUDAdrv.jl only supports NVIDIA drivers for CUDA 9.0 or higher (yours is for CUDA $(version()))" end __initialized__[] = true catch ex # don't actually fail to keep the package loadable - if !silent && !precompiling + if !silent if verbose @error "CUDAdrv.jl failed to initialize" exception=(ex, catch_backtrace()) else diff --git a/src/memory.jl b/src/memory.jl index 9ec77eede8..069cd5a9f1 100644 --- a/src/memory.jl +++ b/src/memory.jl @@ -21,6 +21,8 @@ abstract type Buffer end # - ptr, bytesize and ctx fields # - convert() to certain pointers +CUDAdrv.device(buf::Buffer) = device(buf.ctx) + Base.sizeof(buf::Buffer) = buf.bytesize # ccall integration @@ -31,9 +33,13 @@ Base.unsafe_convert(T::Type{<:Union{Ptr,CuPtr}}, buf::Buffer) = convert(T, buf) ## device buffer -## -## residing on the GPU +""" + Mem.DeviceBuffer + Mem.Device + +A buffer of device memory residing on the GPU. +""" struct DeviceBuffer <: Buffer ptr::CuPtr{Cvoid} bytesize::Int @@ -52,7 +58,7 @@ Base.convert(::Type{CuPtr{T}}, buf::DeviceBuffer) where {T} = """ - alloc(DeviceBuffer, bytesize::Integer) + Mem.alloc(DeviceBuffer, bytesize::Integer) Allocate `bytesize` bytes of memory on the device. This memory is only accessible on the GPU, and requires explicit calls to `upload` and `download` for access on the CPU. @@ -66,11 +72,6 @@ function alloc(::Type{DeviceBuffer}, bytesize::Integer) return DeviceBuffer(reinterpret(CuPtr{Cvoid}, ptr_ref[]), bytesize, CuCurrentContext()) end -@deprecate_binding HOSTALLOC_DEFAULT 0 false -const HOSTALLOC_PORTABLE = CUDAdrv.CU_MEMHOSTALLOC_PORTABLE -const HOSTALLOC_DEVICEMAP = CUDAdrv.CU_MEMHOSTALLOC_DEVICEMAP -const HOSTALLOC_WRITECOMBINED = CUDAdrv.CU_MEMHOSTALLOC_WRITECOMBINED - function free(buf::DeviceBuffer) if buf.ptr != CU_NULL @@ -80,9 +81,13 @@ end ## host buffer -## -## pinned memory on the CPU, possibly accessible on the GPU +""" + Mem.HostBuffer + Mem.Host + +A buffer of pinned memory on the CPU, possible accessible on the GPU. +""" struct HostBuffer <: Buffer ptr::Ptr{Cvoid} bytesize::Int @@ -110,8 +115,13 @@ function Base.convert(::Type{CuPtr{T}}, buf::HostBuffer) where {T} end +@deprecate_binding HOSTALLOC_DEFAULT 0 false +const HOSTALLOC_PORTABLE = CUDAdrv.CU_MEMHOSTALLOC_PORTABLE +const HOSTALLOC_DEVICEMAP = CUDAdrv.CU_MEMHOSTALLOC_DEVICEMAP +const HOSTALLOC_WRITECOMBINED = CUDAdrv.CU_MEMHOSTALLOC_WRITECOMBINED + """ - alloc(HostBuffer, bytesize::Integer, [flags]) + Mem.alloc(HostBuffer, bytesize::Integer, [flags]) Allocate `bytesize` bytes of page-locked memory on the host. This memory is accessible from the CPU, and makes it possible to perform faster memory copies to the GPU. Furthermore, if @@ -128,11 +138,13 @@ function alloc(::Type{HostBuffer}, bytesize::Integer, flags=0) return HostBuffer(ptr_ref[], bytesize, CuCurrentContext(), mapped) end -@enum_without_prefix CUDAdrv.CUmemAttach_flags CU_MEM_ +const HOSTREGISTER_PORTABLE = CUDAdrv.CU_MEMHOSTREGISTER_PORTABLE +const HOSTREGISTER_DEVICEMAP = CUDAdrv.CU_MEMHOSTREGISTER_DEVICEMAP +const HOSTREGISTER_IOMEMORY = CUDAdrv.CU_MEMHOSTREGISTER_IOMEMORY """ - register(HostBuffer, ptr::Ptr, bytesize::Integer, [flags]) + Mem.register(HostBuffer, ptr::Ptr, bytesize::Integer, [flags]) Page-lock the host memory pointed to by `ptr`. Subsequent transfers to and from devices will be faster, and can be executed asynchronously. If the `MEMHOSTREGISTER_DEVICEMAP` flag is @@ -148,6 +160,11 @@ function register(::Type{HostBuffer}, ptr::Ptr, bytesize::Integer, flags=0) return HostBuffer(ptr, bytesize, CuCurrentContext(), mapped) end +""" + Mem.unregister(HostBuffer) + +Unregisters a memory range that was registered with [`Mem.register`](@ref). +""" function unregister(buf::HostBuffer) CUDAdrv.cuMemHostUnregister(buf) end @@ -161,9 +178,13 @@ end ## unified buffer -## -## managed buffer that is accessible on both the CPU and GPU +""" + Mem.UnifiedBuffer + Mem.Unified + +A managed buffer that is accessible on both the CPU and GPU. +""" struct UnifiedBuffer <: Buffer ptr::CuPtr{Cvoid} bytesize::Int @@ -180,8 +201,10 @@ Base.convert(::Type{Ptr{T}}, buf::UnifiedBuffer) where {T} = Base.convert(::Type{CuPtr{T}}, buf::UnifiedBuffer) where {T} = convert(CuPtr{T}, buf.ptr) +@enum_without_prefix CUDAdrv.CUmemAttach_flags CU_MEM_ + """ - alloc(UnifiedBuffer, bytesize::Integer, [flags::CUmemAttach_flags]) + Mem.alloc(UnifiedBuffer, bytesize::Integer, [flags::CUmemAttach_flags]) Allocate `bytesize` bytes of unified memory. This memory is accessible from both the CPU and GPU, with the CUDA driver automatically copying upon first access. @@ -204,12 +227,13 @@ function free(buf::UnifiedBuffer) end -const HOSTREGISTER_PORTABLE = CUDAdrv.CU_MEMHOSTREGISTER_PORTABLE -const HOSTREGISTER_DEVICEMAP = CUDAdrv.CU_MEMHOSTREGISTER_DEVICEMAP -const HOSTREGISTER_IOMEMORY = CUDAdrv.CU_MEMHOSTREGISTER_IOMEMORY +""" + prefecth(::UnifiedBuffer, [bytes::Integer]; [device::CuDevice], [stream::CuStream]) -function prefetch(buf::UnifiedBuffer, bytes=sizeof(buf); - device::CuDevice=device(), stream::CuStream=CuDefaultStream()) +Prefetches memory to the specified destination device. +""" +function prefetch(buf::UnifiedBuffer, bytes::Integer=sizeof(buf); + device::CuDevice=device(buf), stream::CuStream=CuDefaultStream()) bytes > sizeof(buf) && throw(BoundsError(buf, bytes)) CUDAdrv.cuMemPrefetchAsync(buf, bytes, device, stream) end @@ -217,8 +241,13 @@ end @enum_without_prefix CUDAdrv.CUmem_advise CU_MEM_ -function advise(buf::UnifiedBuffer, advice::CUDAdrv.CUmem_advise, bytes=sizeof(buf), - device=device(buf.ctx)) +""" + advise(::UnifiedBuffer, advice::CUDAdrv.CUmem_advise, [bytes::Integer]; [device::CuDevice]) + +Advise about the usage of a given memory range. +""" +function advise(buf::UnifiedBuffer, advice::CUDAdrv.CUmem_advise, bytes::Integer=sizeof(buf); + device::CuDevice=device(buf)) bytes > sizeof(buf) && throw(BoundsError(buf, bytes)) CUDAdrv.cuMemAdvise(buf, bytes, advice, device) end @@ -239,8 +268,8 @@ const Unified = UnifiedBuffer ## initialization """ - set!(buf::CuPtr, value::Union{UInt8,UInt16,UInt32}, len::Integer; - async::Bool=false, stream::CuStream) + Mem.set!(buf::CuPtr, value::Union{UInt8,UInt16,UInt32}, len::Integer; + async::Bool=false, stream::CuStream) Initialize device memory by copying `val` for `len` times. Executed asynchronously if `async` is true, in which case a valid `stream` is required.