diff --git a/docs/Manifest.toml b/docs/Manifest.toml
new file mode 100644
index 0000000000..cf6109c7b7
--- /dev/null
+++ b/docs/Manifest.toml
@@ -0,0 +1,92 @@
+# This file is machine-generated - editing it directly is not advised
+
+[[Base64]]
+uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
+
+[[Dates]]
+deps = ["Printf"]
+uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
+
+[[Distributed]]
+deps = ["Random", "Serialization", "Sockets"]
+uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
+
+[[DocStringExtensions]]
+deps = ["LibGit2", "Markdown", "Pkg", "Test"]
+git-tree-sha1 = "88bb0edb352b16608036faadcc071adda068582a"
+uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
+version = "0.8.1"
+
+[[Documenter]]
+deps = ["Base64", "Dates", "DocStringExtensions", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"]
+git-tree-sha1 = "885467cebde4639a3d81953652cc53ff5a73cb87"
+uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+version = "0.24.3"
+
+[[InteractiveUtils]]
+deps = ["Markdown"]
+uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
+
+[[JSON]]
+deps = ["Dates", "Mmap", "Parsers", "Unicode"]
+git-tree-sha1 = "b34d7cef7b337321e97d22242c3c2b91f476748e"
+uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+version = "0.21.0"
+
+[[LibGit2]]
+uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
+
+[[Libdl]]
+uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
+
+[[Logging]]
+uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
+
+[[Markdown]]
+deps = ["Base64"]
+uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
+
+[[Mmap]]
+uuid = "a63ad114-7e13-5084-954f-fe012c677804"
+
+[[Parsers]]
+deps = ["Dates", "Test"]
+git-tree-sha1 = "0139ba59ce9bc680e2925aec5b7db79065d60556"
+uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
+version = "0.3.10"
+
+[[Pkg]]
+deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"]
+uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
+
+[[Printf]]
+deps = ["Unicode"]
+uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+
+[[REPL]]
+deps = ["InteractiveUtils", "Markdown", "Sockets"]
+uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
+
+[[Random]]
+deps = ["Serialization"]
+uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+
+[[SHA]]
+uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
+
+[[Serialization]]
+uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
+
+[[Sockets]]
+uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
+
+[[Test]]
+deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
+uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[[UUIDs]]
+deps = ["Random", "SHA"]
+uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
+
+[[Unicode]]
+uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
diff --git a/docs/make.jl b/docs/make.jl
index f5ce353f2d..dd173a2a33 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,17 +1,28 @@
 using Documenter, CUDAdrv
 
-makedocs(
-    modules = [CUDAdrv],
-    format = Documenter.HTML(prettyurls = get(ENV, "CI", nothing) == "true"),
-    sitename = "CUDAdrv.jl",
-    pages = [
-        "Home"    => "index.md",
-        "Manual"  => [
-            "man/usage.md"
-        ],
-        "Library" => [
-            "lib/api.md",
+const src = "https://github.com/JuliaGPU/CUDAdrv.jl"
+const dst = "https://juliagpu.gitlab.io/CUDAdrv.jl/"
+
+function main()
+    makedocs(
+        sitename = "CUDAdrv.jl",
+        authors = "Tim Besard",
+        repo = "$src/blob/{commit}{path}#{line}",
+        format = Documenter.HTML(
+            # Use clean URLs on CI
+            prettyurls = get(ENV, "CI", nothing) == "true",
+            canonical = dst,
+            assets = ["assets/favicon.ico"],
+            analytics = "UA-154489943-5",
+        ),
+        doctest = false,
+        pages = Any[
+            "Home"    => "index.md",
+            "APIs" => [
+                "driver.md",
+            ]
         ]
-    ],
-    doctest = true
-)
+    )
+end
+
+isinteractive() || main()
diff --git a/docs/src/assets/favicon.ico b/docs/src/assets/favicon.ico
new file mode 100644
index 0000000000..9021a68fbb
Binary files /dev/null and b/docs/src/assets/favicon.ico differ
diff --git a/docs/src/assets/logo.png b/docs/src/assets/logo.png
new file mode 100644
index 0000000000..c5610d094d
Binary files /dev/null and b/docs/src/assets/logo.png differ
diff --git a/docs/src/lib/api.md b/docs/src/driver.md
similarity index 66%
rename from docs/src/lib/api.md
rename to docs/src/driver.md
index 8e89e602ce..e00ffef6bb 100644
--- a/docs/src/lib/api.md
+++ b/docs/src/driver.md
@@ -1,4 +1,4 @@
-# API wrappers
+# CUDA driver
 
 This section lists the package's public functionality that directly corresponds to
 functionality of the CUDA driver API. In general, the abstractions stay close to those of
@@ -8,20 +8,6 @@ the CUDA driver API, so for more information on certain library calls you can co
 The documentation is grouped according to the modules of the driver API.
 
 
-## Installation properties
-
-```@docs
-CUDAdrv.vendor
-```
-
-
-## Initialization
-
-```@docs
-CUDAdrv.init
-```
-
-
 ## Error Handling
 
 ```@docs
@@ -63,7 +49,7 @@ CUDAdrv.CuContext
 CUDAdrv.destroy!(::CuContext)
 CUDAdrv.CuCurrentContext
 CUDAdrv.activate(::CuContext)
-CUDAdrv.synchronize(::CuContext)
+CUDAdrv.synchronize()
 CUDAdrv.device(::CuContext)
 ```
 
@@ -96,8 +82,8 @@ CUDAdrv.CuFunction
 ```@docs
 CUDAdrv.CuGlobal
 CUDAdrv.eltype(::CuGlobal)
-CUDAdrv.get(::CuGlobal)
-CUDAdrv.set{T}(::CuGlobal{T}, ::T)
+Base.getindex(::CuGlobal)
+Base.setindex!(::CuGlobal{T}, ::T) where {T}
 ```
 
 ### Linker
@@ -111,27 +97,42 @@ CUDAdrv.complete
 CUDAdrv.CuModule(::CUDAdrv.CuLinkImage, args...)
 ```
 
+
 ## Memory Management
 
+Three kinds of memory buffers can be allocated: device memory, host memory, and unified
+memory. Each of these buffers can be allocated by calling `alloc` with the type of buffer as
+first argument, and freed by calling `free`. Certain buffers have specific methods defined.
+
+```@docs
+CUDAdrv.Mem.DeviceBuffer
+CUDAdrv.Mem.alloc(::Type{Mem.DeviceBuffer}, ::Integer)
+```
+
+```@docs
+CUDAdrv.Mem.HostBuffer
+CUDAdrv.Mem.alloc(::Type{Mem.HostBuffer}, ::Integer, flags)
+CUDAdrv.Mem.register(::Type{Mem.HostBuffer}, ::Ptr, ::Integer, flags)
+CUDAdrv.Mem.unregister(::Mem.HostBuffer)
+```
+
 ```@docs
-CUDAdrv.Mem.alloc
-CUDAdrv.Mem.free
-CUDAdrv.Mem.set!
-CUDAdrv.Mem.upload
-CUDAdrv.Mem.upload!
-CUDAdrv.Mem.download
-CUDAdrv.Mem.download!
-CUDAdrv.Mem.transfer
-CUDAdrv.Mem.transfer!
+CUDAdrv.Mem.UnifiedBuffer
+CUDAdrv.Mem.alloc(::Type{Mem.UnifiedBuffer}, ::Integer, ::CUDAdrv.CUmemAttach_flags)
+CUDAdrv.Mem.prefetch(::Mem.UnifiedBuffer, bytes::Integer; device, stream)
+CUDAdrv.Mem.advise(::Mem.UnifiedBuffer, ::CUDAdrv.CUmem_advise, ::Integer; device)
 ```
 
+To work with these buffers, you need to `convert` them to a `Ptr` or `CuPtr`. Several
+methods then work with these raw pointers:
+
+
+
 ### Memory info
 
 ```@docs
-CUDAdrv.Mem.info
-CUDAdrv.Mem.total
-CUDAdrv.Mem.used
-CUDAdrv.Mem.free()
+CUDAdrv.available_memory
+CUDAdrv.total_memory
 ```
 
 
diff --git a/docs/src/index.md b/docs/src/index.md
index 5cb884c387..cd665fdba7 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -1,56 +1,10 @@
 # CUDAdrv.jl
 
-*A Julia wrapper for the CUDA driver API.*
-
-This package aims to provide high-level wrappers for the functionality exposed by the CUDA
-driver API, and is meant for users who need high- or low-level access to the CUDA toolkit or
-the underlying hardware.
-
-The package is built upon the [low-level CUDA driver
-API](http://docs.nvidia.com/cuda/cuda-driver-api/), but that shouldn't make the Julia
-wrapper any harder to use. That said, it is a work-in-progress and does not offer the same
-functionality or convenience as the more popular
-[CUDArt](https://github.com/JuliaGPU/CUDArt.jl) package, which is built upon the
-[higher-level CUDA runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/).
-
-
-## Installation
-
-Requirements:
-
-* Julia 0.5 or higher (use
-  [v0.1.0](https://github.com/JuliaGPU/CUDAdrv.jl/releases/tag/v0.1.0) of this package
-  for compatibility with Julia 0.4)
-* NVIDIA driver, providing `libcuda.so` (the full CUDA toolkit is not required)
-* CUDA hardware
-
-At the Julia REPL:
-
-```julia
-Pkg.add("CUDAdrv")
-using CUDAdrv
-
-# optionally
-Pkg.test("CUDAdrv")
-```
-
-Loading CUDAdrv might display error messages, indicating issues with your set-up. These
-messages can be cryptic as they happen too early for decent error handling to be loaded.
-However, please pay close attention to them as they might prevent CUDAdrv.jl from working
-properly! Some common issues:
-
-* unknown error (code 999): this often indicates that your set-up is broken, eg. because you
-  didn't load the correct, or any, kernel module. Please verify your set-up, on Linux by
-  executing `nvidia-smi` or on other platforms by compiling and running CUDA C code using
-  `nvcc`.
-* no device (code 100): CUDA didn't detect your device, because it is not supported by CUDA
-  or because you loaded the wrong kernel driver (eg. legacy when you need regular, or
-  vice-versa). CUDAdrv.jl cannot work in this case, because CUDA does not allow us to query
-  the driver version without a valid device, something we need in order to version the API
-  calls.
-* using library stubs (code -1): if any API call returns -1, you're probably using the CUDA
-  driver library stubs which return this value for every function call. This is not
-  supported by CUDAdrv.jl, and is only intended to be used when compiling C or C++ code to
-  be linked with `libcuda.so` at a time when that library isn't available yet. Unless you
-  purposefully added the stub libraries to the search path, please run the build script with
-  `JULIA_DEBUG=CUDAdrv` and file a bug report.
+Welcome to the API reference documentation of CUDAdrv.jl. This documentation is a work in
+progress. For general usage instructions of CUDAdrv.jl and the rest of the Julia CUDA
+toolchain, please refer to the [CUDA.jl documentation](https://juliagpu.gitlab.io/CUDA.jl/).
+
+Even though this package is built on the [low-level CUDA driver
+API](http://docs.nvidia.com/cuda/cuda-driver-api/), that should not scare you: The wrappers
+provided by CUDAdrv.jl are high level, and generally work similar to the higher-level CUDA
+runtime API.
diff --git a/docs/src/man/usage.md b/docs/src/man/usage.md
deleted file mode 100644
index 11daf9c670..0000000000
--- a/docs/src/man/usage.md
+++ /dev/null
@@ -1,189 +0,0 @@
-# Usage
-
-Quick start:
-
-```jldoctest
-using CUDAdrv
-
-dev = CuDevice(0)
-ctx = CuContext(dev)
-
-# code that does GPU computations
-
-destroy!(ctx)
-
-# output
-
-```
-
-To enable debug logging, launch Julia with the `JULIA_DEBUG` environment
-variable set to `CUDAdrv`.
-
-```@meta
-DocTestSetup = quote
-    using CUDAdrv
-
-    dev = CuDevice(0)
-    ctx = CuContext(dev)
-end
-```
-
-
-## Automatic memory management
-
-Except for the encapsulating context, `destroy` or `unload` calls are never needed. Objects
-are registered with the Julia garbage collector, and are automatically finalized when they
-go out of scope.
-
-However, many CUDA API functions implicitly depend on global state, such as the current
-active context. The wrapper needs to model those dependencies in order for objects not to
-get destroyed before any dependent object is. If we fail to model these dependency
-relations, API calls might randomly fail, eg. in the case of a missing context dependency
-with a `INVALID_CONTEXT` or `CONTEXT_IS_DESTROYED` error message. File a bug report if
-that happens.
-
-
-## Device memory
-
-Device memory is represented as `Buffer` objects, which can be allocated or
-initialized from existing arrays:
-
-```jldoctest
-A   = zeros(Float32,3,4)
-d_A = Mem.upload(A);
-typeof(d_A)
-
-# output
-
-CUDAdrv.Mem.Buffer
-```
-
-A variety of methods are defined to work with buffers, however, these are all
-low-level methods. Use the CuArrays.jl package for a higher-level array
-abstraction.
-
-
-## Modules and custom kernels
-
-This will not teach you about CUDA programming; for that, please refer to the CUDA
-documentation and other online sources.
-
-### Compiling your own modules
-
-You can write and use your own custom kernels, first writing a `.cu` file and compiling it
-as a `ptx` module. On Linux, compilation would look something like this:
-
-```
-nvcc -ptx mycudamodule.cu
-```
-
-You can specify that the code should be compiled for compute capability 2.0 devices or
-higher using:
-
-```
-nvcc -ptx -gencode=arch=compute_20,code=sm_20 mycudamodule.cu
-```
-
-If you want to write code that will support multiple datatypes (e.g., `Float32` and
-`Float64`), it's recommended that you use C++ and write your code using templates. Then use
-`extern C` to instantiate bindings for each datatype. For example:
-
-```cpp
-template <typename T>
-__device__ void kernel_function1(T *data) {
-    // Code goes here
-}
-template <typename T1, typename T2>
-__device__ void kernel_function2(T1 *data1, T2 *data2) {
-    // Code goes here
-}
-
-extern "C"
-{
-    void __global__ kernel_function1_float(float *data) {kernel_function1(data);}
-    void __global__ kernel_function1_double(double *data) {kernel_function1(data);}
-    void __global__ kernel_function2_int_float(int *data1, float *data2) {kernel_function2(data1,data2);}
-}
-```
-
-#### Initializing and freeing PTX modules
-
-To easily make your kernels available, the recommended approach is to define something
-analogous to the following for each `ptx` module (this example uses the kernels described in
-the previous section):
-
-```julia
-module MyCudaModule
-
-import CUDAdrv
-import CUDAdrv: CuModule, CuModuleFile, CuFunction, cudacall
-
-export function1
-
-const ptxdict = Dict()
-const mdlist = Array{CuModule}(0)
-
-function mdinit(devlist)
-    global ptxdict
-    global mdlist
-    isempty(mdlist) || error("mdlist is not empty")
-    for dev in devlist
-        CuDevice(dev)
-        md = CuModuleFile("mycudamodule.ptx")
-
-        ptxdict[("function1", Float32)] = CuFunction(md, "kernel_function1_float")
-        ptxdict[("function1", Float64)] = CuFunction(md, "kernel_function1_double")
-        ptxdict[("function2", Int32, Float32)] = CuFunction(md, "kernel_function2_int_float")
-
-        push!(mdlist, md)
-    end
-end
-
-mdclose() = (empty!(mdlist); empty!(ptxdict))
-
-function finit()
-  mdclose()
-end
-
-function init(devlist)
-  mdinit(devlist)
-end
-
-function function1(griddim::CuDim, blockdim::CuDim, data::CuArray{T}) where T
-    cufunction1 = ptxdict[("function1", T)]
-    cudacall(cufunction1, griddim, blockdim, (Ptr{T},), data)
-end
-
-...
-
-end  # MyCudaModule
-```
-
-Usage will look something like the following:
-
-```julia
-gpuid = 0
-dev = CuDevice(gpuid) # Or the ID of the GPU you want, if you have many of them
-ctx = CuContext(dev)
-
-MyCudaModule.init(gpuid)
-# Code that uses functions from your MyCudaModule
-MyCudaModule.finit()
-
-destroy!(ctx)
-```
-
-
-
-# Other notes
-
-## Memory storage order
-
-Julia convention is that matrices are stored in column-major order, whereas C (and CUDA) use
-row-major. For efficiency this wrapper avoids reordering memory, so that the linear sequence
-of addresses is the same between main memory and the GPU. For most usages, this is probably
-what you want.
-
-However, for the purposes of linear algebra, this effectively means that one is storing the
-transpose of matrices on the GPU. Keep this in mind when manipulating code on your GPU
-kernels.
diff --git a/src/CUDAdrv.jl b/src/CUDAdrv.jl
index de5673dde9..67115eb857 100644
--- a/src/CUDAdrv.jl
+++ b/src/CUDAdrv.jl
@@ -44,9 +44,9 @@ const __initialized__ = Ref(false)
 functional() = __initialized__[]
 
 function __init__()
-    silent = parse(Bool, get(ENV, "JULIA_CUDA_SILENT", "false"))
-    verbose = parse(Bool, get(ENV, "JULIA_CUDA_VERBOSE", "false"))
     precompiling = ccall(:jl_generating_output, Cint, ()) != 0
+    silent = parse(Bool, get(ENV, "JULIA_CUDA_SILENT", "false")) || precompiling
+    verbose = parse(Bool, get(ENV, "JULIA_CUDA_VERBOSE", "false"))
 
     try
         if haskey(ENV, "_") && basename(ENV["_"]) == "rr"
@@ -56,13 +56,13 @@ function __init__()
         cuInit(0)
 
         if version() <= v"9"
-            @warn "CUDAdrv.jl only supports NVIDIA drivers for CUDA 9.0 or higher (yours is for CUDA $(version()))"
+            silent || @warn "CUDAdrv.jl only supports NVIDIA drivers for CUDA 9.0 or higher (yours is for CUDA $(version()))"
         end
 
         __initialized__[] = true
     catch ex
         # don't actually fail to keep the package loadable
-        if !silent && !precompiling
+        if !silent
             if verbose
                 @error "CUDAdrv.jl failed to initialize" exception=(ex, catch_backtrace())
             else
diff --git a/src/memory.jl b/src/memory.jl
index 9ec77eede8..069cd5a9f1 100644
--- a/src/memory.jl
+++ b/src/memory.jl
@@ -21,6 +21,8 @@ abstract type Buffer end
 # - ptr, bytesize and ctx fields
 # - convert() to certain pointers
 
+CUDAdrv.device(buf::Buffer) = device(buf.ctx)
+
 Base.sizeof(buf::Buffer) = buf.bytesize
 
 # ccall integration
@@ -31,9 +33,13 @@ Base.unsafe_convert(T::Type{<:Union{Ptr,CuPtr}}, buf::Buffer) = convert(T, buf)
 
 
 ## device buffer
-##
-## residing on the GPU
 
+"""
+    Mem.DeviceBuffer
+    Mem.Device
+
+A buffer of device memory residing on the GPU.
+"""
 struct DeviceBuffer <: Buffer
     ptr::CuPtr{Cvoid}
     bytesize::Int
@@ -52,7 +58,7 @@ Base.convert(::Type{CuPtr{T}}, buf::DeviceBuffer) where {T} =
 
 
 """
-    alloc(DeviceBuffer, bytesize::Integer)
+    Mem.alloc(DeviceBuffer, bytesize::Integer)
 
 Allocate `bytesize` bytes of memory on the device. This memory is only accessible on the
 GPU, and requires explicit calls to `upload` and `download` for access on the CPU.
@@ -66,11 +72,6 @@ function alloc(::Type{DeviceBuffer}, bytesize::Integer)
     return DeviceBuffer(reinterpret(CuPtr{Cvoid}, ptr_ref[]), bytesize, CuCurrentContext())
 end
 
-@deprecate_binding HOSTALLOC_DEFAULT 0 false
-const HOSTALLOC_PORTABLE = CUDAdrv.CU_MEMHOSTALLOC_PORTABLE
-const HOSTALLOC_DEVICEMAP = CUDAdrv.CU_MEMHOSTALLOC_DEVICEMAP
-const HOSTALLOC_WRITECOMBINED = CUDAdrv.CU_MEMHOSTALLOC_WRITECOMBINED
-
 
 function free(buf::DeviceBuffer)
     if buf.ptr != CU_NULL
@@ -80,9 +81,13 @@ end
 
 
 ## host buffer
-##
-## pinned memory on the CPU, possibly accessible on the GPU
 
+"""
+    Mem.HostBuffer
+    Mem.Host
+
+A buffer of pinned memory on the CPU, possible accessible on the GPU.
+"""
 struct HostBuffer <: Buffer
     ptr::Ptr{Cvoid}
     bytesize::Int
@@ -110,8 +115,13 @@ function Base.convert(::Type{CuPtr{T}}, buf::HostBuffer) where {T}
 end
 
 
+@deprecate_binding HOSTALLOC_DEFAULT 0 false
+const HOSTALLOC_PORTABLE = CUDAdrv.CU_MEMHOSTALLOC_PORTABLE
+const HOSTALLOC_DEVICEMAP = CUDAdrv.CU_MEMHOSTALLOC_DEVICEMAP
+const HOSTALLOC_WRITECOMBINED = CUDAdrv.CU_MEMHOSTALLOC_WRITECOMBINED
+
 """
-    alloc(HostBuffer, bytesize::Integer, [flags])
+    Mem.alloc(HostBuffer, bytesize::Integer, [flags])
 
 Allocate `bytesize` bytes of page-locked memory on the host. This memory is accessible from
 the CPU, and makes it possible to perform faster memory copies to the GPU. Furthermore, if
@@ -128,11 +138,13 @@ function alloc(::Type{HostBuffer}, bytesize::Integer, flags=0)
     return HostBuffer(ptr_ref[], bytesize, CuCurrentContext(), mapped)
 end
 
-@enum_without_prefix CUDAdrv.CUmemAttach_flags CU_MEM_
 
+const HOSTREGISTER_PORTABLE = CUDAdrv.CU_MEMHOSTREGISTER_PORTABLE
+const HOSTREGISTER_DEVICEMAP = CUDAdrv.CU_MEMHOSTREGISTER_DEVICEMAP
+const HOSTREGISTER_IOMEMORY = CUDAdrv.CU_MEMHOSTREGISTER_IOMEMORY
 
 """
-    register(HostBuffer, ptr::Ptr, bytesize::Integer, [flags])
+    Mem.register(HostBuffer, ptr::Ptr, bytesize::Integer, [flags])
 
 Page-lock the host memory pointed to by `ptr`. Subsequent transfers to and from devices will
 be faster, and can be executed asynchronously. If the `MEMHOSTREGISTER_DEVICEMAP` flag is
@@ -148,6 +160,11 @@ function register(::Type{HostBuffer}, ptr::Ptr, bytesize::Integer, flags=0)
     return HostBuffer(ptr, bytesize, CuCurrentContext(), mapped)
 end
 
+"""
+    Mem.unregister(HostBuffer)
+
+Unregisters a memory range that was registered with [`Mem.register`](@ref).
+"""
 function unregister(buf::HostBuffer)
     CUDAdrv.cuMemHostUnregister(buf)
 end
@@ -161,9 +178,13 @@ end
 
 
 ## unified buffer
-##
-## managed buffer that is accessible on both the CPU and GPU
 
+"""
+    Mem.UnifiedBuffer
+    Mem.Unified
+
+A managed buffer that is accessible on both the CPU and GPU.
+"""
 struct UnifiedBuffer <: Buffer
     ptr::CuPtr{Cvoid}
     bytesize::Int
@@ -180,8 +201,10 @@ Base.convert(::Type{Ptr{T}}, buf::UnifiedBuffer) where {T} =
 Base.convert(::Type{CuPtr{T}}, buf::UnifiedBuffer) where {T} =
     convert(CuPtr{T}, buf.ptr)
 
+@enum_without_prefix CUDAdrv.CUmemAttach_flags CU_MEM_
+
 """
-    alloc(UnifiedBuffer, bytesize::Integer, [flags::CUmemAttach_flags])
+    Mem.alloc(UnifiedBuffer, bytesize::Integer, [flags::CUmemAttach_flags])
 
 Allocate `bytesize` bytes of unified memory. This memory is accessible from both the CPU and
 GPU, with the CUDA driver automatically copying upon first access.
@@ -204,12 +227,13 @@ function free(buf::UnifiedBuffer)
 end
 
 
-const HOSTREGISTER_PORTABLE = CUDAdrv.CU_MEMHOSTREGISTER_PORTABLE
-const HOSTREGISTER_DEVICEMAP = CUDAdrv.CU_MEMHOSTREGISTER_DEVICEMAP
-const HOSTREGISTER_IOMEMORY = CUDAdrv.CU_MEMHOSTREGISTER_IOMEMORY
+"""
+    prefecth(::UnifiedBuffer, [bytes::Integer]; [device::CuDevice], [stream::CuStream])
 
-function prefetch(buf::UnifiedBuffer, bytes=sizeof(buf);
-                  device::CuDevice=device(), stream::CuStream=CuDefaultStream())
+Prefetches memory to the specified destination device.
+"""
+function prefetch(buf::UnifiedBuffer, bytes::Integer=sizeof(buf);
+                  device::CuDevice=device(buf), stream::CuStream=CuDefaultStream())
     bytes > sizeof(buf) && throw(BoundsError(buf, bytes))
     CUDAdrv.cuMemPrefetchAsync(buf, bytes, device, stream)
 end
@@ -217,8 +241,13 @@ end
 
 @enum_without_prefix CUDAdrv.CUmem_advise CU_MEM_
 
-function advise(buf::UnifiedBuffer, advice::CUDAdrv.CUmem_advise, bytes=sizeof(buf),
-                device=device(buf.ctx))
+"""
+    advise(::UnifiedBuffer, advice::CUDAdrv.CUmem_advise, [bytes::Integer]; [device::CuDevice])
+
+Advise about the usage of a given memory range.
+"""
+function advise(buf::UnifiedBuffer, advice::CUDAdrv.CUmem_advise, bytes::Integer=sizeof(buf);
+                device::CuDevice=device(buf))
     bytes > sizeof(buf) && throw(BoundsError(buf, bytes))
     CUDAdrv.cuMemAdvise(buf, bytes, advice, device)
 end
@@ -239,8 +268,8 @@ const Unified = UnifiedBuffer
 ## initialization
 
 """
-    set!(buf::CuPtr, value::Union{UInt8,UInt16,UInt32}, len::Integer;
-         async::Bool=false, stream::CuStream)
+    Mem.set!(buf::CuPtr, value::Union{UInt8,UInt16,UInt32}, len::Integer;
+             async::Bool=false, stream::CuStream)
 
 Initialize device memory by copying `val` for `len` times. Executed asynchronously if
 `async` is true, in which case a valid `stream` is required.