Skip to content

Commit

Permalink
Have separate free! method for hostcalls (#539)
Browse files Browse the repository at this point in the history
  • Loading branch information
pxl-th authored Nov 18, 2023
1 parent 8ac70bd commit f96eb14
Show file tree
Hide file tree
Showing 6 changed files with 132 additions and 111 deletions.
6 changes: 6 additions & 0 deletions docs/src/hostcall.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ end
y = ROCArray(Float32[0f0])
@roc kernel!(y, hc)
AMDGPU.synchronize(; stop_hostcalls=true) # Stop hostcall.
AMDGPU.Device.free!(hc) # Free hostcall buffers.

@assert Array(y)[1] 42f0
```
Expand Down Expand Up @@ -68,3 +69,8 @@ hostcall one more time before exiting, while `finish!` will exit immediately.

`finish!` can be used on any `HostCallHolder` to force-exit the running
hostcall task.

## Free hostcall buffers

For custom hostcalls it is important to call `AMDGPU.Device.free!`
once kernel has finished to free buffers that hostcall used in the process.
17 changes: 8 additions & 9 deletions src/device/gcn/hostcall.jl
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ function HostCall(
HostCall{RT, AT}(signal_handle, buf_ptr, buf_len)
end

mutable struct HostCallHolder
struct HostCallHolder
hc::HostCall
ret_bufs::Vector{Mem.HostBuffer}
task::Task
Expand Down Expand Up @@ -186,16 +186,15 @@ function HostCallHolder(
end
return
end
HostCallHolder(hc, ret_bufs, tsk, finish_ref, continuous_ref)
end

holder = HostCallHolder(hc, ret_bufs, tsk, finish_ref, continuous_ref)
finalizer(holder) do holder
if !Runtime.RT_EXITING[]
buf_ptr = reinterpret(Ptr{Cvoid}, holder.hc.buf_ptr)
HIP.hipHostFree(buf_ptr) |> HIP.check
Mem.free.(holder.ret_bufs)
end
function free!(holder::HostCallHolder)
if !Runtime.RT_EXITING[]
buf_ptr = reinterpret(Ptr{Cvoid}, holder.hc.buf_ptr)
HIP.hipHostFree(buf_ptr) |> HIP.check
Mem.free.(holder.ret_bufs)
end
return holder
end

Adapt.adapt(to::Runtime.Adaptor, hc::HostCallHolder) = hc.hc
Expand Down
4 changes: 2 additions & 2 deletions src/device/gcn/hostcall_utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ const NAMED_PERDEVICE_HOSTCALLS = Dict{
function named_perdevice_hostcall(func, dev::HIP.HIPDevice, name::Symbol)
Base.@lock Runtime.RT_LOCK begin
hcs = get!(
() -> Dict{Symbol, Tuple{HostCall, Mem.HostBuffer}}(),
() -> Dict{Symbol, Tuple{HostCallHolder, Mem.HostBuffer}}(),
NAMED_PERDEVICE_HOSTCALLS, dev)
get!(func, hcs, name)
end
Expand All @@ -202,7 +202,7 @@ end
function get_named_perdevice_hostcall(dev::HIP.HIPDevice, name::Symbol)
Base.@lock Runtime.RT_LOCK begin
hcs = get(
() -> Dict{Symbol, Tuple{HostCall, Mem.HostBuffer}}(),
() -> Dict{Symbol, Tuple{HostCallHolder, Mem.HostBuffer}}(),
NAMED_PERDEVICE_HOSTCALLS, dev)
get(hcs, name, nothing)
end
Expand Down
2 changes: 2 additions & 0 deletions src/highlevel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ function synchronize(stm::HIPStream = stream();
AMDGPU.Device.finish!(hc[1])
# Remove it from global hostcalls, so that new one is created.
AMDGPU.Device.remove_perdevice_hostcall!(stm.device, gbl)
# Free buffers.
AMDGPU.Device.free!(hc[1])
end
return
end
Expand Down
12 changes: 12 additions & 0 deletions test/device/hostcall.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
AMDGPU.synchronize(; stop_hostcalls=true)
@test Array(RB)[1] == 1f0
@test dref[] == true
AMDGPU.Device.free!(hc)
end

@testset "Call: Error" begin
Expand Down Expand Up @@ -48,6 +49,8 @@ end
@test dref[] == false
sleep(1) # Give time for the task to shut down.
@test Base.istaskfailed(hc.task)

AMDGPU.Device.free!(hc)
end
end

Expand All @@ -67,6 +70,7 @@ end
@roc kernel(RA, RB, hc)
AMDGPU.synchronize(; stop_hostcalls=true)
@test Array(RB)[1] == 2f0
AMDGPU.Device.free!(hc)
end

@testset "Call: (1 arg)" begin
Expand All @@ -85,6 +89,7 @@ end
@roc kernel(RA, RB, hc)
AMDGPU.synchronize(; stop_hostcalls=true)
@test Array(RB)[1] == 44f0
AMDGPU.Device.free!(hc)
end

@testset "Call: (2 homogeneous args)" begin
Expand All @@ -103,6 +108,7 @@ end
@roc kernel(RA, RB, hc)
AMDGPU.synchronize(; stop_hostcalls=true)
@test Array(RB)[1] == 47f0
AMDGPU.Device.free!(hc)
end

@testset "Call: (2 heterogeneous args)" begin
Expand All @@ -121,6 +127,7 @@ end
@roc kernel(RA, RB, hc)
AMDGPU.synchronize(; stop_hostcalls=true)
@test Array(RB)[1] == 47f0
AMDGPU.Device.free!(hc)
end

@testset "Call: (2 heterogeneous args, return homogeneous tuple)" begin
Expand All @@ -139,6 +146,7 @@ end
@roc kernel(RA, RB, hc)
AMDGPU.synchronize(; stop_hostcalls=true)
@test Array(RB)[1] == 48f0
AMDGPU.Device.free!(hc)
end

@testset "Call: (2 heterogeneous args, return heterogeneous tuple)" begin
Expand All @@ -157,6 +165,7 @@ end
@roc kernel(RA, RB, hc)
AMDGPU.synchronize(; stop_hostcalls=true)
@test Array(RB)[1] == 48f0
AMDGPU.Device.free!(hc)
end

@testset "Call: (2 hostcalls, 1 kernel)" begin
Expand All @@ -179,6 +188,8 @@ end
@roc kernel(RA, RB, hc1, hc2)
AMDGPU.synchronize(; stop_hostcalls=true)
@test Array(RB)[1] == 11f0
AMDGPU.Device.free!(hc1)
AMDGPU.Device.free!(hc2)
end

@testset "Call: (1 hostcall, 2 kernels)" begin
Expand Down Expand Up @@ -209,6 +220,7 @@ end
# Give HostCall task time to exit.
sleep(2)
@test istaskdone(hc)
AMDGPU.Device.free!(hc)
end

end
202 changes: 102 additions & 100 deletions test/rocarray/base.jl
Original file line number Diff line number Diff line change
Expand Up @@ -144,106 +144,108 @@ end
finalize(A)
end

@testset "unsafe_copy3d!" begin
@testset "Full copy" begin
T = Int32
src = ROCArray(ones(T, 4, 4, 4))
dst = ROCArray(zeros(T, 4, 4, 4))
Mem.unsafe_copy3d!(
pointer(dst), typeof(dst.buf[]),
pointer(src), typeof(src.buf[]),
length(src))
@test Array(src) == Array(dst)
end

@testset "3D Copy middle part of y-z planes, each dimension is different in size" begin
nx, ny, nz = 4, 6, 8
src = ROCArray(collect(reshape(1:(nx * ny * nz), nx, ny, nz)))
dst = ROCArray(zeros(Int, nx, ny, nz))
Mem.unsafe_copy3d!(
pointer(dst), typeof(dst.buf[]),
pointer(src), typeof(src.buf[]),
1, 4, 4;
dstPos=(1, 2, 3), srcPos=(1, 2, 3),
dstPitch=nx * sizeof(Int), dstHeight=ny,
srcPitch=nx * sizeof(Int), srcHeight=ny)
@test Array(src)[1, 2:5, 3:6] == Array(dst)[1, 2:5, 3:6]
end

@testset "3D Copy middle part of x-y-z planes, each dimension is different in size" begin
nx, ny, nz = 4, 6, 8
src = ROCArray(collect(reshape(1:(nx * ny * nz), nx, ny, nz)))
dst = ROCArray(zeros(Int, nx, ny, nz))
Mem.unsafe_copy3d!(
pointer(dst), typeof(dst.buf[]),
pointer(src), typeof(src.buf[]),
2, 4, 4;
dstPos=(2, 2, 3), srcPos=(2, 2, 3),
dstPitch=nx * sizeof(Int), dstHeight=ny,
srcPitch=nx * sizeof(Int), srcHeight=ny)
@test Array(src)[2:3, 2:5, 3:6] == Array(dst)[2:3, 2:5, 3:6]
end

@testset "3D -> 2D -> 3D copy" begin
nx, ny, nz = 2, 3, 4
T = Int
P = ROCArray(reshape(1:(2 * 3 * 4), nx, ny, nz))

for dim in 1:3
if dim == 1
ranges = [2:2, 1:size(P,2), 1:size(P,3)]
buf = zeros(T, size(P,2), size(P,3))
buf_view_shape = (1, size(P,2), size(P,3))
elseif dim == 2
ranges = [1:size(P,1), 3:3, 1:size(P,3)]
buf = zeros(T, size(P,1), size(P,3))
buf_view_shape = (size(P,1), 1, size(P,3))
elseif dim == 3
ranges = [1:size(P,1), 1:size(P,2), 3:3]
buf = zeros(T, size(P,1), size(P,2))
buf_view_shape = (size(P,1), size(P,2), 1)
end

# Reshape 2D to 3D for simplicity.
buf_view = reshape(buf, buf_view_shape)

AMDGPU.Mem.unsafe_copy3d!(
pointer(buf), AMDGPU.Mem.HostBuffer,
pointer(P), typeof(P.buf[]),
length(ranges[1]), length(ranges[2]), length(ranges[3]);
srcPos=(ranges[1][1], ranges[2][1], ranges[3][1]),
dstPitch=sizeof(T) * size(buf_view, 1), dstHeight=size(buf_view, 2),
srcPitch=sizeof(T) * size(P, 1), srcHeight=size(P, 2))

if dim == 1
@assert buf == Array(P)[2, :, :]
elseif dim == 2
@assert buf == Array(P)[:, 3, :]
elseif dim == 3
@assert buf == Array(P)[:, :, 3]
end

# host to device
P2 = similar(P)

AMDGPU.Mem.unsafe_copy3d!(
pointer(P2), typeof(P2.buf[]),
pointer(buf), AMDGPU.Mem.HostBuffer,
length(ranges[1]), length(ranges[2]), length(ranges[3]);
dstPos=(ranges[1][1], ranges[2][1], ranges[3][1]),
dstPitch=sizeof(T) * size(P2,1), dstHeight=size(P2, 2),
srcPitch=sizeof(T) * size(buf_view, 1), srcHeight=size(buf_view, 2))

if dim == 1
@assert Array(P2)[2, :, :] == Array(P)[2, :, :]
elseif dim == 2
@assert Array(P2)[:, 3, :] == Array(P)[:, 3, :]
elseif dim == 3
@assert Array(P2)[:, :, 3] == Array(P)[:, :, 3]
end
end
end
end
# TODO unsafe_copy3d! is broken in HIP:
# https://github.com/ROCm-Developer-Tools/HIP/issues/3289#issuecomment-1651195870
# @testset "unsafe_copy3d!" begin
# @testset "Full copy" begin
# T = Int32
# src = ROCArray(ones(T, 4, 4, 4))
# dst = ROCArray(zeros(T, 4, 4, 4))
# Mem.unsafe_copy3d!(
# pointer(dst), typeof(dst.buf[]),
# pointer(src), typeof(src.buf[]),
# length(src))
# @test Array(src) == Array(dst)
# end

# @testset "3D Copy middle part of y-z planes, each dimension is different in size" begin
# nx, ny, nz = 4, 6, 8
# src = ROCArray(collect(reshape(1:(nx * ny * nz), nx, ny, nz)))
# dst = ROCArray(zeros(Int, nx, ny, nz))
# Mem.unsafe_copy3d!(
# pointer(dst), typeof(dst.buf[]),
# pointer(src), typeof(src.buf[]),
# 1, 4, 4;
# dstPos=(1, 2, 3), srcPos=(1, 2, 3),
# dstPitch=nx * sizeof(Int), dstHeight=ny,
# srcPitch=nx * sizeof(Int), srcHeight=ny)
# @test Array(src)[1, 2:5, 3:6] == Array(dst)[1, 2:5, 3:6]
# end

# @testset "3D Copy middle part of x-y-z planes, each dimension is different in size" begin
# nx, ny, nz = 4, 6, 8
# src = ROCArray(collect(reshape(1:(nx * ny * nz), nx, ny, nz)))
# dst = ROCArray(zeros(Int, nx, ny, nz))
# Mem.unsafe_copy3d!(
# pointer(dst), typeof(dst.buf[]),
# pointer(src), typeof(src.buf[]),
# 2, 4, 4;
# dstPos=(2, 2, 3), srcPos=(2, 2, 3),
# dstPitch=nx * sizeof(Int), dstHeight=ny,
# srcPitch=nx * sizeof(Int), srcHeight=ny)
# @test Array(src)[2:3, 2:5, 3:6] == Array(dst)[2:3, 2:5, 3:6]
# end

# @testset "3D -> 2D -> 3D copy" begin
# nx, ny, nz = 2, 3, 4
# T = Int
# P = ROCArray(reshape(1:(2 * 3 * 4), nx, ny, nz))

# for dim in 1:3
# if dim == 1
# ranges = [2:2, 1:size(P,2), 1:size(P,3)]
# buf = zeros(T, size(P,2), size(P,3))
# buf_view_shape = (1, size(P,2), size(P,3))
# elseif dim == 2
# ranges = [1:size(P,1), 3:3, 1:size(P,3)]
# buf = zeros(T, size(P,1), size(P,3))
# buf_view_shape = (size(P,1), 1, size(P,3))
# elseif dim == 3
# ranges = [1:size(P,1), 1:size(P,2), 3:3]
# buf = zeros(T, size(P,1), size(P,2))
# buf_view_shape = (size(P,1), size(P,2), 1)
# end

# # Reshape 2D to 3D for simplicity.
# buf_view = reshape(buf, buf_view_shape)

# AMDGPU.Mem.unsafe_copy3d!(
# pointer(buf), AMDGPU.Mem.HostBuffer,
# pointer(P), typeof(P.buf[]),
# length(ranges[1]), length(ranges[2]), length(ranges[3]);
# srcPos=(ranges[1][1], ranges[2][1], ranges[3][1]),
# dstPitch=sizeof(T) * size(buf_view, 1), dstHeight=size(buf_view, 2),
# srcPitch=sizeof(T) * size(P, 1), srcHeight=size(P, 2))

# if dim == 1
# @assert buf == Array(P)[2, :, :]
# elseif dim == 2
# @assert buf == Array(P)[:, 3, :]
# elseif dim == 3
# @assert buf == Array(P)[:, :, 3]
# end

# # host to device
# P2 = similar(P)

# AMDGPU.Mem.unsafe_copy3d!(
# pointer(P2), typeof(P2.buf[]),
# pointer(buf), AMDGPU.Mem.HostBuffer,
# length(ranges[1]), length(ranges[2]), length(ranges[3]);
# dstPos=(ranges[1][1], ranges[2][1], ranges[3][1]),
# dstPitch=sizeof(T) * size(P2,1), dstHeight=size(P2, 2),
# srcPitch=sizeof(T) * size(buf_view, 1), srcHeight=size(buf_view, 2))

# if dim == 1
# @assert Array(P2)[2, :, :] == Array(P)[2, :, :]
# elseif dim == 2
# @assert Array(P2)[:, 3, :] == Array(P)[:, 3, :]
# elseif dim == 3
# @assert Array(P2)[:, :, 3] == Array(P)[:, :, 3]
# end
# end
# end
# end

@testset "accumulate" begin
for n in (0, 1, 2, 3, 10, 10_000, 16384, 16384 + 1)
Expand Down

0 comments on commit f96eb14

Please sign in to comment.