diff --git a/docs/src/hostcall.md b/docs/src/hostcall.md index 8b4275ae2..3d3769916 100644 --- a/docs/src/hostcall.md +++ b/docs/src/hostcall.md @@ -33,6 +33,7 @@ end y = ROCArray(Float32[0f0]) @roc kernel!(y, hc) AMDGPU.synchronize(; stop_hostcalls=true) # Stop hostcall. +AMDGPU.Device.free!(hc) # Free hostcall buffers. @assert Array(y)[1] ≈ 42f0 ``` @@ -68,3 +69,8 @@ hostcall one more time before exiting, while `finish!` will exit immediately. `finish!` can be used on any `HostCallHolder` to force-exit the running hostcall task. + +## Free hostcall buffers + +For custom hostcalls it is important to call `AMDGPU.Device.free!` +once kernel has finished to free buffers that hostcall used in the process. diff --git a/src/device/gcn/hostcall.jl b/src/device/gcn/hostcall.jl index f01d547b1..27eec81fe 100644 --- a/src/device/gcn/hostcall.jl +++ b/src/device/gcn/hostcall.jl @@ -59,7 +59,7 @@ function HostCall( HostCall{RT, AT}(signal_handle, buf_ptr, buf_len) end -mutable struct HostCallHolder +struct HostCallHolder hc::HostCall ret_bufs::Vector{Mem.HostBuffer} task::Task @@ -186,16 +186,15 @@ function HostCallHolder( end return end + HostCallHolder(hc, ret_bufs, tsk, finish_ref, continuous_ref) +end - holder = HostCallHolder(hc, ret_bufs, tsk, finish_ref, continuous_ref) - finalizer(holder) do holder - if !Runtime.RT_EXITING[] - buf_ptr = reinterpret(Ptr{Cvoid}, holder.hc.buf_ptr) - HIP.hipHostFree(buf_ptr) |> HIP.check - Mem.free.(holder.ret_bufs) - end +function free!(holder::HostCallHolder) + if !Runtime.RT_EXITING[] + buf_ptr = reinterpret(Ptr{Cvoid}, holder.hc.buf_ptr) + HIP.hipHostFree(buf_ptr) |> HIP.check + Mem.free.(holder.ret_bufs) end - return holder end Adapt.adapt(to::Runtime.Adaptor, hc::HostCallHolder) = hc.hc diff --git a/src/device/gcn/hostcall_utils.jl b/src/device/gcn/hostcall_utils.jl index e2a95f207..36b59f7b1 100644 --- a/src/device/gcn/hostcall_utils.jl +++ b/src/device/gcn/hostcall_utils.jl @@ -192,7 +192,7 @@ const NAMED_PERDEVICE_HOSTCALLS = Dict{ function named_perdevice_hostcall(func, dev::HIP.HIPDevice, name::Symbol) Base.@lock Runtime.RT_LOCK begin hcs = get!( - () -> Dict{Symbol, Tuple{HostCall, Mem.HostBuffer}}(), + () -> Dict{Symbol, Tuple{HostCallHolder, Mem.HostBuffer}}(), NAMED_PERDEVICE_HOSTCALLS, dev) get!(func, hcs, name) end @@ -202,7 +202,7 @@ end function get_named_perdevice_hostcall(dev::HIP.HIPDevice, name::Symbol) Base.@lock Runtime.RT_LOCK begin hcs = get( - () -> Dict{Symbol, Tuple{HostCall, Mem.HostBuffer}}(), + () -> Dict{Symbol, Tuple{HostCallHolder, Mem.HostBuffer}}(), NAMED_PERDEVICE_HOSTCALLS, dev) get(hcs, name, nothing) end diff --git a/src/highlevel.jl b/src/highlevel.jl index 1d0d19ad1..bec308e84 100644 --- a/src/highlevel.jl +++ b/src/highlevel.jl @@ -65,6 +65,8 @@ function synchronize(stm::HIPStream = stream(); AMDGPU.Device.finish!(hc[1]) # Remove it from global hostcalls, so that new one is created. AMDGPU.Device.remove_perdevice_hostcall!(stm.device, gbl) + # Free buffers. + AMDGPU.Device.free!(hc[1]) end return end diff --git a/test/device/hostcall.jl b/test/device/hostcall.jl index bfde5a364..75d196652 100644 --- a/test/device/hostcall.jl +++ b/test/device/hostcall.jl @@ -20,6 +20,7 @@ AMDGPU.synchronize(; stop_hostcalls=true) @test Array(RB)[1] == 1f0 @test dref[] == true + AMDGPU.Device.free!(hc) end @testset "Call: Error" begin @@ -48,6 +49,8 @@ end @test dref[] == false sleep(1) # Give time for the task to shut down. @test Base.istaskfailed(hc.task) + + AMDGPU.Device.free!(hc) end end @@ -67,6 +70,7 @@ end @roc kernel(RA, RB, hc) AMDGPU.synchronize(; stop_hostcalls=true) @test Array(RB)[1] == 2f0 + AMDGPU.Device.free!(hc) end @testset "Call: (1 arg)" begin @@ -85,6 +89,7 @@ end @roc kernel(RA, RB, hc) AMDGPU.synchronize(; stop_hostcalls=true) @test Array(RB)[1] == 44f0 + AMDGPU.Device.free!(hc) end @testset "Call: (2 homogeneous args)" begin @@ -103,6 +108,7 @@ end @roc kernel(RA, RB, hc) AMDGPU.synchronize(; stop_hostcalls=true) @test Array(RB)[1] == 47f0 + AMDGPU.Device.free!(hc) end @testset "Call: (2 heterogeneous args)" begin @@ -121,6 +127,7 @@ end @roc kernel(RA, RB, hc) AMDGPU.synchronize(; stop_hostcalls=true) @test Array(RB)[1] == 47f0 + AMDGPU.Device.free!(hc) end @testset "Call: (2 heterogeneous args, return homogeneous tuple)" begin @@ -139,6 +146,7 @@ end @roc kernel(RA, RB, hc) AMDGPU.synchronize(; stop_hostcalls=true) @test Array(RB)[1] == 48f0 + AMDGPU.Device.free!(hc) end @testset "Call: (2 heterogeneous args, return heterogeneous tuple)" begin @@ -157,6 +165,7 @@ end @roc kernel(RA, RB, hc) AMDGPU.synchronize(; stop_hostcalls=true) @test Array(RB)[1] == 48f0 + AMDGPU.Device.free!(hc) end @testset "Call: (2 hostcalls, 1 kernel)" begin @@ -179,6 +188,8 @@ end @roc kernel(RA, RB, hc1, hc2) AMDGPU.synchronize(; stop_hostcalls=true) @test Array(RB)[1] == 11f0 + AMDGPU.Device.free!(hc1) + AMDGPU.Device.free!(hc2) end @testset "Call: (1 hostcall, 2 kernels)" begin @@ -209,6 +220,7 @@ end # Give HostCall task time to exit. sleep(2) @test istaskdone(hc) + AMDGPU.Device.free!(hc) end end diff --git a/test/rocarray/base.jl b/test/rocarray/base.jl index fe20f45ec..254399210 100644 --- a/test/rocarray/base.jl +++ b/test/rocarray/base.jl @@ -144,106 +144,108 @@ end finalize(A) end -@testset "unsafe_copy3d!" begin - @testset "Full copy" begin - T = Int32 - src = ROCArray(ones(T, 4, 4, 4)) - dst = ROCArray(zeros(T, 4, 4, 4)) - Mem.unsafe_copy3d!( - pointer(dst), typeof(dst.buf[]), - pointer(src), typeof(src.buf[]), - length(src)) - @test Array(src) == Array(dst) - end - - @testset "3D Copy middle part of y-z planes, each dimension is different in size" begin - nx, ny, nz = 4, 6, 8 - src = ROCArray(collect(reshape(1:(nx * ny * nz), nx, ny, nz))) - dst = ROCArray(zeros(Int, nx, ny, nz)) - Mem.unsafe_copy3d!( - pointer(dst), typeof(dst.buf[]), - pointer(src), typeof(src.buf[]), - 1, 4, 4; - dstPos=(1, 2, 3), srcPos=(1, 2, 3), - dstPitch=nx * sizeof(Int), dstHeight=ny, - srcPitch=nx * sizeof(Int), srcHeight=ny) - @test Array(src)[1, 2:5, 3:6] == Array(dst)[1, 2:5, 3:6] - end - - @testset "3D Copy middle part of x-y-z planes, each dimension is different in size" begin - nx, ny, nz = 4, 6, 8 - src = ROCArray(collect(reshape(1:(nx * ny * nz), nx, ny, nz))) - dst = ROCArray(zeros(Int, nx, ny, nz)) - Mem.unsafe_copy3d!( - pointer(dst), typeof(dst.buf[]), - pointer(src), typeof(src.buf[]), - 2, 4, 4; - dstPos=(2, 2, 3), srcPos=(2, 2, 3), - dstPitch=nx * sizeof(Int), dstHeight=ny, - srcPitch=nx * sizeof(Int), srcHeight=ny) - @test Array(src)[2:3, 2:5, 3:6] == Array(dst)[2:3, 2:5, 3:6] - end - - @testset "3D -> 2D -> 3D copy" begin - nx, ny, nz = 2, 3, 4 - T = Int - P = ROCArray(reshape(1:(2 * 3 * 4), nx, ny, nz)) - - for dim in 1:3 - if dim == 1 - ranges = [2:2, 1:size(P,2), 1:size(P,3)] - buf = zeros(T, size(P,2), size(P,3)) - buf_view_shape = (1, size(P,2), size(P,3)) - elseif dim == 2 - ranges = [1:size(P,1), 3:3, 1:size(P,3)] - buf = zeros(T, size(P,1), size(P,3)) - buf_view_shape = (size(P,1), 1, size(P,3)) - elseif dim == 3 - ranges = [1:size(P,1), 1:size(P,2), 3:3] - buf = zeros(T, size(P,1), size(P,2)) - buf_view_shape = (size(P,1), size(P,2), 1) - end - - # Reshape 2D to 3D for simplicity. - buf_view = reshape(buf, buf_view_shape) - - AMDGPU.Mem.unsafe_copy3d!( - pointer(buf), AMDGPU.Mem.HostBuffer, - pointer(P), typeof(P.buf[]), - length(ranges[1]), length(ranges[2]), length(ranges[3]); - srcPos=(ranges[1][1], ranges[2][1], ranges[3][1]), - dstPitch=sizeof(T) * size(buf_view, 1), dstHeight=size(buf_view, 2), - srcPitch=sizeof(T) * size(P, 1), srcHeight=size(P, 2)) - - if dim == 1 - @assert buf == Array(P)[2, :, :] - elseif dim == 2 - @assert buf == Array(P)[:, 3, :] - elseif dim == 3 - @assert buf == Array(P)[:, :, 3] - end - - # host to device - P2 = similar(P) - - AMDGPU.Mem.unsafe_copy3d!( - pointer(P2), typeof(P2.buf[]), - pointer(buf), AMDGPU.Mem.HostBuffer, - length(ranges[1]), length(ranges[2]), length(ranges[3]); - dstPos=(ranges[1][1], ranges[2][1], ranges[3][1]), - dstPitch=sizeof(T) * size(P2,1), dstHeight=size(P2, 2), - srcPitch=sizeof(T) * size(buf_view, 1), srcHeight=size(buf_view, 2)) - - if dim == 1 - @assert Array(P2)[2, :, :] == Array(P)[2, :, :] - elseif dim == 2 - @assert Array(P2)[:, 3, :] == Array(P)[:, 3, :] - elseif dim == 3 - @assert Array(P2)[:, :, 3] == Array(P)[:, :, 3] - end - end - end -end +# TODO unsafe_copy3d! is broken in HIP: +# https://github.com/ROCm-Developer-Tools/HIP/issues/3289#issuecomment-1651195870 +# @testset "unsafe_copy3d!" begin +# @testset "Full copy" begin +# T = Int32 +# src = ROCArray(ones(T, 4, 4, 4)) +# dst = ROCArray(zeros(T, 4, 4, 4)) +# Mem.unsafe_copy3d!( +# pointer(dst), typeof(dst.buf[]), +# pointer(src), typeof(src.buf[]), +# length(src)) +# @test Array(src) == Array(dst) +# end + +# @testset "3D Copy middle part of y-z planes, each dimension is different in size" begin +# nx, ny, nz = 4, 6, 8 +# src = ROCArray(collect(reshape(1:(nx * ny * nz), nx, ny, nz))) +# dst = ROCArray(zeros(Int, nx, ny, nz)) +# Mem.unsafe_copy3d!( +# pointer(dst), typeof(dst.buf[]), +# pointer(src), typeof(src.buf[]), +# 1, 4, 4; +# dstPos=(1, 2, 3), srcPos=(1, 2, 3), +# dstPitch=nx * sizeof(Int), dstHeight=ny, +# srcPitch=nx * sizeof(Int), srcHeight=ny) +# @test Array(src)[1, 2:5, 3:6] == Array(dst)[1, 2:5, 3:6] +# end + +# @testset "3D Copy middle part of x-y-z planes, each dimension is different in size" begin +# nx, ny, nz = 4, 6, 8 +# src = ROCArray(collect(reshape(1:(nx * ny * nz), nx, ny, nz))) +# dst = ROCArray(zeros(Int, nx, ny, nz)) +# Mem.unsafe_copy3d!( +# pointer(dst), typeof(dst.buf[]), +# pointer(src), typeof(src.buf[]), +# 2, 4, 4; +# dstPos=(2, 2, 3), srcPos=(2, 2, 3), +# dstPitch=nx * sizeof(Int), dstHeight=ny, +# srcPitch=nx * sizeof(Int), srcHeight=ny) +# @test Array(src)[2:3, 2:5, 3:6] == Array(dst)[2:3, 2:5, 3:6] +# end + +# @testset "3D -> 2D -> 3D copy" begin +# nx, ny, nz = 2, 3, 4 +# T = Int +# P = ROCArray(reshape(1:(2 * 3 * 4), nx, ny, nz)) + +# for dim in 1:3 +# if dim == 1 +# ranges = [2:2, 1:size(P,2), 1:size(P,3)] +# buf = zeros(T, size(P,2), size(P,3)) +# buf_view_shape = (1, size(P,2), size(P,3)) +# elseif dim == 2 +# ranges = [1:size(P,1), 3:3, 1:size(P,3)] +# buf = zeros(T, size(P,1), size(P,3)) +# buf_view_shape = (size(P,1), 1, size(P,3)) +# elseif dim == 3 +# ranges = [1:size(P,1), 1:size(P,2), 3:3] +# buf = zeros(T, size(P,1), size(P,2)) +# buf_view_shape = (size(P,1), size(P,2), 1) +# end + +# # Reshape 2D to 3D for simplicity. +# buf_view = reshape(buf, buf_view_shape) + +# AMDGPU.Mem.unsafe_copy3d!( +# pointer(buf), AMDGPU.Mem.HostBuffer, +# pointer(P), typeof(P.buf[]), +# length(ranges[1]), length(ranges[2]), length(ranges[3]); +# srcPos=(ranges[1][1], ranges[2][1], ranges[3][1]), +# dstPitch=sizeof(T) * size(buf_view, 1), dstHeight=size(buf_view, 2), +# srcPitch=sizeof(T) * size(P, 1), srcHeight=size(P, 2)) + +# if dim == 1 +# @assert buf == Array(P)[2, :, :] +# elseif dim == 2 +# @assert buf == Array(P)[:, 3, :] +# elseif dim == 3 +# @assert buf == Array(P)[:, :, 3] +# end + +# # host to device +# P2 = similar(P) + +# AMDGPU.Mem.unsafe_copy3d!( +# pointer(P2), typeof(P2.buf[]), +# pointer(buf), AMDGPU.Mem.HostBuffer, +# length(ranges[1]), length(ranges[2]), length(ranges[3]); +# dstPos=(ranges[1][1], ranges[2][1], ranges[3][1]), +# dstPitch=sizeof(T) * size(P2,1), dstHeight=size(P2, 2), +# srcPitch=sizeof(T) * size(buf_view, 1), srcHeight=size(buf_view, 2)) + +# if dim == 1 +# @assert Array(P2)[2, :, :] == Array(P)[2, :, :] +# elseif dim == 2 +# @assert Array(P2)[:, 3, :] == Array(P)[:, 3, :] +# elseif dim == 3 +# @assert Array(P2)[:, :, 3] == Array(P)[:, :, 3] +# end +# end +# end +# end @testset "accumulate" begin for n in (0, 1, 2, 3, 10, 10_000, 16384, 16384 + 1)