From 490677aef6604ab4f865d661c824e122ffdb457d Mon Sep 17 00:00:00 2001 From: Bo Qiao Date: Mon, 20 Dec 2021 11:03:46 +0800 Subject: [PATCH] [lang] Fix ndarray cuda dealloc when using preallocated memory (#3829) * Identify preallocate case * Add test for caching behavior * Auto Format * Add a note * Auto Format Co-authored-by: Taichi Gardener --- taichi/backends/cuda/cuda_device.cpp | 4 +++- taichi/backends/cuda/cuda_device.h | 11 +++++++++++ tests/python/test_ndarray.py | 12 ++++++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/taichi/backends/cuda/cuda_device.cpp b/taichi/backends/cuda/cuda_device.cpp index c3c813879..bac1fbde8 100644 --- a/taichi/backends/cuda/cuda_device.cpp +++ b/taichi/backends/cuda/cuda_device.cpp @@ -23,6 +23,7 @@ DeviceAllocation CudaDevice::allocate_memory(const AllocParams ¶ms) { info.size = params.size; info.is_imported = false; info.use_cached = false; + info.use_preallocated = false; DeviceAllocation alloc; alloc.alloc_id = allocations_.size(); @@ -48,6 +49,7 @@ DeviceAllocation CudaDevice::allocate_memory_runtime( info.size = taichi::iroundup(params.size, taichi_page_size); info.is_imported = false; info.use_cached = params.use_cached; + info.use_preallocated = true; DeviceAllocation alloc; alloc.alloc_id = allocations_.size(); @@ -69,7 +71,7 @@ void CudaDevice::dealloc_memory(DeviceAllocation handle) { TI_ERROR("the CudaCachingAllocator is not initialized"); } caching_allocator_->release(info.size, (uint64_t *)info.ptr); - } else { + } else if (!info.use_preallocated) { CUDADriver::get_instance().mem_free(info.ptr); info.ptr = nullptr; } diff --git a/taichi/backends/cuda/cuda_device.h b/taichi/backends/cuda/cuda_device.h index 0e804c0a8..5a320c6a2 100644 --- a/taichi/backends/cuda/cuda_device.h +++ b/taichi/backends/cuda/cuda_device.h @@ -81,6 +81,17 @@ class CudaDevice : public Device { void *ptr{nullptr}; size_t size{0}; bool is_imported{false}; + /* Note: Memory allocation in CUDA device. + * CudaDevice can use either its own cuda malloc mechanism via + * `allocate_memory` or the preallocated memory managed by Llvmprogramimpl + * via `allocate_memory_runtime`. The `use_preallocated` is used to track + * this option. For now, we keep both options and the preallocated method is + * used by default for CUDA backend. The `use_cached` is to enable/disable + * the caching behavior in `allocate_memory_runtime`. Later it should be + * always enabled, for now we keep both options to allow a scenario when + * using preallocated memory while disabling the caching behavior. + * */ + bool use_preallocated{true}; bool use_cached{false}; }; diff --git a/tests/python/test_ndarray.py b/tests/python/test_ndarray.py index f3934f1ad..cb6145c16 100644 --- a/tests/python/test_ndarray.py +++ b/tests/python/test_ndarray.py @@ -309,6 +309,18 @@ def _test_ndarray_deepcopy(): assert y[4][1, 0] == 9 +def test_ndarray_cuda_caching_allocator(): + ti.init(arch=ti.cuda, + ndarray_use_torch=False, + ndarray_use_cached_allocator=True) + n = 8 + a = ti.ndarray(ti.i32, shape=(n)) + a.fill(2) + a = 1 + b = ti.ndarray(ti.i32, shape=(n)) + b.fill(2) + + @ti.test(arch=supported_archs_taichi_ndarray, ndarray_use_torch=False) def test_ndarray_rw_cache(): a = ti.Vector.ndarray(3, ti.f32, ())