From 490677aef6604ab4f865d661c824e122ffdb457d Mon Sep 17 00:00:00 2001
From: Bo Qiao <boqiao@taichi.graphics>
Date: Mon, 20 Dec 2021 11:03:46 +0800
Subject: [PATCH] [lang] Fix ndarray cuda dealloc when using preallocated
 memory (#3829)

* Identify preallocate case

* Add test for caching behavior

* Auto Format

* Add a note

* Auto Format

Co-authored-by: Taichi Gardener <taichigardener@gmail.com>
---
 taichi/backends/cuda/cuda_device.cpp |  4 +++-
 taichi/backends/cuda/cuda_device.h   | 11 +++++++++++
 tests/python/test_ndarray.py         | 12 ++++++++++++
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/taichi/backends/cuda/cuda_device.cpp b/taichi/backends/cuda/cuda_device.cpp
index c3c813879..bac1fbde8 100644
--- a/taichi/backends/cuda/cuda_device.cpp
+++ b/taichi/backends/cuda/cuda_device.cpp
@@ -23,6 +23,7 @@ DeviceAllocation CudaDevice::allocate_memory(const AllocParams &params) {
   info.size = params.size;
   info.is_imported = false;
   info.use_cached = false;
+  info.use_preallocated = false;
 
   DeviceAllocation alloc;
   alloc.alloc_id = allocations_.size();
@@ -48,6 +49,7 @@ DeviceAllocation CudaDevice::allocate_memory_runtime(
   info.size = taichi::iroundup(params.size, taichi_page_size);
   info.is_imported = false;
   info.use_cached = params.use_cached;
+  info.use_preallocated = true;
 
   DeviceAllocation alloc;
   alloc.alloc_id = allocations_.size();
@@ -69,7 +71,7 @@ void CudaDevice::dealloc_memory(DeviceAllocation handle) {
       TI_ERROR("the CudaCachingAllocator is not initialized");
     }
     caching_allocator_->release(info.size, (uint64_t *)info.ptr);
-  } else {
+  } else if (!info.use_preallocated) {
     CUDADriver::get_instance().mem_free(info.ptr);
     info.ptr = nullptr;
   }
diff --git a/taichi/backends/cuda/cuda_device.h b/taichi/backends/cuda/cuda_device.h
index 0e804c0a8..5a320c6a2 100644
--- a/taichi/backends/cuda/cuda_device.h
+++ b/taichi/backends/cuda/cuda_device.h
@@ -81,6 +81,17 @@ class CudaDevice : public Device {
     void *ptr{nullptr};
     size_t size{0};
     bool is_imported{false};
+    /* Note: Memory allocation in CUDA device.
+     * CudaDevice can use either its own cuda malloc mechanism via
+     * `allocate_memory` or the preallocated memory managed by Llvmprogramimpl
+     * via `allocate_memory_runtime`. The `use_preallocated` is used to track
+     * this option. For now, we keep both options and the preallocated method is
+     * used by default for CUDA backend. The `use_cached` is to enable/disable
+     * the caching behavior in `allocate_memory_runtime`. Later it should be
+     * always enabled, for now we keep both options to allow a scenario when
+     * using preallocated memory while disabling the caching behavior.
+     * */
+    bool use_preallocated{true};
     bool use_cached{false};
   };
 
diff --git a/tests/python/test_ndarray.py b/tests/python/test_ndarray.py
index f3934f1ad..cb6145c16 100644
--- a/tests/python/test_ndarray.py
+++ b/tests/python/test_ndarray.py
@@ -309,6 +309,18 @@ def _test_ndarray_deepcopy():
     assert y[4][1, 0] == 9
 
 
+def test_ndarray_cuda_caching_allocator():
+    ti.init(arch=ti.cuda,
+            ndarray_use_torch=False,
+            ndarray_use_cached_allocator=True)
+    n = 8
+    a = ti.ndarray(ti.i32, shape=(n))
+    a.fill(2)
+    a = 1
+    b = ti.ndarray(ti.i32, shape=(n))
+    b.fill(2)
+
+
 @ti.test(arch=supported_archs_taichi_ndarray, ndarray_use_torch=False)
 def test_ndarray_rw_cache():
     a = ti.Vector.ndarray(3, ti.f32, ())