Revert "cacheInputs propagates allocation only for matmul schedulers." (

#3706) Reverts #3621 to fix #3701
NVIDIA · Jan 14, 2025 · 5e08e1d · 5e08e1d
1 parent de04f12
commit 5e08e1d
Show file tree

Hide file tree

Showing 5 changed files with 11 additions and 17 deletions.
diff --git a/csrc/scheduler/ampere_multi_matmul.cpp b/csrc/scheduler/ampere_multi_matmul.cpp
@@ -489,8 +489,7 @@ void AmpereMultipleMatmulScheduler::cacheInputsAndOutputs() {
   scheduler_utils::clearMemorySpace(fusion_);
 
   // Cache inputs
-  scheduler_utils::cacheInputs(
-      fusion_, /*unroll=*/true, /*propagate_allocation=*/true);
+  scheduler_utils::cacheInputs(fusion_, /*unroll=*/true);
 
   // Cache and fork outputs
   cached_outputs_ =

diff --git a/csrc/scheduler/hopper_multi_matmul.cpp b/csrc/scheduler/hopper_multi_matmul.cpp
@@ -101,8 +101,7 @@ void HopperMultipleMatmulScheduler::cacheInputsAndOutputs() {
   scheduler_utils::clearMemorySpace(fusion_);
 
   // Cache inputs
-  scheduler_utils::cacheInputs(
-      fusion_, /*unroll=*/true, /*propagate_allocation=*/true);
+  scheduler_utils::cacheInputs(fusion_, /*unroll=*/true);
 
   // Cache and fork outputs
   scheduler_utils::cacheAndForkOutputs(fusion_, /*unroll=*/true);

diff --git a/csrc/scheduler/utils.cpp b/csrc/scheduler/utils.cpp
@@ -1187,10 +1187,7 @@ void clearMemorySpace(Fusion* fusion) {
 
 // Returns cached after tensors of the fusion inputs if unrolled. Otherwise
 // return empty vector.
-std::vector<TensorView*> cacheInputs(
-    Fusion* fusion,
-    bool unroll,
-    bool propagate_allocation) {
+std::vector<TensorView*> cacheInputs(Fusion* fusion, bool unroll) {
   if (!unroll) {
     return {};
   }
@@ -1227,10 +1224,10 @@ std::vector<TensorView*> cacheInputs(
     }
 
     auto cached_tv = tv->cacheAfter(
-        LoadStoreOpType::Set,
-        CacheOp::Unspecified,
-        propagate_allocation,
-        cached_uses);
+        /*op_type=*/LoadStoreOpType::Set,
+        /*cache_op=*/CacheOp::Unspecified,
+        /*propagate_allocation_domain=*/true,
+        /*cached_uses=*/cached_uses);
     cached_inputs.emplace_back(cached_tv);
   }
   return cached_inputs;

diff --git a/csrc/scheduler/utils.h b/csrc/scheduler/utils.h
@@ -334,10 +334,7 @@ void clearMemorySpace(Fusion* fusion);
 
 // Returns cached after tensors of the fusion inputs if unrolled. Otherwise
 // return empty vector.
-std::vector<TensorView*> cacheInputs(
-    Fusion* fusion,
-    bool unroll,
-    bool propagate_allocation = false);
+std::vector<TensorView*> cacheInputs(Fusion* fusion, bool unroll);
 
 // Returns the pairs of <cache of each fusion output, corresponding output> for
 // all outputs.

diff --git a/tests/cpp/test_allocation_domain.cpp b/tests/cpp/test_allocation_domain.cpp
@@ -1426,9 +1426,11 @@ TEST_F(AllocationDomainTest, InputAllocationIsSplit_Concrete) {
   fusion->addInput(in);
   fusion->addOutput(out);
 
+  // Ideally, loop should stay the same as logical because a fusion input comes
+  // from outside and isn't generated by a loop in the containing kernel (cf.
+  // #3479).
   in->split(0, 2);
   in->setAllocationDomain(in->getLoopDomain(), true);
-  in->setLoopDomain(in->getLogicalDomain());
 
   FusionExecutorCache executor_cache(std::move(fusion));
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA);