From 5e08e1d7bdc5d690222546da95560d1ac16ceadb Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Tue, 14 Jan 2025 12:27:24 -0800 Subject: [PATCH] Revert "cacheInputs propagates allocation only for matmul schedulers." (#3706) Reverts NVIDIA/Fuser#3621 to fix #3701 --- csrc/scheduler/ampere_multi_matmul.cpp | 3 +-- csrc/scheduler/hopper_multi_matmul.cpp | 3 +-- csrc/scheduler/utils.cpp | 13 +++++-------- csrc/scheduler/utils.h | 5 +---- tests/cpp/test_allocation_domain.cpp | 4 +++- 5 files changed, 11 insertions(+), 17 deletions(-) diff --git a/csrc/scheduler/ampere_multi_matmul.cpp b/csrc/scheduler/ampere_multi_matmul.cpp index e051607257a..598482b76c9 100644 --- a/csrc/scheduler/ampere_multi_matmul.cpp +++ b/csrc/scheduler/ampere_multi_matmul.cpp @@ -489,8 +489,7 @@ void AmpereMultipleMatmulScheduler::cacheInputsAndOutputs() { scheduler_utils::clearMemorySpace(fusion_); // Cache inputs - scheduler_utils::cacheInputs( - fusion_, /*unroll=*/true, /*propagate_allocation=*/true); + scheduler_utils::cacheInputs(fusion_, /*unroll=*/true); // Cache and fork outputs cached_outputs_ = diff --git a/csrc/scheduler/hopper_multi_matmul.cpp b/csrc/scheduler/hopper_multi_matmul.cpp index ba84a2c896c..b0e4b751c8a 100644 --- a/csrc/scheduler/hopper_multi_matmul.cpp +++ b/csrc/scheduler/hopper_multi_matmul.cpp @@ -101,8 +101,7 @@ void HopperMultipleMatmulScheduler::cacheInputsAndOutputs() { scheduler_utils::clearMemorySpace(fusion_); // Cache inputs - scheduler_utils::cacheInputs( - fusion_, /*unroll=*/true, /*propagate_allocation=*/true); + scheduler_utils::cacheInputs(fusion_, /*unroll=*/true); // Cache and fork outputs scheduler_utils::cacheAndForkOutputs(fusion_, /*unroll=*/true); diff --git a/csrc/scheduler/utils.cpp b/csrc/scheduler/utils.cpp index 6b840c96b8c..cd22d935a52 100644 --- a/csrc/scheduler/utils.cpp +++ b/csrc/scheduler/utils.cpp @@ -1187,10 +1187,7 @@ void clearMemorySpace(Fusion* fusion) { // Returns cached after tensors of the fusion inputs if unrolled. Otherwise // return empty vector. -std::vector cacheInputs( - Fusion* fusion, - bool unroll, - bool propagate_allocation) { +std::vector cacheInputs(Fusion* fusion, bool unroll) { if (!unroll) { return {}; } @@ -1227,10 +1224,10 @@ std::vector cacheInputs( } auto cached_tv = tv->cacheAfter( - LoadStoreOpType::Set, - CacheOp::Unspecified, - propagate_allocation, - cached_uses); + /*op_type=*/LoadStoreOpType::Set, + /*cache_op=*/CacheOp::Unspecified, + /*propagate_allocation_domain=*/true, + /*cached_uses=*/cached_uses); cached_inputs.emplace_back(cached_tv); } return cached_inputs; diff --git a/csrc/scheduler/utils.h b/csrc/scheduler/utils.h index dbad708f003..62a359816d2 100644 --- a/csrc/scheduler/utils.h +++ b/csrc/scheduler/utils.h @@ -334,10 +334,7 @@ void clearMemorySpace(Fusion* fusion); // Returns cached after tensors of the fusion inputs if unrolled. Otherwise // return empty vector. -std::vector cacheInputs( - Fusion* fusion, - bool unroll, - bool propagate_allocation = false); +std::vector cacheInputs(Fusion* fusion, bool unroll); // Returns the pairs of for // all outputs. diff --git a/tests/cpp/test_allocation_domain.cpp b/tests/cpp/test_allocation_domain.cpp index a726dd6b262..bff62bb98e1 100644 --- a/tests/cpp/test_allocation_domain.cpp +++ b/tests/cpp/test_allocation_domain.cpp @@ -1426,9 +1426,11 @@ TEST_F(AllocationDomainTest, InputAllocationIsSplit_Concrete) { fusion->addInput(in); fusion->addOutput(out); + // Ideally, loop should stay the same as logical because a fusion input comes + // from outside and isn't generated by a loop in the containing kernel (cf. + // #3479). in->split(0, 2); in->setAllocationDomain(in->getLoopDomain(), true); - in->setLoopDomain(in->getLogicalDomain()); FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA);