From 04e06a80b7a6341c5037a0b294ccb11594fe7242 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Mon, 18 Nov 2024 11:15:02 -0800 Subject: [PATCH 01/17] Add a repro for #3282 --- tests/cpp/test_multidevice_sharding.cpp | 32 +++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tests/cpp/test_multidevice_sharding.cpp b/tests/cpp/test_multidevice_sharding.cpp index 1e1ff2eab9e..873cbd3e8ca 100644 --- a/tests/cpp/test_multidevice_sharding.cpp +++ b/tests/cpp/test_multidevice_sharding.cpp @@ -340,6 +340,38 @@ TEST_F(MultiDeviceTest, Transpose) { UnorderedElementsAre(HeuristicIs(SchedulerType::Transpose))); } +TEST_F(MultiDeviceTest, ParallelizeLoopSplit) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + const auto num_devices = communicator_->size(); + auto mesh = DeviceMesh::createForNumDevices(num_devices); + + TensorView* in = makeContigConcreteTensor({num_devices * 3}); + in->setDeviceMesh(mesh); + fusion->addInput(in); + TensorView* out = set(in); + fusion->addOutput(out); + + for (auto* tv : {in, out}) { + tv->split(0, num_devices, /*inner_split=*/false); + tv->axis(0)->parallelize(ParallelType::DIDx); + tv->setAllocationDomain(tv->getLoopDomain(), true); + } + + at::Tensor in_tensor = at::randn({3}, tensor_options); + FusionExecutorCache executor_cache(std::move(fusion)); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + + testValidate( + executor_cache.fusion(), + {out_tensor}, + {in_tensor}, + {in_tensor}, + __LINE__, + __FILE__); +} + class MultiDeviceBroadcastTest : public MultiDeviceTest, public testing::WithParamInterface {}; From 416f1d0df4c239f92d848a7d8b63c46539d70788 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Mon, 18 Nov 2024 11:58:07 -0800 Subject: [PATCH 02/17] Remove an assumption in the transpose scheduler. --- csrc/scheduler/transpose.cpp | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/csrc/scheduler/transpose.cpp b/csrc/scheduler/transpose.cpp index e5023f4e25c..553ba9d773e 100644 --- a/csrc/scheduler/transpose.cpp +++ b/csrc/scheduler/transpose.cpp @@ -227,34 +227,31 @@ class DomainMap : public pointwise_utils::DomainMap { root_dim, " in tensor ", tv); - auto replay_exprs = StmtSort::getExprsBetween( + std::vector replay_exprs = StmtSort::getExprsBetween( {mapped_id}, {tv->getLoopDomain().begin(), tv->getLoopDomain().end()}); // Project the root id to loop id. Similar to projectIdToRFactor. - for (auto expr : replay_exprs) { - if (expr->isA()) { - // Split with factor one is not supposed to be here, reshape would map - // this to a broadcast. This is a conservative assert, we can relaxed it - // and support with mapping it to outer. - NVF_ERROR( - !expr->as()->factor()->isOneInt(), - "split with factor one is supposed to be translated to broadcast by reshape"); - if (expr->as()->in() == mapped_id) { - mapped_id = expr->as()->inner(); + for (auto* expr : replay_exprs) { + if (auto* split = dynamic_cast(expr)) { + if (split->in() == mapped_id) { + mapped_id = split->inner(); } - } else if (expr->isA()) { + } else if (auto* merge = dynamic_cast(expr)) { // Merge with size-1 dimension is not supposed to be here, reshape would // map this to a squeeze. This is a conservative assert, we can relaxed // it and support with mapping it to out. NVF_ERROR( - !expr->as()->inner()->extent()->isOneInt(), + !merge->inner()->extent()->isOneInt(), "merge with size-1 dimension is supposed to be translated to squeeze by reshape"); - if (expr->as()->inner() == mapped_id) { - mapped_id = expr->as()->out(); + if (merge->inner() == mapped_id) { + mapped_id = merge->out(); + } + } else if (auto* resize = dynamic_cast(expr)) { + if (resize->in() == mapped_id) { + mapped_id = resize->out(); } - } else if (expr->isA() && expr->as()->in() == mapped_id) { - mapped_id = expr->as()->out(); } } + // Find the position of the loop id const auto& dom = tv->getLoopDomain(); for (auto i : c10::irange(dom.size())) { From 2c984c8dd9918726255c4062b0f2170033f18722 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Mon, 18 Nov 2024 17:37:50 -0800 Subject: [PATCH 03/17] Reimplement unshardSizesAndStrides. --- csrc/expr_evaluator.cpp | 91 ++++++++++++++++++++++--------------- csrc/fusion_segmenter.cpp | 4 +- csrc/multidevice/utils.cpp | 5 ++- csrc/tensor_metadata.cpp | 92 ++++++++++++++++++++++++-------------- 4 files changed, 118 insertions(+), 74 deletions(-) diff --git a/csrc/expr_evaluator.cpp b/csrc/expr_evaluator.cpp index d4ca6daa022..567447871e4 100644 --- a/csrc/expr_evaluator.cpp +++ b/csrc/expr_evaluator.cpp @@ -6,6 +6,9 @@ */ // clang-format on +#include +#include + #include #include #include @@ -14,11 +17,9 @@ #include #include #include +#include #include -#include -#include - namespace nvfuser { namespace { @@ -127,6 +128,44 @@ void validateValWithConcreteValue( } } +std::vector unshardedSizes( + const TensorView* tv, + c10::IntArrayRef sizes) { + std::vector unsharded_sizes = sizes.vec(); + + for (IterDomain* alloc_id : tv->getMaybeAllocationDomain()) { + const ParallelType parallel_type = alloc_id->getParallelType(); + if (!isParallelTypeDeviceDim(parallel_type)) { + continue; + } + + const auto inputs = IterVisitor::getInputsTo( + {alloc_id}, + {tv->getLogicalDomain().begin(), tv->getLogicalDomain().end()}); + if (inputs.empty()) { + // FIXME: is this even possible? Logical ought to dominate allocation. + continue; + } + NVF_ERROR(inputs.size() == 1); + + const auto iter = std::find( + tv->getLogicalDomain().begin(), + tv->getLogicalDomain().end(), + inputs[0]); + if (iter == tv->getLogicalDomain().end()) { + // FIXME: is this even possible? Logical ought to dominate allocation. + continue; + } + const auto index = std::count_if( + tv->getLogicalDomain().begin(), iter, [](IterDomain* id) -> bool { + return !id->isReduction(); + }); + unsharded_sizes.at(index) *= tv->getDeviceMesh().size(parallel_type); + } + + return unsharded_sizes; +} + } // namespace void ExpressionEvaluator::bindTensorDomain( @@ -143,6 +182,14 @@ void ExpressionEvaluator::bindTensorDomain( logical_domain.size(), ", but got a tensor of rank ", t.dim()); + + std::vector sizes; + if (isSharded(tv)) { + sizes = unshardedSizes(tv, t.sizes()); + } else { + sizes = t.sizes().vec(); + } + for (auto i : c10::irange(t.dim())) { auto id = logical_domain[i]; if (id->isBroadcast()) { @@ -151,7 +198,7 @@ void ExpressionEvaluator::bindTensorDomain( if (id->hasExpandedExtent()) { // Verify that t is also expanded NVF_ERROR( - t.size(i) == 1 || t.stride(i) == 0, + sizes[i] == 1 || t.stride(i) == 0, "IterDomain ", id->toString(), " in ", @@ -159,45 +206,15 @@ void ExpressionEvaluator::bindTensorDomain( "TensorView ", tv->toString(), " has expanded extent but input tensor has size ", - t.size(i), + sizes[i], " and stride ", t.stride(i), " in dimension ", i); - bind_( - logical_domain[i]->expandedExtent(), t.size(i), evaluate_validate); + bind_(logical_domain[i]->expandedExtent(), sizes[i], evaluate_validate); } } else { - if (logical_domain[i]->isDeviceDim()) { - // Currently we have the restrictions: - // (1) Devices parallelized axis extent == DeviceMesh's extent - // (2) Device parallelized axis cannot be split or merged - // Therefore, the device parallelized extents will always be allocated - // with size 1, but the symbolic axis extent is binded with the extent - // of the DeviceMesh - NVF_CHECK( - 1 == t.size(i), - "TensorView ", - tv->toString(), - getInputPosString(tv), - " IterDomain ", - id->toString(), - "is sharded and must have size 1, but input tensor has size ", - t.size(i)); - NVF_CHECK( - tv->hasDeviceMesh(), - "TV ", - tv->toString(), - getInputPosString(tv), - " has an empty DeviceMesh with DID parallelization") - bind_( - logical_domain[i]->extent(), - static_cast( - tv->getDeviceMesh().size(logical_domain[i]->getParallelType())), - evaluate_validate); - } else { - bind_(logical_domain[i]->extent(), t.size(i), evaluate_validate); - } + bind_(logical_domain[i]->extent(), sizes[i], evaluate_validate); } } } diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp index 506a2e81987..fa8b50d6fa2 100644 --- a/csrc/fusion_segmenter.cpp +++ b/csrc/fusion_segmenter.cpp @@ -1883,7 +1883,9 @@ void eraseInputDistinctRootDomains(Fusion* fusion) { std::vector new_alloc; new_alloc.reserve(tv->getAllocationDomain().size()); for (IterDomain* alloc_id : tv->getAllocationDomain()) { - new_alloc.push_back(replay.getReplay().at(alloc_id)); + IterDomain* new_alloc_id = replay.getReplay().at(alloc_id); + new_alloc_id->parallelize(alloc_id->getParallelType()); + new_alloc.push_back(new_alloc_id); } std::vector new_loop; diff --git a/csrc/multidevice/utils.cpp b/csrc/multidevice/utils.cpp index 1c40ffc5c2b..65cb76b0da7 100644 --- a/csrc/multidevice/utils.cpp +++ b/csrc/multidevice/utils.cpp @@ -106,8 +106,9 @@ std::pair, std::vector> getShardingChanges bool isSharded(const TensorView* tv) { bool is_sharded = false; - for (IterDomain* id : TensorDomain::noReductions(tv->getLoopDomain())) { - if (!id->isDeviceDim()) { + for (IterDomain* alloc_id : + TensorDomain::noReductions(tv->getMaybeAllocationDomain())) { + if (!alloc_id->isDeviceDim()) { continue; } diff --git a/csrc/tensor_metadata.cpp b/csrc/tensor_metadata.cpp index 32fdee2de42..e35d235b709 100644 --- a/csrc/tensor_metadata.cpp +++ b/csrc/tensor_metadata.cpp @@ -272,6 +272,46 @@ void validateAllocationSizesAndStrides( } } +// FIXME: strides are never changed +std::pair, std::vector> unshardedSizesAndStrides( + TensorView* tv, + c10::IntArrayRef sizes, + c10::IntArrayRef strides) { + std::vector unsharded_sizes = sizes.vec(); + std::vector unsharded_strides = strides.vec(); + + for (IterDomain* alloc_id : tv->getMaybeAllocationDomain()) { + const ParallelType parallel_type = alloc_id->getParallelType(); + if (!isParallelTypeDeviceDim(parallel_type)) { + continue; + } + + const auto inputs = IterVisitor::getInputsTo( + {alloc_id}, + {tv->getLogicalDomain().begin(), tv->getLogicalDomain().end()}); + if (inputs.empty()) { + // FIXME: is this even possible? Logical ought to dominate loop. + continue; + } + NVF_ERROR(inputs.size() == 1); + + const auto iter = std::find( + tv->getLogicalDomain().begin(), + tv->getLogicalDomain().end(), + inputs[0]); + if (iter == tv->getLogicalDomain().end()) { + // FIXME: is this even possible? Logical ought to dominate loop. + continue; + } + const auto index = std::count_if( + tv->getLogicalDomain().begin(), iter, [](IterDomain* id) -> bool { + return !id->isReduction(); + }); + unsharded_sizes.at(index) *= tv->getDeviceMesh().size(parallel_type); + } + + return {unsharded_sizes, unsharded_strides}; +} } // namespace std::pair, std::vector> @@ -282,11 +322,21 @@ inferAndValidateAllocationSizesAndStrides( const auto& logical = tv->getLogicalDomain(); const auto& alloc = tv->getMaybeAllocationDomain(); + std::vector logical_sizes; + std::vector logical_strides; + if (isSharded(tv)) { + std::tie(logical_sizes, logical_strides) = + unshardedSizesAndStrides(tv, tensor.sizes(), tensor.strides()); + } else { + logical_sizes = tensor.sizes().vec(); + logical_strides = tensor.strides().vec(); + } + // active IDs and their shape and stride std::unordered_map> active_ids; int64_t dim_index = 0; for (IterDomain* id : TensorDomain::noReductions(logical)) { - active_ids[id] = {tensor.size(dim_index), tensor.stride(dim_index)}; + active_ids[id] = {logical_sizes[dim_index], logical_strides[dim_index]}; dim_index++; } NVF_ERROR(dim_index == tensor.dim()); @@ -296,50 +346,24 @@ inferAndValidateAllocationSizesAndStrides( // Now active_ids should contain the final sizes and strides, unordered. We // need to put them to the correct order. - std::vector sizes; - std::vector strides; - sizes.reserve(alloc.size()); - strides.reserve(alloc.size()); + std::vector allocation_sizes; + std::vector allocation_strides; for (IterDomain* id : TensorDomain::noReductions(alloc)) { if (id->isDeviceDim()) { - sizes.push_back(1); + allocation_sizes.push_back(1); } else { - sizes.push_back(active_ids.at(id).first); + allocation_sizes.push_back(active_ids.at(id).first); } - strides.push_back(active_ids.at(id).second); + allocation_strides.push_back(active_ids.at(id).second); } // Only validate final sizes and strides when we have a non-empty tensor. if (tensor.numel() != 0) { validateAllocationSizesAndStrides( - alloc, tv->getContiguity(), sizes, strides); - } - return {std::move(sizes), std::move(strides)}; -} - -namespace { -std::pair, std::vector> unshardedSizesAndStrides( - TensorView* tv, - c10::IntArrayRef sizes, - c10::IntArrayRef strides) { - std::vector unsharded_sizes(sizes.size()); - std::vector unsharded_strides(strides.size()); - for (const auto i : c10::irange(sizes.size())) { - IterDomain* id = tv->getLogicalDomain()[i]; - if (id->isDeviceDim()) { - unsharded_sizes[i] = tv->getDeviceMesh().size(id->getParallelType()); - // This probably doesn't matter in practice unless a kernel accidentally - // tries to access the data on another rank. To be safe, set the stride - // to zero, analogous to an expanded broadcast dimension. - unsharded_strides[i] = 0; - } else { - unsharded_sizes[i] = sizes[i]; - unsharded_strides[i] = strides[i]; - } + alloc, tv->getContiguity(), allocation_sizes, allocation_strides); } - return {unsharded_sizes, unsharded_strides}; + return {std::move(allocation_sizes), std::move(allocation_strides)}; } -} // namespace std::vector GetMetaData::evaluate( const ExpressionEvaluator& ee, From 44b50912bffb6280aab21738f9b45a1d1fd23ae1 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Thu, 21 Nov 2024 14:29:06 -0800 Subject: [PATCH 04/17] Inherit parallel type for new allocation IDs --- csrc/fusion_segmenter.cpp | 1 + csrc/transform_replay.cpp | 9 ++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp index fa8b50d6fa2..4d7cb4e693b 100644 --- a/csrc/fusion_segmenter.cpp +++ b/csrc/fusion_segmenter.cpp @@ -1884,6 +1884,7 @@ void eraseInputDistinctRootDomains(Fusion* fusion) { new_alloc.reserve(tv->getAllocationDomain().size()); for (IterDomain* alloc_id : tv->getAllocationDomain()) { IterDomain* new_alloc_id = replay.getReplay().at(alloc_id); + // FIXME: should this be taken care of by ReplayTransformations? new_alloc_id->parallelize(alloc_id->getParallelType()); new_alloc.push_back(new_alloc_id); } diff --git a/csrc/transform_replay.cpp b/csrc/transform_replay.cpp index 06e15929aa9..0ac2a9e97a4 100644 --- a/csrc/transform_replay.cpp +++ b/csrc/transform_replay.cpp @@ -770,11 +770,14 @@ std::pair TransformReplay::replayCasP( new_contiguity.reserve(producer_rank); for (auto i : c10::irange(producer_rank)) { - IterDomain* id = producer->getAllocationDomain()[i]; + IterDomain* alloc_id = producer->getAllocationDomain()[i]; // We won't find reduction IterDomains in the map. See // AllocationDomainTest.CacheBefore. - if (auto it = p2c_map.find(id); it != p2c_map.end()) { - new_allocation_domain.push_back(it->second); + if (auto it = p2c_map.find(alloc_id); it != p2c_map.end()) { + IterDomain* new_alloc_id = it->second; + // FIXME: should this be taken care of by ReplayTransformations? + new_alloc_id->parallelize(alloc_id->getParallelType()); + new_allocation_domain.push_back(new_alloc_id); new_contiguity.push_back(producer->getContiguity()[i]); } } From 521c783a6e1a7d7ad5c0dc568ff6e2af79924501 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Thu, 21 Nov 2024 15:07:07 -0800 Subject: [PATCH 05/17] Fix broadcast tests --- csrc/expr_evaluator.cpp | 7 +++---- tests/cpp/test_multidevice_sharding.cpp | 18 +++++++++++++----- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/csrc/expr_evaluator.cpp b/csrc/expr_evaluator.cpp index 567447871e4..7e04c3bcd4d 100644 --- a/csrc/expr_evaluator.cpp +++ b/csrc/expr_evaluator.cpp @@ -193,8 +193,7 @@ void ExpressionEvaluator::bindTensorDomain( for (auto i : c10::irange(t.dim())) { auto id = logical_domain[i]; if (id->isBroadcast()) { - // DIDs are ignored for broadcast. - bind_(logical_domain[i]->extent(), 1, evaluate_validate); + bind_(id->extent(), 1, evaluate_validate); if (id->hasExpandedExtent()) { // Verify that t is also expanded NVF_ERROR( @@ -211,10 +210,10 @@ void ExpressionEvaluator::bindTensorDomain( t.stride(i), " in dimension ", i); - bind_(logical_domain[i]->expandedExtent(), sizes[i], evaluate_validate); + bind_(id->expandedExtent(), sizes[i], evaluate_validate); } } else { - bind_(logical_domain[i]->extent(), sizes[i], evaluate_validate); + bind_(id->extent(), sizes[i], evaluate_validate); } } } diff --git a/tests/cpp/test_multidevice_sharding.cpp b/tests/cpp/test_multidevice_sharding.cpp index 873cbd3e8ca..4b1f605313d 100644 --- a/tests/cpp/test_multidevice_sharding.cpp +++ b/tests/cpp/test_multidevice_sharding.cpp @@ -424,20 +424,28 @@ TEST_P(MultiDeviceBroadcastTest, Expanded) { TensorView* in = TensorViewBuilder() .dtype(DataType::Float) .contiguity({std::nullopt, true}) - .shape({3, -1}) + .shape({num_devices * 3, -1}) .expanded({true, false}) .build(); in->setDeviceMesh(mesh); - if (parallelizes_broadcast) { - in->axis(0)->parallelize(ParallelType::DIDx); - } TensorView* out = set(in); fusion->addInput(in); fusion->addOutput(out); + if (parallelizes_broadcast) { + for (auto* tv : {in, out}) { + tv->split(0, num_devices, /*inner_split=*/false); + tv->axis(0)->parallelize(ParallelType::DIDx); + tv->setAllocationDomain(tv->getLoopDomain(), true); + } + } + FusionExecutorCache executor_cache(std::move(fusion)); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); - at::Tensor in_tensor = at::randn({8}, options).as_strided({3, 8}, {0, 1}); + at::Tensor in_tensor = + at::randn({8}, options) + .as_strided( + {parallelizes_broadcast ? 3 : num_devices * 3, 8}, {0, 1}); at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; testValidate( executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__); From 9ff10cf8281b78d89c640080d79777b0d9f2513d Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Fri, 22 Nov 2024 09:59:00 -0800 Subject: [PATCH 06/17] Unify unshardedSizes. --- csrc/expr_evaluator.cpp | 54 +++--------------------------- csrc/multidevice/utils.cpp | 45 +++++++++++++++++++++++++ csrc/multidevice/utils.h | 29 ++++++++++++++++ csrc/tensor_metadata.cpp | 68 +++++--------------------------------- 4 files changed, 88 insertions(+), 108 deletions(-) diff --git a/csrc/expr_evaluator.cpp b/csrc/expr_evaluator.cpp index 7e04c3bcd4d..a2ebccfb7b3 100644 --- a/csrc/expr_evaluator.cpp +++ b/csrc/expr_evaluator.cpp @@ -128,44 +128,6 @@ void validateValWithConcreteValue( } } -std::vector unshardedSizes( - const TensorView* tv, - c10::IntArrayRef sizes) { - std::vector unsharded_sizes = sizes.vec(); - - for (IterDomain* alloc_id : tv->getMaybeAllocationDomain()) { - const ParallelType parallel_type = alloc_id->getParallelType(); - if (!isParallelTypeDeviceDim(parallel_type)) { - continue; - } - - const auto inputs = IterVisitor::getInputsTo( - {alloc_id}, - {tv->getLogicalDomain().begin(), tv->getLogicalDomain().end()}); - if (inputs.empty()) { - // FIXME: is this even possible? Logical ought to dominate allocation. - continue; - } - NVF_ERROR(inputs.size() == 1); - - const auto iter = std::find( - tv->getLogicalDomain().begin(), - tv->getLogicalDomain().end(), - inputs[0]); - if (iter == tv->getLogicalDomain().end()) { - // FIXME: is this even possible? Logical ought to dominate allocation. - continue; - } - const auto index = std::count_if( - tv->getLogicalDomain().begin(), iter, [](IterDomain* id) -> bool { - return !id->isReduction(); - }); - unsharded_sizes.at(index) *= tv->getDeviceMesh().size(parallel_type); - } - - return unsharded_sizes; -} - } // namespace void ExpressionEvaluator::bindTensorDomain( @@ -183,13 +145,7 @@ void ExpressionEvaluator::bindTensorDomain( ", but got a tensor of rank ", t.dim()); - std::vector sizes; - if (isSharded(tv)) { - sizes = unshardedSizes(tv, t.sizes()); - } else { - sizes = t.sizes().vec(); - } - + std::vector logical_sizes = unshardedSizes(tv, t.sizes()); for (auto i : c10::irange(t.dim())) { auto id = logical_domain[i]; if (id->isBroadcast()) { @@ -197,7 +153,7 @@ void ExpressionEvaluator::bindTensorDomain( if (id->hasExpandedExtent()) { // Verify that t is also expanded NVF_ERROR( - sizes[i] == 1 || t.stride(i) == 0, + logical_sizes[i] == 1 || t.stride(i) == 0, "IterDomain ", id->toString(), " in ", @@ -205,15 +161,15 @@ void ExpressionEvaluator::bindTensorDomain( "TensorView ", tv->toString(), " has expanded extent but input tensor has size ", - sizes[i], + logical_sizes[i], " and stride ", t.stride(i), " in dimension ", i); - bind_(id->expandedExtent(), sizes[i], evaluate_validate); + bind_(id->expandedExtent(), logical_sizes[i], evaluate_validate); } } else { - bind_(id->extent(), sizes[i], evaluate_validate); + bind_(id->extent(), logical_sizes[i], evaluate_validate); } } } diff --git a/csrc/multidevice/utils.cpp b/csrc/multidevice/utils.cpp index 65cb76b0da7..f17e9a42c0a 100644 --- a/csrc/multidevice/utils.cpp +++ b/csrc/multidevice/utils.cpp @@ -122,6 +122,51 @@ bool isSharded(const TensorView* tv) { return is_sharded; } +std::vector unshardedSizes( + const TensorView* tv, + c10::IntArrayRef sizes) { + std::vector unsharded_sizes = sizes.vec(); + + for (IterDomain* alloc_id : tv->getMaybeAllocationDomain()) { + const ParallelType parallel_type = alloc_id->getParallelType(); + if (!isParallelTypeDeviceDim(parallel_type)) { + continue; + } + + const auto inputs = IterVisitor::getInputsTo( + {alloc_id}, + {tv->getLogicalDomain().begin(), tv->getLogicalDomain().end()}); + NVF_ERROR( + !inputs.empty(), + "IterVisitor::getInputsTo shouldn't return empty unless `of` is empty."); + NVF_ERROR( + inputs.size() == 1, + "Failed to find the single logical input to ", + alloc_id, + ". This is likely because there's a Merge expression from logical to allocation, which isn't supported. Inputs are: ", + toDelimitedString(inputs)); + + const auto iter = std::find( + tv->getLogicalDomain().begin(), + tv->getLogicalDomain().end(), + inputs[0]); + NVF_ERROR( + iter != tv->getLogicalDomain().end(), + "The found input IterDomain isn't logical. This is likely because logical doesn't dominate allocation: ", + inputs[0]); + + // Count the number of non-reduction IterDomains before `iter`. Reduction + // IterDomains are not materialized in the at::Tensor's shape. + const auto index = std::count_if( + tv->getLogicalDomain().begin(), iter, [](IterDomain* id) -> bool { + return !id->isReduction(); + }); + unsharded_sizes.at(index) *= tv->getDeviceMesh().size(parallel_type); + } + + return unsharded_sizes; +} + int64_t numDeviceDims(const TensorView* tv) { return std::count_if( tv->getLoopDomain().begin(), diff --git a/csrc/multidevice/utils.h b/csrc/multidevice/utils.h index 12013e918b4..b4f25f0df6c 100644 --- a/csrc/multidevice/utils.h +++ b/csrc/multidevice/utils.h @@ -7,6 +7,8 @@ // clang-format on #pragma once +#include + #include #include #include @@ -127,4 +129,31 @@ int64_t getShardedAxis(TensorView*); // Reorders a TensorView so that the DID parallelized axis are in front. void reorderDIDToFront(TensorView*); + +// Given a TensorView and the shape of a sharded tensor of which certain +// dimensions are partially alloated, returns the global shape that'll be used +// to bind to the TensorView's logical domain. This is to solve #3282 so we can +// bind a sharded tensor to a TensorView that has a DID-parallel loop domain. +// +// For example, when `tv` is +// logical: iM, iN +// allocation: iDIDx{D}, iN/D, iM +// and `sizes` is [2, 3], the returned shape will be [2, 3D]. This is because, +// according to the allocation domain, iM is fully allocated and iN is sharded +// and thus partially allocated. +// +// As a degenerate case, it's fine to call this function with a non-sharded +// TensorView and tensor. +// +// Limitations: +// - The function assumes that there are no Merges from logical to the +// DID-parallel IterDomains in allocation. Otherwise, it's unclear which logical +// dimension this DID-parallelization should be attributed to. +// - The function assumes that all Splits from logical to the DID-parallel +// IterDomains in allocation are even. This is because there are currently no +// ways to pass in the global shape without an API overhaul. +std::vector unshardedSizes( + const TensorView* tv, + c10::IntArrayRef sizes); + } // namespace nvfuser diff --git a/csrc/tensor_metadata.cpp b/csrc/tensor_metadata.cpp index e35d235b709..5bb67cf1b9d 100644 --- a/csrc/tensor_metadata.cpp +++ b/csrc/tensor_metadata.cpp @@ -272,46 +272,6 @@ void validateAllocationSizesAndStrides( } } -// FIXME: strides are never changed -std::pair, std::vector> unshardedSizesAndStrides( - TensorView* tv, - c10::IntArrayRef sizes, - c10::IntArrayRef strides) { - std::vector unsharded_sizes = sizes.vec(); - std::vector unsharded_strides = strides.vec(); - - for (IterDomain* alloc_id : tv->getMaybeAllocationDomain()) { - const ParallelType parallel_type = alloc_id->getParallelType(); - if (!isParallelTypeDeviceDim(parallel_type)) { - continue; - } - - const auto inputs = IterVisitor::getInputsTo( - {alloc_id}, - {tv->getLogicalDomain().begin(), tv->getLogicalDomain().end()}); - if (inputs.empty()) { - // FIXME: is this even possible? Logical ought to dominate loop. - continue; - } - NVF_ERROR(inputs.size() == 1); - - const auto iter = std::find( - tv->getLogicalDomain().begin(), - tv->getLogicalDomain().end(), - inputs[0]); - if (iter == tv->getLogicalDomain().end()) { - // FIXME: is this even possible? Logical ought to dominate loop. - continue; - } - const auto index = std::count_if( - tv->getLogicalDomain().begin(), iter, [](IterDomain* id) -> bool { - return !id->isReduction(); - }); - unsharded_sizes.at(index) *= tv->getDeviceMesh().size(parallel_type); - } - - return {unsharded_sizes, unsharded_strides}; -} } // namespace std::pair, std::vector> @@ -322,21 +282,12 @@ inferAndValidateAllocationSizesAndStrides( const auto& logical = tv->getLogicalDomain(); const auto& alloc = tv->getMaybeAllocationDomain(); - std::vector logical_sizes; - std::vector logical_strides; - if (isSharded(tv)) { - std::tie(logical_sizes, logical_strides) = - unshardedSizesAndStrides(tv, tensor.sizes(), tensor.strides()); - } else { - logical_sizes = tensor.sizes().vec(); - logical_strides = tensor.strides().vec(); - } - // active IDs and their shape and stride + std::vector logical_sizes = unshardedSizes(tv, tensor.sizes()); std::unordered_map> active_ids; int64_t dim_index = 0; for (IterDomain* id : TensorDomain::noReductions(logical)) { - active_ids[id] = {logical_sizes[dim_index], logical_strides[dim_index]}; + active_ids[id] = {logical_sizes[dim_index], tensor.stride(dim_index)}; dim_index++; } NVF_ERROR(dim_index == tensor.dim()); @@ -348,6 +299,8 @@ inferAndValidateAllocationSizesAndStrides( // need to put them to the correct order. std::vector allocation_sizes; std::vector allocation_strides; + allocation_sizes.reserve(alloc.size()); + allocation_strides.reserve(alloc.size()); for (IterDomain* id : TensorDomain::noReductions(alloc)) { if (id->isDeviceDim()) { allocation_sizes.push_back(1); @@ -388,22 +341,19 @@ std::vector GetMetaData::evaluate( metadata->data = input.data_ptr(); if (isSharded(tv)) { - auto [unsharded_sizes, unsharded_strides] = - unshardedSizesAndStrides(tv, input.sizes(), input.strides()); + std::vector unsharded_sizes = unshardedSizes(tv, input.sizes()); metadata->logical_size_data = std::move(unsharded_sizes); metadata->logical_size = c10::makeArrayRef(metadata->logical_size_data); - metadata->logical_stride_data = std::move(unsharded_strides); - metadata->logical_stride = c10::makeArrayRef(metadata->logical_stride_data); } else { metadata->logical_size = input.sizes(); - metadata->logical_stride = input.strides(); } + metadata->logical_stride = input.strides(); - auto [sizes, strides] = + auto [allocation_sizes, allocation_strides] = inferAndValidateAllocationSizesAndStrides(input, tv, ee); - metadata->alloc_size_data = std::move(sizes); + metadata->alloc_size_data = std::move(allocation_sizes); metadata->alloc_size = c10::makeArrayRef(metadata->alloc_size_data); - metadata->alloc_stride_data = std::move(strides); + metadata->alloc_stride_data = std::move(allocation_strides); metadata->alloc_stride = c10::makeArrayRef(metadata->alloc_stride_data); return {PolymorphicValue(std::move(struct_))}; } From 8bd9486caa36265c81ca715e454ed7164b38abc1 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Mon, 25 Nov 2024 16:52:35 -0800 Subject: [PATCH 07/17] Fix a test --- csrc/ir/nodes.cpp | 23 ++++++++---------- tests/cpp/test_sharding.cpp | 47 ++++++++++++++++++++++++------------- 2 files changed, 41 insertions(+), 29 deletions(-) diff --git a/csrc/ir/nodes.cpp b/csrc/ir/nodes.cpp index c93c4980e85..6906861814e 100644 --- a/csrc/ir/nodes.cpp +++ b/csrc/ir/nodes.cpp @@ -3238,24 +3238,21 @@ bool TensorDomain::sameAs( std::string TensorDomain::toString(const int indent_size, const bool loop_only) const { std::stringstream ss; - if (nDims() == 0) { - indent(ss, indent_size) << "[ ]"; - return ss.str(); - } - indent(ss, indent_size) << "[ " << toDelimitedString(loop()) << " ]"; - if (!loop_only) { + if (loop_only) { + indent(ss, indent_size) << "[" << toDelimitedString(loop()) << "]"; + } else { + indent(ss, indent_size) + << "logical=[" << toDelimitedString(logical()) << "]" << std::endl; if (hasRoot()) { - ss << "," << std::endl; indent(ss, indent_size + 1) - << "root=[ " << toDelimitedString(root()) << " ]"; + << "root=[" << toDelimitedString(root()) << "]" << std::endl; } - ss << "," << std::endl; indent(ss, indent_size + 1) - << "logical=[ " << toDelimitedString(logical()) << " ]"; - if (!allocation_domain_.empty()) { - ss << "," << std::endl; + << "loop=[" << toDelimitedString(loop()) << "]" << std::endl; + if (hasAllocation()) { indent(ss, indent_size + 1) - << "allocation=[ " << toDelimitedString(allocation()) << " ]"; + << "allocation=[" << toDelimitedString(allocation()) << "]" + << std::endl; } } return ss.str(); diff --git a/tests/cpp/test_sharding.cpp b/tests/cpp/test_sharding.cpp index a0c643e95b4..9c9ca068689 100644 --- a/tests/cpp/test_sharding.cpp +++ b/tests/cpp/test_sharding.cpp @@ -23,26 +23,41 @@ namespace nvfuser { using ShardingTest = NVFuserFixtureParamTest; -// TODO: This test checks that isSharded generates an error when a split/merged -// axis is parallelized with DIDx. Update when this restriction is lifted. -TEST_F(ShardingTest, IsSharded) { +TEST_F(ShardingTest, LogicalIsSharded) { Fusion fusion; FusionGuard fg(&fusion); - TensorView* a = makeSymbolicTensor(3); - a->axis(2)->parallelize(ParallelType::DIDx); - a->split(0, 4); - EXPECT_TRUE(isSharded(a)) << "DIDx on logical domain"; + TensorView* x = makeSymbolicTensor(3); + x->axis(2)->parallelize(ParallelType::DIDx); + x->split(0, 4); - TensorView* b = makeSymbolicTensor(3); - b->split(1, 4); - b->axis(1)->parallelize(ParallelType::DIDx); - EXPECT_TRUE(isSharded(b)) << "DIDx on loop domain"; - - TensorView* c = makeSymbolicTensor(3); - c->axis(0)->parallelize(ParallelType::DIDx); - c->axis(1)->parallelize(ParallelType::DIDx); - EXPECT_ANY_THROW(isSharded(c)) << "Multiple DIDx"; + EXPECT_TRUE(isSharded(x)) << "DIDx on logical domain:" << std::endl + << x->domain()->toString(0, /*loop_only=*/false); +} + +TEST_F(ShardingTest, AllocationIsSharded) { + Fusion fusion; + FusionGuard fg(&fusion); + + TensorView* x = makeSymbolicTensor(3); + x->split(1, 4); + x->axis(1)->parallelize(ParallelType::DIDx); + x->setAllocationDomain(x->getLoopDomain(), true); + + EXPECT_TRUE(isSharded(x)) << "DIDx on allocation domain:" << std::endl + << x->domain()->toString(0, /*loop_only=*/false); +} + +TEST_F(ShardingTest, MultipleDIDx) { + Fusion fusion; + FusionGuard fg(&fusion); + + TensorView* x = makeSymbolicTensor(3); + x->axis(0)->parallelize(ParallelType::DIDx); + x->axis(1)->parallelize(ParallelType::DIDx); + EXPECT_ANY_THROW(isSharded(x)) + << "Multiple DIDx:" << std::endl + << x->domain()->toString(0, /*loop_only=*/false); } TEST_F(ShardingTest, PropagateSharding) { From 5a16349adcde62fb267259949565f85a8d4660b6 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Mon, 25 Nov 2024 20:23:31 -0800 Subject: [PATCH 08/17] Refine the logic in the transpose scheduler --- csrc/scheduler/transpose.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/csrc/scheduler/transpose.cpp b/csrc/scheduler/transpose.cpp index 553ba9d773e..7e320f99a91 100644 --- a/csrc/scheduler/transpose.cpp +++ b/csrc/scheduler/transpose.cpp @@ -233,7 +233,12 @@ class DomainMap : public pointwise_utils::DomainMap { for (auto* expr : replay_exprs) { if (auto* split = dynamic_cast(expr)) { if (split->in() == mapped_id) { - mapped_id = split->inner(); + if (split->inner()->extent()->isOneInt() && + !split->outer()->extent()->isOneInt()) { + mapped_id = split->outer(); + } else { + mapped_id = split->inner(); + } } } else if (auto* merge = dynamic_cast(expr)) { // Merge with size-1 dimension is not supposed to be here, reshape would From 27689708c7e15f91c9ac1383f434284464bf1990 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Wed, 27 Nov 2024 11:17:17 -0800 Subject: [PATCH 09/17] Comment --- csrc/multidevice/utils.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/csrc/multidevice/utils.h b/csrc/multidevice/utils.h index b4f25f0df6c..50f0223a034 100644 --- a/csrc/multidevice/utils.h +++ b/csrc/multidevice/utils.h @@ -151,7 +151,20 @@ void reorderDIDToFront(TensorView*); // dimension this DID-parallelization should be attributed to. // - The function assumes that all Splits from logical to the DID-parallel // IterDomains in allocation are even. This is because there are currently no -// ways to pass in the global shape without an API overhaul. +// ways to pass in the global shape. +// +// Despite these limitations, I took this approach as a shortcut to fix #3282, +// which blocked many other tasks. I'm however open to other better, long-term +// solutions. Some alternatives considered in #3282 are: +// - Try to bind `at::Tensor`s to allocation domains instead of logical. Many +// `*Op::evaluate` methods (e.g. +// https://github.com/NVIDIA/Fuser/blob/2415d904d1e9a5da7ca6fb1a55d3045bbd510341/csrc/ir/nodes.cpp#L4321-L4329) +// assume the input/output `at::Tensor`s have the same dimension order as the +// logical domain. Doing so would have to change them all. +// - Try to pass into FusionExecutorCache both logical (global) shapes and +// allocated (local) tensors for sharded TensorViews. The logical shapes would +// have to be passed through FusionKernelRuntime, FusionExecutor, +// ExpressionEvaluator, and so on, which is an API overhaul. std::vector unshardedSizes( const TensorView* tv, c10::IntArrayRef sizes); From 400684ea2dc7294f18b4e3609764c16e28f3945a Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Wed, 27 Nov 2024 11:32:15 -0800 Subject: [PATCH 10/17] Resolve two fixmes --- csrc/fusion_segmenter.cpp | 4 +++- csrc/transform_replay.cpp | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp index 4d7cb4e693b..c98543a179a 100644 --- a/csrc/fusion_segmenter.cpp +++ b/csrc/fusion_segmenter.cpp @@ -1884,7 +1884,9 @@ void eraseInputDistinctRootDomains(Fusion* fusion) { new_alloc.reserve(tv->getAllocationDomain().size()); for (IterDomain* alloc_id : tv->getAllocationDomain()) { IterDomain* new_alloc_id = replay.getReplay().at(alloc_id); - // FIXME: should this be taken care of by ReplayTransformations? + // ReplayTransformations replay transforms but not paralelization, so + // we have to manually parallelize the new allocation ID. In other + // places, parallelization is usually done through parallelizeAllLike. new_alloc_id->parallelize(alloc_id->getParallelType()); new_alloc.push_back(new_alloc_id); } diff --git a/csrc/transform_replay.cpp b/csrc/transform_replay.cpp index 0ac2a9e97a4..6c58d83528a 100644 --- a/csrc/transform_replay.cpp +++ b/csrc/transform_replay.cpp @@ -775,7 +775,6 @@ std::pair TransformReplay::replayCasP( // AllocationDomainTest.CacheBefore. if (auto it = p2c_map.find(alloc_id); it != p2c_map.end()) { IterDomain* new_alloc_id = it->second; - // FIXME: should this be taken care of by ReplayTransformations? new_alloc_id->parallelize(alloc_id->getParallelType()); new_allocation_domain.push_back(new_alloc_id); new_contiguity.push_back(producer->getContiguity()[i]); From 261b831b1b0e6ecdfefa2bb49142418e846678b3 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Thu, 28 Nov 2024 21:49:38 -0800 Subject: [PATCH 11/17] Harden a test --- tests/cpp/test_sharding.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/cpp/test_sharding.cpp b/tests/cpp/test_sharding.cpp index e4ce3f67ebd..62d14cf43d5 100644 --- a/tests/cpp/test_sharding.cpp +++ b/tests/cpp/test_sharding.cpp @@ -52,9 +52,12 @@ TEST_F(ShardingTest, MultipleDIDx) { Fusion fusion; FusionGuard fg(&fusion); - TensorView* x = makeSymbolicTensor(3); + TensorView* x = makeSymbolicTensor(1); + x->split(0, 2); x->axis(0)->parallelize(ParallelType::DIDx); x->axis(1)->parallelize(ParallelType::DIDx); + x->setAllocationDomain(x->getLoopDomain(), true); + EXPECT_ANY_THROW(isSharded(x)) << "Multiple DIDx:" << std::endl << x->domain()->toString(0, /*loop_only=*/false); From c9d748e34b2a2b46f7f3f1a654f96322291e927e Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Thu, 28 Nov 2024 22:14:28 -0800 Subject: [PATCH 12/17] Apply suggestions from code review Co-authored-by: samnordmann --- csrc/multidevice/utils.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/csrc/multidevice/utils.h b/csrc/multidevice/utils.h index 50f0223a034..8e240e6d5a2 100644 --- a/csrc/multidevice/utils.h +++ b/csrc/multidevice/utils.h @@ -131,7 +131,7 @@ int64_t getShardedAxis(TensorView*); void reorderDIDToFront(TensorView*); // Given a TensorView and the shape of a sharded tensor of which certain -// dimensions are partially alloated, returns the global shape that'll be used +// dimensions are partially allocated, returns the global shape that'll be used // to bind to the TensorView's logical domain. This is to solve #3282 so we can // bind a sharded tensor to a TensorView that has a DID-parallel loop domain. // @@ -142,8 +142,7 @@ void reorderDIDToFront(TensorView*); // according to the allocation domain, iM is fully allocated and iN is sharded // and thus partially allocated. // -// As a degenerate case, it's fine to call this function with a non-sharded -// TensorView and tensor. +// If the TensorView is not sharded, this function returns `sizes` // // Limitations: // - The function assumes that there are no Merges from logical to the From b6e3f40cabbd50a260562b93159f7995be6c923e Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Thu, 28 Nov 2024 22:22:58 -0800 Subject: [PATCH 13/17] Lint --- csrc/multidevice/utils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/multidevice/utils.h b/csrc/multidevice/utils.h index 8e240e6d5a2..5be2e11bd15 100644 --- a/csrc/multidevice/utils.h +++ b/csrc/multidevice/utils.h @@ -142,7 +142,7 @@ void reorderDIDToFront(TensorView*); // according to the allocation domain, iM is fully allocated and iN is sharded // and thus partially allocated. // -// If the TensorView is not sharded, this function returns `sizes` +// If the TensorView is not sharded, this function returns `sizes`. // // Limitations: // - The function assumes that there are no Merges from logical to the From 61260cb1dcface672ebb6164e6b53f9a77f04b66 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Fri, 29 Nov 2024 17:37:08 -0800 Subject: [PATCH 14/17] Add a test with reordering --- csrc/tensor_metadata.cpp | 28 ++++++++++++----- tests/cpp/test_multidevice_sharding.cpp | 40 ++++++++++++++++++++++++- 2 files changed, 59 insertions(+), 9 deletions(-) diff --git a/csrc/tensor_metadata.cpp b/csrc/tensor_metadata.cpp index 5bb67cf1b9d..5a64fe5cae7 100644 --- a/csrc/tensor_metadata.cpp +++ b/csrc/tensor_metadata.cpp @@ -37,6 +37,7 @@ class ForwardTraverseFromLogicalToAlloc { // TODO: see [Allocation domain on both side of logical] return; } + auto [in_size, in_stride] = in_it->second; auto factor = ee_.evaluate(split->factor()).as(); NVF_ERROR( @@ -44,17 +45,24 @@ class ForwardTraverseFromLogicalToAlloc { "The logical domain and allocation domain of fusion input/output ", "tensors must be a one-to-one map, therefore, ", "non-divisible split is not allowed in allocation domain"); + + int64_t inner_size; + int64_t outer_size; + if (split->innerSplit()) { + inner_size = factor; + outer_size = in_size / factor; + } else { + outer_size = factor; + inner_size = in_size / factor; + } + NVF_ERROR(active_ids_.erase(in) == 1); + NVF_ERROR(active_ids_.emplace(inner, std::make_pair(inner_size, in_stride)) + .second); NVF_ERROR( active_ids_ - .emplace(inner, std::pair{factor, in_stride}) + .emplace(outer, std::make_pair(outer_size, in_stride * inner_size)) .second); - NVF_ERROR(active_ids_ - .emplace( - outer, - std::pair{ - in_size / factor, in_stride * factor}) - .second); } void handle(Merge* merge) { @@ -259,6 +267,10 @@ void validateAllocationSizesAndStrides( "Stride mismatch with contiguity info. ", " allocation domain: ", ir_utils::toString(alloc_dom), + ": sizes: ", + sizes, + ": strides: ", + strides, "; contiguity: ", toDelimitedString(contiguity), "; dim: ", @@ -287,7 +299,7 @@ inferAndValidateAllocationSizesAndStrides( std::unordered_map> active_ids; int64_t dim_index = 0; for (IterDomain* id : TensorDomain::noReductions(logical)) { - active_ids[id] = {logical_sizes[dim_index], tensor.stride(dim_index)}; + active_ids[id] = {logical_sizes.at(dim_index), tensor.stride(dim_index)}; dim_index++; } NVF_ERROR(dim_index == tensor.dim()); diff --git a/tests/cpp/test_multidevice_sharding.cpp b/tests/cpp/test_multidevice_sharding.cpp index 4b1f605313d..3adac90bc5e 100644 --- a/tests/cpp/test_multidevice_sharding.cpp +++ b/tests/cpp/test_multidevice_sharding.cpp @@ -340,7 +340,7 @@ TEST_F(MultiDeviceTest, Transpose) { UnorderedElementsAre(HeuristicIs(SchedulerType::Transpose))); } -TEST_F(MultiDeviceTest, ParallelizeLoopSplit) { +TEST_F(MultiDeviceTest, LoopSplit) { auto fusion = std::make_unique(); FusionGuard fg(fusion.get()); @@ -372,6 +372,44 @@ TEST_F(MultiDeviceTest, ParallelizeLoopSplit) { __FILE__); } +TEST_F(MultiDeviceTest, LoopSplitWithReorder) { + auto fusion = std::make_unique(); + FusionGuard fg(fusion.get()); + + const auto num_devices = communicator_->size(); + auto mesh = DeviceMesh::createForNumDevices(num_devices); + + TensorView* in = makeContigConcreteTensor({2, num_devices * 3}); + in->setDeviceMesh(mesh); + fusion->addInput(in); + + TensorView* out = set(in); + fusion->addOutput(out); + + // logical: i{2}, i{3D} + // allocation: iDIDx{D}, i{3}, i{2} + in->split(1, num_devices, /*inner_split=*/false); + in->reorder({{0, -1}}); + in->axis(0)->parallelize(ParallelType::DIDx); + in->setAllocationDomain(in->getLoopDomain(), true); + + out->split(1, num_devices, /*inner_split=*/false); + out->axis(1)->parallelize(ParallelType::DIDx); + out->setAllocationDomain(out->getLoopDomain(), true); + + at::Tensor in_tensor = at::randn({3, 2}, tensor_options).t(); + FusionExecutorCache executor_cache(std::move(fusion)); + at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0]; + + testValidate( + executor_cache.fusion(), + {out_tensor}, + {in_tensor}, + {in_tensor}, + __LINE__, + __FILE__); +} + class MultiDeviceBroadcastTest : public MultiDeviceTest, public testing::WithParamInterface {}; From 2daf5b2d28e55430a712a57edef73dd9ed0e3b0c Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Fri, 29 Nov 2024 18:48:59 -0800 Subject: [PATCH 15/17] Lint --- csrc/tensor_metadata.cpp | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/csrc/tensor_metadata.cpp b/csrc/tensor_metadata.cpp index 5a64fe5cae7..f60f9842ced 100644 --- a/csrc/tensor_metadata.cpp +++ b/csrc/tensor_metadata.cpp @@ -39,22 +39,20 @@ class ForwardTraverseFromLogicalToAlloc { } auto [in_size, in_stride] = in_it->second; - auto factor = ee_.evaluate(split->factor()).as(); - NVF_ERROR( - in_size % factor == 0, - "The logical domain and allocation domain of fusion input/output ", - "tensors must be a one-to-one map, therefore, ", - "non-divisible split is not allowed in allocation domain"); - int64_t inner_size; - int64_t outer_size; - if (split->innerSplit()) { - inner_size = factor; - outer_size = in_size / factor; - } else { - outer_size = factor; - inner_size = in_size / factor; - } + auto [outer_size, inner_size] = [&]() { + const auto factor = ee_.evaluate(split->factor()).as(); + NVF_ERROR( + in_size % factor == 0, + "The logical domain and allocation domain of fusion input/output ", + "tensors must be a one-to-one map, therefore, ", + "non-divisible split is not allowed in allocation domain"); + if (split->innerSplit()) { + return std::make_pair(in_size / factor, factor); + } else { + return std::make_pair(factor, in_size / factor); + } + }(); NVF_ERROR(active_ids_.erase(in) == 1); NVF_ERROR(active_ids_.emplace(inner, std::make_pair(inner_size, in_stride)) From d4532e6c93415e2d21ce80c75f8c1407bd8a1410 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Fri, 29 Nov 2024 19:06:57 -0800 Subject: [PATCH 16/17] Revert "Lint" This reverts commit 2daf5b2d28e55430a712a57edef73dd9ed0e3b0c. --- csrc/tensor_metadata.cpp | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/csrc/tensor_metadata.cpp b/csrc/tensor_metadata.cpp index f60f9842ced..5a64fe5cae7 100644 --- a/csrc/tensor_metadata.cpp +++ b/csrc/tensor_metadata.cpp @@ -39,20 +39,22 @@ class ForwardTraverseFromLogicalToAlloc { } auto [in_size, in_stride] = in_it->second; + auto factor = ee_.evaluate(split->factor()).as(); + NVF_ERROR( + in_size % factor == 0, + "The logical domain and allocation domain of fusion input/output ", + "tensors must be a one-to-one map, therefore, ", + "non-divisible split is not allowed in allocation domain"); - auto [outer_size, inner_size] = [&]() { - const auto factor = ee_.evaluate(split->factor()).as(); - NVF_ERROR( - in_size % factor == 0, - "The logical domain and allocation domain of fusion input/output ", - "tensors must be a one-to-one map, therefore, ", - "non-divisible split is not allowed in allocation domain"); - if (split->innerSplit()) { - return std::make_pair(in_size / factor, factor); - } else { - return std::make_pair(factor, in_size / factor); - } - }(); + int64_t inner_size; + int64_t outer_size; + if (split->innerSplit()) { + inner_size = factor; + outer_size = in_size / factor; + } else { + outer_size = factor; + inner_size = in_size / factor; + } NVF_ERROR(active_ids_.erase(in) == 1); NVF_ERROR(active_ids_.emplace(inner, std::make_pair(inner_size, in_stride)) From a5f6eb36819d62d03a2f9a363040bc25398037c6 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Fri, 29 Nov 2024 19:07:14 -0800 Subject: [PATCH 17/17] Fix lint --- csrc/tensor_metadata.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/csrc/tensor_metadata.cpp b/csrc/tensor_metadata.cpp index 5a64fe5cae7..96f1d85fd2d 100644 --- a/csrc/tensor_metadata.cpp +++ b/csrc/tensor_metadata.cpp @@ -46,11 +46,11 @@ class ForwardTraverseFromLogicalToAlloc { "tensors must be a one-to-one map, therefore, ", "non-divisible split is not allowed in allocation domain"); - int64_t inner_size; - int64_t outer_size; + int64_t inner_size = 0; + int64_t outer_size = 0; if (split->innerSplit()) { - inner_size = factor; outer_size = in_size / factor; + inner_size = factor; } else { outer_size = factor; inner_size = in_size / factor;