From 04e06a80b7a6341c5037a0b294ccb11594fe7242 Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Mon, 18 Nov 2024 11:15:02 -0800
Subject: [PATCH 01/17] Add a repro for #3282

---
 tests/cpp/test_multidevice_sharding.cpp | 32 +++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
diff --git a/tests/cpp/test_multidevice_sharding.cpp b/tests/cpp/test_multidevice_sharding.cpp
index 1e1ff2eab9e..873cbd3e8ca 100644
--- a/tests/cpp/test_multidevice_sharding.cpp
+++ b/tests/cpp/test_multidevice_sharding.cpp
@@ -340,6 +340,38 @@ TEST_F(MultiDeviceTest, Transpose) {
       UnorderedElementsAre(HeuristicIs(SchedulerType::Transpose)));
 }
 
+TEST_F(MultiDeviceTest, ParallelizeLoopSplit) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  const auto num_devices = communicator_->size();
+  auto mesh = DeviceMesh::createForNumDevices(num_devices);
+
+  TensorView* in = makeContigConcreteTensor({num_devices * 3});
+  in->setDeviceMesh(mesh);
+  fusion->addInput(in);
+  TensorView* out = set(in);
+  fusion->addOutput(out);
+
+  for (auto* tv : {in, out}) {
+    tv->split(0, num_devices, /*inner_split=*/false);
+    tv->axis(0)->parallelize(ParallelType::DIDx);
+    tv->setAllocationDomain(tv->getLoopDomain(), true);
+  }
+
+  at::Tensor in_tensor = at::randn({3}, tensor_options);
+  FusionExecutorCache executor_cache(std::move(fusion));
+  at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0];
+
+  testValidate(
+      executor_cache.fusion(),
+      {out_tensor},
+      {in_tensor},
+      {in_tensor},
+      __LINE__,
+      __FILE__);
+}
+
 class MultiDeviceBroadcastTest : public MultiDeviceTest,
                                  public testing::WithParamInterface<bool> {};
 

From 416f1d0df4c239f92d848a7d8b63c46539d70788 Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Mon, 18 Nov 2024 11:58:07 -0800
Subject: [PATCH 02/17] Remove an assumption in the transpose scheduler.

---
 csrc/scheduler/transpose.cpp | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/csrc/scheduler/transpose.cpp b/csrc/scheduler/transpose.cpp
index e5023f4e25c..553ba9d773e 100644
--- a/csrc/scheduler/transpose.cpp
+++ b/csrc/scheduler/transpose.cpp
@@ -227,34 +227,31 @@ class DomainMap : public pointwise_utils::DomainMap {
         root_dim,
         " in tensor ",
         tv);
-    auto replay_exprs = StmtSort::getExprsBetween(
+    std::vector<Expr*> replay_exprs = StmtSort::getExprsBetween(
         {mapped_id}, {tv->getLoopDomain().begin(), tv->getLoopDomain().end()});
     // Project the root id to loop id. Similar to projectIdToRFactor.
-    for (auto expr : replay_exprs) {
-      if (expr->isA<Split>()) {
-        // Split with factor one is not supposed to be here, reshape would map
-        // this to a broadcast. This is a conservative assert, we can relaxed it
-        // and support with mapping it to outer.
-        NVF_ERROR(
-            !expr->as<Split>()->factor()->isOneInt(),
-            "split with factor one is supposed to be translated to broadcast by reshape");
-        if (expr->as<Split>()->in() == mapped_id) {
-          mapped_id = expr->as<Split>()->inner();
+    for (auto* expr : replay_exprs) {
+      if (auto* split = dynamic_cast<Split*>(expr)) {
+        if (split->in() == mapped_id) {
+          mapped_id = split->inner();
         }
-      } else if (expr->isA<Merge>()) {
+      } else if (auto* merge = dynamic_cast<Merge*>(expr)) {
         // Merge with size-1 dimension is not supposed to be here, reshape would
         // map this to a squeeze. This is a conservative assert, we can relaxed
         // it and support with mapping it to out.
         NVF_ERROR(
-            !expr->as<Merge>()->inner()->extent()->isOneInt(),
+            !merge->inner()->extent()->isOneInt(),
             "merge with size-1 dimension is supposed to be translated to squeeze by reshape");
-        if (expr->as<Merge>()->inner() == mapped_id) {
-          mapped_id = expr->as<Merge>()->out();
+        if (merge->inner() == mapped_id) {
+          mapped_id = merge->out();
+        }
+      } else if (auto* resize = dynamic_cast<Resize*>(expr)) {
+        if (resize->in() == mapped_id) {
+          mapped_id = resize->out();
         }
-      } else if (expr->isA<Resize>() && expr->as<Resize>()->in() == mapped_id) {
-        mapped_id = expr->as<Resize>()->out();
       }
     }
+
     // Find the position of the loop id
     const auto& dom = tv->getLoopDomain();
     for (auto i : c10::irange(dom.size())) {

From 2c984c8dd9918726255c4062b0f2170033f18722 Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Mon, 18 Nov 2024 17:37:50 -0800
Subject: [PATCH 03/17] Reimplement unshardSizesAndStrides.

---
 csrc/expr_evaluator.cpp    | 91 ++++++++++++++++++++++---------------
 csrc/fusion_segmenter.cpp  |  4 +-
 csrc/multidevice/utils.cpp |  5 ++-
 csrc/tensor_metadata.cpp   | 92 ++++++++++++++++++++++++--------------
 4 files changed, 118 insertions(+), 74 deletions(-)

diff --git a/csrc/expr_evaluator.cpp b/csrc/expr_evaluator.cpp
index d4ca6daa022..567447871e4 100644
--- a/csrc/expr_evaluator.cpp
+++ b/csrc/expr_evaluator.cpp
@@ -6,6 +6,9 @@
  */
 // clang-format on
 
+#include <functional>
+#include <iostream>
+
 #include <debug.h>
 #include <evaluator_common.h>
 #include <expr_evaluator.h>
@@ -14,11 +17,9 @@
 #include <ir/iostream.h>
 #include <ir/utils.h>
 #include <logical_domain_map.h>
+#include <multidevice/utils.h>
 #include <polymorphic_value.h>
 
-#include <functional>
-#include <iostream>
-
 namespace nvfuser {
 
 namespace {
@@ -127,6 +128,44 @@ void validateValWithConcreteValue(
   }
 }
 
+std::vector<int64_t> unshardedSizes(
+    const TensorView* tv,
+    c10::IntArrayRef sizes) {
+  std::vector<int64_t> unsharded_sizes = sizes.vec();
+
+  for (IterDomain* alloc_id : tv->getMaybeAllocationDomain()) {
+    const ParallelType parallel_type = alloc_id->getParallelType();
+    if (!isParallelTypeDeviceDim(parallel_type)) {
+      continue;
+    }
+
+    const auto inputs = IterVisitor::getInputsTo(
+        {alloc_id},
+        {tv->getLogicalDomain().begin(), tv->getLogicalDomain().end()});
+    if (inputs.empty()) {
+      // FIXME: is this even possible? Logical ought to dominate allocation.
+      continue;
+    }
+    NVF_ERROR(inputs.size() == 1);
+
+    const auto iter = std::find(
+        tv->getLogicalDomain().begin(),
+        tv->getLogicalDomain().end(),
+        inputs[0]);
+    if (iter == tv->getLogicalDomain().end()) {
+      // FIXME: is this even possible? Logical ought to dominate allocation.
+      continue;
+    }
+    const auto index = std::count_if(
+        tv->getLogicalDomain().begin(), iter, [](IterDomain* id) -> bool {
+          return !id->isReduction();
+        });
+    unsharded_sizes.at(index) *= tv->getDeviceMesh().size(parallel_type);
+  }
+
+  return unsharded_sizes;
+}
+
 } // namespace
 
 void ExpressionEvaluator::bindTensorDomain(
@@ -143,6 +182,14 @@ void ExpressionEvaluator::bindTensorDomain(
       logical_domain.size(),
       ", but got a tensor of rank ",
       t.dim());
+
+  std::vector<int64_t> sizes;
+  if (isSharded(tv)) {
+    sizes = unshardedSizes(tv, t.sizes());
+  } else {
+    sizes = t.sizes().vec();
+  }
+
   for (auto i : c10::irange(t.dim())) {
     auto id = logical_domain[i];
     if (id->isBroadcast()) {
@@ -151,7 +198,7 @@ void ExpressionEvaluator::bindTensorDomain(
       if (id->hasExpandedExtent()) {
         // Verify that t is also expanded
         NVF_ERROR(
-            t.size(i) == 1 || t.stride(i) == 0,
+            sizes[i] == 1 || t.stride(i) == 0,
             "IterDomain ",
             id->toString(),
             " in ",
@@ -159,45 +206,15 @@ void ExpressionEvaluator::bindTensorDomain(
             "TensorView ",
             tv->toString(),
             " has expanded extent but input tensor has size ",
-            t.size(i),
+            sizes[i],
             " and stride ",
             t.stride(i),
             " in dimension ",
             i);
-        bind_(
-            logical_domain[i]->expandedExtent(), t.size(i), evaluate_validate);
+        bind_(logical_domain[i]->expandedExtent(), sizes[i], evaluate_validate);
       }
     } else {
-      if (logical_domain[i]->isDeviceDim()) {
-        // Currently we have the restrictions:
-        // (1) Devices parallelized axis extent == DeviceMesh's extent
-        // (2) Device parallelized axis cannot be split or merged
-        // Therefore, the device parallelized extents will always be allocated
-        // with size 1, but the symbolic axis extent is binded with the extent
-        // of the DeviceMesh
-        NVF_CHECK(
-            1 == t.size(i),
-            "TensorView ",
-            tv->toString(),
-            getInputPosString(tv),
-            " IterDomain ",
-            id->toString(),
-            "is sharded and must have size 1, but input tensor has size ",
-            t.size(i));
-        NVF_CHECK(
-            tv->hasDeviceMesh(),
-            "TV ",
-            tv->toString(),
-            getInputPosString(tv),
-            " has an empty DeviceMesh with DID parallelization")
-        bind_(
-            logical_domain[i]->extent(),
-            static_cast<int64_t>(
-                tv->getDeviceMesh().size(logical_domain[i]->getParallelType())),
-            evaluate_validate);
-      } else {
-        bind_(logical_domain[i]->extent(), t.size(i), evaluate_validate);
-      }
+      bind_(logical_domain[i]->extent(), sizes[i], evaluate_validate);
     }
   }
 }
diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index 506a2e81987..fa8b50d6fa2 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -1883,7 +1883,9 @@ void eraseInputDistinctRootDomains(Fusion* fusion) {
       std::vector<IterDomain*> new_alloc;
       new_alloc.reserve(tv->getAllocationDomain().size());
       for (IterDomain* alloc_id : tv->getAllocationDomain()) {
-        new_alloc.push_back(replay.getReplay().at(alloc_id));
+        IterDomain* new_alloc_id = replay.getReplay().at(alloc_id);
+        new_alloc_id->parallelize(alloc_id->getParallelType());
+        new_alloc.push_back(new_alloc_id);
       }
 
       std::vector<IterDomain*> new_loop;
diff --git a/csrc/multidevice/utils.cpp b/csrc/multidevice/utils.cpp
index 1c40ffc5c2b..65cb76b0da7 100644
--- a/csrc/multidevice/utils.cpp
+++ b/csrc/multidevice/utils.cpp
@@ -106,8 +106,9 @@ std::pair<std::vector<IterDomain*>, std::vector<IterDomain*>> getShardingChanges
 
 bool isSharded(const TensorView* tv) {
   bool is_sharded = false;
-  for (IterDomain* id : TensorDomain::noReductions(tv->getLoopDomain())) {
-    if (!id->isDeviceDim()) {
+  for (IterDomain* alloc_id :
+       TensorDomain::noReductions(tv->getMaybeAllocationDomain())) {
+    if (!alloc_id->isDeviceDim()) {
       continue;
     }
 
diff --git a/csrc/tensor_metadata.cpp b/csrc/tensor_metadata.cpp
index 32fdee2de42..e35d235b709 100644
--- a/csrc/tensor_metadata.cpp
+++ b/csrc/tensor_metadata.cpp
@@ -272,6 +272,46 @@ void validateAllocationSizesAndStrides(
   }
 }
 
+// FIXME: strides are never changed
+std::pair<std::vector<int64_t>, std::vector<int64_t>> unshardedSizesAndStrides(
+    TensorView* tv,
+    c10::IntArrayRef sizes,
+    c10::IntArrayRef strides) {
+  std::vector<int64_t> unsharded_sizes = sizes.vec();
+  std::vector<int64_t> unsharded_strides = strides.vec();
+
+  for (IterDomain* alloc_id : tv->getMaybeAllocationDomain()) {
+    const ParallelType parallel_type = alloc_id->getParallelType();
+    if (!isParallelTypeDeviceDim(parallel_type)) {
+      continue;
+    }
+
+    const auto inputs = IterVisitor::getInputsTo(
+        {alloc_id},
+        {tv->getLogicalDomain().begin(), tv->getLogicalDomain().end()});
+    if (inputs.empty()) {
+      // FIXME: is this even possible? Logical ought to dominate loop.
+      continue;
+    }
+    NVF_ERROR(inputs.size() == 1);
+
+    const auto iter = std::find(
+        tv->getLogicalDomain().begin(),
+        tv->getLogicalDomain().end(),
+        inputs[0]);
+    if (iter == tv->getLogicalDomain().end()) {
+      // FIXME: is this even possible? Logical ought to dominate loop.
+      continue;
+    }
+    const auto index = std::count_if(
+        tv->getLogicalDomain().begin(), iter, [](IterDomain* id) -> bool {
+          return !id->isReduction();
+        });
+    unsharded_sizes.at(index) *= tv->getDeviceMesh().size(parallel_type);
+  }
+
+  return {unsharded_sizes, unsharded_strides};
+}
 } // namespace
 
 std::pair<std::vector<int64_t>, std::vector<int64_t>>
@@ -282,11 +322,21 @@ inferAndValidateAllocationSizesAndStrides(
   const auto& logical = tv->getLogicalDomain();
   const auto& alloc = tv->getMaybeAllocationDomain();
 
+  std::vector<int64_t> logical_sizes;
+  std::vector<int64_t> logical_strides;
+  if (isSharded(tv)) {
+    std::tie(logical_sizes, logical_strides) =
+        unshardedSizesAndStrides(tv, tensor.sizes(), tensor.strides());
+  } else {
+    logical_sizes = tensor.sizes().vec();
+    logical_strides = tensor.strides().vec();
+  }
+
   // active IDs and their shape and stride
   std::unordered_map<IterDomain*, std::pair<int64_t, int64_t>> active_ids;
   int64_t dim_index = 0;
   for (IterDomain* id : TensorDomain::noReductions(logical)) {
-    active_ids[id] = {tensor.size(dim_index), tensor.stride(dim_index)};
+    active_ids[id] = {logical_sizes[dim_index], logical_strides[dim_index]};
     dim_index++;
   }
   NVF_ERROR(dim_index == tensor.dim());
@@ -296,50 +346,24 @@ inferAndValidateAllocationSizesAndStrides(
 
   // Now active_ids should contain the final sizes and strides, unordered. We
   // need to put them to the correct order.
-  std::vector<int64_t> sizes;
-  std::vector<int64_t> strides;
-  sizes.reserve(alloc.size());
-  strides.reserve(alloc.size());
+  std::vector<int64_t> allocation_sizes;
+  std::vector<int64_t> allocation_strides;
   for (IterDomain* id : TensorDomain::noReductions(alloc)) {
     if (id->isDeviceDim()) {
-      sizes.push_back(1);
+      allocation_sizes.push_back(1);
     } else {
-      sizes.push_back(active_ids.at(id).first);
+      allocation_sizes.push_back(active_ids.at(id).first);
     }
-    strides.push_back(active_ids.at(id).second);
+    allocation_strides.push_back(active_ids.at(id).second);
   }
 
   // Only validate final sizes and strides when we have a non-empty tensor.
   if (tensor.numel() != 0) {
     validateAllocationSizesAndStrides(
-        alloc, tv->getContiguity(), sizes, strides);
-  }
-  return {std::move(sizes), std::move(strides)};
-}
-
-namespace {
-std::pair<std::vector<int64_t>, std::vector<int64_t>> unshardedSizesAndStrides(
-    TensorView* tv,
-    c10::IntArrayRef sizes,
-    c10::IntArrayRef strides) {
-  std::vector<int64_t> unsharded_sizes(sizes.size());
-  std::vector<int64_t> unsharded_strides(strides.size());
-  for (const auto i : c10::irange(sizes.size())) {
-    IterDomain* id = tv->getLogicalDomain()[i];
-    if (id->isDeviceDim()) {
-      unsharded_sizes[i] = tv->getDeviceMesh().size(id->getParallelType());
-      // This probably doesn't matter in practice unless a kernel accidentally
-      // tries to access the data on another rank. To be safe, set the stride
-      // to zero, analogous to an expanded broadcast dimension.
-      unsharded_strides[i] = 0;
-    } else {
-      unsharded_sizes[i] = sizes[i];
-      unsharded_strides[i] = strides[i];
-    }
+        alloc, tv->getContiguity(), allocation_sizes, allocation_strides);
   }
-  return {unsharded_sizes, unsharded_strides};
+  return {std::move(allocation_sizes), std::move(allocation_strides)};
 }
-} // namespace
 
 std::vector<PolymorphicValue> GetMetaData::evaluate(
     const ExpressionEvaluator& ee,

From 44b50912bffb6280aab21738f9b45a1d1fd23ae1 Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Thu, 21 Nov 2024 14:29:06 -0800
Subject: [PATCH 04/17] Inherit parallel type for new allocation IDs

---
 csrc/fusion_segmenter.cpp | 1 +
 csrc/transform_replay.cpp | 9 ++++++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index fa8b50d6fa2..4d7cb4e693b 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -1884,6 +1884,7 @@ void eraseInputDistinctRootDomains(Fusion* fusion) {
       new_alloc.reserve(tv->getAllocationDomain().size());
       for (IterDomain* alloc_id : tv->getAllocationDomain()) {
         IterDomain* new_alloc_id = replay.getReplay().at(alloc_id);
+        // FIXME: should this be taken care of by ReplayTransformations?
         new_alloc_id->parallelize(alloc_id->getParallelType());
         new_alloc.push_back(new_alloc_id);
       }
diff --git a/csrc/transform_replay.cpp b/csrc/transform_replay.cpp
index 06e15929aa9..0ac2a9e97a4 100644
--- a/csrc/transform_replay.cpp
+++ b/csrc/transform_replay.cpp
@@ -770,11 +770,14 @@ std::pair<TensorDomain*, int64_t> TransformReplay::replayCasP(
     new_contiguity.reserve(producer_rank);
 
     for (auto i : c10::irange(producer_rank)) {
-      IterDomain* id = producer->getAllocationDomain()[i];
+      IterDomain* alloc_id = producer->getAllocationDomain()[i];
       // We won't find reduction IterDomains in the map. See
       // AllocationDomainTest.CacheBefore.
-      if (auto it = p2c_map.find(id); it != p2c_map.end()) {
-        new_allocation_domain.push_back(it->second);
+      if (auto it = p2c_map.find(alloc_id); it != p2c_map.end()) {
+        IterDomain* new_alloc_id = it->second;
+        // FIXME: should this be taken care of by ReplayTransformations?
+        new_alloc_id->parallelize(alloc_id->getParallelType());
+        new_allocation_domain.push_back(new_alloc_id);
         new_contiguity.push_back(producer->getContiguity()[i]);
       }
     }

From 521c783a6e1a7d7ad5c0dc568ff6e2af79924501 Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Thu, 21 Nov 2024 15:07:07 -0800
Subject: [PATCH 05/17] Fix broadcast tests

---
 csrc/expr_evaluator.cpp                 |  7 +++----
 tests/cpp/test_multidevice_sharding.cpp | 18 +++++++++++++-----
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/csrc/expr_evaluator.cpp b/csrc/expr_evaluator.cpp
index 567447871e4..7e04c3bcd4d 100644
--- a/csrc/expr_evaluator.cpp
+++ b/csrc/expr_evaluator.cpp
@@ -193,8 +193,7 @@ void ExpressionEvaluator::bindTensorDomain(
   for (auto i : c10::irange(t.dim())) {
     auto id = logical_domain[i];
     if (id->isBroadcast()) {
-      // DIDs are ignored for broadcast.
-      bind_(logical_domain[i]->extent(), 1, evaluate_validate);
+      bind_(id->extent(), 1, evaluate_validate);
       if (id->hasExpandedExtent()) {
         // Verify that t is also expanded
         NVF_ERROR(
@@ -211,10 +210,10 @@ void ExpressionEvaluator::bindTensorDomain(
             t.stride(i),
             " in dimension ",
             i);
-        bind_(logical_domain[i]->expandedExtent(), sizes[i], evaluate_validate);
+        bind_(id->expandedExtent(), sizes[i], evaluate_validate);
       }
     } else {
-      bind_(logical_domain[i]->extent(), sizes[i], evaluate_validate);
+      bind_(id->extent(), sizes[i], evaluate_validate);
     }
   }
 }
diff --git a/tests/cpp/test_multidevice_sharding.cpp b/tests/cpp/test_multidevice_sharding.cpp
index 873cbd3e8ca..4b1f605313d 100644
--- a/tests/cpp/test_multidevice_sharding.cpp
+++ b/tests/cpp/test_multidevice_sharding.cpp
@@ -424,20 +424,28 @@ TEST_P(MultiDeviceBroadcastTest, Expanded) {
   TensorView* in = TensorViewBuilder()
                        .dtype(DataType::Float)
                        .contiguity({std::nullopt, true})
-                       .shape({3, -1})
+                       .shape({num_devices * 3, -1})
                        .expanded({true, false})
                        .build();
   in->setDeviceMesh(mesh);
-  if (parallelizes_broadcast) {
-    in->axis(0)->parallelize(ParallelType::DIDx);
-  }
   TensorView* out = set(in);
   fusion->addInput(in);
   fusion->addOutput(out);
 
+  if (parallelizes_broadcast) {
+    for (auto* tv : {in, out}) {
+      tv->split(0, num_devices, /*inner_split=*/false);
+      tv->axis(0)->parallelize(ParallelType::DIDx);
+      tv->setAllocationDomain(tv->getLoopDomain(), true);
+    }
+  }
+
   FusionExecutorCache executor_cache(std::move(fusion));
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor in_tensor = at::randn({8}, options).as_strided({3, 8}, {0, 1});
+  at::Tensor in_tensor =
+      at::randn({8}, options)
+          .as_strided(
+              {parallelizes_broadcast ? 3 : num_devices * 3, 8}, {0, 1});
   at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0];
   testValidate(
       executor_cache.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);

From 9ff10cf8281b78d89c640080d79777b0d9f2513d Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Fri, 22 Nov 2024 09:59:00 -0800
Subject: [PATCH 06/17] Unify unshardedSizes.

---
 csrc/expr_evaluator.cpp    | 54 +++---------------------------
 csrc/multidevice/utils.cpp | 45 +++++++++++++++++++++++++
 csrc/multidevice/utils.h   | 29 ++++++++++++++++
 csrc/tensor_metadata.cpp   | 68 +++++---------------------------------
 4 files changed, 88 insertions(+), 108 deletions(-)

diff --git a/csrc/expr_evaluator.cpp b/csrc/expr_evaluator.cpp
index 7e04c3bcd4d..a2ebccfb7b3 100644
--- a/csrc/expr_evaluator.cpp
+++ b/csrc/expr_evaluator.cpp
@@ -128,44 +128,6 @@ void validateValWithConcreteValue(
   }
 }
 
-std::vector<int64_t> unshardedSizes(
-    const TensorView* tv,
-    c10::IntArrayRef sizes) {
-  std::vector<int64_t> unsharded_sizes = sizes.vec();
-
-  for (IterDomain* alloc_id : tv->getMaybeAllocationDomain()) {
-    const ParallelType parallel_type = alloc_id->getParallelType();
-    if (!isParallelTypeDeviceDim(parallel_type)) {
-      continue;
-    }
-
-    const auto inputs = IterVisitor::getInputsTo(
-        {alloc_id},
-        {tv->getLogicalDomain().begin(), tv->getLogicalDomain().end()});
-    if (inputs.empty()) {
-      // FIXME: is this even possible? Logical ought to dominate allocation.
-      continue;
-    }
-    NVF_ERROR(inputs.size() == 1);
-
-    const auto iter = std::find(
-        tv->getLogicalDomain().begin(),
-        tv->getLogicalDomain().end(),
-        inputs[0]);
-    if (iter == tv->getLogicalDomain().end()) {
-      // FIXME: is this even possible? Logical ought to dominate allocation.
-      continue;
-    }
-    const auto index = std::count_if(
-        tv->getLogicalDomain().begin(), iter, [](IterDomain* id) -> bool {
-          return !id->isReduction();
-        });
-    unsharded_sizes.at(index) *= tv->getDeviceMesh().size(parallel_type);
-  }
-
-  return unsharded_sizes;
-}
-
 } // namespace
 
 void ExpressionEvaluator::bindTensorDomain(
@@ -183,13 +145,7 @@ void ExpressionEvaluator::bindTensorDomain(
       ", but got a tensor of rank ",
       t.dim());
 
-  std::vector<int64_t> sizes;
-  if (isSharded(tv)) {
-    sizes = unshardedSizes(tv, t.sizes());
-  } else {
-    sizes = t.sizes().vec();
-  }
-
+  std::vector<int64_t> logical_sizes = unshardedSizes(tv, t.sizes());
   for (auto i : c10::irange(t.dim())) {
     auto id = logical_domain[i];
     if (id->isBroadcast()) {
@@ -197,7 +153,7 @@ void ExpressionEvaluator::bindTensorDomain(
       if (id->hasExpandedExtent()) {
         // Verify that t is also expanded
         NVF_ERROR(
-            sizes[i] == 1 || t.stride(i) == 0,
+            logical_sizes[i] == 1 || t.stride(i) == 0,
             "IterDomain ",
             id->toString(),
             " in ",
@@ -205,15 +161,15 @@ void ExpressionEvaluator::bindTensorDomain(
             "TensorView ",
             tv->toString(),
             " has expanded extent but input tensor has size ",
-            sizes[i],
+            logical_sizes[i],
             " and stride ",
             t.stride(i),
             " in dimension ",
             i);
-        bind_(id->expandedExtent(), sizes[i], evaluate_validate);
+        bind_(id->expandedExtent(), logical_sizes[i], evaluate_validate);
       }
     } else {
-      bind_(id->extent(), sizes[i], evaluate_validate);
+      bind_(id->extent(), logical_sizes[i], evaluate_validate);
     }
   }
 }
diff --git a/csrc/multidevice/utils.cpp b/csrc/multidevice/utils.cpp
index 65cb76b0da7..f17e9a42c0a 100644
--- a/csrc/multidevice/utils.cpp
+++ b/csrc/multidevice/utils.cpp
@@ -122,6 +122,51 @@ bool isSharded(const TensorView* tv) {
   return is_sharded;
 }
 
+std::vector<int64_t> unshardedSizes(
+    const TensorView* tv,
+    c10::IntArrayRef sizes) {
+  std::vector<int64_t> unsharded_sizes = sizes.vec();
+
+  for (IterDomain* alloc_id : tv->getMaybeAllocationDomain()) {
+    const ParallelType parallel_type = alloc_id->getParallelType();
+    if (!isParallelTypeDeviceDim(parallel_type)) {
+      continue;
+    }
+
+    const auto inputs = IterVisitor::getInputsTo(
+        {alloc_id},
+        {tv->getLogicalDomain().begin(), tv->getLogicalDomain().end()});
+    NVF_ERROR(
+        !inputs.empty(),
+        "IterVisitor::getInputsTo shouldn't return empty unless `of` is empty.");
+    NVF_ERROR(
+        inputs.size() == 1,
+        "Failed to find the single logical input to ",
+        alloc_id,
+        ". This is likely because there's a Merge expression from logical to allocation, which isn't supported. Inputs are: ",
+        toDelimitedString(inputs));
+
+    const auto iter = std::find(
+        tv->getLogicalDomain().begin(),
+        tv->getLogicalDomain().end(),
+        inputs[0]);
+    NVF_ERROR(
+        iter != tv->getLogicalDomain().end(),
+        "The found input IterDomain isn't logical. This is likely because logical doesn't dominate allocation: ",
+        inputs[0]);
+
+    // Count the number of non-reduction IterDomains before `iter`. Reduction
+    // IterDomains are not materialized in the at::Tensor's shape.
+    const auto index = std::count_if(
+        tv->getLogicalDomain().begin(), iter, [](IterDomain* id) -> bool {
+          return !id->isReduction();
+        });
+    unsharded_sizes.at(index) *= tv->getDeviceMesh().size(parallel_type);
+  }
+
+  return unsharded_sizes;
+}
+
 int64_t numDeviceDims(const TensorView* tv) {
   return std::count_if(
       tv->getLoopDomain().begin(),
diff --git a/csrc/multidevice/utils.h b/csrc/multidevice/utils.h
index 12013e918b4..b4f25f0df6c 100644
--- a/csrc/multidevice/utils.h
+++ b/csrc/multidevice/utils.h
@@ -7,6 +7,8 @@
 // clang-format on
 #pragma once
 
+#include <c10/util/ArrayRef.h>
+
 #include <compute_at_map.h>
 #include <fusion.h>
 #include <id_model/id_model.h>
@@ -127,4 +129,31 @@ int64_t getShardedAxis(TensorView*);
 
 // Reorders a TensorView so that the DID parallelized axis are in front.
 void reorderDIDToFront(TensorView*);
+
+// Given a TensorView and the shape of a sharded tensor of which certain
+// dimensions are partially alloated, returns the global shape that'll be used
+// to bind to the TensorView's logical domain. This is to solve #3282 so we can
+// bind a sharded tensor to a TensorView that has a DID-parallel loop domain.
+//
+// For example, when `tv` is
+//   logical: iM, iN
+//   allocation: iDIDx{D}, iN/D, iM
+// and `sizes` is [2, 3], the returned shape will be [2, 3D]. This is because,
+// according to the allocation domain, iM is fully allocated and iN is sharded
+// and thus partially allocated.
+//
+// As a degenerate case, it's fine to call this function with a non-sharded
+// TensorView and tensor.
+//
+// Limitations:
+// - The function assumes that there are no Merges from logical to the
+// DID-parallel IterDomains in allocation. Otherwise, it's unclear which logical
+// dimension this DID-parallelization should be attributed to.
+// - The function assumes that all Splits from logical to the DID-parallel
+// IterDomains in allocation are even. This is because there are currently no
+// ways to pass in the global shape without an API overhaul.
+std::vector<int64_t> unshardedSizes(
+    const TensorView* tv,
+    c10::IntArrayRef sizes);
+
 } // namespace nvfuser
diff --git a/csrc/tensor_metadata.cpp b/csrc/tensor_metadata.cpp
index e35d235b709..5bb67cf1b9d 100644
--- a/csrc/tensor_metadata.cpp
+++ b/csrc/tensor_metadata.cpp
@@ -272,46 +272,6 @@ void validateAllocationSizesAndStrides(
   }
 }
 
-// FIXME: strides are never changed
-std::pair<std::vector<int64_t>, std::vector<int64_t>> unshardedSizesAndStrides(
-    TensorView* tv,
-    c10::IntArrayRef sizes,
-    c10::IntArrayRef strides) {
-  std::vector<int64_t> unsharded_sizes = sizes.vec();
-  std::vector<int64_t> unsharded_strides = strides.vec();
-
-  for (IterDomain* alloc_id : tv->getMaybeAllocationDomain()) {
-    const ParallelType parallel_type = alloc_id->getParallelType();
-    if (!isParallelTypeDeviceDim(parallel_type)) {
-      continue;
-    }
-
-    const auto inputs = IterVisitor::getInputsTo(
-        {alloc_id},
-        {tv->getLogicalDomain().begin(), tv->getLogicalDomain().end()});
-    if (inputs.empty()) {
-      // FIXME: is this even possible? Logical ought to dominate loop.
-      continue;
-    }
-    NVF_ERROR(inputs.size() == 1);
-
-    const auto iter = std::find(
-        tv->getLogicalDomain().begin(),
-        tv->getLogicalDomain().end(),
-        inputs[0]);
-    if (iter == tv->getLogicalDomain().end()) {
-      // FIXME: is this even possible? Logical ought to dominate loop.
-      continue;
-    }
-    const auto index = std::count_if(
-        tv->getLogicalDomain().begin(), iter, [](IterDomain* id) -> bool {
-          return !id->isReduction();
-        });
-    unsharded_sizes.at(index) *= tv->getDeviceMesh().size(parallel_type);
-  }
-
-  return {unsharded_sizes, unsharded_strides};
-}
 } // namespace
 
 std::pair<std::vector<int64_t>, std::vector<int64_t>>
@@ -322,21 +282,12 @@ inferAndValidateAllocationSizesAndStrides(
   const auto& logical = tv->getLogicalDomain();
   const auto& alloc = tv->getMaybeAllocationDomain();
 
-  std::vector<int64_t> logical_sizes;
-  std::vector<int64_t> logical_strides;
-  if (isSharded(tv)) {
-    std::tie(logical_sizes, logical_strides) =
-        unshardedSizesAndStrides(tv, tensor.sizes(), tensor.strides());
-  } else {
-    logical_sizes = tensor.sizes().vec();
-    logical_strides = tensor.strides().vec();
-  }
-
   // active IDs and their shape and stride
+  std::vector<int64_t> logical_sizes = unshardedSizes(tv, tensor.sizes());
   std::unordered_map<IterDomain*, std::pair<int64_t, int64_t>> active_ids;
   int64_t dim_index = 0;
   for (IterDomain* id : TensorDomain::noReductions(logical)) {
-    active_ids[id] = {logical_sizes[dim_index], logical_strides[dim_index]};
+    active_ids[id] = {logical_sizes[dim_index], tensor.stride(dim_index)};
     dim_index++;
   }
   NVF_ERROR(dim_index == tensor.dim());
@@ -348,6 +299,8 @@ inferAndValidateAllocationSizesAndStrides(
   // need to put them to the correct order.
   std::vector<int64_t> allocation_sizes;
   std::vector<int64_t> allocation_strides;
+  allocation_sizes.reserve(alloc.size());
+  allocation_strides.reserve(alloc.size());
   for (IterDomain* id : TensorDomain::noReductions(alloc)) {
     if (id->isDeviceDim()) {
       allocation_sizes.push_back(1);
@@ -388,22 +341,19 @@ std::vector<PolymorphicValue> GetMetaData::evaluate(
   metadata->data = input.data_ptr();
 
   if (isSharded(tv)) {
-    auto [unsharded_sizes, unsharded_strides] =
-        unshardedSizesAndStrides(tv, input.sizes(), input.strides());
+    std::vector<int64_t> unsharded_sizes = unshardedSizes(tv, input.sizes());
     metadata->logical_size_data = std::move(unsharded_sizes);
     metadata->logical_size = c10::makeArrayRef(metadata->logical_size_data);
-    metadata->logical_stride_data = std::move(unsharded_strides);
-    metadata->logical_stride = c10::makeArrayRef(metadata->logical_stride_data);
   } else {
     metadata->logical_size = input.sizes();
-    metadata->logical_stride = input.strides();
   }
+  metadata->logical_stride = input.strides();
 
-  auto [sizes, strides] =
+  auto [allocation_sizes, allocation_strides] =
       inferAndValidateAllocationSizesAndStrides(input, tv, ee);
-  metadata->alloc_size_data = std::move(sizes);
+  metadata->alloc_size_data = std::move(allocation_sizes);
   metadata->alloc_size = c10::makeArrayRef(metadata->alloc_size_data);
-  metadata->alloc_stride_data = std::move(strides);
+  metadata->alloc_stride_data = std::move(allocation_strides);
   metadata->alloc_stride = c10::makeArrayRef(metadata->alloc_stride_data);
   return {PolymorphicValue(std::move(struct_))};
 }

From 8bd9486caa36265c81ca715e454ed7164b38abc1 Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Mon, 25 Nov 2024 16:52:35 -0800
Subject: [PATCH 07/17] Fix a test

---
 csrc/ir/nodes.cpp           | 23 ++++++++----------
 tests/cpp/test_sharding.cpp | 47 ++++++++++++++++++++++++-------------
 2 files changed, 41 insertions(+), 29 deletions(-)

diff --git a/csrc/ir/nodes.cpp b/csrc/ir/nodes.cpp
index c93c4980e85..6906861814e 100644
--- a/csrc/ir/nodes.cpp
+++ b/csrc/ir/nodes.cpp
@@ -3238,24 +3238,21 @@ bool TensorDomain::sameAs(
 std::string TensorDomain::toString(const int indent_size, const bool loop_only)
     const {
   std::stringstream ss;
-  if (nDims() == 0) {
-    indent(ss, indent_size) << "[ ]";
-    return ss.str();
-  }
-  indent(ss, indent_size) << "[ " << toDelimitedString(loop()) << " ]";
-  if (!loop_only) {
+  if (loop_only) {
+    indent(ss, indent_size) << "[" << toDelimitedString(loop()) << "]";
+  } else {
+    indent(ss, indent_size)
+        << "logical=[" << toDelimitedString(logical()) << "]" << std::endl;
     if (hasRoot()) {
-      ss << "," << std::endl;
       indent(ss, indent_size + 1)
-          << "root=[ " << toDelimitedString(root()) << " ]";
+          << "root=[" << toDelimitedString(root()) << "]" << std::endl;
     }
-    ss << "," << std::endl;
     indent(ss, indent_size + 1)
-        << "logical=[ " << toDelimitedString(logical()) << " ]";
-    if (!allocation_domain_.empty()) {
-      ss << "," << std::endl;
+        << "loop=[" << toDelimitedString(loop()) << "]" << std::endl;
+    if (hasAllocation()) {
       indent(ss, indent_size + 1)
-          << "allocation=[ " << toDelimitedString(allocation()) << " ]";
+          << "allocation=[" << toDelimitedString(allocation()) << "]"
+          << std::endl;
     }
   }
   return ss.str();
diff --git a/tests/cpp/test_sharding.cpp b/tests/cpp/test_sharding.cpp
index a0c643e95b4..9c9ca068689 100644
--- a/tests/cpp/test_sharding.cpp
+++ b/tests/cpp/test_sharding.cpp
@@ -23,26 +23,41 @@ namespace nvfuser {
 
 using ShardingTest = NVFuserFixtureParamTest<bool>;
 
-// TODO: This test checks that isSharded generates an error when a split/merged
-// axis is parallelized with DIDx. Update when this restriction is lifted.
-TEST_F(ShardingTest, IsSharded) {
+TEST_F(ShardingTest, LogicalIsSharded) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
-  TensorView* a = makeSymbolicTensor(3);
-  a->axis(2)->parallelize(ParallelType::DIDx);
-  a->split(0, 4);
-  EXPECT_TRUE(isSharded(a)) << "DIDx on logical domain";
+  TensorView* x = makeSymbolicTensor(3);
+  x->axis(2)->parallelize(ParallelType::DIDx);
+  x->split(0, 4);
 
-  TensorView* b = makeSymbolicTensor(3);
-  b->split(1, 4);
-  b->axis(1)->parallelize(ParallelType::DIDx);
-  EXPECT_TRUE(isSharded(b)) << "DIDx on loop domain";
-
-  TensorView* c = makeSymbolicTensor(3);
-  c->axis(0)->parallelize(ParallelType::DIDx);
-  c->axis(1)->parallelize(ParallelType::DIDx);
-  EXPECT_ANY_THROW(isSharded(c)) << "Multiple DIDx";
+  EXPECT_TRUE(isSharded(x)) << "DIDx on logical domain:" << std::endl
+                            << x->domain()->toString(0, /*loop_only=*/false);
+}
+
+TEST_F(ShardingTest, AllocationIsSharded) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* x = makeSymbolicTensor(3);
+  x->split(1, 4);
+  x->axis(1)->parallelize(ParallelType::DIDx);
+  x->setAllocationDomain(x->getLoopDomain(), true);
+
+  EXPECT_TRUE(isSharded(x)) << "DIDx on allocation domain:" << std::endl
+                            << x->domain()->toString(0, /*loop_only=*/false);
+}
+
+TEST_F(ShardingTest, MultipleDIDx) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* x = makeSymbolicTensor(3);
+  x->axis(0)->parallelize(ParallelType::DIDx);
+  x->axis(1)->parallelize(ParallelType::DIDx);
+  EXPECT_ANY_THROW(isSharded(x))
+      << "Multiple DIDx:" << std::endl
+      << x->domain()->toString(0, /*loop_only=*/false);
 }
 
 TEST_F(ShardingTest, PropagateSharding) {

From 5a16349adcde62fb267259949565f85a8d4660b6 Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Mon, 25 Nov 2024 20:23:31 -0800
Subject: [PATCH 08/17] Refine the logic in the transpose scheduler

---
 csrc/scheduler/transpose.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/csrc/scheduler/transpose.cpp b/csrc/scheduler/transpose.cpp
index 553ba9d773e..7e320f99a91 100644
--- a/csrc/scheduler/transpose.cpp
+++ b/csrc/scheduler/transpose.cpp
@@ -233,7 +233,12 @@ class DomainMap : public pointwise_utils::DomainMap {
     for (auto* expr : replay_exprs) {
       if (auto* split = dynamic_cast<Split*>(expr)) {
         if (split->in() == mapped_id) {
-          mapped_id = split->inner();
+          if (split->inner()->extent()->isOneInt() &&
+              !split->outer()->extent()->isOneInt()) {
+            mapped_id = split->outer();
+          } else {
+            mapped_id = split->inner();
+          }
         }
       } else if (auto* merge = dynamic_cast<Merge*>(expr)) {
         // Merge with size-1 dimension is not supposed to be here, reshape would

From 27689708c7e15f91c9ac1383f434284464bf1990 Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Wed, 27 Nov 2024 11:17:17 -0800
Subject: [PATCH 09/17] Comment

---
 csrc/multidevice/utils.h | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/csrc/multidevice/utils.h b/csrc/multidevice/utils.h
index b4f25f0df6c..50f0223a034 100644
--- a/csrc/multidevice/utils.h
+++ b/csrc/multidevice/utils.h
@@ -151,7 +151,20 @@ void reorderDIDToFront(TensorView*);
 // dimension this DID-parallelization should be attributed to.
 // - The function assumes that all Splits from logical to the DID-parallel
 // IterDomains in allocation are even. This is because there are currently no
-// ways to pass in the global shape without an API overhaul.
+// ways to pass in the global shape.
+//
+// Despite these limitations, I took this approach as a shortcut to fix #3282,
+// which blocked many other tasks. I'm however open to other better, long-term
+// solutions. Some alternatives considered in #3282 are:
+// - Try to bind `at::Tensor`s to allocation domains instead of logical. Many
+// `*Op::evaluate` methods (e.g.
+// https://github.com/NVIDIA/Fuser/blob/2415d904d1e9a5da7ca6fb1a55d3045bbd510341/csrc/ir/nodes.cpp#L4321-L4329)
+// assume the input/output `at::Tensor`s have the same dimension order as the
+// logical domain. Doing so would have to change them all.
+// - Try to pass into FusionExecutorCache both logical (global) shapes and
+// allocated (local) tensors for sharded TensorViews. The logical shapes would
+// have to be passed through FusionKernelRuntime, FusionExecutor,
+// ExpressionEvaluator, and so on, which is an API overhaul.
 std::vector<int64_t> unshardedSizes(
     const TensorView* tv,
     c10::IntArrayRef sizes);

From 400684ea2dc7294f18b4e3609764c16e28f3945a Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Wed, 27 Nov 2024 11:32:15 -0800
Subject: [PATCH 10/17] Resolve two fixmes

---
 csrc/fusion_segmenter.cpp | 4 +++-
 csrc/transform_replay.cpp | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/csrc/fusion_segmenter.cpp b/csrc/fusion_segmenter.cpp
index 4d7cb4e693b..c98543a179a 100644
--- a/csrc/fusion_segmenter.cpp
+++ b/csrc/fusion_segmenter.cpp
@@ -1884,7 +1884,9 @@ void eraseInputDistinctRootDomains(Fusion* fusion) {
       new_alloc.reserve(tv->getAllocationDomain().size());
       for (IterDomain* alloc_id : tv->getAllocationDomain()) {
         IterDomain* new_alloc_id = replay.getReplay().at(alloc_id);
-        // FIXME: should this be taken care of by ReplayTransformations?
+        // ReplayTransformations replay transforms but not paralelization, so
+        // we have to manually parallelize the new allocation ID. In other
+        // places, parallelization is usually done through parallelizeAllLike.
         new_alloc_id->parallelize(alloc_id->getParallelType());
         new_alloc.push_back(new_alloc_id);
       }
diff --git a/csrc/transform_replay.cpp b/csrc/transform_replay.cpp
index 0ac2a9e97a4..6c58d83528a 100644
--- a/csrc/transform_replay.cpp
+++ b/csrc/transform_replay.cpp
@@ -775,7 +775,6 @@ std::pair<TensorDomain*, int64_t> TransformReplay::replayCasP(
       // AllocationDomainTest.CacheBefore.
       if (auto it = p2c_map.find(alloc_id); it != p2c_map.end()) {
         IterDomain* new_alloc_id = it->second;
-        // FIXME: should this be taken care of by ReplayTransformations?
         new_alloc_id->parallelize(alloc_id->getParallelType());
         new_allocation_domain.push_back(new_alloc_id);
         new_contiguity.push_back(producer->getContiguity()[i]);

From 261b831b1b0e6ecdfefa2bb49142418e846678b3 Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Thu, 28 Nov 2024 21:49:38 -0800
Subject: [PATCH 11/17] Harden a test

---
 tests/cpp/test_sharding.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/cpp/test_sharding.cpp b/tests/cpp/test_sharding.cpp
index e4ce3f67ebd..62d14cf43d5 100644
--- a/tests/cpp/test_sharding.cpp
+++ b/tests/cpp/test_sharding.cpp
@@ -52,9 +52,12 @@ TEST_F(ShardingTest, MultipleDIDx) {
   Fusion fusion;
   FusionGuard fg(&fusion);
 
-  TensorView* x = makeSymbolicTensor(3);
+  TensorView* x = makeSymbolicTensor(1);
+  x->split(0, 2);
   x->axis(0)->parallelize(ParallelType::DIDx);
   x->axis(1)->parallelize(ParallelType::DIDx);
+  x->setAllocationDomain(x->getLoopDomain(), true);
+
   EXPECT_ANY_THROW(isSharded(x))
       << "Multiple DIDx:" << std::endl
       << x->domain()->toString(0, /*loop_only=*/false);

From c9d748e34b2a2b46f7f3f1a654f96322291e927e Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Thu, 28 Nov 2024 22:14:28 -0800
Subject: [PATCH 12/17] Apply suggestions from code review

Co-authored-by: samnordmann <snordmann@nvidia.com>
---
 csrc/multidevice/utils.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/csrc/multidevice/utils.h b/csrc/multidevice/utils.h
index 50f0223a034..8e240e6d5a2 100644
--- a/csrc/multidevice/utils.h
+++ b/csrc/multidevice/utils.h
@@ -131,7 +131,7 @@ int64_t getShardedAxis(TensorView*);
 void reorderDIDToFront(TensorView*);
 
 // Given a TensorView and the shape of a sharded tensor of which certain
-// dimensions are partially alloated, returns the global shape that'll be used
+// dimensions are partially allocated, returns the global shape that'll be used
 // to bind to the TensorView's logical domain. This is to solve #3282 so we can
 // bind a sharded tensor to a TensorView that has a DID-parallel loop domain.
 //
@@ -142,8 +142,7 @@ void reorderDIDToFront(TensorView*);
 // according to the allocation domain, iM is fully allocated and iN is sharded
 // and thus partially allocated.
 //
-// As a degenerate case, it's fine to call this function with a non-sharded
-// TensorView and tensor.
+// If the TensorView is not sharded, this function returns `sizes` 
 //
 // Limitations:
 // - The function assumes that there are no Merges from logical to the

From b6e3f40cabbd50a260562b93159f7995be6c923e Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Thu, 28 Nov 2024 22:22:58 -0800
Subject: [PATCH 13/17] Lint

---
 csrc/multidevice/utils.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/multidevice/utils.h b/csrc/multidevice/utils.h
index 8e240e6d5a2..5be2e11bd15 100644
--- a/csrc/multidevice/utils.h
+++ b/csrc/multidevice/utils.h
@@ -142,7 +142,7 @@ void reorderDIDToFront(TensorView*);
 // according to the allocation domain, iM is fully allocated and iN is sharded
 // and thus partially allocated.
 //
-// If the TensorView is not sharded, this function returns `sizes` 
+// If the TensorView is not sharded, this function returns `sizes`.
 //
 // Limitations:
 // - The function assumes that there are no Merges from logical to the

From 61260cb1dcface672ebb6164e6b53f9a77f04b66 Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Fri, 29 Nov 2024 17:37:08 -0800
Subject: [PATCH 14/17] Add a test with reordering

---
 csrc/tensor_metadata.cpp                | 28 ++++++++++++-----
 tests/cpp/test_multidevice_sharding.cpp | 40 ++++++++++++++++++++++++-
 2 files changed, 59 insertions(+), 9 deletions(-)

diff --git a/csrc/tensor_metadata.cpp b/csrc/tensor_metadata.cpp
index 5bb67cf1b9d..5a64fe5cae7 100644
--- a/csrc/tensor_metadata.cpp
+++ b/csrc/tensor_metadata.cpp
@@ -37,6 +37,7 @@ class ForwardTraverseFromLogicalToAlloc {
       // TODO: see [Allocation domain on both side of logical]
       return;
     }
+
     auto [in_size, in_stride] = in_it->second;
     auto factor = ee_.evaluate(split->factor()).as<int64_t>();
     NVF_ERROR(
@@ -44,17 +45,24 @@ class ForwardTraverseFromLogicalToAlloc {
         "The logical domain and allocation domain of fusion input/output ",
         "tensors must be a one-to-one map, therefore, ",
         "non-divisible split is not allowed in allocation domain");
+
+    int64_t inner_size;
+    int64_t outer_size;
+    if (split->innerSplit()) {
+      inner_size = factor;
+      outer_size = in_size / factor;
+    } else {
+      outer_size = factor;
+      inner_size = in_size / factor;
+    }
+
     NVF_ERROR(active_ids_.erase(in) == 1);
+    NVF_ERROR(active_ids_.emplace(inner, std::make_pair(inner_size, in_stride))
+                  .second);
     NVF_ERROR(
         active_ids_
-            .emplace(inner, std::pair<int64_t, int64_t>{factor, in_stride})
+            .emplace(outer, std::make_pair(outer_size, in_stride * inner_size))
             .second);
-    NVF_ERROR(active_ids_
-                  .emplace(
-                      outer,
-                      std::pair<int64_t, int64_t>{
-                          in_size / factor, in_stride * factor})
-                  .second);
   }
 
   void handle(Merge* merge) {
@@ -259,6 +267,10 @@ void validateAllocationSizesAndStrides(
           "Stride mismatch with contiguity info. ",
           " allocation domain: ",
           ir_utils::toString(alloc_dom),
+          ": sizes: ",
+          sizes,
+          ": strides: ",
+          strides,
           "; contiguity: ",
           toDelimitedString(contiguity),
           "; dim: ",
@@ -287,7 +299,7 @@ inferAndValidateAllocationSizesAndStrides(
   std::unordered_map<IterDomain*, std::pair<int64_t, int64_t>> active_ids;
   int64_t dim_index = 0;
   for (IterDomain* id : TensorDomain::noReductions(logical)) {
-    active_ids[id] = {logical_sizes[dim_index], tensor.stride(dim_index)};
+    active_ids[id] = {logical_sizes.at(dim_index), tensor.stride(dim_index)};
     dim_index++;
   }
   NVF_ERROR(dim_index == tensor.dim());
diff --git a/tests/cpp/test_multidevice_sharding.cpp b/tests/cpp/test_multidevice_sharding.cpp
index 4b1f605313d..3adac90bc5e 100644
--- a/tests/cpp/test_multidevice_sharding.cpp
+++ b/tests/cpp/test_multidevice_sharding.cpp
@@ -340,7 +340,7 @@ TEST_F(MultiDeviceTest, Transpose) {
       UnorderedElementsAre(HeuristicIs(SchedulerType::Transpose)));
 }
 
-TEST_F(MultiDeviceTest, ParallelizeLoopSplit) {
+TEST_F(MultiDeviceTest, LoopSplit) {
   auto fusion = std::make_unique<Fusion>();
   FusionGuard fg(fusion.get());
 
@@ -372,6 +372,44 @@ TEST_F(MultiDeviceTest, ParallelizeLoopSplit) {
       __FILE__);
 }
 
+TEST_F(MultiDeviceTest, LoopSplitWithReorder) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  const auto num_devices = communicator_->size();
+  auto mesh = DeviceMesh::createForNumDevices(num_devices);
+
+  TensorView* in = makeContigConcreteTensor({2, num_devices * 3});
+  in->setDeviceMesh(mesh);
+  fusion->addInput(in);
+
+  TensorView* out = set(in);
+  fusion->addOutput(out);
+
+  // logical: i{2}, i{3D}
+  // allocation: iDIDx{D}, i{3}, i{2}
+  in->split(1, num_devices, /*inner_split=*/false);
+  in->reorder({{0, -1}});
+  in->axis(0)->parallelize(ParallelType::DIDx);
+  in->setAllocationDomain(in->getLoopDomain(), true);
+
+  out->split(1, num_devices, /*inner_split=*/false);
+  out->axis(1)->parallelize(ParallelType::DIDx);
+  out->setAllocationDomain(out->getLoopDomain(), true);
+
+  at::Tensor in_tensor = at::randn({3, 2}, tensor_options).t();
+  FusionExecutorCache executor_cache(std::move(fusion));
+  at::Tensor out_tensor = executor_cache.runFusionWithInputs({in_tensor})[0];
+
+  testValidate(
+      executor_cache.fusion(),
+      {out_tensor},
+      {in_tensor},
+      {in_tensor},
+      __LINE__,
+      __FILE__);
+}
+
 class MultiDeviceBroadcastTest : public MultiDeviceTest,
                                  public testing::WithParamInterface<bool> {};
 

From 2daf5b2d28e55430a712a57edef73dd9ed0e3b0c Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Fri, 29 Nov 2024 18:48:59 -0800
Subject: [PATCH 15/17] Lint

---
 csrc/tensor_metadata.cpp | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/csrc/tensor_metadata.cpp b/csrc/tensor_metadata.cpp
index 5a64fe5cae7..f60f9842ced 100644
--- a/csrc/tensor_metadata.cpp
+++ b/csrc/tensor_metadata.cpp
@@ -39,22 +39,20 @@ class ForwardTraverseFromLogicalToAlloc {
     }
 
     auto [in_size, in_stride] = in_it->second;
-    auto factor = ee_.evaluate(split->factor()).as<int64_t>();
-    NVF_ERROR(
-        in_size % factor == 0,
-        "The logical domain and allocation domain of fusion input/output ",
-        "tensors must be a one-to-one map, therefore, ",
-        "non-divisible split is not allowed in allocation domain");
 
-    int64_t inner_size;
-    int64_t outer_size;
-    if (split->innerSplit()) {
-      inner_size = factor;
-      outer_size = in_size / factor;
-    } else {
-      outer_size = factor;
-      inner_size = in_size / factor;
-    }
+    auto [outer_size, inner_size] = [&]() {
+      const auto factor = ee_.evaluate(split->factor()).as<int64_t>();
+      NVF_ERROR(
+          in_size % factor == 0,
+          "The logical domain and allocation domain of fusion input/output ",
+          "tensors must be a one-to-one map, therefore, ",
+          "non-divisible split is not allowed in allocation domain");
+      if (split->innerSplit()) {
+        return std::make_pair(in_size / factor, factor);
+      } else {
+        return std::make_pair(factor, in_size / factor);
+      }
+    }();
 
     NVF_ERROR(active_ids_.erase(in) == 1);
     NVF_ERROR(active_ids_.emplace(inner, std::make_pair(inner_size, in_stride))

From d4532e6c93415e2d21ce80c75f8c1407bd8a1410 Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Fri, 29 Nov 2024 19:06:57 -0800
Subject: [PATCH 16/17] Revert "Lint"

This reverts commit 2daf5b2d28e55430a712a57edef73dd9ed0e3b0c.
---
 csrc/tensor_metadata.cpp | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/csrc/tensor_metadata.cpp b/csrc/tensor_metadata.cpp
index f60f9842ced..5a64fe5cae7 100644
--- a/csrc/tensor_metadata.cpp
+++ b/csrc/tensor_metadata.cpp
@@ -39,20 +39,22 @@ class ForwardTraverseFromLogicalToAlloc {
     }
 
     auto [in_size, in_stride] = in_it->second;
+    auto factor = ee_.evaluate(split->factor()).as<int64_t>();
+    NVF_ERROR(
+        in_size % factor == 0,
+        "The logical domain and allocation domain of fusion input/output ",
+        "tensors must be a one-to-one map, therefore, ",
+        "non-divisible split is not allowed in allocation domain");
 
-    auto [outer_size, inner_size] = [&]() {
-      const auto factor = ee_.evaluate(split->factor()).as<int64_t>();
-      NVF_ERROR(
-          in_size % factor == 0,
-          "The logical domain and allocation domain of fusion input/output ",
-          "tensors must be a one-to-one map, therefore, ",
-          "non-divisible split is not allowed in allocation domain");
-      if (split->innerSplit()) {
-        return std::make_pair(in_size / factor, factor);
-      } else {
-        return std::make_pair(factor, in_size / factor);
-      }
-    }();
+    int64_t inner_size;
+    int64_t outer_size;
+    if (split->innerSplit()) {
+      inner_size = factor;
+      outer_size = in_size / factor;
+    } else {
+      outer_size = factor;
+      inner_size = in_size / factor;
+    }
 
     NVF_ERROR(active_ids_.erase(in) == 1);
     NVF_ERROR(active_ids_.emplace(inner, std::make_pair(inner_size, in_stride))

From a5f6eb36819d62d03a2f9a363040bc25398037c6 Mon Sep 17 00:00:00 2001
From: Jingyue Wu <wujingyue@gmail.com>
Date: Fri, 29 Nov 2024 19:07:14 -0800
Subject: [PATCH 17/17] Fix lint

---
 csrc/tensor_metadata.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/csrc/tensor_metadata.cpp b/csrc/tensor_metadata.cpp
index 5a64fe5cae7..96f1d85fd2d 100644
--- a/csrc/tensor_metadata.cpp
+++ b/csrc/tensor_metadata.cpp
@@ -46,11 +46,11 @@ class ForwardTraverseFromLogicalToAlloc {
         "tensors must be a one-to-one map, therefore, ",
         "non-divisible split is not allowed in allocation domain");
 
-    int64_t inner_size;
-    int64_t outer_size;
+    int64_t inner_size = 0;
+    int64_t outer_size = 0;
     if (split->innerSplit()) {
-      inner_size = factor;
       outer_size = in_size / factor;
+      inner_size = factor;
     } else {
       outer_size = factor;
       inner_size = in_size / factor;