From 8ac306c05af9129446dc584a9823f85dc0775548 Mon Sep 17 00:00:00 2001
From: Umang Yadav <umayadav@amd.com>
Date: Fri, 27 Sep 2024 13:43:51 +0000
Subject: [PATCH 1/4] move rocdl-attach-target before convert-gpu-to-rocdl,
 read chipset from attached target during lowering

---
 .../Conversion/GPUToROCDL/GPUToROCDLPass.h    |  1 -
 .../mlir/include/mlir/Conversion/Passes.td    |  3 --
 .../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp      | 44 ++++++++++++-------
 mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp | 19 ++++----
 mlir/test/rocmlir-driver/pipelines.mlir       | 12 ++---
 5 files changed, 44 insertions(+), 35 deletions(-)

diff --git a/external/llvm-project/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h b/external/llvm-project/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
index 564778771299..7def4a1dd5e8 100644
--- a/external/llvm-project/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
+++ b/external/llvm-project/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
@@ -42,7 +42,6 @@ void configureGpuToROCDLConversionLegality(ConversionTarget &target);
 /// is configurable.
 std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
 createLowerGpuOpsToROCDLOpsPass(
-    const std::string &chipset = "gfx900",
     unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout,
     bool useBarePtrCallConv = false,
     gpu::amd::Runtime runtime = gpu::amd::Runtime::Unknown);
diff --git a/external/llvm-project/mlir/include/mlir/Conversion/Passes.td b/external/llvm-project/mlir/include/mlir/Conversion/Passes.td
index e52c7ff6bd56..cfed275daf4c 100644
--- a/external/llvm-project/mlir/include/mlir/Conversion/Passes.td
+++ b/external/llvm-project/mlir/include/mlir/Conversion/Passes.td
@@ -591,9 +591,6 @@ def ConvertGpuOpsToROCDLOps : Pass<"convert-gpu-to-rocdl", "gpu::GPUModuleOp"> {
     "memref::MemRefDialect",
   ];
   let options = [
-    Option<"chipset", "chipset", "std::string",
-           /*default=*/"\"gfx000\"",
-           "Chipset that these operations will run on">,
     Option<"indexBitwidth", "index-bitwidth", "unsigned",
            /*default=kDeriveIndexBitwidthFromDataLayout*/"0",
            "Bitwidth of the index type, 0 to use size of machine word">,
diff --git a/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 36fbf80c8156..e8b96227f18f 100644
--- a/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -14,6 +14,8 @@
 #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
 #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Location.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/Passes.h"
@@ -203,11 +205,8 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
 struct LowerGpuOpsToROCDLOpsPass
     : public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> {
   LowerGpuOpsToROCDLOpsPass() = default;
-  LowerGpuOpsToROCDLOpsPass(const std::string &chipset, unsigned indexBitwidth,
-                            bool useBarePtrCallConv,
+  LowerGpuOpsToROCDLOpsPass(unsigned indexBitwidth, bool useBarePtrCallConv,
                             gpu::amd::Runtime runtime) {
-    if (this->chipset.getNumOccurrences() == 0)
-      this->chipset = chipset;
     if (this->indexBitwidth.getNumOccurrences() == 0)
       this->indexBitwidth = indexBitwidth;
     if (this->useBarePtrCallConv.getNumOccurrences() == 0)
@@ -219,6 +218,26 @@ struct LowerGpuOpsToROCDLOpsPass
   void runOnOperation() override {
     gpu::GPUModuleOp m = getOperation();
     MLIRContext *ctx = m.getContext();
+    ArrayAttr targets = m.getTargetsAttr();
+    FailureOr<amdgpu::Chipset> maybeChipset;
+    if (!targets) {
+      emitError(UnknownLoc::get(ctx), "ROCDLTargetAttr is empty on GPU module");
+      return signalPassFailure();
+    }
+    if (targets.size() != 1) {
+      emitError(UnknownLoc::get(ctx), "ROCDLTargetAttrs has more specified "
+                                      "more than one gpu-arch on GPU module");
+      return signalPassFailure();
+    } else {
+      const ROCDL::ROCDLTargetAttr targetAttr =
+          mlir::dyn_cast<ROCDL::ROCDLTargetAttr>(targets.getValue().front());
+      maybeChipset = amdgpu::Chipset::parse(targetAttr.getChip());
+      if (failed(maybeChipset)) {
+        emitError(UnknownLoc::get(ctx),
+                  "Invalid chipset name: " + targetAttr.getChip());
+        return signalPassFailure();
+      }
+    }
 
     auto llvmDataLayout = m->getAttrOfType<StringAttr>(
         LLVM::LLVMDialect::getDataLayoutAttrName());
@@ -232,12 +251,6 @@ struct LowerGpuOpsToROCDLOpsPass
                     UnitAttr::get(ctx));
     }
 
-    FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
-    if (failed(maybeChipset)) {
-      emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
-      return signalPassFailure();
-    }
-
     /// Customize the bitwidth used for the device side index computations.
     LowerToLLVMOptions options(
         ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
@@ -337,8 +350,7 @@ void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) {
                       LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>();
   // These ops are legal for f32 type.
   target.addDynamicallyLegalOp<LLVM::ExpOp, LLVM::LogOp>([](Operation *op) {
-    return any_of(op->getOperandTypes(),
-                  llvm::IsaPred<Float32Type>);
+    return any_of(op->getOperandTypes(), llvm::IsaPred<Float32Type>);
   });
   // TODO: Remove once we support replacing non-root ops.
   target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
@@ -350,7 +362,8 @@ static void populateOpPatterns(LLVMTypeConverter &converter,
                                RewritePatternSet &patterns, StringRef f32Func,
                                StringRef f64Func, StringRef f16Func) {
   patterns.add<ScalarizeVectorOpLowering<OpTy>>(converter);
-  patterns.add<OpToFuncCallLowering<OpTy>>(converter, f32Func, f64Func, f16Func);
+  patterns.add<OpToFuncCallLowering<OpTy>>(converter, f32Func, f64Func,
+                                           f16Func);
 }
 
 void mlir::populateGpuToROCDLConversionPatterns(
@@ -399,10 +412,9 @@ void mlir::populateGpuToROCDLConversionPatterns(
 }
 
 std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
-mlir::createLowerGpuOpsToROCDLOpsPass(const std::string &chipset,
-                                      unsigned indexBitwidth,
+mlir::createLowerGpuOpsToROCDLOpsPass(unsigned indexBitwidth,
                                       bool useBarePtrCallConv,
                                       gpu::amd::Runtime runtime) {
   return std::make_unique<LowerGpuOpsToROCDLOpsPass>(
-      chipset, indexBitwidth, useBarePtrCallConv, runtime);
+      indexBitwidth, useBarePtrCallConv, runtime);
 }
diff --git a/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp b/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp
index adfaff9141ed..e08e5478e979 100644
--- a/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp
+++ b/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp
@@ -236,23 +236,24 @@ void rock::buildBackendPipeline(OpPassManager &pm,
   // We need to lower affine again, because the expand strided metadata pass
   // adds back affine.apply for memref.subview
   gpuPm.addPass(createLowerAffinePass());
-  gpuPm.addPass(createLowerGpuOpsToROCDLOpsPass(
-      options.chip, /*indexBitwidth=*/kDeriveIndexBitwidthFromDataLayout,
+  GpuROCDLAttachTargetOptions opts;
+  opts.triple = options.triple;
+  opts.chip = options.chip;
+  opts.features = options.features;
+  opts.optLevel = options.optLevel;
+  pm.addPass(createGpuROCDLAttachTarget(opts));
+  auto &gpuPm2 = pm.nest<gpu::GPUModuleOp>();
+  gpuPm2.addPass(createLowerGpuOpsToROCDLOpsPass(
+      /*indexBitwidth=*/kDeriveIndexBitwidthFromDataLayout,
       /*useBarePtrCallConv=*/true, gpu::amd::Runtime::HIP));
   // Ensure we only run passes on LLVM functions inside GPU modules.
-  auto &llvmFuncPm = gpuPm.nest<LLVM::LLVMFuncOp>();
+  auto &llvmFuncPm = gpuPm2.nest<LLVM::LLVMFuncOp>();
   // -canonicalize -cse so that we don't have to crawl through memref
   // descriptors. (Mainly we want the `extractvalue` fold).
   llvmFuncPm.addPass(createCanonicalizerPass());
   llvmFuncPm.addPass(createCSEPass());
   llvmFuncPm.addPass(rock::createRockPrepareLLVMPass());
   if (options.compile) {
-    GpuROCDLAttachTargetOptions opts;
-    opts.triple = options.triple;
-    opts.chip = options.chip;
-    opts.features = options.features;
-    opts.optLevel = options.optLevel;
-    pm.addPass(createGpuROCDLAttachTarget(opts));
     pm.addPass(createGpuModuleToBinaryPass());
     pm.addPass(createRockCheckResidencyPass());
   }
diff --git a/mlir/test/rocmlir-driver/pipelines.mlir b/mlir/test/rocmlir-driver/pipelines.mlir
index 192932a1624f..9c10440e2705 100644
--- a/mlir/test/rocmlir-driver/pipelines.mlir
+++ b/mlir/test/rocmlir-driver/pipelines.mlir
@@ -50,12 +50,12 @@
 // BINARY-NEXT:convert-arith-to-amdgpu{allow-packed-f16-round-to-zero=true chipset=gfx90a saturate-fp8-truncf=true},
 // BINARY-NEXT:emulate-fp8-ext-trunc,
 // BINARY-NEXT:expand-strided-metadata,
-// BINARY-NEXT:lower-affine,
-// BINARY-NEXT:convert-gpu-to-rocdl{chipset=gfx90a index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true},
+// BINARY-NEXT:lower-affine),
+// BINARY-NEXT:rocdl-attach-target{O=3 abi=500 chip=gfx90a correct-sqrt=true daz=false fast=false features= finite-only=false  module= triple=amdgcn-amd-amdhsa unsafe-math=false wave64=true},
+// BINARY-NEXT:gpu.module(convert-gpu-to-rocdl{index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true},
 // BINARY-NEXT:llvm.func(canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},
 // BINARY-NEXT:cse,
 // BINARY-NEXT:rock-prepare-llvm)),
-// BINARY-NEXT:rocdl-attach-target{O=3 abi=500 chip=gfx90a correct-sqrt=true daz=false fast=false features= finite-only=false  module= triple=amdgcn-amd-amdhsa unsafe-math=false wave64=true},
 // BINARY-NEXT:gpu-module-to-binary{format=fatbin  opts= toolkit=},
 // BINARY-NEXT:rock-check-residency,
 // BINARY-NEXT:emulate-fp8-ext-trunc)
@@ -69,12 +69,12 @@
 // BINARY_MI300-NEXT:f8E5M2} target-type=f32},
 // BINARY_MI300-NEXT:convert-arith-to-amdgpu{allow-packed-f16-round-to-zero=true chipset=gfx940 saturate-fp8-truncf=true},
 // BINARY_MI300-NEXT:expand-strided-metadata,
-// BINARY_MI300-NEXT:lower-affine,
-// BINARY_MI300-NEXT:convert-gpu-to-rocdl{chipset=gfx940 index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true},
+// BINARY_MI300-NEXT:lower-affine),
+// BINARY_MI300-NEXT:rocdl-attach-target{O=3 abi=500 chip=gfx940 correct-sqrt=true daz=false fast=false features= finite-only=false  module= triple=amdgcn-amd-amdhsa unsafe-math=false wave64=true},
+// BINARY_MI300-NEXT:gpu.module(convert-gpu-to-rocdl{index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true},
 // BINARY_MI300-NEXT:llvm.func(canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},
 // BINARY_MI300-NEXT:cse,
 // BINARY_MI300-NEXT:rock-prepare-llvm)),
-// BINARY_MI300-NEXT:rocdl-attach-target{O=3 abi=500 chip=gfx940 correct-sqrt=true daz=false fast=false features= finite-only=false  module= triple=amdgcn-amd-amdhsa unsafe-math=false wave64=true},
 // BINARY_MI300-NEXT:gpu-module-to-binary{format=fatbin  opts= toolkit=},
 // BINARY_MI300-NEXT:rock-check-residency,
 // BINARY_MI300-NEXT:emulate-fp8-ext-trunc)

From 129a0fa7eda55ff0dc35d5ca9a3aad47a2f77a1d Mon Sep 17 00:00:00 2001
From: Umang Yadav <umayadav@amd.com>
Date: Mon, 30 Sep 2024 20:30:35 +0000
Subject: [PATCH 2/4] add "infer" option for backwards compatibility

---
 .../Conversion/GPUToROCDL/GPUToROCDLPass.h    |  1 +
 .../mlir/include/mlir/Conversion/Passes.td    |  3 ++
 .../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp      | 39 ++++++++++++-------
 mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp |  2 +-
 mlir/test/rocmlir-driver/pipelines.mlir       |  4 +-
 5 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/external/llvm-project/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h b/external/llvm-project/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
index 7def4a1dd5e8..f1233ad894da 100644
--- a/external/llvm-project/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
+++ b/external/llvm-project/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
@@ -42,6 +42,7 @@ void configureGpuToROCDLConversionLegality(ConversionTarget &target);
 /// is configurable.
 std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
 createLowerGpuOpsToROCDLOpsPass(
+    const std::string &chipset = "infer",
     unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout,
     bool useBarePtrCallConv = false,
     gpu::amd::Runtime runtime = gpu::amd::Runtime::Unknown);
diff --git a/external/llvm-project/mlir/include/mlir/Conversion/Passes.td b/external/llvm-project/mlir/include/mlir/Conversion/Passes.td
index cfed275daf4c..62d6f2954a66 100644
--- a/external/llvm-project/mlir/include/mlir/Conversion/Passes.td
+++ b/external/llvm-project/mlir/include/mlir/Conversion/Passes.td
@@ -591,6 +591,9 @@ def ConvertGpuOpsToROCDLOps : Pass<"convert-gpu-to-rocdl", "gpu::GPUModuleOp"> {
     "memref::MemRefDialect",
   ];
   let options = [
+    Option<"chipset", "chipset", "std::string",
+           /*default=*/"\"infer\"",
+           "Chipset that these operations will run on. By default it will infer target from attached Target Attribute on GPU Module">,
     Option<"indexBitwidth", "index-bitwidth", "unsigned",
            /*default=kDeriveIndexBitwidthFromDataLayout*/"0",
            "Bitwidth of the index type, 0 to use size of machine word">,
diff --git a/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index e8b96227f18f..eb2d87f09f98 100644
--- a/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -14,8 +14,6 @@
 #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
 #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
-#include "mlir/IR/Diagnostics.h"
-#include "mlir/IR/Location.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/Passes.h"
@@ -57,7 +55,6 @@ namespace mlir {
 } // namespace mlir
 
 #include "mlir/Dialect/LLVMIR/Transforms/Passes.h"
-
 using namespace mlir;
 
 /// Returns true if the given `gpu.func` can be safely called using the bare
@@ -205,8 +202,11 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
 struct LowerGpuOpsToROCDLOpsPass
     : public impl::ConvertGpuOpsToROCDLOpsBase<LowerGpuOpsToROCDLOpsPass> {
   LowerGpuOpsToROCDLOpsPass() = default;
-  LowerGpuOpsToROCDLOpsPass(unsigned indexBitwidth, bool useBarePtrCallConv,
+  LowerGpuOpsToROCDLOpsPass(const std::string &chipset, unsigned indexBitwidth,
+                            bool useBarePtrCallConv,
                             gpu::amd::Runtime runtime) {
+    if (this->chipset.getNumOccurrences() == 0)
+      this->chipset = chipset;
     if (this->indexBitwidth.getNumOccurrences() == 0)
       this->indexBitwidth = indexBitwidth;
     if (this->useBarePtrCallConv.getNumOccurrences() == 0)
@@ -220,15 +220,17 @@ struct LowerGpuOpsToROCDLOpsPass
     MLIRContext *ctx = m.getContext();
     ArrayAttr targets = m.getTargetsAttr();
     FailureOr<amdgpu::Chipset> maybeChipset;
-    if (!targets) {
-      emitError(UnknownLoc::get(ctx), "ROCDLTargetAttr is empty on GPU module");
-      return signalPassFailure();
-    }
-    if (targets.size() != 1) {
-      emitError(UnknownLoc::get(ctx), "ROCDLTargetAttrs has more specified "
-                                      "more than one gpu-arch on GPU module");
-      return signalPassFailure();
-    } else {
+    if (chipset == "infer") {
+      if (!targets) {
+        emitError(UnknownLoc::get(ctx),
+                  "ROCDLTargetAttr is empty on GPU module");
+        return signalPassFailure();
+      }
+      if (targets.size() != 1) {
+        emitError(UnknownLoc::get(ctx), "ROCDLTargetAttrs has more specified "
+                                        "more than one gpu-arch on GPU module");
+        return signalPassFailure();
+      }
       const ROCDL::ROCDLTargetAttr targetAttr =
           mlir::dyn_cast<ROCDL::ROCDLTargetAttr>(targets.getValue().front());
       maybeChipset = amdgpu::Chipset::parse(targetAttr.getChip());
@@ -237,6 +239,12 @@ struct LowerGpuOpsToROCDLOpsPass
                   "Invalid chipset name: " + targetAttr.getChip());
         return signalPassFailure();
       }
+    } else {
+      maybeChipset = amdgpu::Chipset::parse(chipset);
+      if (failed(maybeChipset)) {
+        emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
+        return signalPassFailure();
+      }
     }
 
     auto llvmDataLayout = m->getAttrOfType<StringAttr>(
@@ -412,9 +420,10 @@ void mlir::populateGpuToROCDLConversionPatterns(
 }
 
 std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
-mlir::createLowerGpuOpsToROCDLOpsPass(unsigned indexBitwidth,
+mlir::createLowerGpuOpsToROCDLOpsPass(const std::string &chipset,
+                                      unsigned indexBitwidth,
                                       bool useBarePtrCallConv,
                                       gpu::amd::Runtime runtime) {
   return std::make_unique<LowerGpuOpsToROCDLOpsPass>(
-      indexBitwidth, useBarePtrCallConv, runtime);
+      chipset, indexBitwidth, useBarePtrCallConv, runtime);
 }
diff --git a/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp b/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp
index e08e5478e979..f3e30e8e6743 100644
--- a/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp
+++ b/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp
@@ -244,7 +244,7 @@ void rock::buildBackendPipeline(OpPassManager &pm,
   pm.addPass(createGpuROCDLAttachTarget(opts));
   auto &gpuPm2 = pm.nest<gpu::GPUModuleOp>();
   gpuPm2.addPass(createLowerGpuOpsToROCDLOpsPass(
-      /*indexBitwidth=*/kDeriveIndexBitwidthFromDataLayout,
+      /*chipset=*/"infer", /*indexBitwidth=*/kDeriveIndexBitwidthFromDataLayout,
       /*useBarePtrCallConv=*/true, gpu::amd::Runtime::HIP));
   // Ensure we only run passes on LLVM functions inside GPU modules.
   auto &llvmFuncPm = gpuPm2.nest<LLVM::LLVMFuncOp>();
diff --git a/mlir/test/rocmlir-driver/pipelines.mlir b/mlir/test/rocmlir-driver/pipelines.mlir
index 9c10440e2705..e4336de1cb80 100644
--- a/mlir/test/rocmlir-driver/pipelines.mlir
+++ b/mlir/test/rocmlir-driver/pipelines.mlir
@@ -52,7 +52,7 @@
 // BINARY-NEXT:expand-strided-metadata,
 // BINARY-NEXT:lower-affine),
 // BINARY-NEXT:rocdl-attach-target{O=3 abi=500 chip=gfx90a correct-sqrt=true daz=false fast=false features= finite-only=false  module= triple=amdgcn-amd-amdhsa unsafe-math=false wave64=true},
-// BINARY-NEXT:gpu.module(convert-gpu-to-rocdl{index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true},
+// BINARY-NEXT:gpu.module(convert-gpu-to-rocdl{chipset=infer index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true},
 // BINARY-NEXT:llvm.func(canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},
 // BINARY-NEXT:cse,
 // BINARY-NEXT:rock-prepare-llvm)),
@@ -71,7 +71,7 @@
 // BINARY_MI300-NEXT:expand-strided-metadata,
 // BINARY_MI300-NEXT:lower-affine),
 // BINARY_MI300-NEXT:rocdl-attach-target{O=3 abi=500 chip=gfx940 correct-sqrt=true daz=false fast=false features= finite-only=false  module= triple=amdgcn-amd-amdhsa unsafe-math=false wave64=true},
-// BINARY_MI300-NEXT:gpu.module(convert-gpu-to-rocdl{index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true},
+// BINARY_MI300-NEXT:gpu.module(convert-gpu-to-rocdl{chipset=infer index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true},
 // BINARY_MI300-NEXT:llvm.func(canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},
 // BINARY_MI300-NEXT:cse,
 // BINARY_MI300-NEXT:rock-prepare-llvm)),

From 23e6204a144684f5b4aeb01f48a3fa71d44e1b14 Mon Sep 17 00:00:00 2001
From: Umang Yadav <umayadav@amd.com>
Date: Tue, 1 Oct 2024 11:33:10 +0000
Subject: [PATCH 3/4] refactor based on daniel's comments

---
 .../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp      | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index eb2d87f09f98..417ec10894b7 100644
--- a/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -219,7 +219,6 @@ struct LowerGpuOpsToROCDLOpsPass
     gpu::GPUModuleOp m = getOperation();
     MLIRContext *ctx = m.getContext();
     ArrayAttr targets = m.getTargetsAttr();
-    FailureOr<amdgpu::Chipset> maybeChipset;
     if (chipset == "infer") {
       if (!targets) {
         emitError(UnknownLoc::get(ctx),
@@ -233,18 +232,12 @@ struct LowerGpuOpsToROCDLOpsPass
       }
       const ROCDL::ROCDLTargetAttr targetAttr =
           mlir::dyn_cast<ROCDL::ROCDLTargetAttr>(targets.getValue().front());
-      maybeChipset = amdgpu::Chipset::parse(targetAttr.getChip());
-      if (failed(maybeChipset)) {
-        emitError(UnknownLoc::get(ctx),
-                  "Invalid chipset name: " + targetAttr.getChip());
-        return signalPassFailure();
-      }
-    } else {
-      maybeChipset = amdgpu::Chipset::parse(chipset);
-      if (failed(maybeChipset)) {
-        emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
-        return signalPassFailure();
-      }
+      chipset = targetAttr.getChip().str();
+    }
+    FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
+    if (failed(maybeChipset)) {
+      emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
+      return signalPassFailure();
     }
 
     auto llvmDataLayout = m->getAttrOfType<StringAttr>(

From c966d3bb0a23b968047372d32d8461f79cb928b6 Mon Sep 17 00:00:00 2001
From: Umang Yadav <umayadav@amd.com>
Date: Thu, 10 Oct 2024 14:09:27 +0000
Subject: [PATCH 4/4] pull in upstream changes

---
 .../mlir/include/mlir/Conversion/Passes.td    |  2 +-
 .../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp      | 16 +++++-----
 .../GPUCommon/lower-memory-space-attrs.mlir   |  2 +-
 .../GPUCommon/memory-attrbution.mlir          |  2 +-
 .../GPUCommon/memref-arg-attrs.mlir           |  2 +-
 .../GPUCommon/memref-arg-noalias-attrs.mlir   |  2 +-
 .../GPUCommon/memref-arg-noalias-warning.mlir |  2 +-
 .../GPUToROCDL/gpu-to-rocdl-hip.mlir          |  2 +-
 .../GPUToROCDL/gpu-to-rocdl-infer-target.mlir | 30 +++++++++++++++++++
 .../GPUToROCDL/gpu-to-rocdl-opencl.mlir       |  2 +-
 .../Conversion/GPUToROCDL/gpu-to-rocdl.mlir   |  4 +--
 .../test/Conversion/GPUToROCDL/memref.mlir    |  4 +--
 .../Integration/GPU/ROCM/gpu-to-hsaco.mlir    |  2 +-
 .../test/Integration/GPU/ROCM/printf.mlir     |  2 +-
 .../Integration/GPU/ROCM/two-modules.mlir     |  2 +-
 .../test/Integration/GPU/ROCM/vecadd.mlir     |  2 +-
 .../GPU/ROCM/vector-transferops.mlir          |  2 +-
 17 files changed, 55 insertions(+), 25 deletions(-)
 create mode 100644 external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-infer-target.mlir

diff --git a/external/llvm-project/mlir/include/mlir/Conversion/Passes.td b/external/llvm-project/mlir/include/mlir/Conversion/Passes.td
index 62d6f2954a66..9123b8ef46e6 100644
--- a/external/llvm-project/mlir/include/mlir/Conversion/Passes.td
+++ b/external/llvm-project/mlir/include/mlir/Conversion/Passes.td
@@ -593,7 +593,7 @@ def ConvertGpuOpsToROCDLOps : Pass<"convert-gpu-to-rocdl", "gpu::GPUModuleOp"> {
   let options = [
     Option<"chipset", "chipset", "std::string",
            /*default=*/"\"infer\"",
-           "Chipset that these operations will run on. By default it will infer target from attached Target Attribute on GPU Module">,
+           "Chipset that these operations will run on. By Default it will infer target from attached target attribute on GPU module on which it operates">,
     Option<"indexBitwidth", "index-bitwidth", "unsigned",
            /*default=kDeriveIndexBitwidthFromDataLayout*/"0",
            "Bitwidth of the index type, 0 to use size of machine word">,
diff --git a/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 417ec10894b7..8dbae392204c 100644
--- a/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -218,25 +218,25 @@ struct LowerGpuOpsToROCDLOpsPass
   void runOnOperation() override {
     gpu::GPUModuleOp m = getOperation();
     MLIRContext *ctx = m.getContext();
-    ArrayAttr targets = m.getTargetsAttr();
+
     if (chipset == "infer") {
+      ArrayAttr targets = m.getTargetsAttr();
       if (!targets) {
-        emitError(UnknownLoc::get(ctx),
-                  "ROCDLTargetAttr is empty on GPU module");
+        m->emitError("there are no target attributes to infer");
         return signalPassFailure();
       }
       if (targets.size() != 1) {
-        emitError(UnknownLoc::get(ctx), "ROCDLTargetAttrs has more specified "
-                                        "more than one gpu-arch on GPU module");
+        m->emitError("expected a single target attribute");
         return signalPassFailure();
       }
-      const ROCDL::ROCDLTargetAttr targetAttr =
-          mlir::dyn_cast<ROCDL::ROCDLTargetAttr>(targets.getValue().front());
+      ROCDL::ROCDLTargetAttr targetAttr =
+          dyn_cast<ROCDL::ROCDLTargetAttr>(targets[0]);
       chipset = targetAttr.getChip().str();
     }
+
     FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
     if (failed(maybeChipset)) {
-      emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
+      m->emitError("invalid chipset name: " + chipset);
       return signalPassFailure();
     }
 
diff --git a/external/llvm-project/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs.mlir b/external/llvm-project/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs.mlir
index 771f3185904b..a338d35525eb 100644
--- a/external/llvm-project/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs.mlir
+++ b/external/llvm-project/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl | FileCheck %s --check-prefixes=CHECK,ROCDL
+// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='chipset=gfx900' | FileCheck %s --check-prefixes=CHECK,ROCDL
 // RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm | FileCheck %s --check-prefixes=CHECK,NVVM
 
 gpu.module @kernel {
diff --git a/external/llvm-project/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir b/external/llvm-project/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir
index 4fc19b8e9364..b1291e07c060 100644
--- a/external/llvm-project/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir
+++ b/external/llvm-project/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-nvvm --split-input-file %s | FileCheck --check-prefix=NVVM %s
-// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-rocdl --split-input-file %s | FileCheck --check-prefix=ROCDL %s
+// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-rocdl='chipset=gfx900' --split-input-file %s | FileCheck --check-prefix=ROCDL %s
 
 gpu.module @kernel {
   // NVVM-LABEL:  llvm.func @private
diff --git a/external/llvm-project/mlir/test/Conversion/GPUCommon/memref-arg-attrs.mlir b/external/llvm-project/mlir/test/Conversion/GPUCommon/memref-arg-attrs.mlir
index e7c742067b4e..3c3082c47389 100644
--- a/external/llvm-project/mlir/test/Conversion/GPUCommon/memref-arg-attrs.mlir
+++ b/external/llvm-project/mlir/test/Conversion/GPUCommon/memref-arg-attrs.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-bare-ptr-memref-call-conv=0' | FileCheck %s --check-prefixes=CHECK,ROCDL
+// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='chipset=gfx900 use-bare-ptr-memref-call-conv=0' | FileCheck %s --check-prefixes=CHECK,ROCDL
 // RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm='use-bare-ptr-memref-call-conv=0' | FileCheck %s --check-prefixes=CHECK,NVVM
 
 gpu.module @kernel {
diff --git a/external/llvm-project/mlir/test/Conversion/GPUCommon/memref-arg-noalias-attrs.mlir b/external/llvm-project/mlir/test/Conversion/GPUCommon/memref-arg-noalias-attrs.mlir
index 33cdc3348e51..d17214d1f229 100644
--- a/external/llvm-project/mlir/test/Conversion/GPUCommon/memref-arg-noalias-attrs.mlir
+++ b/external/llvm-project/mlir/test/Conversion/GPUCommon/memref-arg-noalias-attrs.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-bare-ptr-memref-call-conv=1' | FileCheck %s --check-prefixes=CHECK,ROCDL
+// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='chipset=gfx900 use-bare-ptr-memref-call-conv=1' | FileCheck %s --check-prefixes=CHECK,ROCDL
 // RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm='use-bare-ptr-memref-call-conv=1' | FileCheck %s --check-prefixes=CHECK,NVVM
 
 gpu.module @kernel {
diff --git a/external/llvm-project/mlir/test/Conversion/GPUCommon/memref-arg-noalias-warning.mlir b/external/llvm-project/mlir/test/Conversion/GPUCommon/memref-arg-noalias-warning.mlir
index 793df7380d78..ab98be59a2c8 100644
--- a/external/llvm-project/mlir/test/Conversion/GPUCommon/memref-arg-noalias-warning.mlir
+++ b/external/llvm-project/mlir/test/Conversion/GPUCommon/memref-arg-noalias-warning.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-bare-ptr-memref-call-conv=0' -verify-diagnostics
+// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='chipset=gfx900 use-bare-ptr-memref-call-conv=0' -verify-diagnostics
 
 gpu.module @kernel {
 // expected-warning @+1 {{Cannot copy noalias with non-bare pointers.}}
diff --git a/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir b/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir
index 1b904fa142ba..3e3b43c6d4f4 100644
--- a/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir
+++ b/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-gpu-to-rocdl='runtime=HIP' -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx900 runtime=HIP' -split-input-file | FileCheck %s
 
 gpu.module @test_module {
   // CHECK-DAG: llvm.mlir.global internal constant @[[$PRINT_GLOBAL0:[A-Za-z0-9_]+]]("Hello, world\0A\00")
diff --git a/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-infer-target.mlir b/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-infer-target.mlir
new file mode 100644
index 000000000000..4ef6fd004b13
--- /dev/null
+++ b/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-infer-target.mlir
@@ -0,0 +1,30 @@
+// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file --verify-diagnostics | FileCheck --check-prefix=CHECK_TARGET %s
+
+// CHECK_TARGET: @test_module [#rocdl.target<O = 3, chip = "gfx90a">]  attributes {llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"} {
+gpu.module @test_module [#rocdl.target<O = 3, chip = "gfx90a">] {
+  // CHECK_TARGET-LABEL: @kernel_func
+  // CHECK_TARGET: attributes
+  // CHECK_TARGET: gpu.kernel
+  // CHECK_TARGET: rocdl.kernel
+  gpu.func @kernel_func() kernel {
+    gpu.return
+  }
+}
+
+// -----
+
+// expected-error@below {{there are no target attributes to infer}}
+gpu.module @test_module {
+  gpu.func @kernel_func() kernel {
+    gpu.return
+  }
+}
+
+// -----
+
+// expected-error@below {{invalid chipset name: gfx90a,gfx900}}
+gpu.module @test_module [#rocdl.target<O = 3, chip = "gfx90a,gfx900">] {
+  gpu.func @kernel_func() kernel {
+    gpu.return
+  }
+}
diff --git a/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir b/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir
index 870f5c5016ec..fa01801972d6 100644
--- a/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir
+++ b/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-gpu-to-rocdl='runtime=OpenCL' | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx900 runtime=OpenCL' | FileCheck %s
 
 gpu.module @test_module {
   // CHECK: llvm.mlir.global internal constant @[[$PRINT_GLOBAL:[A-Za-z0-9_]+]]("Hello: %d\0A\00")  {addr_space = 4 : i32}
diff --git a/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index da54dc836a90..356d21cbf1ba 100644
--- a/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s
-// RUN: mlir-opt %s -convert-gpu-to-rocdl='index-bitwidth=32' -split-input-file | FileCheck --check-prefix=CHECK32 %s
+// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx900' -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx900 index-bitwidth=32' -split-input-file | FileCheck --check-prefix=CHECK32 %s
 
 // CHECK-LABEL: @test_module
 // CHECK-SAME: llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
diff --git a/external/llvm-project/mlir/test/Conversion/GPUToROCDL/memref.mlir b/external/llvm-project/mlir/test/Conversion/GPUToROCDL/memref.mlir
index e645481c8923..debf899dd687 100644
--- a/external/llvm-project/mlir/test/Conversion/GPUToROCDL/memref.mlir
+++ b/external/llvm-project/mlir/test/Conversion/GPUToROCDL/memref.mlir
@@ -1,6 +1,6 @@
-// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx900' -split-input-file | FileCheck %s
 // RUN: mlir-opt %s \
-// RUN:   -convert-gpu-to-rocdl='use-bare-ptr-memref-call-conv=true' \
+// RUN:   -convert-gpu-to-rocdl='chipset=gfx900 use-bare-ptr-memref-call-conv=true' \
 // RUN:   -split-input-file \
 // RUN: | FileCheck %s --check-prefix=BARE
 
diff --git a/external/llvm-project/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir b/external/llvm-project/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir
index 3c8f3b1d0cbf..edb75ee81224 100644
--- a/external/llvm-project/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir
+++ b/external/llvm-project/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-opt %s \
 // RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl),rocdl-attach-target{chip=%chip})' \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo),rocdl-attach-target{chip=%chip}, gpu.module(convert-gpu-to-rocdl))' \
 // RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_rocm_runtime \
diff --git a/external/llvm-project/mlir/test/Integration/GPU/ROCM/printf.mlir b/external/llvm-project/mlir/test/Integration/GPU/ROCM/printf.mlir
index d5e6e3757540..e8feeaa69c29 100644
--- a/external/llvm-project/mlir/test/Integration/GPU/ROCM/printf.mlir
+++ b/external/llvm-project/mlir/test/Integration/GPU/ROCM/printf.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl{index-bitwidth=32 runtime=HIP}),rocdl-attach-target{chip=%chip})' \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo),rocdl-attach-target{chip=%chip}, gpu.module(convert-gpu-to-rocdl{index-bitwidth=32 runtime=HIP}))' \
 // RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_rocm_runtime \
diff --git a/external/llvm-project/mlir/test/Integration/GPU/ROCM/two-modules.mlir b/external/llvm-project/mlir/test/Integration/GPU/ROCM/two-modules.mlir
index d49d3957abbe..d20f71d16280 100644
--- a/external/llvm-project/mlir/test/Integration/GPU/ROCM/two-modules.mlir
+++ b/external/llvm-project/mlir/test/Integration/GPU/ROCM/two-modules.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-opt %s \
 // RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl),rocdl-attach-target{chip=%chip})' \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo),rocdl-attach-target{chip=%chip}, gpu.module(convert-gpu-to-rocdl))' \
 // RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_rocm_runtime \
diff --git a/external/llvm-project/mlir/test/Integration/GPU/ROCM/vecadd.mlir b/external/llvm-project/mlir/test/Integration/GPU/ROCM/vecadd.mlir
index 986d8239427e..0ac391cd5f8e 100644
--- a/external/llvm-project/mlir/test/Integration/GPU/ROCM/vecadd.mlir
+++ b/external/llvm-project/mlir/test/Integration/GPU/ROCM/vecadd.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s \
 // RUN: | mlir-opt -convert-scf-to-cf \
 // RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl{use-bare-ptr-memref-call-conv=true}),rocdl-attach-target{chip=%chip})' \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo),rocdl-attach-target{chip=%chip}, gpu.module(convert-gpu-to-rocdl{use-bare-ptr-memref-call-conv=true}))' \
 // RUN: | mlir-opt -gpu-to-llvm=use-bare-pointers-for-kernels=true -reconcile-unrealized-casts -gpu-module-to-binary \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_rocm_runtime \
diff --git a/external/llvm-project/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir b/external/llvm-project/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir
index 575d967dcc9a..417f67e64669 100644
--- a/external/llvm-project/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir
+++ b/external/llvm-project/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s \
 // RUN: | mlir-opt -convert-scf-to-cf \
 // RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl{chipset=%chip index-bitwidth=32}),rocdl-attach-target{chip=%chip})' \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo),rocdl-attach-target{chip=%chip}, gpu.module(convert-gpu-to-rocdl{index-bitwidth=32}))' \
 // RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_rocm_runtime \