ROCm · umangyadav · Sep 27, 2024 · Sep 30, 2024 · Oct 1, 2024 · Oct 10, 2024
@@ -42,7 +42,7 @@ void configureGpuToROCDLConversionLegality(ConversionTarget &target);
 /// is configurable.
 std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
 createLowerGpuOpsToROCDLOpsPass(
-    const std::string &chipset = "gfx900",
+    const std::string &chipset = "infer",
     unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout,
     bool useBarePtrCallConv = false,
     gpu::amd::Runtime runtime = gpu::amd::Runtime::Unknown);

@@ -592,8 +592,8 @@ def ConvertGpuOpsToROCDLOps : Pass<"convert-gpu-to-rocdl", "gpu::GPUModuleOp"> {
   ];
   let options = [
     Option<"chipset", "chipset", "std::string",
-           /*default=*/"\"gfx000\"",
-           "Chipset that these operations will run on">,
+           /*default=*/"\"infer\"",
+           "Chipset that these operations will run on. By Default it will infer target from attached target attribute on GPU module on which it operates">,
     Option<"indexBitwidth", "index-bitwidth", "unsigned",
            /*default=kDeriveIndexBitwidthFromDataLayout*/"0",
            "Bitwidth of the index type, 0 to use size of machine word">,

@@ -55,7 +55,6 @@ namespace mlir {
 } // namespace mlir
 
 #include "mlir/Dialect/LLVMIR/Transforms/Passes.h"
-
 using namespace mlir;
 
 /// Returns true if the given `gpu.func` can be safely called using the bare
@@ -220,6 +219,27 @@ struct LowerGpuOpsToROCDLOpsPass
     gpu::GPUModuleOp m = getOperation();
     MLIRContext *ctx = m.getContext();
 
+    if (chipset == "infer") {
+      ArrayAttr targets = m.getTargetsAttr();
+      if (!targets) {
+        m->emitError("there are no target attributes to infer");
+        return signalPassFailure();
+      }
+      if (targets.size() != 1) {
+        m->emitError("expected a single target attribute");
+        return signalPassFailure();
+      }
+      ROCDL::ROCDLTargetAttr targetAttr =
+          dyn_cast<ROCDL::ROCDLTargetAttr>(targets[0]);
+      chipset = targetAttr.getChip().str();
+    }
+
+    FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
+    if (failed(maybeChipset)) {
+      m->emitError("invalid chipset name: " + chipset);
+      return signalPassFailure();
+    }
+
     auto llvmDataLayout = m->getAttrOfType<StringAttr>(
         LLVM::LLVMDialect::getDataLayoutAttrName());
     if (!llvmDataLayout) {
@@ -232,12 +252,6 @@ struct LowerGpuOpsToROCDLOpsPass
                     UnitAttr::get(ctx));
     }
 
-    FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
-    if (failed(maybeChipset)) {
-      emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
-      return signalPassFailure();
-    }
-
     /// Customize the bitwidth used for the device side index computations.
     LowerToLLVMOptions options(
         ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
@@ -337,8 +351,7 @@ void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) {
                       LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>();
   // These ops are legal for f32 type.
   target.addDynamicallyLegalOp<LLVM::ExpOp, LLVM::LogOp>([](Operation *op) {
-    return any_of(op->getOperandTypes(),
-                  llvm::IsaPred<Float32Type>);
+    return any_of(op->getOperandTypes(), llvm::IsaPred<Float32Type>);
   });
   // TODO: Remove once we support replacing non-root ops.
   target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
@@ -350,7 +363,8 @@ static void populateOpPatterns(LLVMTypeConverter &converter,
                                RewritePatternSet &patterns, StringRef f32Func,
                                StringRef f64Func, StringRef f16Func) {
   patterns.add<ScalarizeVectorOpLowering<OpTy>>(converter);
-  patterns.add<OpToFuncCallLowering<OpTy>>(converter, f32Func, f64Func, f16Func);
+  patterns.add<OpToFuncCallLowering<OpTy>>(converter, f32Func, f64Func,
+                                           f16Func);
 }
 
 void mlir::populateGpuToROCDLConversionPatterns(

@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl | FileCheck %s --check-prefixes=CHECK,ROCDL
+// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='chipset=gfx900' | FileCheck %s --check-prefixes=CHECK,ROCDL
 // RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm | FileCheck %s --check-prefixes=CHECK,NVVM
 
 gpu.module @kernel {

@@ -1,5 +1,5 @@
 // RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-nvvm --split-input-file %s | FileCheck --check-prefix=NVVM %s
-// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-rocdl --split-input-file %s | FileCheck --check-prefix=ROCDL %s
+// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-rocdl='chipset=gfx900' --split-input-file %s | FileCheck --check-prefix=ROCDL %s
 
 gpu.module @kernel {
   // NVVM-LABEL:  llvm.func @private

@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-bare-ptr-memref-call-conv=0' | FileCheck %s --check-prefixes=CHECK,ROCDL
+// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='chipset=gfx900 use-bare-ptr-memref-call-conv=0' | FileCheck %s --check-prefixes=CHECK,ROCDL
 // RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm='use-bare-ptr-memref-call-conv=0' | FileCheck %s --check-prefixes=CHECK,NVVM
 
 gpu.module @kernel {

@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-bare-ptr-memref-call-conv=1' | FileCheck %s --check-prefixes=CHECK,ROCDL
+// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='chipset=gfx900 use-bare-ptr-memref-call-conv=1' | FileCheck %s --check-prefixes=CHECK,ROCDL
 // RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm='use-bare-ptr-memref-call-conv=1' | FileCheck %s --check-prefixes=CHECK,NVVM
 
 gpu.module @kernel {

@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-bare-ptr-memref-call-conv=0' -verify-diagnostics
+// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='chipset=gfx900 use-bare-ptr-memref-call-conv=0' -verify-diagnostics
 
 gpu.module @kernel {
 // expected-warning @+1 {{Cannot copy noalias with non-bare pointers.}}

@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-gpu-to-rocdl='runtime=HIP' -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx900 runtime=HIP' -split-input-file | FileCheck %s
 
 gpu.module @test_module {
   // CHECK-DAG: llvm.mlir.global internal constant @[[$PRINT_GLOBAL0:[A-Za-z0-9_]+]]("Hello, world\0A\00")

@@ -0,0 +1,30 @@
+// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file --verify-diagnostics | FileCheck --check-prefix=CHECK_TARGET %s
+
+// CHECK_TARGET: @test_module [#rocdl.target<O = 3, chip = "gfx90a">]  attributes {llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"} {
+gpu.module @test_module [#rocdl.target<O = 3, chip = "gfx90a">] {
+  // CHECK_TARGET-LABEL: @kernel_func
+  // CHECK_TARGET: attributes
+  // CHECK_TARGET: gpu.kernel
+  // CHECK_TARGET: rocdl.kernel
+  gpu.func @kernel_func() kernel {
+    gpu.return
+  }
+}
+
+// -----
+
+// expected-error@below {{there are no target attributes to infer}}
+gpu.module @test_module {
+  gpu.func @kernel_func() kernel {
+    gpu.return
+  }
+}
+
+// -----
+
+// expected-error@below {{invalid chipset name: gfx90a,gfx900}}
+gpu.module @test_module [#rocdl.target<O = 3, chip = "gfx90a,gfx900">] {
+  gpu.func @kernel_func() kernel {
+    gpu.return
+  }
+}
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-gpu-to-rocdl='runtime=OpenCL' | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx900 runtime=OpenCL' | FileCheck %s
 
 gpu.module @test_module {
   // CHECK: llvm.mlir.global internal constant @[[$PRINT_GLOBAL:[A-Za-z0-9_]+]]("Hello: %d\0A\00")  {addr_space = 4 : i32}

@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s
-// RUN: mlir-opt %s -convert-gpu-to-rocdl='index-bitwidth=32' -split-input-file | FileCheck --check-prefix=CHECK32 %s
+// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx900' -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx900 index-bitwidth=32' -split-input-file | FileCheck --check-prefix=CHECK32 %s
 
 // CHECK-LABEL: @test_module
 // CHECK-SAME: llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"

@@ -1,6 +1,6 @@
-// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx900' -split-input-file | FileCheck %s
 // RUN: mlir-opt %s \
-// RUN:   -convert-gpu-to-rocdl='use-bare-ptr-memref-call-conv=true' \
+// RUN:   -convert-gpu-to-rocdl='chipset=gfx900 use-bare-ptr-memref-call-conv=true' \
 // RUN:   -split-input-file \
 // RUN: | FileCheck %s --check-prefix=BARE
 

@@ -1,6 +1,6 @@
 // RUN: mlir-opt %s \
 // RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl),rocdl-attach-target{chip=%chip})' \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo),rocdl-attach-target{chip=%chip}, gpu.module(convert-gpu-to-rocdl))' \
 // RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_rocm_runtime \

@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl{index-bitwidth=32 runtime=HIP}),rocdl-attach-target{chip=%chip})' \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo),rocdl-attach-target{chip=%chip}, gpu.module(convert-gpu-to-rocdl{index-bitwidth=32 runtime=HIP}))' \
 // RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_rocm_runtime \

@@ -1,6 +1,6 @@
 // RUN: mlir-opt %s \
 // RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl),rocdl-attach-target{chip=%chip})' \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo),rocdl-attach-target{chip=%chip}, gpu.module(convert-gpu-to-rocdl))' \
 // RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_rocm_runtime \

@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s \
 // RUN: | mlir-opt -convert-scf-to-cf \
 // RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl{use-bare-ptr-memref-call-conv=true}),rocdl-attach-target{chip=%chip})' \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo),rocdl-attach-target{chip=%chip}, gpu.module(convert-gpu-to-rocdl{use-bare-ptr-memref-call-conv=true}))' \
 // RUN: | mlir-opt -gpu-to-llvm=use-bare-pointers-for-kernels=true -reconcile-unrealized-casts -gpu-module-to-binary \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_rocm_runtime \

@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s \
 // RUN: | mlir-opt -convert-scf-to-cf \
 // RUN: | mlir-opt -gpu-kernel-outlining \
-// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl{chipset=%chip index-bitwidth=32}),rocdl-attach-target{chip=%chip})' \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo),rocdl-attach-target{chip=%chip}, gpu.module(convert-gpu-to-rocdl{index-bitwidth=32}))' \
 // RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_rocm_runtime \

@@ -236,23 +236,24 @@ void rock::buildBackendPipeline(OpPassManager &pm,
   // We need to lower affine again, because the expand strided metadata pass
   // adds back affine.apply for memref.subview
   gpuPm.addPass(createLowerAffinePass());
-  gpuPm.addPass(createLowerGpuOpsToROCDLOpsPass(
-      options.chip, /*indexBitwidth=*/kDeriveIndexBitwidthFromDataLayout,
+  GpuROCDLAttachTargetOptions opts;
+  opts.triple = options.triple;
+  opts.chip = options.chip;
+  opts.features = options.features;
+  opts.optLevel = options.optLevel;
+  pm.addPass(createGpuROCDLAttachTarget(opts));
+  auto &gpuPm2 = pm.nest<gpu::GPUModuleOp>();
+  gpuPm2.addPass(createLowerGpuOpsToROCDLOpsPass(
+      /*chipset=*/"infer", /*indexBitwidth=*/kDeriveIndexBitwidthFromDataLayout,
       /*useBarePtrCallConv=*/true, gpu::amd::Runtime::HIP));
   // Ensure we only run passes on LLVM functions inside GPU modules.
-  auto &llvmFuncPm = gpuPm.nest<LLVM::LLVMFuncOp>();
+  auto &llvmFuncPm = gpuPm2.nest<LLVM::LLVMFuncOp>();
   // -canonicalize -cse so that we don't have to crawl through memref
   // descriptors. (Mainly we want the `extractvalue` fold).
   llvmFuncPm.addPass(createCanonicalizerPass());
   llvmFuncPm.addPass(createCSEPass());
   llvmFuncPm.addPass(rock::createRockPrepareLLVMPass());
   if (options.compile) {
-    GpuROCDLAttachTargetOptions opts;
-    opts.triple = options.triple;
-    opts.chip = options.chip;
-    opts.features = options.features;
-    opts.optLevel = options.optLevel;
-    pm.addPass(createGpuROCDLAttachTarget(opts));
     pm.addPass(createGpuModuleToBinaryPass());
     pm.addPass(createRockCheckResidencyPass());
   }

@@ -50,12 +50,12 @@
 // BINARY-NEXT:convert-arith-to-amdgpu{allow-packed-f16-round-to-zero=true chipset=gfx90a saturate-fp8-truncf=true},
 // BINARY-NEXT:emulate-fp8-ext-trunc,
 // BINARY-NEXT:expand-strided-metadata,
-// BINARY-NEXT:lower-affine,
-// BINARY-NEXT:convert-gpu-to-rocdl{chipset=gfx90a index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true},
+// BINARY-NEXT:lower-affine),
+// BINARY-NEXT:rocdl-attach-target{O=3 abi=500 chip=gfx90a correct-sqrt=true daz=false fast=false features= finite-only=false  module= triple=amdgcn-amd-amdhsa unsafe-math=false wave64=true},
+// BINARY-NEXT:gpu.module(convert-gpu-to-rocdl{chipset=infer index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true},
 // BINARY-NEXT:llvm.func(canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},
 // BINARY-NEXT:cse,
 // BINARY-NEXT:rock-prepare-llvm)),
-// BINARY-NEXT:rocdl-attach-target{O=3 abi=500 chip=gfx90a correct-sqrt=true daz=false fast=false features= finite-only=false  module= triple=amdgcn-amd-amdhsa unsafe-math=false wave64=true},
 // BINARY-NEXT:gpu-module-to-binary{format=fatbin  opts= toolkit=},
 // BINARY-NEXT:rock-check-residency,
 // BINARY-NEXT:emulate-fp8-ext-trunc)
@@ -69,12 +69,12 @@
 // BINARY_MI300-NEXT:f8E5M2} target-type=f32},
 // BINARY_MI300-NEXT:convert-arith-to-amdgpu{allow-packed-f16-round-to-zero=true chipset=gfx940 saturate-fp8-truncf=true},
 // BINARY_MI300-NEXT:expand-strided-metadata,
-// BINARY_MI300-NEXT:lower-affine,
-// BINARY_MI300-NEXT:convert-gpu-to-rocdl{chipset=gfx940 index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true},
+// BINARY_MI300-NEXT:lower-affine),
+// BINARY_MI300-NEXT:rocdl-attach-target{O=3 abi=500 chip=gfx940 correct-sqrt=true daz=false fast=false features= finite-only=false  module= triple=amdgcn-amd-amdhsa unsafe-math=false wave64=true},
+// BINARY_MI300-NEXT:gpu.module(convert-gpu-to-rocdl{chipset=infer index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true},
 // BINARY_MI300-NEXT:llvm.func(canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},
 // BINARY_MI300-NEXT:cse,
 // BINARY_MI300-NEXT:rock-prepare-llvm)),
-// BINARY_MI300-NEXT:rocdl-attach-target{O=3 abi=500 chip=gfx940 correct-sqrt=true daz=false fast=false features= finite-only=false  module= triple=amdgcn-amd-amdhsa unsafe-math=false wave64=true},
 // BINARY_MI300-NEXT:gpu-module-to-binary{format=fatbin  opts= toolkit=},
 // BINARY_MI300-NEXT:rock-check-residency,
 // BINARY_MI300-NEXT:emulate-fp8-ext-trunc)