Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use target attached on gpu module when lowering #1666

Open
wants to merge 4 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ void configureGpuToROCDLConversionLegality(ConversionTarget &target);
/// is configurable.
std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
createLowerGpuOpsToROCDLOpsPass(
const std::string &chipset = "gfx900",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For backwards compatibity/other people using the pass/..., keep this, but use a default value that's something like target or infer

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added "infer" option

const std::string &chipset = "infer",
unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout,
bool useBarePtrCallConv = false,
gpu::amd::Runtime runtime = gpu::amd::Runtime::Unknown);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -592,8 +592,8 @@ def ConvertGpuOpsToROCDLOps : Pass<"convert-gpu-to-rocdl", "gpu::GPUModuleOp"> {
];
let options = [
Option<"chipset", "chipset", "std::string",
/*default=*/"\"gfx000\"",
"Chipset that these operations will run on">,
/*default=*/"\"infer\"",
"Chipset that these operations will run on. By Default it will infer target from attached target attribute on GPU module on which it operates">,
Option<"indexBitwidth", "index-bitwidth", "unsigned",
/*default=kDeriveIndexBitwidthFromDataLayout*/"0",
"Bitwidth of the index type, 0 to use size of machine word">,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ namespace mlir {
} // namespace mlir

#include "mlir/Dialect/LLVMIR/Transforms/Passes.h"

using namespace mlir;

/// Returns true if the given `gpu.func` can be safely called using the bare
Expand Down Expand Up @@ -220,6 +219,27 @@ struct LowerGpuOpsToROCDLOpsPass
gpu::GPUModuleOp m = getOperation();
MLIRContext *ctx = m.getContext();

if (chipset == "infer") {
ArrayAttr targets = m.getTargetsAttr();
if (!targets) {
m->emitError("there are no target attributes to infer");
return signalPassFailure();
}
if (targets.size() != 1) {
m->emitError("expected a single target attribute");
return signalPassFailure();
}
ROCDL::ROCDLTargetAttr targetAttr =
dyn_cast<ROCDL::ROCDLTargetAttr>(targets[0]);
chipset = targetAttr.getChip().str();
}

FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
if (failed(maybeChipset)) {
m->emitError("invalid chipset name: " + chipset);
return signalPassFailure();
}

auto llvmDataLayout = m->getAttrOfType<StringAttr>(
LLVM::LLVMDialect::getDataLayoutAttrName());
if (!llvmDataLayout) {
Expand All @@ -232,12 +252,6 @@ struct LowerGpuOpsToROCDLOpsPass
UnitAttr::get(ctx));
}

FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
if (failed(maybeChipset)) {
emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
return signalPassFailure();
}

/// Customize the bitwidth used for the device side index computations.
LowerToLLVMOptions options(
ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
Expand Down Expand Up @@ -337,8 +351,7 @@ void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) {
LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>();
// These ops are legal for f32 type.
target.addDynamicallyLegalOp<LLVM::ExpOp, LLVM::LogOp>([](Operation *op) {
return any_of(op->getOperandTypes(),
llvm::IsaPred<Float32Type>);
return any_of(op->getOperandTypes(), llvm::IsaPred<Float32Type>);
});
// TODO: Remove once we support replacing non-root ops.
target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
Expand All @@ -350,7 +363,8 @@ static void populateOpPatterns(LLVMTypeConverter &converter,
RewritePatternSet &patterns, StringRef f32Func,
StringRef f64Func, StringRef f16Func) {
patterns.add<ScalarizeVectorOpLowering<OpTy>>(converter);
patterns.add<OpToFuncCallLowering<OpTy>>(converter, f32Func, f64Func, f16Func);
patterns.add<OpToFuncCallLowering<OpTy>>(converter, f32Func, f64Func,
f16Func);
}

void mlir::populateGpuToROCDLConversionPatterns(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl | FileCheck %s --check-prefixes=CHECK,ROCDL
// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='chipset=gfx900' | FileCheck %s --check-prefixes=CHECK,ROCDL
// RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm | FileCheck %s --check-prefixes=CHECK,NVVM

gpu.module @kernel {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-nvvm --split-input-file %s | FileCheck --check-prefix=NVVM %s
// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-rocdl --split-input-file %s | FileCheck --check-prefix=ROCDL %s
// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-rocdl='chipset=gfx900' --split-input-file %s | FileCheck --check-prefix=ROCDL %s

gpu.module @kernel {
// NVVM-LABEL: llvm.func @private
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-bare-ptr-memref-call-conv=0' | FileCheck %s --check-prefixes=CHECK,ROCDL
// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='chipset=gfx900 use-bare-ptr-memref-call-conv=0' | FileCheck %s --check-prefixes=CHECK,ROCDL
// RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm='use-bare-ptr-memref-call-conv=0' | FileCheck %s --check-prefixes=CHECK,NVVM

gpu.module @kernel {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-bare-ptr-memref-call-conv=1' | FileCheck %s --check-prefixes=CHECK,ROCDL
// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='chipset=gfx900 use-bare-ptr-memref-call-conv=1' | FileCheck %s --check-prefixes=CHECK,ROCDL
// RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm='use-bare-ptr-memref-call-conv=1' | FileCheck %s --check-prefixes=CHECK,NVVM

gpu.module @kernel {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-bare-ptr-memref-call-conv=0' -verify-diagnostics
// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='chipset=gfx900 use-bare-ptr-memref-call-conv=0' -verify-diagnostics

gpu.module @kernel {
// expected-warning @+1 {{Cannot copy noalias with non-bare pointers.}}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: mlir-opt %s -convert-gpu-to-rocdl='runtime=HIP' -split-input-file | FileCheck %s
// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx900 runtime=HIP' -split-input-file | FileCheck %s

gpu.module @test_module {
// CHECK-DAG: llvm.mlir.global internal constant @[[$PRINT_GLOBAL0:[A-Za-z0-9_]+]]("Hello, world\0A\00")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file --verify-diagnostics | FileCheck --check-prefix=CHECK_TARGET %s

// CHECK_TARGET: @test_module [#rocdl.target<O = 3, chip = "gfx90a">] attributes {llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"} {
gpu.module @test_module [#rocdl.target<O = 3, chip = "gfx90a">] {
// CHECK_TARGET-LABEL: @kernel_func
// CHECK_TARGET: attributes
// CHECK_TARGET: gpu.kernel
// CHECK_TARGET: rocdl.kernel
gpu.func @kernel_func() kernel {
gpu.return
}
}

// -----

// expected-error@below {{there are no target attributes to infer}}
gpu.module @test_module {
gpu.func @kernel_func() kernel {
gpu.return
}
}

// -----

// expected-error@below {{invalid chipset name: gfx90a,gfx900}}
gpu.module @test_module [#rocdl.target<O = 3, chip = "gfx90a,gfx900">] {
gpu.func @kernel_func() kernel {
gpu.return
}
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: mlir-opt %s -convert-gpu-to-rocdl='runtime=OpenCL' | FileCheck %s
// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx900 runtime=OpenCL' | FileCheck %s

gpu.module @test_module {
// CHECK: llvm.mlir.global internal constant @[[$PRINT_GLOBAL:[A-Za-z0-9_]+]]("Hello: %d\0A\00") {addr_space = 4 : i32}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s
// RUN: mlir-opt %s -convert-gpu-to-rocdl='index-bitwidth=32' -split-input-file | FileCheck --check-prefix=CHECK32 %s
// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx900' -split-input-file | FileCheck %s
// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx900 index-bitwidth=32' -split-input-file | FileCheck --check-prefix=CHECK32 %s

// CHECK-LABEL: @test_module
// CHECK-SAME: llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s
// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx900' -split-input-file | FileCheck %s
// RUN: mlir-opt %s \
// RUN: -convert-gpu-to-rocdl='use-bare-ptr-memref-call-conv=true' \
// RUN: -convert-gpu-to-rocdl='chipset=gfx900 use-bare-ptr-memref-call-conv=true' \
// RUN: -split-input-file \
// RUN: | FileCheck %s --check-prefix=BARE

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// RUN: mlir-opt %s \
// RUN: | mlir-opt -gpu-kernel-outlining \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl),rocdl-attach-target{chip=%chip})' \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo),rocdl-attach-target{chip=%chip}, gpu.module(convert-gpu-to-rocdl))' \
// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_rocm_runtime \
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// RUN: mlir-opt %s \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl{index-bitwidth=32 runtime=HIP}),rocdl-attach-target{chip=%chip})' \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo),rocdl-attach-target{chip=%chip}, gpu.module(convert-gpu-to-rocdl{index-bitwidth=32 runtime=HIP}))' \
// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_rocm_runtime \
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// RUN: mlir-opt %s \
// RUN: | mlir-opt -gpu-kernel-outlining \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl),rocdl-attach-target{chip=%chip})' \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo),rocdl-attach-target{chip=%chip}, gpu.module(convert-gpu-to-rocdl))' \
// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_rocm_runtime \
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// RUN: mlir-opt %s \
// RUN: | mlir-opt -convert-scf-to-cf \
// RUN: | mlir-opt -gpu-kernel-outlining \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl{use-bare-ptr-memref-call-conv=true}),rocdl-attach-target{chip=%chip})' \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo),rocdl-attach-target{chip=%chip}, gpu.module(convert-gpu-to-rocdl{use-bare-ptr-memref-call-conv=true}))' \
// RUN: | mlir-opt -gpu-to-llvm=use-bare-pointers-for-kernels=true -reconcile-unrealized-casts -gpu-module-to-binary \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_rocm_runtime \
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// RUN: mlir-opt %s \
// RUN: | mlir-opt -convert-scf-to-cf \
// RUN: | mlir-opt -gpu-kernel-outlining \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl{chipset=%chip index-bitwidth=32}),rocdl-attach-target{chip=%chip})' \
// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo),rocdl-attach-target{chip=%chip}, gpu.module(convert-gpu-to-rocdl{index-bitwidth=32}))' \
// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary \
// RUN: | mlir-cpu-runner \
// RUN: --shared-libs=%mlir_rocm_runtime \
Expand Down
19 changes: 10 additions & 9 deletions mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -236,23 +236,24 @@ void rock::buildBackendPipeline(OpPassManager &pm,
// We need to lower affine again, because the expand strided metadata pass
// adds back affine.apply for memref.subview
gpuPm.addPass(createLowerAffinePass());
gpuPm.addPass(createLowerGpuOpsToROCDLOpsPass(
options.chip, /*indexBitwidth=*/kDeriveIndexBitwidthFromDataLayout,
GpuROCDLAttachTargetOptions opts;
opts.triple = options.triple;
opts.chip = options.chip;
opts.features = options.features;
opts.optLevel = options.optLevel;
pm.addPass(createGpuROCDLAttachTarget(opts));
auto &gpuPm2 = pm.nest<gpu::GPUModuleOp>();
gpuPm2.addPass(createLowerGpuOpsToROCDLOpsPass(
/*chipset=*/"infer", /*indexBitwidth=*/kDeriveIndexBitwidthFromDataLayout,
/*useBarePtrCallConv=*/true, gpu::amd::Runtime::HIP));
// Ensure we only run passes on LLVM functions inside GPU modules.
auto &llvmFuncPm = gpuPm.nest<LLVM::LLVMFuncOp>();
auto &llvmFuncPm = gpuPm2.nest<LLVM::LLVMFuncOp>();
// -canonicalize -cse so that we don't have to crawl through memref
// descriptors. (Mainly we want the `extractvalue` fold).
llvmFuncPm.addPass(createCanonicalizerPass());
llvmFuncPm.addPass(createCSEPass());
llvmFuncPm.addPass(rock::createRockPrepareLLVMPass());
if (options.compile) {
GpuROCDLAttachTargetOptions opts;
opts.triple = options.triple;
opts.chip = options.chip;
opts.features = options.features;
opts.optLevel = options.optLevel;
pm.addPass(createGpuROCDLAttachTarget(opts));
pm.addPass(createGpuModuleToBinaryPass());
pm.addPass(createRockCheckResidencyPass());
}
Expand Down
12 changes: 6 additions & 6 deletions mlir/test/rocmlir-driver/pipelines.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,12 @@
// BINARY-NEXT:convert-arith-to-amdgpu{allow-packed-f16-round-to-zero=true chipset=gfx90a saturate-fp8-truncf=true},
// BINARY-NEXT:emulate-fp8-ext-trunc,
// BINARY-NEXT:expand-strided-metadata,
// BINARY-NEXT:lower-affine,
// BINARY-NEXT:convert-gpu-to-rocdl{chipset=gfx90a index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true},
// BINARY-NEXT:lower-affine),
// BINARY-NEXT:rocdl-attach-target{O=3 abi=500 chip=gfx90a correct-sqrt=true daz=false fast=false features= finite-only=false module= triple=amdgcn-amd-amdhsa unsafe-math=false wave64=true},
// BINARY-NEXT:gpu.module(convert-gpu-to-rocdl{chipset=infer index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true},
// BINARY-NEXT:llvm.func(canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},
// BINARY-NEXT:cse,
// BINARY-NEXT:rock-prepare-llvm)),
// BINARY-NEXT:rocdl-attach-target{O=3 abi=500 chip=gfx90a correct-sqrt=true daz=false fast=false features= finite-only=false module= triple=amdgcn-amd-amdhsa unsafe-math=false wave64=true},
// BINARY-NEXT:gpu-module-to-binary{format=fatbin opts= toolkit=},
// BINARY-NEXT:rock-check-residency,
// BINARY-NEXT:emulate-fp8-ext-trunc)
Expand All @@ -69,12 +69,12 @@
// BINARY_MI300-NEXT:f8E5M2} target-type=f32},
// BINARY_MI300-NEXT:convert-arith-to-amdgpu{allow-packed-f16-round-to-zero=true chipset=gfx940 saturate-fp8-truncf=true},
// BINARY_MI300-NEXT:expand-strided-metadata,
// BINARY_MI300-NEXT:lower-affine,
// BINARY_MI300-NEXT:convert-gpu-to-rocdl{chipset=gfx940 index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true},
// BINARY_MI300-NEXT:lower-affine),
// BINARY_MI300-NEXT:rocdl-attach-target{O=3 abi=500 chip=gfx940 correct-sqrt=true daz=false fast=false features= finite-only=false module= triple=amdgcn-amd-amdhsa unsafe-math=false wave64=true},
// BINARY_MI300-NEXT:gpu.module(convert-gpu-to-rocdl{chipset=infer index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true},
// BINARY_MI300-NEXT:llvm.func(canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},
// BINARY_MI300-NEXT:cse,
// BINARY_MI300-NEXT:rock-prepare-llvm)),
// BINARY_MI300-NEXT:rocdl-attach-target{O=3 abi=500 chip=gfx940 correct-sqrt=true daz=false fast=false features= finite-only=false module= triple=amdgcn-amd-amdhsa unsafe-math=false wave64=true},
// BINARY_MI300-NEXT:gpu-module-to-binary{format=fatbin opts= toolkit=},
// BINARY_MI300-NEXT:rock-check-residency,
// BINARY_MI300-NEXT:emulate-fp8-ext-trunc)
Expand Down