From 8ac306c05af9129446dc584a9823f85dc0775548 Mon Sep 17 00:00:00 2001 From: Umang Yadav Date: Fri, 27 Sep 2024 13:43:51 +0000 Subject: [PATCH 1/4] move rocdl-attach-target before convert-gpu-to-rocdl, read chipset from attached target during lowering --- .../Conversion/GPUToROCDL/GPUToROCDLPass.h | 1 - .../mlir/include/mlir/Conversion/Passes.td | 3 -- .../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 44 ++++++++++++------- mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp | 19 ++++---- mlir/test/rocmlir-driver/pipelines.mlir | 12 ++--- 5 files changed, 44 insertions(+), 35 deletions(-) diff --git a/external/llvm-project/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h b/external/llvm-project/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h index 564778771299..7def4a1dd5e8 100644 --- a/external/llvm-project/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h +++ b/external/llvm-project/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h @@ -42,7 +42,6 @@ void configureGpuToROCDLConversionLegality(ConversionTarget &target); /// is configurable. std::unique_ptr> createLowerGpuOpsToROCDLOpsPass( - const std::string &chipset = "gfx900", unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout, bool useBarePtrCallConv = false, gpu::amd::Runtime runtime = gpu::amd::Runtime::Unknown); diff --git a/external/llvm-project/mlir/include/mlir/Conversion/Passes.td b/external/llvm-project/mlir/include/mlir/Conversion/Passes.td index e52c7ff6bd56..cfed275daf4c 100644 --- a/external/llvm-project/mlir/include/mlir/Conversion/Passes.td +++ b/external/llvm-project/mlir/include/mlir/Conversion/Passes.td @@ -591,9 +591,6 @@ def ConvertGpuOpsToROCDLOps : Pass<"convert-gpu-to-rocdl", "gpu::GPUModuleOp"> { "memref::MemRefDialect", ]; let options = [ - Option<"chipset", "chipset", "std::string", - /*default=*/"\"gfx000\"", - "Chipset that these operations will run on">, Option<"indexBitwidth", "index-bitwidth", "unsigned", /*default=kDeriveIndexBitwidthFromDataLayout*/"0", "Bitwidth of the index type, 0 to use size of machine word">, diff --git a/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp index 36fbf80c8156..e8b96227f18f 100644 --- a/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -14,6 +14,8 @@ #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h" #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h" #include "mlir/Dialect/Arith/Transforms/Passes.h" +#include "mlir/IR/Diagnostics.h" +#include "mlir/IR/Location.h" #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassManager.h" #include "mlir/Transforms/Passes.h" @@ -203,11 +205,8 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern { struct LowerGpuOpsToROCDLOpsPass : public impl::ConvertGpuOpsToROCDLOpsBase { LowerGpuOpsToROCDLOpsPass() = default; - LowerGpuOpsToROCDLOpsPass(const std::string &chipset, unsigned indexBitwidth, - bool useBarePtrCallConv, + LowerGpuOpsToROCDLOpsPass(unsigned indexBitwidth, bool useBarePtrCallConv, gpu::amd::Runtime runtime) { - if (this->chipset.getNumOccurrences() == 0) - this->chipset = chipset; if (this->indexBitwidth.getNumOccurrences() == 0) this->indexBitwidth = indexBitwidth; if (this->useBarePtrCallConv.getNumOccurrences() == 0) @@ -219,6 +218,26 @@ struct LowerGpuOpsToROCDLOpsPass void runOnOperation() override { gpu::GPUModuleOp m = getOperation(); MLIRContext *ctx = m.getContext(); + ArrayAttr targets = m.getTargetsAttr(); + FailureOr maybeChipset; + if (!targets) { + emitError(UnknownLoc::get(ctx), "ROCDLTargetAttr is empty on GPU module"); + return signalPassFailure(); + } + if (targets.size() != 1) { + emitError(UnknownLoc::get(ctx), "ROCDLTargetAttrs has more specified " + "more than one gpu-arch on GPU module"); + return signalPassFailure(); + } else { + const ROCDL::ROCDLTargetAttr targetAttr = + mlir::dyn_cast(targets.getValue().front()); + maybeChipset = amdgpu::Chipset::parse(targetAttr.getChip()); + if (failed(maybeChipset)) { + emitError(UnknownLoc::get(ctx), + "Invalid chipset name: " + targetAttr.getChip()); + return signalPassFailure(); + } + } auto llvmDataLayout = m->getAttrOfType( LLVM::LLVMDialect::getDataLayoutAttrName()); @@ -232,12 +251,6 @@ struct LowerGpuOpsToROCDLOpsPass UnitAttr::get(ctx)); } - FailureOr maybeChipset = amdgpu::Chipset::parse(chipset); - if (failed(maybeChipset)) { - emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset); - return signalPassFailure(); - } - /// Customize the bitwidth used for the device side index computations. LowerToLLVMOptions options( ctx, DataLayout(cast(m.getOperation()))); @@ -337,8 +350,7 @@ void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) { LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp>(); // These ops are legal for f32 type. target.addDynamicallyLegalOp([](Operation *op) { - return any_of(op->getOperandTypes(), - llvm::IsaPred); + return any_of(op->getOperandTypes(), llvm::IsaPred); }); // TODO: Remove once we support replacing non-root ops. target.addLegalOp(); @@ -350,7 +362,8 @@ static void populateOpPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, StringRef f32Func, StringRef f64Func, StringRef f16Func) { patterns.add>(converter); - patterns.add>(converter, f32Func, f64Func, f16Func); + patterns.add>(converter, f32Func, f64Func, + f16Func); } void mlir::populateGpuToROCDLConversionPatterns( @@ -399,10 +412,9 @@ void mlir::populateGpuToROCDLConversionPatterns( } std::unique_ptr> -mlir::createLowerGpuOpsToROCDLOpsPass(const std::string &chipset, - unsigned indexBitwidth, +mlir::createLowerGpuOpsToROCDLOpsPass(unsigned indexBitwidth, bool useBarePtrCallConv, gpu::amd::Runtime runtime) { return std::make_unique( - chipset, indexBitwidth, useBarePtrCallConv, runtime); + indexBitwidth, useBarePtrCallConv, runtime); } diff --git a/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp b/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp index adfaff9141ed..e08e5478e979 100644 --- a/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp +++ b/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp @@ -236,23 +236,24 @@ void rock::buildBackendPipeline(OpPassManager &pm, // We need to lower affine again, because the expand strided metadata pass // adds back affine.apply for memref.subview gpuPm.addPass(createLowerAffinePass()); - gpuPm.addPass(createLowerGpuOpsToROCDLOpsPass( - options.chip, /*indexBitwidth=*/kDeriveIndexBitwidthFromDataLayout, + GpuROCDLAttachTargetOptions opts; + opts.triple = options.triple; + opts.chip = options.chip; + opts.features = options.features; + opts.optLevel = options.optLevel; + pm.addPass(createGpuROCDLAttachTarget(opts)); + auto &gpuPm2 = pm.nest(); + gpuPm2.addPass(createLowerGpuOpsToROCDLOpsPass( + /*indexBitwidth=*/kDeriveIndexBitwidthFromDataLayout, /*useBarePtrCallConv=*/true, gpu::amd::Runtime::HIP)); // Ensure we only run passes on LLVM functions inside GPU modules. - auto &llvmFuncPm = gpuPm.nest(); + auto &llvmFuncPm = gpuPm2.nest(); // -canonicalize -cse so that we don't have to crawl through memref // descriptors. (Mainly we want the `extractvalue` fold). llvmFuncPm.addPass(createCanonicalizerPass()); llvmFuncPm.addPass(createCSEPass()); llvmFuncPm.addPass(rock::createRockPrepareLLVMPass()); if (options.compile) { - GpuROCDLAttachTargetOptions opts; - opts.triple = options.triple; - opts.chip = options.chip; - opts.features = options.features; - opts.optLevel = options.optLevel; - pm.addPass(createGpuROCDLAttachTarget(opts)); pm.addPass(createGpuModuleToBinaryPass()); pm.addPass(createRockCheckResidencyPass()); } diff --git a/mlir/test/rocmlir-driver/pipelines.mlir b/mlir/test/rocmlir-driver/pipelines.mlir index 192932a1624f..9c10440e2705 100644 --- a/mlir/test/rocmlir-driver/pipelines.mlir +++ b/mlir/test/rocmlir-driver/pipelines.mlir @@ -50,12 +50,12 @@ // BINARY-NEXT:convert-arith-to-amdgpu{allow-packed-f16-round-to-zero=true chipset=gfx90a saturate-fp8-truncf=true}, // BINARY-NEXT:emulate-fp8-ext-trunc, // BINARY-NEXT:expand-strided-metadata, -// BINARY-NEXT:lower-affine, -// BINARY-NEXT:convert-gpu-to-rocdl{chipset=gfx90a index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true}, +// BINARY-NEXT:lower-affine), +// BINARY-NEXT:rocdl-attach-target{O=3 abi=500 chip=gfx90a correct-sqrt=true daz=false fast=false features= finite-only=false module= triple=amdgcn-amd-amdhsa unsafe-math=false wave64=true}, +// BINARY-NEXT:gpu.module(convert-gpu-to-rocdl{index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true}, // BINARY-NEXT:llvm.func(canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true}, // BINARY-NEXT:cse, // BINARY-NEXT:rock-prepare-llvm)), -// BINARY-NEXT:rocdl-attach-target{O=3 abi=500 chip=gfx90a correct-sqrt=true daz=false fast=false features= finite-only=false module= triple=amdgcn-amd-amdhsa unsafe-math=false wave64=true}, // BINARY-NEXT:gpu-module-to-binary{format=fatbin opts= toolkit=}, // BINARY-NEXT:rock-check-residency, // BINARY-NEXT:emulate-fp8-ext-trunc) @@ -69,12 +69,12 @@ // BINARY_MI300-NEXT:f8E5M2} target-type=f32}, // BINARY_MI300-NEXT:convert-arith-to-amdgpu{allow-packed-f16-round-to-zero=true chipset=gfx940 saturate-fp8-truncf=true}, // BINARY_MI300-NEXT:expand-strided-metadata, -// BINARY_MI300-NEXT:lower-affine, -// BINARY_MI300-NEXT:convert-gpu-to-rocdl{chipset=gfx940 index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true}, +// BINARY_MI300-NEXT:lower-affine), +// BINARY_MI300-NEXT:rocdl-attach-target{O=3 abi=500 chip=gfx940 correct-sqrt=true daz=false fast=false features= finite-only=false module= triple=amdgcn-amd-amdhsa unsafe-math=false wave64=true}, +// BINARY_MI300-NEXT:gpu.module(convert-gpu-to-rocdl{index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true}, // BINARY_MI300-NEXT:llvm.func(canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true}, // BINARY_MI300-NEXT:cse, // BINARY_MI300-NEXT:rock-prepare-llvm)), -// BINARY_MI300-NEXT:rocdl-attach-target{O=3 abi=500 chip=gfx940 correct-sqrt=true daz=false fast=false features= finite-only=false module= triple=amdgcn-amd-amdhsa unsafe-math=false wave64=true}, // BINARY_MI300-NEXT:gpu-module-to-binary{format=fatbin opts= toolkit=}, // BINARY_MI300-NEXT:rock-check-residency, // BINARY_MI300-NEXT:emulate-fp8-ext-trunc) From 129a0fa7eda55ff0dc35d5ca9a3aad47a2f77a1d Mon Sep 17 00:00:00 2001 From: Umang Yadav Date: Mon, 30 Sep 2024 20:30:35 +0000 Subject: [PATCH 2/4] add "infer" option for backwards compatibility --- .../Conversion/GPUToROCDL/GPUToROCDLPass.h | 1 + .../mlir/include/mlir/Conversion/Passes.td | 3 ++ .../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 39 ++++++++++++------- mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp | 2 +- mlir/test/rocmlir-driver/pipelines.mlir | 4 +- 5 files changed, 31 insertions(+), 18 deletions(-) diff --git a/external/llvm-project/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h b/external/llvm-project/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h index 7def4a1dd5e8..f1233ad894da 100644 --- a/external/llvm-project/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h +++ b/external/llvm-project/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h @@ -42,6 +42,7 @@ void configureGpuToROCDLConversionLegality(ConversionTarget &target); /// is configurable. std::unique_ptr> createLowerGpuOpsToROCDLOpsPass( + const std::string &chipset = "infer", unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout, bool useBarePtrCallConv = false, gpu::amd::Runtime runtime = gpu::amd::Runtime::Unknown); diff --git a/external/llvm-project/mlir/include/mlir/Conversion/Passes.td b/external/llvm-project/mlir/include/mlir/Conversion/Passes.td index cfed275daf4c..62d6f2954a66 100644 --- a/external/llvm-project/mlir/include/mlir/Conversion/Passes.td +++ b/external/llvm-project/mlir/include/mlir/Conversion/Passes.td @@ -591,6 +591,9 @@ def ConvertGpuOpsToROCDLOps : Pass<"convert-gpu-to-rocdl", "gpu::GPUModuleOp"> { "memref::MemRefDialect", ]; let options = [ + Option<"chipset", "chipset", "std::string", + /*default=*/"\"infer\"", + "Chipset that these operations will run on. By default it will infer target from attached Target Attribute on GPU Module">, Option<"indexBitwidth", "index-bitwidth", "unsigned", /*default=kDeriveIndexBitwidthFromDataLayout*/"0", "Bitwidth of the index type, 0 to use size of machine word">, diff --git a/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp index e8b96227f18f..eb2d87f09f98 100644 --- a/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -14,8 +14,6 @@ #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h" #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h" #include "mlir/Dialect/Arith/Transforms/Passes.h" -#include "mlir/IR/Diagnostics.h" -#include "mlir/IR/Location.h" #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassManager.h" #include "mlir/Transforms/Passes.h" @@ -57,7 +55,6 @@ namespace mlir { } // namespace mlir #include "mlir/Dialect/LLVMIR/Transforms/Passes.h" - using namespace mlir; /// Returns true if the given `gpu.func` can be safely called using the bare @@ -205,8 +202,11 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern { struct LowerGpuOpsToROCDLOpsPass : public impl::ConvertGpuOpsToROCDLOpsBase { LowerGpuOpsToROCDLOpsPass() = default; - LowerGpuOpsToROCDLOpsPass(unsigned indexBitwidth, bool useBarePtrCallConv, + LowerGpuOpsToROCDLOpsPass(const std::string &chipset, unsigned indexBitwidth, + bool useBarePtrCallConv, gpu::amd::Runtime runtime) { + if (this->chipset.getNumOccurrences() == 0) + this->chipset = chipset; if (this->indexBitwidth.getNumOccurrences() == 0) this->indexBitwidth = indexBitwidth; if (this->useBarePtrCallConv.getNumOccurrences() == 0) @@ -220,15 +220,17 @@ struct LowerGpuOpsToROCDLOpsPass MLIRContext *ctx = m.getContext(); ArrayAttr targets = m.getTargetsAttr(); FailureOr maybeChipset; - if (!targets) { - emitError(UnknownLoc::get(ctx), "ROCDLTargetAttr is empty on GPU module"); - return signalPassFailure(); - } - if (targets.size() != 1) { - emitError(UnknownLoc::get(ctx), "ROCDLTargetAttrs has more specified " - "more than one gpu-arch on GPU module"); - return signalPassFailure(); - } else { + if (chipset == "infer") { + if (!targets) { + emitError(UnknownLoc::get(ctx), + "ROCDLTargetAttr is empty on GPU module"); + return signalPassFailure(); + } + if (targets.size() != 1) { + emitError(UnknownLoc::get(ctx), "ROCDLTargetAttrs has more specified " + "more than one gpu-arch on GPU module"); + return signalPassFailure(); + } const ROCDL::ROCDLTargetAttr targetAttr = mlir::dyn_cast(targets.getValue().front()); maybeChipset = amdgpu::Chipset::parse(targetAttr.getChip()); @@ -237,6 +239,12 @@ struct LowerGpuOpsToROCDLOpsPass "Invalid chipset name: " + targetAttr.getChip()); return signalPassFailure(); } + } else { + maybeChipset = amdgpu::Chipset::parse(chipset); + if (failed(maybeChipset)) { + emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset); + return signalPassFailure(); + } } auto llvmDataLayout = m->getAttrOfType( @@ -412,9 +420,10 @@ void mlir::populateGpuToROCDLConversionPatterns( } std::unique_ptr> -mlir::createLowerGpuOpsToROCDLOpsPass(unsigned indexBitwidth, +mlir::createLowerGpuOpsToROCDLOpsPass(const std::string &chipset, + unsigned indexBitwidth, bool useBarePtrCallConv, gpu::amd::Runtime runtime) { return std::make_unique( - indexBitwidth, useBarePtrCallConv, runtime); + chipset, indexBitwidth, useBarePtrCallConv, runtime); } diff --git a/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp b/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp index e08e5478e979..f3e30e8e6743 100644 --- a/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp +++ b/mlir/lib/Dialect/Rock/Pipelines/Pipelines.cpp @@ -244,7 +244,7 @@ void rock::buildBackendPipeline(OpPassManager &pm, pm.addPass(createGpuROCDLAttachTarget(opts)); auto &gpuPm2 = pm.nest(); gpuPm2.addPass(createLowerGpuOpsToROCDLOpsPass( - /*indexBitwidth=*/kDeriveIndexBitwidthFromDataLayout, + /*chipset=*/"infer", /*indexBitwidth=*/kDeriveIndexBitwidthFromDataLayout, /*useBarePtrCallConv=*/true, gpu::amd::Runtime::HIP)); // Ensure we only run passes on LLVM functions inside GPU modules. auto &llvmFuncPm = gpuPm2.nest(); diff --git a/mlir/test/rocmlir-driver/pipelines.mlir b/mlir/test/rocmlir-driver/pipelines.mlir index 9c10440e2705..e4336de1cb80 100644 --- a/mlir/test/rocmlir-driver/pipelines.mlir +++ b/mlir/test/rocmlir-driver/pipelines.mlir @@ -52,7 +52,7 @@ // BINARY-NEXT:expand-strided-metadata, // BINARY-NEXT:lower-affine), // BINARY-NEXT:rocdl-attach-target{O=3 abi=500 chip=gfx90a correct-sqrt=true daz=false fast=false features= finite-only=false module= triple=amdgcn-amd-amdhsa unsafe-math=false wave64=true}, -// BINARY-NEXT:gpu.module(convert-gpu-to-rocdl{index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true}, +// BINARY-NEXT:gpu.module(convert-gpu-to-rocdl{chipset=infer index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true}, // BINARY-NEXT:llvm.func(canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true}, // BINARY-NEXT:cse, // BINARY-NEXT:rock-prepare-llvm)), @@ -71,7 +71,7 @@ // BINARY_MI300-NEXT:expand-strided-metadata, // BINARY_MI300-NEXT:lower-affine), // BINARY_MI300-NEXT:rocdl-attach-target{O=3 abi=500 chip=gfx940 correct-sqrt=true daz=false fast=false features= finite-only=false module= triple=amdgcn-amd-amdhsa unsafe-math=false wave64=true}, -// BINARY_MI300-NEXT:gpu.module(convert-gpu-to-rocdl{index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true}, +// BINARY_MI300-NEXT:gpu.module(convert-gpu-to-rocdl{chipset=infer index-bitwidth=0 runtime=HIP use-bare-ptr-memref-call-conv=true}, // BINARY_MI300-NEXT:llvm.func(canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true}, // BINARY_MI300-NEXT:cse, // BINARY_MI300-NEXT:rock-prepare-llvm)), From 23e6204a144684f5b4aeb01f48a3fa71d44e1b14 Mon Sep 17 00:00:00 2001 From: Umang Yadav Date: Tue, 1 Oct 2024 11:33:10 +0000 Subject: [PATCH 3/4] refactor based on daniel's comments --- .../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp index eb2d87f09f98..417ec10894b7 100644 --- a/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -219,7 +219,6 @@ struct LowerGpuOpsToROCDLOpsPass gpu::GPUModuleOp m = getOperation(); MLIRContext *ctx = m.getContext(); ArrayAttr targets = m.getTargetsAttr(); - FailureOr maybeChipset; if (chipset == "infer") { if (!targets) { emitError(UnknownLoc::get(ctx), @@ -233,18 +232,12 @@ struct LowerGpuOpsToROCDLOpsPass } const ROCDL::ROCDLTargetAttr targetAttr = mlir::dyn_cast(targets.getValue().front()); - maybeChipset = amdgpu::Chipset::parse(targetAttr.getChip()); - if (failed(maybeChipset)) { - emitError(UnknownLoc::get(ctx), - "Invalid chipset name: " + targetAttr.getChip()); - return signalPassFailure(); - } - } else { - maybeChipset = amdgpu::Chipset::parse(chipset); - if (failed(maybeChipset)) { - emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset); - return signalPassFailure(); - } + chipset = targetAttr.getChip().str(); + } + FailureOr maybeChipset = amdgpu::Chipset::parse(chipset); + if (failed(maybeChipset)) { + emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset); + return signalPassFailure(); } auto llvmDataLayout = m->getAttrOfType( From c966d3bb0a23b968047372d32d8461f79cb928b6 Mon Sep 17 00:00:00 2001 From: Umang Yadav Date: Thu, 10 Oct 2024 14:09:27 +0000 Subject: [PATCH 4/4] pull in upstream changes --- .../mlir/include/mlir/Conversion/Passes.td | 2 +- .../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 16 +++++----- .../GPUCommon/lower-memory-space-attrs.mlir | 2 +- .../GPUCommon/memory-attrbution.mlir | 2 +- .../GPUCommon/memref-arg-attrs.mlir | 2 +- .../GPUCommon/memref-arg-noalias-attrs.mlir | 2 +- .../GPUCommon/memref-arg-noalias-warning.mlir | 2 +- .../GPUToROCDL/gpu-to-rocdl-hip.mlir | 2 +- .../GPUToROCDL/gpu-to-rocdl-infer-target.mlir | 30 +++++++++++++++++++ .../GPUToROCDL/gpu-to-rocdl-opencl.mlir | 2 +- .../Conversion/GPUToROCDL/gpu-to-rocdl.mlir | 4 +-- .../test/Conversion/GPUToROCDL/memref.mlir | 4 +-- .../Integration/GPU/ROCM/gpu-to-hsaco.mlir | 2 +- .../test/Integration/GPU/ROCM/printf.mlir | 2 +- .../Integration/GPU/ROCM/two-modules.mlir | 2 +- .../test/Integration/GPU/ROCM/vecadd.mlir | 2 +- .../GPU/ROCM/vector-transferops.mlir | 2 +- 17 files changed, 55 insertions(+), 25 deletions(-) create mode 100644 external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-infer-target.mlir diff --git a/external/llvm-project/mlir/include/mlir/Conversion/Passes.td b/external/llvm-project/mlir/include/mlir/Conversion/Passes.td index 62d6f2954a66..9123b8ef46e6 100644 --- a/external/llvm-project/mlir/include/mlir/Conversion/Passes.td +++ b/external/llvm-project/mlir/include/mlir/Conversion/Passes.td @@ -593,7 +593,7 @@ def ConvertGpuOpsToROCDLOps : Pass<"convert-gpu-to-rocdl", "gpu::GPUModuleOp"> { let options = [ Option<"chipset", "chipset", "std::string", /*default=*/"\"infer\"", - "Chipset that these operations will run on. By default it will infer target from attached Target Attribute on GPU Module">, + "Chipset that these operations will run on. By Default it will infer target from attached target attribute on GPU module on which it operates">, Option<"indexBitwidth", "index-bitwidth", "unsigned", /*default=kDeriveIndexBitwidthFromDataLayout*/"0", "Bitwidth of the index type, 0 to use size of machine word">, diff --git a/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp index 417ec10894b7..8dbae392204c 100644 --- a/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/external/llvm-project/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -218,25 +218,25 @@ struct LowerGpuOpsToROCDLOpsPass void runOnOperation() override { gpu::GPUModuleOp m = getOperation(); MLIRContext *ctx = m.getContext(); - ArrayAttr targets = m.getTargetsAttr(); + if (chipset == "infer") { + ArrayAttr targets = m.getTargetsAttr(); if (!targets) { - emitError(UnknownLoc::get(ctx), - "ROCDLTargetAttr is empty on GPU module"); + m->emitError("there are no target attributes to infer"); return signalPassFailure(); } if (targets.size() != 1) { - emitError(UnknownLoc::get(ctx), "ROCDLTargetAttrs has more specified " - "more than one gpu-arch on GPU module"); + m->emitError("expected a single target attribute"); return signalPassFailure(); } - const ROCDL::ROCDLTargetAttr targetAttr = - mlir::dyn_cast(targets.getValue().front()); + ROCDL::ROCDLTargetAttr targetAttr = + dyn_cast(targets[0]); chipset = targetAttr.getChip().str(); } + FailureOr maybeChipset = amdgpu::Chipset::parse(chipset); if (failed(maybeChipset)) { - emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset); + m->emitError("invalid chipset name: " + chipset); return signalPassFailure(); } diff --git a/external/llvm-project/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs.mlir b/external/llvm-project/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs.mlir index 771f3185904b..a338d35525eb 100644 --- a/external/llvm-project/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs.mlir +++ b/external/llvm-project/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl | FileCheck %s --check-prefixes=CHECK,ROCDL +// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='chipset=gfx900' | FileCheck %s --check-prefixes=CHECK,ROCDL // RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm | FileCheck %s --check-prefixes=CHECK,NVVM gpu.module @kernel { diff --git a/external/llvm-project/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir b/external/llvm-project/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir index 4fc19b8e9364..b1291e07c060 100644 --- a/external/llvm-project/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir +++ b/external/llvm-project/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-nvvm --split-input-file %s | FileCheck --check-prefix=NVVM %s -// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-rocdl --split-input-file %s | FileCheck --check-prefix=ROCDL %s +// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-rocdl='chipset=gfx900' --split-input-file %s | FileCheck --check-prefix=ROCDL %s gpu.module @kernel { // NVVM-LABEL: llvm.func @private diff --git a/external/llvm-project/mlir/test/Conversion/GPUCommon/memref-arg-attrs.mlir b/external/llvm-project/mlir/test/Conversion/GPUCommon/memref-arg-attrs.mlir index e7c742067b4e..3c3082c47389 100644 --- a/external/llvm-project/mlir/test/Conversion/GPUCommon/memref-arg-attrs.mlir +++ b/external/llvm-project/mlir/test/Conversion/GPUCommon/memref-arg-attrs.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-bare-ptr-memref-call-conv=0' | FileCheck %s --check-prefixes=CHECK,ROCDL +// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='chipset=gfx900 use-bare-ptr-memref-call-conv=0' | FileCheck %s --check-prefixes=CHECK,ROCDL // RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm='use-bare-ptr-memref-call-conv=0' | FileCheck %s --check-prefixes=CHECK,NVVM gpu.module @kernel { diff --git a/external/llvm-project/mlir/test/Conversion/GPUCommon/memref-arg-noalias-attrs.mlir b/external/llvm-project/mlir/test/Conversion/GPUCommon/memref-arg-noalias-attrs.mlir index 33cdc3348e51..d17214d1f229 100644 --- a/external/llvm-project/mlir/test/Conversion/GPUCommon/memref-arg-noalias-attrs.mlir +++ b/external/llvm-project/mlir/test/Conversion/GPUCommon/memref-arg-noalias-attrs.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-bare-ptr-memref-call-conv=1' | FileCheck %s --check-prefixes=CHECK,ROCDL +// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='chipset=gfx900 use-bare-ptr-memref-call-conv=1' | FileCheck %s --check-prefixes=CHECK,ROCDL // RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm='use-bare-ptr-memref-call-conv=1' | FileCheck %s --check-prefixes=CHECK,NVVM gpu.module @kernel { diff --git a/external/llvm-project/mlir/test/Conversion/GPUCommon/memref-arg-noalias-warning.mlir b/external/llvm-project/mlir/test/Conversion/GPUCommon/memref-arg-noalias-warning.mlir index 793df7380d78..ab98be59a2c8 100644 --- a/external/llvm-project/mlir/test/Conversion/GPUCommon/memref-arg-noalias-warning.mlir +++ b/external/llvm-project/mlir/test/Conversion/GPUCommon/memref-arg-noalias-warning.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-bare-ptr-memref-call-conv=0' -verify-diagnostics +// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='chipset=gfx900 use-bare-ptr-memref-call-conv=0' -verify-diagnostics gpu.module @kernel { // expected-warning @+1 {{Cannot copy noalias with non-bare pointers.}} diff --git a/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir b/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir index 1b904fa142ba..3e3b43c6d4f4 100644 --- a/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir +++ b/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -convert-gpu-to-rocdl='runtime=HIP' -split-input-file | FileCheck %s +// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx900 runtime=HIP' -split-input-file | FileCheck %s gpu.module @test_module { // CHECK-DAG: llvm.mlir.global internal constant @[[$PRINT_GLOBAL0:[A-Za-z0-9_]+]]("Hello, world\0A\00") diff --git a/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-infer-target.mlir b/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-infer-target.mlir new file mode 100644 index 000000000000..4ef6fd004b13 --- /dev/null +++ b/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-infer-target.mlir @@ -0,0 +1,30 @@ +// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file --verify-diagnostics | FileCheck --check-prefix=CHECK_TARGET %s + +// CHECK_TARGET: @test_module [#rocdl.target] attributes {llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"} { +gpu.module @test_module [#rocdl.target] { + // CHECK_TARGET-LABEL: @kernel_func + // CHECK_TARGET: attributes + // CHECK_TARGET: gpu.kernel + // CHECK_TARGET: rocdl.kernel + gpu.func @kernel_func() kernel { + gpu.return + } +} + +// ----- + +// expected-error@below {{there are no target attributes to infer}} +gpu.module @test_module { + gpu.func @kernel_func() kernel { + gpu.return + } +} + +// ----- + +// expected-error@below {{invalid chipset name: gfx90a,gfx900}} +gpu.module @test_module [#rocdl.target] { + gpu.func @kernel_func() kernel { + gpu.return + } +} diff --git a/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir b/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir index 870f5c5016ec..fa01801972d6 100644 --- a/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir +++ b/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -convert-gpu-to-rocdl='runtime=OpenCL' | FileCheck %s +// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx900 runtime=OpenCL' | FileCheck %s gpu.module @test_module { // CHECK: llvm.mlir.global internal constant @[[$PRINT_GLOBAL:[A-Za-z0-9_]+]]("Hello: %d\0A\00") {addr_space = 4 : i32} diff --git a/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir index da54dc836a90..356d21cbf1ba 100644 --- a/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir +++ b/external/llvm-project/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir @@ -1,5 +1,5 @@ -// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s -// RUN: mlir-opt %s -convert-gpu-to-rocdl='index-bitwidth=32' -split-input-file | FileCheck --check-prefix=CHECK32 %s +// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx900' -split-input-file | FileCheck %s +// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx900 index-bitwidth=32' -split-input-file | FileCheck --check-prefix=CHECK32 %s // CHECK-LABEL: @test_module // CHECK-SAME: llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" diff --git a/external/llvm-project/mlir/test/Conversion/GPUToROCDL/memref.mlir b/external/llvm-project/mlir/test/Conversion/GPUToROCDL/memref.mlir index e645481c8923..debf899dd687 100644 --- a/external/llvm-project/mlir/test/Conversion/GPUToROCDL/memref.mlir +++ b/external/llvm-project/mlir/test/Conversion/GPUToROCDL/memref.mlir @@ -1,6 +1,6 @@ -// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s +// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx900' -split-input-file | FileCheck %s // RUN: mlir-opt %s \ -// RUN: -convert-gpu-to-rocdl='use-bare-ptr-memref-call-conv=true' \ +// RUN: -convert-gpu-to-rocdl='chipset=gfx900 use-bare-ptr-memref-call-conv=true' \ // RUN: -split-input-file \ // RUN: | FileCheck %s --check-prefix=BARE diff --git a/external/llvm-project/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir b/external/llvm-project/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir index 3c8f3b1d0cbf..edb75ee81224 100644 --- a/external/llvm-project/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir +++ b/external/llvm-project/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir @@ -1,6 +1,6 @@ // RUN: mlir-opt %s \ // RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl),rocdl-attach-target{chip=%chip})' \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo),rocdl-attach-target{chip=%chip}, gpu.module(convert-gpu-to-rocdl))' \ // RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_rocm_runtime \ diff --git a/external/llvm-project/mlir/test/Integration/GPU/ROCM/printf.mlir b/external/llvm-project/mlir/test/Integration/GPU/ROCM/printf.mlir index d5e6e3757540..e8feeaa69c29 100644 --- a/external/llvm-project/mlir/test/Integration/GPU/ROCM/printf.mlir +++ b/external/llvm-project/mlir/test/Integration/GPU/ROCM/printf.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl{index-bitwidth=32 runtime=HIP}),rocdl-attach-target{chip=%chip})' \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo),rocdl-attach-target{chip=%chip}, gpu.module(convert-gpu-to-rocdl{index-bitwidth=32 runtime=HIP}))' \ // RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_rocm_runtime \ diff --git a/external/llvm-project/mlir/test/Integration/GPU/ROCM/two-modules.mlir b/external/llvm-project/mlir/test/Integration/GPU/ROCM/two-modules.mlir index d49d3957abbe..d20f71d16280 100644 --- a/external/llvm-project/mlir/test/Integration/GPU/ROCM/two-modules.mlir +++ b/external/llvm-project/mlir/test/Integration/GPU/ROCM/two-modules.mlir @@ -1,6 +1,6 @@ // RUN: mlir-opt %s \ // RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl),rocdl-attach-target{chip=%chip})' \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo),rocdl-attach-target{chip=%chip}, gpu.module(convert-gpu-to-rocdl))' \ // RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_rocm_runtime \ diff --git a/external/llvm-project/mlir/test/Integration/GPU/ROCM/vecadd.mlir b/external/llvm-project/mlir/test/Integration/GPU/ROCM/vecadd.mlir index 986d8239427e..0ac391cd5f8e 100644 --- a/external/llvm-project/mlir/test/Integration/GPU/ROCM/vecadd.mlir +++ b/external/llvm-project/mlir/test/Integration/GPU/ROCM/vecadd.mlir @@ -1,7 +1,7 @@ // RUN: mlir-opt %s \ // RUN: | mlir-opt -convert-scf-to-cf \ // RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl{use-bare-ptr-memref-call-conv=true}),rocdl-attach-target{chip=%chip})' \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo),rocdl-attach-target{chip=%chip}, gpu.module(convert-gpu-to-rocdl{use-bare-ptr-memref-call-conv=true}))' \ // RUN: | mlir-opt -gpu-to-llvm=use-bare-pointers-for-kernels=true -reconcile-unrealized-casts -gpu-module-to-binary \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_rocm_runtime \ diff --git a/external/llvm-project/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir b/external/llvm-project/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir index 575d967dcc9a..417f67e64669 100644 --- a/external/llvm-project/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir +++ b/external/llvm-project/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir @@ -1,7 +1,7 @@ // RUN: mlir-opt %s \ // RUN: | mlir-opt -convert-scf-to-cf \ // RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-rocdl{chipset=%chip index-bitwidth=32}),rocdl-attach-target{chip=%chip})' \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo),rocdl-attach-target{chip=%chip}, gpu.module(convert-gpu-to-rocdl{index-bitwidth=32}))' \ // RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_rocm_runtime \