Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ROCm utilities and tests, and CUDA fallbacks #40619

Merged
merged 7 commits into from
Feb 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 0 additions & 9 deletions HeterogeneousCore/CUDAServices/bin/BuildFile.xml

This file was deleted.

8 changes: 8 additions & 0 deletions HeterogeneousCore/CUDAUtilities/bin/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<iftool name="cuda-gcc-support">
<use name="cuda"/>
<bin name="cudaComputeCapabilities" file="cudaComputeCapabilities.cpp isCudaDeviceSupported.cu"/>
<bin name="cudaIsEnabled" file="cudaIsEnabled.cpp isCudaDeviceSupported.cu"/>
<else/>
<bin name="cudaComputeCapabilities" file="cudaComputeCapabilities_fallback.cpp"/>
<bin name="cudaIsEnabled" file="cudaIsEnabled_fallback.cpp"/>
</iftool>
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
// C/C++ headers
#include <cstdlib>
#include <iostream>

// always returns EXIT_FAILURE
int main() {
std::cerr << "cudaComputeCapabilities: CUDA is not supported on this architecture" << std::endl;
return EXIT_FAILURE;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
// C/C++ headers
#include <cstdlib>

// always returns EXIT_FAILURE
int main() { return EXIT_FAILURE; }
10 changes: 10 additions & 0 deletions HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -133,4 +133,14 @@
<flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
<flags CXXFLAGS="-g -DGPU_DEBUG"/>
</bin>

<bin file="testCudaCheck.cpp" name="testCudaCheck">
<use name="catch2"/>
<use name="cuda"/>
</bin>

<bin file="testRequireCUDADevices.cpp" name="testRequireCUDADevices">
<use name="catch2"/>
<use name="cuda"/>
</bin>
</iftool>
20 changes: 20 additions & 0 deletions HeterogeneousCore/CUDAUtilities/test/testCudaCheck.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// Catch2 headers
#define CATCH_CONFIG_MAIN
#include <catch.hpp>

// CUDA headers
#include <cuda_runtime.h>

// CMSSW headers
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"

TEST_CASE("HeterogeneousCore/CUDAUtilities testCudaCheck", "[testCudaCheck]") {
SECTION("Test cudaCheck() driver API") {
REQUIRE_NOTHROW(cudaCheck(CUDA_SUCCESS));
REQUIRE_THROWS(cudaCheck(CUDA_ERROR_UNKNOWN));
}
SECTION("Test cudaCheck() runtime API") {
REQUIRE_NOTHROW(cudaCheck(cudaSuccess));
REQUIRE_THROWS(cudaCheck(cudaErrorUnknown));
}
}
21 changes: 21 additions & 0 deletions HeterogeneousCore/CUDAUtilities/test/testRequireCUDADevices.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// Catch2 headers
#define CATCH_CONFIG_MAIN
#include <catch.hpp>

// CUDA headers
#include <cuda_runtime.h>

// CMSSW headers
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"

TEST_CASE("HeterogeneousCore/CUDAUtilities testRequireCUDADevices", "[testRequireCUDADevices]") {
SECTION("Test requireDevices()") {
cms::cudatest::requireDevices();

int devices = 0;
cudaCheck(cudaGetDeviceCount(&devices));

REQUIRE(devices > 0);
}
}
7 changes: 7 additions & 0 deletions HeterogeneousCore/ROCmUtilities/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<iftool name="rocm">
<use name="rocm"/>
<use name="FWCore/Utilities" source_only="true"/>
<export>
<lib name="1"/>
</export>
</iftool>
8 changes: 8 additions & 0 deletions HeterogeneousCore/ROCmUtilities/bin/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<iftool name="rocm">
<use name="rocm"/>
<bin name="rocmComputeCapabilities" file="rocmComputeCapabilities.cpp isRocmDeviceSupported.hip.cc"/>
<bin name="rocmIsEnabled" file="rocmIsEnabled.cpp isRocmDeviceSupported.hip.cc"/>
<else/>
<bin name="rocmComputeCapabilities" file="rocmComputeCapabilities_fallback.cpp"/>
<bin name="rocmIsEnabled" file="rocmIsEnabled_fallback.cpp"/>
</iftool>
6 changes: 6 additions & 0 deletions HeterogeneousCore/ROCmUtilities/bin/isRocmDeviceSupported.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#ifndef HeterogeneousCore_ROCmServices_bin_isRocmDeviceSupported_h
#define HeterogeneousCore_ROCmServices_bin_isRocmDeviceSupported_h

bool isRocmDeviceSupported(int device);

#endif // HeterogeneousCore_ROCmServices_bin_isRocmDeviceSupported_h
57 changes: 57 additions & 0 deletions HeterogeneousCore/ROCmUtilities/bin/isRocmDeviceSupported.hip.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#include <hip/hip_runtime.h>

#include "isRocmDeviceSupported.h"

namespace {
__global__ void setSupported(bool* result) { *result = true; }
} // namespace

bool isRocmDeviceSupported(int device) {
bool supported = false;
bool* supported_d;

// select the requested device - will fail if the index is invalid
hipError_t status = hipSetDevice(device);
if (status != hipSuccess)
return false;

// allocate memory for the flag on the device
status = hipMalloc(&supported_d, sizeof(bool));
if (status != hipSuccess)
return false;

// initialise the flag on the device
status = hipMemset(supported_d, 0x00, sizeof(bool));
if (status != hipSuccess)
return false;

// try to set the flag on the device
setSupported<<<1, 1>>>(supported_d);

// check for an eventual error from launching the kernel on an unsupported device
status = hipGetLastError();
if (status != hipSuccess)
return false;

// wait for the kernelto run
status = hipDeviceSynchronize();
if (status != hipSuccess)
return false;

// copy the flag back to the host
status = hipMemcpy(&supported, supported_d, sizeof(bool), hipMemcpyDeviceToHost);
if (status != hipSuccess)
return false;

// free the device memory
status = hipFree(supported_d);
if (status != hipSuccess)
return false;

// reset the device
status = hipDeviceReset();
if (status != hipSuccess)
return false;

return supported;
}
34 changes: 34 additions & 0 deletions HeterogeneousCore/ROCmUtilities/bin/rocmComputeCapabilities.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
// C/C++ standard headers
#include <cstdlib>
#include <iomanip>
#include <iostream>

// ROCm headers
#include <hip/hip_runtime.h>

// CMSSW headers
#include "HeterogeneousCore/ROCmUtilities/interface/hipCheck.h"
#include "isRocmDeviceSupported.h"

int main() {
int devices = 0;
hipError_t status = hipGetDeviceCount(&devices);
if (status != hipSuccess) {
std::cerr << "rocmComputeCapabilities: " << hipGetErrorString(status) << std::endl;
return EXIT_FAILURE;
}

for (int i = 0; i < devices; ++i) {
hipDeviceProp_t properties;
hipCheck(hipGetDeviceProperties(&properties, i));
std::stringstream arch;
arch << "gfx" << properties.gcnArch;
std::cout << std::setw(4) << i << " " << std::setw(8) << arch.str() << " " << properties.name;
if (not isRocmDeviceSupported(i)) {
std::cout << " (unsupported)";
}
std::cout << std::endl;
}

return EXIT_SUCCESS;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
// C/C++ headers
#include <cstdlib>
#include <iostream>

// always returns EXIT_FAILURE
int main() {
std::cerr << "rocmComputeCapabilities: ROCm is not supported on this architecture" << std::endl;
return EXIT_FAILURE;
}
26 changes: 26 additions & 0 deletions HeterogeneousCore/ROCmUtilities/bin/rocmIsEnabled.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// C/C++ headers
#include <cstdlib>

// ROCm headers
#include <hip/hip_runtime.h>

// local headers
#include "isRocmDeviceSupported.h"

// returns EXIT_SUCCESS if at least one visible ROCm device can be used, or EXIT_FAILURE otherwise
int main() {
int devices = 0;
auto status = hipGetDeviceCount(&devices);
if (status != hipSuccess) {
return EXIT_FAILURE;
}

// check that at least one visible ROCm device can be used
for (int i = 0; i < devices; ++i) {
if (isRocmDeviceSupported(i))
return EXIT_SUCCESS;
}

// no visible usable devices
return EXIT_FAILURE;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
// C/C++ headers
#include <cstdlib>

// always returns EXIT_FAILURE
int main() { return EXIT_FAILURE; }
54 changes: 54 additions & 0 deletions HeterogeneousCore/ROCmUtilities/interface/hipCheck.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#ifndef HeterogeneousCore_ROCmUtilities_hipCheck_h
#define HeterogeneousCore_ROCmUtilities_hipCheck_h

// C++ standard headers
#include <iostream>
#include <sstream>
#include <stdexcept>
#include <string>
#include <string_view>

// ROCm headers
#include <hip/hip_runtime.h>

// CMSSW headers
#include "FWCore/Utilities/interface/Likely.h"

namespace cms {
namespace rocm {

[[noreturn]] inline void abortOnError(const char* file,
int line,
const char* cmd,
const char* error,
const char* message,
std::string_view description = std::string_view()) {
std::ostringstream out;
out << "\n";
out << file << ", line " << line << ":\n";
out << "hipCheck(" << cmd << ");\n";
out << error << ": " << message << "\n";
if (!description.empty())
out << description << "\n";
throw std::runtime_error(out.str());
}

inline bool hipCheck_(const char* file,
int line,
const char* cmd,
hipError_t result,
std::string_view description = std::string_view()) {
if (LIKELY(result == hipSuccess))
return true;

const char* error = hipGetErrorName(result);
const char* message = hipGetErrorString(result);
abortOnError(file, line, cmd, error, message, description);
return false;
}
} // namespace rocm
} // namespace cms

#define hipCheck(ARG, ...) (cms::rocm::hipCheck_(__FILE__, __LINE__, #ARG, (ARG), ##__VA_ARGS__))

#endif // HeterogeneousCore_ROCmUtilities_hipCheck_h
19 changes: 19 additions & 0 deletions HeterogeneousCore/ROCmUtilities/interface/requireDevices.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#ifndef HeterogeneousCore_ROCmUtilities_interface_requireDevices_h
#define HeterogeneousCore_ROCmUtilities_interface_requireDevices_h

/**
* These functions are meant to be called only from unit tests.
*/
namespace cms {
namespace rocmtest {

/// In presence of ROCm devices, return true; otherwise print message and return false
bool testDevices();

/// Print message and exit if there are no ROCm devices
void requireDevices();

} // namespace rocmtest
} // namespace cms

#endif // HeterogeneousCore_ROCmUtilities_interface_requireDevices_h
30 changes: 30 additions & 0 deletions HeterogeneousCore/ROCmUtilities/src/requireDevices.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#include <cstdlib>
#include <iostream>

#include <hip/hip_runtime.h>

#include "HeterogeneousCore/ROCmUtilities/interface/requireDevices.h"

namespace cms::rocmtest {

bool testDevices() {
int devices = 0;
auto status = hipGetDeviceCount(&devices);
if (status != hipSuccess) {
std::cerr << "Failed to initialise the ROCm runtime, the test will be skipped.\n";
return false;
}
if (devices == 0) {
std::cerr << "No ROCm devices available, the test will be skipped.\n";
return false;
}
return true;
}

void requireDevices() {
if (not testDevices()) {
exit(EXIT_SUCCESS);
}
}

} // namespace cms::rocmtest
13 changes: 13 additions & 0 deletions HeterogeneousCore/ROCmUtilities/test/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<iftool name="rocm">
<bin file="testHipCheck.cpp" name="testHipCheck">
<use name="catch2"/>
<use name="rocm"/>
<use name="HeterogeneousCore/ROCmUtilities"/>
</bin>

<bin file="testRequireROCmDevices.cpp" name="testRequireROCmDevices">
<use name="catch2"/>
<use name="rocm"/>
<use name="HeterogeneousCore/ROCmUtilities"/>
</bin>
</iftool>
16 changes: 16 additions & 0 deletions HeterogeneousCore/ROCmUtilities/test/testHipCheck.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// Catch2 headers
#define CATCH_CONFIG_MAIN
#include <catch.hpp>

// ROCm headers
#include <hip/hip_runtime.h>

// CMSSW headers
#include "HeterogeneousCore/ROCmUtilities/interface/hipCheck.h"

TEST_CASE("HeterogeneousCore/ROCmUtilities testHipCheck", "[testHipCheck]") {
SECTION("Test hipCheck() API") {
REQUIRE_NOTHROW(hipCheck(hipSuccess));
REQUIRE_THROWS(hipCheck(hipErrorUnknown));
}
}
Loading