Skip to content

Commit

Permalink
Individual test cases and benchmarks for cuda
Browse files Browse the repository at this point in the history
  • Loading branch information
ngc92 committed Jan 21, 2025
1 parent e5cf813 commit 8faa2f9
Show file tree
Hide file tree
Showing 15 changed files with 775 additions and 159 deletions.
407 changes: 335 additions & 72 deletions examples/identity_cuda/eval.cu

Large diffs are not rendered by default.

34 changes: 13 additions & 21 deletions examples/identity_cuda/reference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,15 @@

#include "task.h"

static input_t generate_input(int seed) {
static input_t generate_input(int seed, int size) {
std::mt19937 rng(seed);
input_t data;

std::uniform_real_distribution<float> dist(0, 1);

for (int i = 0; i < N_SIZES; ++i) {
data[i].resize(Ns[i]);
for (int j = 0; j < Ns[i]; ++j) {
data[i][j] = dist(rng);
}
data.resize(size);
for (int j = 0; j < size; ++j) {
data[j] = dist(rng);
}

return data;
Expand All @@ -32,28 +30,22 @@ static output_t ref_kernel(input_t data) {
return (output_t) data;
}

static bool check_implementation(output_t out, output_t ref, float epsilon = 1e-5) {
static void check_implementation(TestReporter& reporter, output_t out, output_t ref, float epsilon = 1e-5) {
// input_t data = generate_input();
// output_t reference_out = reference(data);

for (int i = 0; i < N_SIZES; ++i) {
auto ref_ptr = ref[i];
auto out_ptr = out[i];

if(out[i].size() != Ns[i]) {
std::cerr << "SIZE MISMATCH at " << i << ": " << Ns[i] << " " << out[i].size() << std::endl;
return false;
}
if(out.size() != ref.size()) {
if(!reporter.check_equal("size mismatch", out.size(), ref.size())) return;
}

for (int j = 0; j < Ns[i]; ++j) {
if (std::fabs(ref_ptr[j] - out_ptr[j]) > epsilon) {
std::cerr << "ERROR AT " << i << ", "<< j << ": " << ref_ptr[j] << " " << out_ptr[j] << std::endl;
return false;
}
for (int j = 0; j < ref.size(); ++j) {
if (std::fabs(ref[j] - out[j]) > epsilon) {
reporter.fail() << "error at " << j << ": " << ref[j] << " " << std::to_string(out[j]);
return;
}
}

return true;
reporter.pass();
}

#endif
8 changes: 6 additions & 2 deletions examples/identity_cuda/submission.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,12 @@ __global__ void copy_kernel(float *input, float *output, int N)

output_t custom_kernel(input_t data)
{
if(data.size() > 256) {
data[0] = -1;
}
return data;
output_t result;

/*
for (int i = 0; i < N_SIZES; ++i)
{
int N = Ns[i];
Expand All @@ -41,6 +45,6 @@ output_t custom_kernel(input_t data)
CUDA_CHECK(cudaFree(d_input));
CUDA_CHECK(cudaFree(d_output));
}

*/
return result;
}
12 changes: 5 additions & 7 deletions examples/identity_cuda/task.h
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
#ifndef __TASK_H__
#define __TASK_H__
#ifndef __POPCORN_TASK_H__
#define __POPCORN_TASK_H__

#include <vector>
#include <array>

#define N_SIZES 10
const int Ns[N_SIZES] = {128, 256, 512, 1024, 2048,
4096, 8192, 16384, 32768, 65536};

using input_t = std::array<std::vector<float>, N_SIZES>;
using input_t = std::vector<float>;
using output_t = input_t;

constexpr std::array<const char*, 2> ArgumentNames = {"seed", "size"};

#endif
17 changes: 17 additions & 0 deletions examples/identity_cuda/task.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,20 @@ config:

# additional include directories
include_dirs: []

# small test cases. should be cheap to run.
tests:
- {"size": 127, "seed": 4242}
- {"size": 128, "seed": 5236}
- {"size": 129, "seed": 1001}
- {"size": 256, "seed": 5531}
- {"size": 512, "seed": 9173}

benchmarks:
- {"size": 1024, "seed": 54352}
- {"size": 2048, "seed": 93246}
- {"size": 4096, "seed": 6256}
- {"size": 8192, "seed": 8841}
- {"size": 16384, "seed": 6252}
- {"size": 32768, "seed": 52624}
- {"size": 65536, "seed": 125432}
69 changes: 68 additions & 1 deletion examples/identity_cuda/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,16 @@
#define POPCORN_UTILS_H

#include <iostream>
#include <string>
#include <sstream>

enum ExitCodes: int {
USAGE_ERROR = 2, // (standard?) exit code for wrong command line
EXIT_CUDA_API = 110, // cuda API call returned an error
EXIT_PIPE_FAIL = 111, // could not set up communication with runner
EXIT_TEST_FAIL = 112, // a test case failed
EXIT_TEST_SPEC = 113 // error when trying to construct a test case
};

// checks that a CUDA API call returned successfully, otherwise prints an error message and exits.
static inline void cuda_check_(cudaError_t status, const char* expr, const char* file, int line, const char* function)
Expand All @@ -13,12 +23,69 @@ static inline void cuda_check_(cudaError_t status, const char* expr, const char*
<< line << ") in `"
<< function << "`: "
<< cudaGetErrorString(status) << std::endl;
std::exit(110);
std::exit(ExitCodes::EXIT_CUDA_API);
}
}

// Convenience macro, automatically logs expression, file, line, and function name
// of the error.
#define CUDA_CHECK(expr) cuda_check_(expr, #expr, __FILE__, __LINE__, __FUNCTION__)


struct TestVerdict {
bool Pass;
std::string Message = "";
};

class TestReporter {
public:
TestReporter() = default;

void pass() {
if(m_State != NONE) {
std::cerr << "Trying to mark result of test twice."
" This indicates an error in the task definition, please report."
<< std::endl;
std::exit(EXIT_TEST_SPEC);
}
m_State = PASS;
}
std::stringstream& fail() {
if(m_State != NONE) {
std::cerr << "Trying to mark result of test twice."
" This indicates an error in the task definition, please report."
<< std::endl;
std::exit(EXIT_TEST_SPEC);
}
m_State = FAIL;
return m_Message;
}

bool has_passed() const {
if(m_State == NONE) {
std::cerr << "Trying to query result of unfinished test."
" This indicates an error in the task definition, please report."
<< std::endl;
std::exit(EXIT_TEST_SPEC);
}
return m_State == PASS;
}

template<class T>
bool check_equal(const char* message, const T& value, const T& expected) {
if(value == expected) return true;
fail() << message << ": " << expected << "`" << expected << "`, got `" << value << "`";
return false;
}

std::string message() const {
return m_Message.str();
}
private:
enum State {
NONE, PASS, FAIL
};
State m_State = NONE;
std::stringstream m_Message;
};
#endif
22 changes: 14 additions & 8 deletions scripts/local-test.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,25 @@
import pprint
import sys
from pathlib import Path

sys.path.append("src/discord-cluster-manager")

from leaderboard_eval import cu_eval
from run_eval import run_cuda_script

ref = Path("examples/identity_cuda/reference.cuh")
sub = Path("examples/identity_cuda/submission.cuh")
sub = Path("examples/identity_cuda/submission.cu")
util = Path("examples/identity_cuda/utils.h")
task = Path("examples/identity_cuda/task.h")

cout, score = run_cuda_script(
{"eval.cu": cu_eval},
{"reference.cuh": ref.read_text(), "submission.cuh": sub.read_text()},
result = run_cuda_script(
{
"eval.cu": Path("examples/identity_cuda/eval.cu").read_text(),
"submission.cu": sub.read_text(),
},
{"reference.cuh": ref.read_text(), "utils.h": util.read_text(), "task.h": task.read_text()},
arch=None,
tests="size: 128; seed: 45\nsize: 512; seed: 123",
mode="test",
)
print(cout)
print(score)
exit(0 if score > 0 else 1)

pprint.pprint(result)
77 changes: 73 additions & 4 deletions src/discord-cluster-manager/cogs/leaderboard_cog.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
GPU_SELECTION,
AllGPU,
GitHubGPU,
ModalGPU,
ModalGPU, SubmissionMode,
)
from discord import app_commands
from discord.ext import commands, tasks
Expand Down Expand Up @@ -42,7 +42,8 @@ async def async_submit_cog_job(
task: LeaderboardTask,
submission_content,
gpu: AllGPU,
runner_name: str = "GitHub",
runner_name: str,
mode: SubmissionMode,
):
discord_thread, result = await command(
interaction,
Expand All @@ -52,13 +53,24 @@ async def async_submit_cog_job(
value=gpu.value,
),
task=task,
mode=mode,
)

# no point going further if this already failed
if discord_thread is None:
return -1

if mode == SubmissionMode.LEADERBOARD:
pass
# public leaderboard run
elif mode == SubmissionMode.PRIVATE:
pass
# private leaderboard run
else:
return 0

try:
print(result)
if result.success:
score = float(result.run.result["duration.mean"]) / 1e9

Expand Down Expand Up @@ -176,6 +188,7 @@ async def on_submit_hook(
command: Callable,
GPUsEnum: Type[Enum],
runner_name: str,
mode: SubmissionMode,
) -> int:
"""
Called as the main body of a submission to route to the correct runner.
Expand Down Expand Up @@ -216,10 +229,28 @@ async def on_submit_hook(
submission_content,
AllGPU[gpu],
runner_name,
mode,
)
for gpu in selected_gpus
]

# also schedule secret run
if mode == SubmissionMode.LEADERBOARD:
tasks += [
self.async_submit_cog_job(
interaction,
leaderboard_name,
script,
command,
task,
submission_content,
AllGPU[gpu],
runner_name,
SubmissionMode.PRIVATE,
)
for gpu in selected_gpus
]

await asyncio.gather(*tasks)
return 0

Expand All @@ -238,6 +269,7 @@ async def submit(
interaction: discord.Interaction,
leaderboard_name: str,
script: discord.Attachment,
mode: SubmissionMode,
):
# Call Modal runner
runner_cog = self.bot.get_cog(f"{runner_name}Cog")
Expand All @@ -256,6 +288,7 @@ async def submit(
runner_command,
GPU_SELECTION[runner_name],
runner_name,
mode,
)
except Exception as e:
logger.error("Error handling leaderboard submission", exc_info=e)
Expand All @@ -280,7 +313,7 @@ async def submit_modal(
leaderboard_name: str,
script: discord.Attachment,
):
return await self.submit("Modal", interaction, leaderboard_name, script)
return await self.submit("Modal", interaction, leaderboard_name, script, mode=SubmissionMode.LEADERBOARD)

@app_commands.command(name="github", description="Submit leaderboard data for GitHub")
@app_commands.describe(
Expand All @@ -294,7 +327,43 @@ async def submit_github(
leaderboard_name: str,
script: discord.Attachment,
):
return await self.submit("GitHub", interaction, leaderboard_name, script)
return await self.submit(
"GitHub", interaction, leaderboard_name, script, mode=SubmissionMode.LEADERBOARD
)

@app_commands.command(name="test", description="Start a testing/debugging run")
@app_commands.describe(
leaderboard_name="Name of the competition / kernel to optimize",
runner="Name of the runner to run on",
script="The Python / CUDA script file to run",
)
@app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete)
async def submit_test(
self,
interaction: discord.Interaction,
runner: str,
leaderboard_name: str,
script: discord.Attachment,
):
runner = {"github": "GitHub", "modal": "Modal"}[runner.lower()]
return await self.submit(runner, interaction, leaderboard_name, script, mode=SubmissionMode.TEST)

@app_commands.command(name="benchmark", description="Start a benchmarking run")
@app_commands.describe(
leaderboard_name="Name of the competition / kernel to optimize",
runner="Name of the runner to run on",
script="The Python / CUDA script file to run",
)
@app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete)
async def submit_bench(
self,
interaction: discord.Interaction,
runner: str,
leaderboard_name: str,
script: discord.Attachment,
):
runner = {"github": "GitHub", "modal": "Modal"}[runner.lower()]
return await self.submit(runner, interaction, leaderboard_name, script, mode=SubmissionMode.BENCHMARK)


class LeaderboardCog(commands.Cog):
Expand Down
Loading

0 comments on commit 8faa2f9

Please sign in to comment.