Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update pytorch script runner to use pipe #131

Merged
merged 3 commits into from
Jan 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 0 additions & 32 deletions .github/workflows/cuda_test.yml

This file was deleted.

65 changes: 65 additions & 0 deletions .github/workflows/runner_ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
name: Runner CI

on:
push:
branches: [ main ]
pull_request:
branches: [ main ]

jobs:
check-cuda:
runs-on: [gpumode-nvidia-arc]
timeout-minutes: 10
container:
image: nvidia/cuda:12.4.0-devel-ubuntu22.04
steps:
- uses: actions/checkout@v3

- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.10'

- name: Install pytest
shell: bash
run: pip install pytest

- name: Run script
shell: bash
run: pytest scripts/ci_test_cuda.py

env:
CUDA_VISIBLE_DEVICES: 0

check-pytorch:
runs-on: [gpumode-nvidia-arc]
timeout-minutes: 10
container:
image: nvidia/cuda:12.4.0-devel-ubuntu22.04
steps:
- uses: actions/checkout@v3

- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.10'

- name: Install uv
uses: astral-sh/setup-uv@v3
with:
version: "latest"

- name: Setup Python environment
run: |
uv venv .venv
echo "VIRTUAL_ENV=$PWD/.venv" >> $GITHUB_ENV
echo "$PWD/.venv/bin" >> $GITHUB_PATH
uv pip install numpy torch setuptools ninja pytest

- name: Run script
shell: bash
run: pytest scripts/ci_test_python.py

env:
CUDA_VISIBLE_DEVICES: 0

10 changes: 6 additions & 4 deletions scripts/ci_test_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

sys.path.append("src/discord-cluster-manager")

from consts import ExitCode
from leaderboard_eval import cu_eval
from run_eval import run_cuda_script

Expand All @@ -23,6 +24,7 @@ def test_does_not_compile():
assert comp.success is False
assert run.success is False
assert comp.nvcc_found is True
assert comp.exit_code != ExitCode.SUCCESS
assert comp.stdout == ""
assert 'train.cuh(2): error: identifier "input_tt" is undefined' in comp.stderr
assert '1 error detected in the compilation of "eval.cu".' in comp.stderr
Expand Down Expand Up @@ -55,9 +57,9 @@ def test_cuda_runtime_error():
assert run.success is False
assert run.command == "./eval.out"
assert "warming up..." in run.stdout
assert "cudaDeviceSynchronize() at eval.cu(64) in `measure_runtime`" in run.stderr
assert "cudaDeviceSynchronize() at eval.cu(63) in `measure_runtime`" in run.stderr
assert "an illegal memory access was encountered" in run.stderr
assert run.exit_code == 3
assert run.exit_code == ExitCode.CUDA_FAIL
assert len(run.result) == 0


Expand Down Expand Up @@ -85,7 +87,7 @@ def test_cuda_validation_fail():
# we never reach the benchmark part, because the test fails
assert "warming up..." not in run.stdout
assert "ERROR AT 0, 0" in run.stderr
assert run.exit_code == 1
assert run.exit_code == ExitCode.VALIDATE_FAIL
assert run.result["check"] == "fail"


Expand All @@ -96,5 +98,5 @@ def test_cuda_correct():
assert comp.success is True
assert run.success is True
assert "warming up..." in run.stdout
assert run.exit_code == 0
assert run.exit_code == ExitCode.SUCCESS
assert run.result["check"] == "pass"
53 changes: 53 additions & 0 deletions scripts/ci_test_python.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import os
import sys
from pathlib import Path

if Path().resolve().name == "scripts":
os.chdir("..")

sys.path.append("src/discord-cluster-manager")

from consts import ExitCode
from leaderboard_eval import py_eval
from run_eval import run_pytorch_script

ref = Path("examples/identity_py/reference.py")


def test_does_not_import():
# input_tt is a typo, so this won't compile
sub = """
this is a syntax error
"""

run = run_pytorch_script(py_eval, ref.read_text(), sub, arch=None)
assert run.success is False
assert run.exit_code != ExitCode.SUCCESS
assert "IndentationError: unexpected indent\n" in run.stderr


def test_error():
# no-op, runs fine but isn't correct
sub = """
import torch
def custom_kernel(input):
return [torch.zeros_like(i) for i in input]
"""
run = run_pytorch_script(py_eval, ref.read_text(), sub, arch=None)
assert run.success is False
assert run.command == "python eval.py"
# we never reach the benchmark part, because the test fails
assert "warming up..." not in run.stdout
assert "mismatch found! custom implementation doesnt match reference." in run.stdout
assert run.exit_code == ExitCode.VALIDATE_FAIL
assert run.result["check"] == "fail"


def test_correct():
sub = Path("examples/identity_py/submission.py").read_text()

run = run_pytorch_script(py_eval, ref.read_text(), sub, arch=None)
assert run.success is True
assert "warming up..." in run.stdout
assert run.exit_code == ExitCode.SUCCESS
assert run.result["check"] == "pass"
19 changes: 12 additions & 7 deletions src/discord-cluster-manager/cogs/verify_run_cog.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import asyncio
import re
from pathlib import Path
from unittest.mock import AsyncMock

import discord
Expand All @@ -12,19 +13,16 @@
logger = setup_logging()


def create_mock_attachment():
def create_mock_attachment(file_name: str, content: str):
"Create an AsyncMock to simulate discord.Attachment"

mock_attachment = AsyncMock(spec=discord.Attachment)
mock_attachment.filename = "test_script.py"
mock_attachment.filename = file_name
mock_attachment.content_type = "text/plain"
mock_attachment.read = AsyncMock(return_value="print('Hello, world!')".encode("utf-8"))
mock_attachment.read = AsyncMock(return_value=content.encode("utf-8"))
return mock_attachment


script_file = create_mock_attachment()


class VerifyRunCog(commands.Cog):
"""
A Discord cog for verifying the success of training runs.
Expand All @@ -45,6 +43,7 @@ async def verify_github_run(
interaction: discord.Interaction,
) -> bool:
github_command = github_cog.run_github
script_file = create_mock_attachment("test_script.py", "print('Hello, world!')")
github_thread = await github_command.callback(github_cog, interaction, script_file, choice)

message_contents = [msg.content async for msg in github_thread.history(limit=None)]
Expand Down Expand Up @@ -86,7 +85,13 @@ async def verify_modal_run(self, modal_cog: ModalCog, interaction: discord.Inter
t4 = app_commands.Choice(name="T4", value="t4")
modal_command = modal_cog.run_modal

modal_thread = await modal_command.callback(modal_cog, interaction, script_file, t4)
sub_code = create_mock_attachment(
"submission.py", Path("examples/identity_py/submission.py").read_text()
)
ref_code = Path("examples/identity_py/reference.py").read_text()
modal_thread = await modal_command.callback(
modal_cog, interaction, sub_code, t4, reference_code=ref_code
)

message_contents = [msg.content async for msg in modal_thread.history(limit=None)]

Expand Down
18 changes: 17 additions & 1 deletion src/discord-cluster-manager/consts.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from enum import Enum
from enum import Enum, IntEnum
from typing import Type


Expand All @@ -25,6 +25,22 @@ class ModalGPU(Enum):
H100 = "H100"


class ExitCode(IntEnum):
"""
Exit codes for our runners. These are just the codes actively return,
others are possible (e.g., exiting due to segfault, permissions, signal, ...)
"""

# program ran successfully
SUCCESS = 0
# a cuda API call failed
CUDA_FAIL = 110
# could not setup file descriptor for custom pipe
PIPE_FAILED = 111
# didn't crash, but tests failed
VALIDATE_FAIL = 112


def combine_enums(enums: list[Type[Enum]], combined_name: str) -> Enum:
combined_members = {}
for enum in enums:
Expand Down
9 changes: 4 additions & 5 deletions src/discord-cluster-manager/eval.cu
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,7 @@ static void cuda_check(cudaError_t status, const char* expr, const char* file, i
<< line << ") in `"
<< function << "`: "
<< cudaGetErrorString(status) << std::endl;
// following pytest convention, exit code 3 means internal error
std::exit(3);
std::exit(110);
}
}

Expand Down Expand Up @@ -83,7 +82,7 @@ void measure_runtime(PopcornOutput& logger) {
auto reference_output = ref_kernel(copy);
if (!check_implementation(submission_output, reference_output)) {
logger.log("check", "fail");
std::exit(1);
std::exit(112);
}

}
Expand Down Expand Up @@ -122,7 +121,7 @@ int main() {
int fd = std::stoi(output_fd);
logger.File.reset(::fdopen(fd, "w"));
} else {
return 4; // pytest: usage error
return 111;
}

auto data = generate_input();
Expand All @@ -131,7 +130,7 @@ int main() {

if (!check_implementation(submission_output, reference_output)) {
logger.log("check", "fail");
return 1;
return 112;
}

measure_runtime(logger);
Expand Down
Loading
Loading