gpu-mode · ngc92 · Jan 14, 2025 · Jan 13, 2025 · Jan 13, 2025 · Jan 14, 2025
diff --git a/.github/workflows/cuda_test.yml b/.github/workflows/cuda_test.yml
diff --git a/.github/workflows/runner_ci.yml b/.github/workflows/runner_ci.yml
@@ -0,0 +1,65 @@
+name: Runner CI
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  check-cuda:
+    runs-on: [gpumode-nvidia-arc]
+    timeout-minutes: 10
+    container:
+      image: nvidia/cuda:12.4.0-devel-ubuntu22.04
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Install pytest
+        shell: bash
+        run: pip install pytest
+
+      - name: Run script
+        shell: bash
+        run: pytest scripts/ci_test_cuda.py
+
+    env:
+      CUDA_VISIBLE_DEVICES: 0
+
+  check-pytorch:
+    runs-on: [gpumode-nvidia-arc]
+    timeout-minutes: 10
+    container:
+      image: nvidia/cuda:12.4.0-devel-ubuntu22.04
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v3
+        with:
+          version: "latest"
+
+      - name: Setup Python environment
+        run: |
+          uv venv .venv
+          echo "VIRTUAL_ENV=$PWD/.venv" >> $GITHUB_ENV
+          echo "$PWD/.venv/bin" >> $GITHUB_PATH
+          uv pip install numpy torch setuptools ninja pytest
+
+      - name: Run script
+        shell: bash
+        run: pytest scripts/ci_test_python.py
+
+    env:
+      CUDA_VISIBLE_DEVICES: 0
+
diff --git a/scripts/ci_test_cuda.py b/scripts/ci_test_cuda.py
@@ -7,6 +7,7 @@
 
 sys.path.append("src/discord-cluster-manager")
 
+from consts import ExitCode
 from leaderboard_eval import cu_eval
 from run_eval import run_cuda_script
 
@@ -23,6 +24,7 @@ def test_does_not_compile():
     assert comp.success is False
     assert run.success is False
     assert comp.nvcc_found is True
+    assert comp.exit_code != ExitCode.SUCCESS
     assert comp.stdout == ""
     assert 'train.cuh(2): error: identifier "input_tt" is undefined' in comp.stderr
     assert '1 error detected in the compilation of "eval.cu".' in comp.stderr
@@ -55,9 +57,9 @@ def test_cuda_runtime_error():
     assert run.success is False
     assert run.command == "./eval.out"
     assert "warming up..." in run.stdout
-    assert "cudaDeviceSynchronize() at eval.cu(64) in `measure_runtime`" in run.stderr
+    assert "cudaDeviceSynchronize() at eval.cu(63) in `measure_runtime`" in run.stderr
     assert "an illegal memory access was encountered" in run.stderr
-    assert run.exit_code == 3
+    assert run.exit_code == ExitCode.CUDA_FAIL
     assert len(run.result) == 0
 
 
@@ -85,7 +87,7 @@ def test_cuda_validation_fail():
     # we never reach the benchmark part, because the test fails
     assert "warming up..." not in run.stdout
     assert "ERROR AT 0, 0" in run.stderr
-    assert run.exit_code == 1
+    assert run.exit_code == ExitCode.VALIDATE_FAIL
     assert run.result["check"] == "fail"
 
 
@@ -96,5 +98,5 @@ def test_cuda_correct():
     assert comp.success is True
     assert run.success is True
     assert "warming up..." in run.stdout
-    assert run.exit_code == 0
+    assert run.exit_code == ExitCode.SUCCESS
     assert run.result["check"] == "pass"
diff --git a/scripts/ci_test_python.py b/scripts/ci_test_python.py
@@ -0,0 +1,53 @@
+import os
+import sys
+from pathlib import Path
+
+if Path().resolve().name == "scripts":
+    os.chdir("..")
+
+sys.path.append("src/discord-cluster-manager")
+
+from consts import ExitCode
+from leaderboard_eval import py_eval
+from run_eval import run_pytorch_script
+
+ref = Path("examples/identity_py/reference.py")
+
+
+def test_does_not_import():
+    # input_tt is a typo, so this won't compile
+    sub = """
+    this is a syntax error
+    """
+
+    run = run_pytorch_script(py_eval, ref.read_text(), sub, arch=None)
+    assert run.success is False
+    assert run.exit_code != ExitCode.SUCCESS
+    assert "IndentationError: unexpected indent\n" in run.stderr
+
+
+def test_error():
+    # no-op, runs fine but isn't correct
+    sub = """
+import torch
+def custom_kernel(input):
+    return [torch.zeros_like(i) for i in input]
+        """
+    run = run_pytorch_script(py_eval, ref.read_text(), sub, arch=None)
+    assert run.success is False
+    assert run.command == "python eval.py"
+    # we never reach the benchmark part, because the test fails
+    assert "warming up..." not in run.stdout
+    assert "mismatch found! custom implementation doesnt match reference." in run.stdout
+    assert run.exit_code == ExitCode.VALIDATE_FAIL
+    assert run.result["check"] == "fail"
+
+
+def test_correct():
+    sub = Path("examples/identity_py/submission.py").read_text()
+
+    run = run_pytorch_script(py_eval, ref.read_text(), sub, arch=None)
+    assert run.success is True
+    assert "warming up..." in run.stdout
+    assert run.exit_code == ExitCode.SUCCESS
+    assert run.result["check"] == "pass"
diff --git a/src/discord-cluster-manager/cogs/verify_run_cog.py b/src/discord-cluster-manager/cogs/verify_run_cog.py
@@ -1,5 +1,6 @@
 import asyncio
 import re
+from pathlib import Path
 from unittest.mock import AsyncMock
 
 import discord
@@ -12,19 +13,16 @@
 logger = setup_logging()
 
 
-def create_mock_attachment():
+def create_mock_attachment(file_name: str, content: str):
     "Create an AsyncMock to simulate discord.Attachment"
 
     mock_attachment = AsyncMock(spec=discord.Attachment)
-    mock_attachment.filename = "test_script.py"
+    mock_attachment.filename = file_name
     mock_attachment.content_type = "text/plain"
-    mock_attachment.read = AsyncMock(return_value="print('Hello, world!')".encode("utf-8"))
+    mock_attachment.read = AsyncMock(return_value=content.encode("utf-8"))
     return mock_attachment
 
 
-script_file = create_mock_attachment()
-
-
 class VerifyRunCog(commands.Cog):
     """
     A Discord cog for verifying the success of training runs.
@@ -45,6 +43,7 @@ async def verify_github_run(
         interaction: discord.Interaction,
     ) -> bool:
         github_command = github_cog.run_github
+        script_file = create_mock_attachment("test_script.py", "print('Hello, world!')")
         github_thread = await github_command.callback(github_cog, interaction, script_file, choice)
 
         message_contents = [msg.content async for msg in github_thread.history(limit=None)]
@@ -86,7 +85,13 @@ async def verify_modal_run(self, modal_cog: ModalCog, interaction: discord.Inter
         t4 = app_commands.Choice(name="T4", value="t4")
         modal_command = modal_cog.run_modal
 
-        modal_thread = await modal_command.callback(modal_cog, interaction, script_file, t4)
+        sub_code = create_mock_attachment(
+            "submission.py", Path("examples/identity_py/submission.py").read_text()
+        )
+        ref_code = Path("examples/identity_py/reference.py").read_text()
+        modal_thread = await modal_command.callback(
+            modal_cog, interaction, sub_code, t4, reference_code=ref_code
+        )
 
         message_contents = [msg.content async for msg in modal_thread.history(limit=None)]
 

diff --git a/src/discord-cluster-manager/consts.py b/src/discord-cluster-manager/consts.py
@@ -1,4 +1,4 @@
-from enum import Enum
+from enum import Enum, IntEnum
 from typing import Type
 
 
@@ -25,6 +25,22 @@ class ModalGPU(Enum):
     H100 = "H100"
 
 
+class ExitCode(IntEnum):
+    """
+    Exit codes for our runners. These are just the codes actively return,
+    others are possible (e.g., exiting due to segfault, permissions, signal, ...)
+    """
+
+    # program ran successfully
+    SUCCESS = 0
+    # a cuda API call failed
+    CUDA_FAIL = 110
+    # could not setup file descriptor for custom pipe
+    PIPE_FAILED = 111
+    # didn't crash, but tests failed
+    VALIDATE_FAIL = 112
+
+
 def combine_enums(enums: list[Type[Enum]], combined_name: str) -> Enum:
     combined_members = {}
     for enum in enums:

diff --git a/src/discord-cluster-manager/eval.cu b/src/discord-cluster-manager/eval.cu
@@ -46,8 +46,7 @@ static void cuda_check(cudaError_t status, const char* expr, const char* file, i
                   << line << ") in `"
                   << function << "`: "
                   << cudaGetErrorString(status) << std::endl;
-        // following pytest convention, exit code 3 means internal error
-        std::exit(3);
+        std::exit(110);
     }
 }
 
@@ -83,7 +82,7 @@ void measure_runtime(PopcornOutput& logger) {
         auto reference_output = ref_kernel(copy);
         if (!check_implementation(submission_output, reference_output)) {
             logger.log("check", "fail");
-            std::exit(1);
+            std::exit(112);
         }
 
     }
@@ -122,7 +121,7 @@ int main() {
         int fd = std::stoi(output_fd);
         logger.File.reset(::fdopen(fd, "w"));
     } else {
-        return 4;       // pytest: usage error
+        return 111;
     }
 
     auto data = generate_input();
@@ -131,7 +130,7 @@ int main() {
 
     if (!check_implementation(submission_output, reference_output)) {
         logger.log("check", "fail");
-        return 1;
+        return 112;
     }
 
     measure_runtime(logger);