From 542d814780d7c87a4c8aefb6959a2c1a0a7fd3af Mon Sep 17 00:00:00 2001
From: Erik Schultheis <erik.schultheis@aalto.fi>
Date: Tue, 14 Jan 2025 00:02:15 +0200
Subject: [PATCH] extended test for python implementation + adjusted exit codes
 for consistency

---
 .github/workflows/cuda_test.yml         | 32 ------------
 .github/workflows/runner_ci.yml         | 65 +++++++++++++++++++++++++
 scripts/ci_test_python.py               | 52 ++++++++++++++++++++
 src/discord-cluster-manager/eval.cu     |  9 ++--
 src/discord-cluster-manager/eval.py     | 12 +++--
 src/discord-cluster-manager/run_eval.py |  6 +--
 6 files changed, 133 insertions(+), 43 deletions(-)
 delete mode 100644 .github/workflows/cuda_test.yml
 create mode 100644 .github/workflows/runner_ci.yml
 create mode 100644 scripts/ci_test_python.py

diff --git a/.github/workflows/cuda_test.yml b/.github/workflows/cuda_test.yml
deleted file mode 100644
index d88aa102..00000000
--- a/.github/workflows/cuda_test.yml
+++ /dev/null
@@ -1,32 +0,0 @@
-name: CUDA Test
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-
-jobs:
-  check-identity:
-    runs-on: [gpumode-nvidia-arc]
-    timeout-minutes: 10
-    container:
-      image: nvidia/cuda:12.4.0-devel-ubuntu22.04
-    steps:
-      - uses: actions/checkout@v3
-
-      - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.10'
-
-      - name: Install pytest
-        shell: bash
-        run: pip install pytest
-
-      - name: Run script
-        shell: bash
-        run: pytest scripts/ci_test_cuda.py
-
-    env:
-      CUDA_VISIBLE_DEVICES: 0
diff --git a/.github/workflows/runner_ci.yml b/.github/workflows/runner_ci.yml
new file mode 100644
index 00000000..216926f6
--- /dev/null
+++ b/.github/workflows/runner_ci.yml
@@ -0,0 +1,65 @@
+name: Runner CI
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  check-cuda:
+    runs-on: [gpumode-nvidia-arc]
+    timeout-minutes: 10
+    container:
+      image: nvidia/cuda:12.4.0-devel-ubuntu22.04
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Install pytest
+        shell: bash
+        run: pip install pytest
+
+      - name: Run script
+        shell: bash
+        run: pytest scripts/ci_test_cuda.py
+
+    env:
+      CUDA_VISIBLE_DEVICES: 0
+
+  check-pytorch:
+    runs-on: [gpumode-nvidia-arc]
+    timeout-minutes: 10
+    container:
+      image: nvidia/cuda:12.4.0-devel-ubuntu22.04
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v3
+        with:
+          version: "latest"
+
+      - name: Setup Python environment
+        run: |
+          uv venv .venv
+          echo "VIRTUAL_ENV=$PWD/.venv" >> $GITHUB_ENV
+          echo "$PWD/.venv/bin" >> $GITHUB_PATH
+          uv pip install numpy torch setuptools ninja pytest
+
+      - name: Run script
+        shell: bash
+        run: pytest scripts/ci_test_python.py
+
+    env:
+      CUDA_VISIBLE_DEVICES: 0
+
diff --git a/scripts/ci_test_python.py b/scripts/ci_test_python.py
new file mode 100644
index 00000000..791e2b1c
--- /dev/null
+++ b/scripts/ci_test_python.py
@@ -0,0 +1,52 @@
+import os
+import sys
+from pathlib import Path
+
+if Path().resolve().name == "scripts":
+    os.chdir("..")
+
+sys.path.append("src/discord-cluster-manager")
+
+from leaderboard_eval import py_eval
+from run_eval import run_pytorch_script
+
+ref = Path("examples/identity_py/reference.py")
+
+
+def test_does_not_import():
+    # input_tt is a typo, so this won't compile
+    sub = """
+    this is a syntax error
+    """
+
+    run = run_pytorch_script(py_eval, ref.read_text(), sub, arch=None)
+    assert run.success is False
+    assert run.exit_code == 1
+    assert "IndentationError: unexpected indent\n" in run.stderr
+
+
+def test_error():
+    # no-op, runs fine but isn't correct
+    sub = """
+import torch
+def custom_kernel(input):
+    return [torch.zeros_like(i) for i in input]
+        """
+    run = run_pytorch_script(py_eval, ref.read_text(), sub, arch=None)
+    assert run.success is True
+    assert run.command == "python eval.py"
+    # we never reach the benchmark part, because the test fails
+    assert "warming up..." not in run.stdout
+    assert "mismatch found! custom implementation doesnt match reference." in run.stdout
+    assert run.exit_code == 112
+    assert run.result["check"] == "fail"
+
+
+def test_correct():
+    sub = Path("examples/identity_py/submission.py").read_text()
+
+    run = run_pytorch_script(py_eval, ref.read_text(), sub, arch=None)
+    assert run.success is True
+    assert "warming up..." in run.stdout
+    assert run.exit_code == 0
+    assert run.result["check"] == "pass"
diff --git a/src/discord-cluster-manager/eval.cu b/src/discord-cluster-manager/eval.cu
index a1b663a3..de2f909b 100644
--- a/src/discord-cluster-manager/eval.cu
+++ b/src/discord-cluster-manager/eval.cu
@@ -46,8 +46,7 @@ static void cuda_check(cudaError_t status, const char* expr, const char* file, i
                   << line << ") in `"
                   << function << "`: "
                   << cudaGetErrorString(status) << std::endl;
-        // following pytest convention, exit code 3 means internal error
-        std::exit(3);
+        std::exit(110);
     }
 }
 
@@ -83,7 +82,7 @@ void measure_runtime(PopcornOutput& logger) {
         auto reference_output = ref_kernel(copy);
         if (!check_implementation(submission_output, reference_output)) {
             logger.log("check", "fail");
-            std::exit(1);
+            std::exit(112);
         }
 
     }
@@ -122,7 +121,7 @@ int main() {
         int fd = std::stoi(output_fd);
         logger.File.reset(::fdopen(fd, "w"));
     } else {
-        return 4;       // pytest: usage error
+        return 111;
     }
 
     auto data = generate_input();
@@ -131,7 +130,7 @@ int main() {
 
     if (!check_implementation(submission_output, reference_output)) {
         logger.log("check", "fail");
-        return 1;
+        return 112;
     }
 
     measure_runtime(logger);
diff --git a/src/discord-cluster-manager/eval.py b/src/discord-cluster-manager/eval.py
index 7928ce50..f3a9a63f 100644
--- a/src/discord-cluster-manager/eval.py
+++ b/src/discord-cluster-manager/eval.py
@@ -1,5 +1,6 @@
 import math
 import os
+import sys
 import time
 
 import torch
@@ -56,7 +57,7 @@ def metric(logger: PopcornLogger):
         torch.cuda.synchronize()
         if not check_implementation(custom_output, ref_output):
             logger.log("check", "fail")
-            exit(1)
+            exit(112)
 
     total_time = sum(times)
     average_duration = total_time / timed_runs
@@ -75,10 +76,15 @@ def metric(logger: PopcornLogger):
 
 
 def main():
-    logger = PopcornLogger(int(os.environ["POPCORN_FD"]))
+    try:
+        logger = PopcornLogger(int(os.environ["POPCORN_FD"]))
+    except Exception as e:
+        print(e, file=sys.stderr)
+        exit(111)
+
     if not correctness():
         logger.log("check", "fail")
-        exit(1)
+        exit(112)
     metric(logger)
 
 
diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py
index 938827f3..41e34c6f 100644
--- a/src/discord-cluster-manager/run_eval.py
+++ b/src/discord-cluster-manager/run_eval.py
@@ -141,10 +141,10 @@ def run_program(args: list[str]) -> RunResult:
         key, _, value = line.partition(":")
         result_dict[key.strip()] = value.strip()
 
+    # 0     everything was fine
+    # 112   program ran fine, but we detected a test failure
     return RunResult(
-        # TODO should we return 0 also on test failure?
-        # TODO check what return codes python uses, e.g. on uncaught exception
-        success=(run_process.returncode == 0 or run_process.returncode == 1),
+        success=(run_process.returncode == 0 or run_process.returncode == 112),
         command=_make_cmd(run_process.args),
         stdout=run_process.stdout,
         stderr=run_process.stderr,