diff --git a/src/discord-cluster-manager/cogs/verify_run_cog.py b/src/discord-cluster-manager/cogs/verify_run_cog.py
index 6c790019..8a0c9dd3 100644
--- a/src/discord-cluster-manager/cogs/verify_run_cog.py
+++ b/src/discord-cluster-manager/cogs/verify_run_cog.py
@@ -1,5 +1,6 @@
 import asyncio
 import re
+from pathlib import Path
 from unittest.mock import AsyncMock
 
 import discord
@@ -12,19 +13,16 @@
 logger = setup_logging()
 
 
-def create_mock_attachment():
+def create_mock_attachment(file_name: str, content: str):
     "Create an AsyncMock to simulate discord.Attachment"
 
     mock_attachment = AsyncMock(spec=discord.Attachment)
-    mock_attachment.filename = "test_script.py"
+    mock_attachment.filename = file_name
     mock_attachment.content_type = "text/plain"
-    mock_attachment.read = AsyncMock(return_value="print('Hello, world!')".encode("utf-8"))
+    mock_attachment.read = AsyncMock(return_value=content.encode("utf-8"))
     return mock_attachment
 
 
-script_file = create_mock_attachment()
-
-
 class VerifyRunCog(commands.Cog):
     """
     A Discord cog for verifying the success of training runs.
@@ -45,6 +43,7 @@ async def verify_github_run(
         interaction: discord.Interaction,
     ) -> bool:
         github_command = github_cog.run_github
+        script_file = create_mock_attachment("test_script.py", "print('Hello, world!')")
         github_thread = await github_command.callback(github_cog, interaction, script_file, choice)
 
         message_contents = [msg.content async for msg in github_thread.history(limit=None)]
@@ -86,7 +85,13 @@ async def verify_modal_run(self, modal_cog: ModalCog, interaction: discord.Inter
         t4 = app_commands.Choice(name="T4", value="t4")
         modal_command = modal_cog.run_modal
 
-        modal_thread = await modal_command.callback(modal_cog, interaction, script_file, t4)
+        sub_code = create_mock_attachment(
+            "submission.py", Path("examples/identity_py/submission.py").read_text()
+        )
+        ref_code = Path("examples/identity_py/reference.py").read_text()
+        modal_thread = await modal_command.callback(
+            modal_cog, interaction, sub_code, t4, reference_code=ref_code
+        )
 
         message_contents = [msg.content async for msg in modal_thread.history(limit=None)]
 
diff --git a/src/discord-cluster-manager/eval.py b/src/discord-cluster-manager/eval.py
new file mode 100644
index 00000000..7928ce50
--- /dev/null
+++ b/src/discord-cluster-manager/eval.py
@@ -0,0 +1,86 @@
+import math
+import os
+import time
+
+import torch
+from reference import check_implementation, generate_input, ref_kernel
+from train import custom_kernel
+
+
+class PopcornLogger:
+    def __init__(self, fd):
+        self.channel = open(fd, "w")
+
+    def log(self, key: str, value):
+        print(f"{key}: {value}\n", file=self.channel)
+
+
+def correctness() -> bool:
+    for _ in range(10):  # check multiple times
+        inputs = generate_input()
+
+        custom_output = custom_kernel(inputs)
+        ref_output = ref_kernel(inputs)
+
+        if not check_implementation(custom_output, ref_output):
+            return False
+
+    print("custom implementation matches the reference implementation.")
+    return True
+
+
+def metric(logger: PopcornLogger):
+    warmup_runs = 10
+    timed_runs = 100
+
+    # Warmup Code
+    print("warming up...")
+    for _ in range(warmup_runs):
+        inputs = generate_input()
+        _ = custom_kernel(inputs)
+    torch.cuda.synchronize()
+
+    # Timing Code
+    times = []
+
+    for _ in range(timed_runs):
+        inputs = generate_input()
+
+        start_time = time.time()
+        custom_output = custom_kernel(inputs)
+        torch.cuda.synchronize()
+        end_time = time.time()
+        times.append(end_time - start_time)
+
+        ref_output = ref_kernel(inputs)
+        torch.cuda.synchronize()
+        if not check_implementation(custom_output, ref_output):
+            logger.log("check", "fail")
+            exit(1)
+
+    total_time = sum(times)
+    average_duration = total_time / timed_runs
+    variance = sum(map(lambda x: (x - average_duration) ** 2, times))  # noqa
+    standard_deviation = math.sqrt(variance / (timed_runs - 1))
+    standard_error = standard_deviation / math.sqrt(timed_runs)
+
+    logger.log("check", "pass")
+    logger.log("duration.mean", average_duration * 1e9)
+    logger.log("duration.std", standard_deviation * 1e9)
+    logger.log("duration.err", standard_error * 1e9)
+    logger.log("duration.best", min(times) * 1e9)
+    logger.log("duration.worst", max(times) * 1e9)
+
+    print(f"Submitted kernel runtime: {average_duration:.4f} ± {standard_error:.4} seconds")
+
+
+def main():
+    logger = PopcornLogger(int(os.environ["POPCORN_FD"]))
+    if not correctness():
+        logger.log("check", "fail")
+        exit(1)
+    metric(logger)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/discord-cluster-manager/leaderboard_eval.py b/src/discord-cluster-manager/leaderboard_eval.py
index d021895c..ca2aefb0 100644
--- a/src/discord-cluster-manager/leaderboard_eval.py
+++ b/src/discord-cluster-manager/leaderboard_eval.py
@@ -4,71 +4,5 @@
 
 from pathlib import Path
 
-py_eval = """
-import torch
-import time
-from reference import ref_kernel, generate_input, check_implementation
-from train import custom_kernel
-
-
-def correctness() -> bool:
-    for _ in range(10):  # check multiple times
-        inputs = generate_input()
-
-        custom_output = custom_kernel(inputs)
-        ref_output = ref_kernel(inputs)
-
-        if not check_implementation(custom_output, ref_output):
-            return False
-
-    print('custom implementation matches the reference implementation.')
-    return True
-
-
-def metric():
-    warmup_runs = 10
-    timed_runs = 100
-
-    # Warmup Code
-    print('warming up...')
-    for _ in range(warmup_runs):
-        inputs = generate_input()
-        _ = custom_kernel(inputs)
-    torch.cuda.synchronize()
-
-    # Timing Code
-    total_time = 0.0
-
-    for _ in range(timed_runs):
-        inputs = generate_input()
-
-        start_time = time.time()
-        custom_output = custom_kernel(inputs)
-        torch.cuda.synchronize()
-        end_time = time.time()
-        total_time += (end_time - start_time)
-
-        ref_output = ref_kernel(inputs)
-        torch.cuda.synchronize()
-        if not check_implementation(custom_output, ref_output):
-            return -1
-
-
-    custom_duration = total_time / timed_runs
-
-    print(f'Submitted kernel runtime: {custom_duration:.4f} seconds')
-
-    return custom_duration
-
-def main():
-    assert (correctness())
-    s = metric()
-
-    print(f'score:{s}')
-
-if __name__ == '__main__':
-    main()
-
-"""
-
+py_eval = Path.read_text(Path(__file__).parent / "eval.py")
 cu_eval = Path.read_text(Path(__file__).parent / "eval.cu")
diff --git a/src/discord-cluster-manager/modal_runner.py b/src/discord-cluster-manager/modal_runner.py
index 97289467..1bcce3c4 100644
--- a/src/discord-cluster-manager/modal_runner.py
+++ b/src/discord-cluster-manager/modal_runner.py
@@ -79,15 +79,40 @@ def modal_run_pytorch_script(  # noqa: C901
     """Modal version of run_pytorch_script, handling timeouts"""
     try:
         with timeout(timeout_seconds):
-            return run_pytorch_script(
+            run_result = run_pytorch_script(
                 script_content=script_content,
                 reference_content=reference_content,
                 submission_content=submission_content,
                 arch=arch,
             )
+            if not run_result.success:
+                # exit code 1 encodes failed tests
+                if run_result.exit_code == 1:
+                    return f"check_implementation failed:\n{run_result.stderr}", 0.0
+                else:
+                    return (
+                        f"Script failed with exit code "
+                        f"({run_result.exit_code}):\n{run_result.stderr}",
+                        0.0,
+                    )
+
+            print("run process stdout:", run_result.stdout)
+            print("run process stderr:", run_result.stderr)
+
+            score = float(run_result.result.get("duration.mean", "0.0")) / 1e9
+            passed = run_result.result.get("check", "") == "pass"
+            if not passed:
+                return "check_implementation failed", 0.0
+
+            if score is None:
+                return run_result.stdout, run_result.duration
+
+            return run_result.stdout, score
 
     except TimeoutException as e:
         return f"Timeout Error: {str(e)}", 0.0
+    except Exception as e:
+        return f"Error executing script: {str(e)}", 0.0
 
 
 def modal_run_cuda_script(  # # noqa: C901
diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py
index 5f54fe18..938827f3 100644
--- a/src/discord-cluster-manager/run_eval.py
+++ b/src/discord-cluster-manager/run_eval.py
@@ -114,7 +114,7 @@ def compile_cuda_script(  # # noqa: C901
     )
 
 
-def run_cuda_program(args: list[str]) -> RunResult:
+def run_program(args: list[str]) -> RunResult:
     # set up a pipe so the tester can communicate its verdict with us
     env = os.environ.copy()
     pipe_read, pipe_write = os.pipe()
@@ -142,7 +142,9 @@ def run_cuda_program(args: list[str]) -> RunResult:
         result_dict[key.strip()] = value.strip()
 
     return RunResult(
-        success=run_process.returncode == 0,
+        # TODO should we return 0 also on test failure?
+        # TODO check what return codes python uses, e.g. on uncaught exception
+        success=(run_process.returncode == 0 or run_process.returncode == 1),
         command=_make_cmd(run_process.args),
         stdout=run_process.stdout,
         stderr=run_process.stderr,
@@ -206,7 +208,7 @@ def run_cuda_script(  # # noqa: C901
                 result={},
             )
 
-        run_result = run_cuda_program(["./eval.out"])
+        run_result = run_program(["./eval.out"])
         return compile_result, run_result
 
     finally:
@@ -221,9 +223,9 @@ def run_pytorch_script(  # noqa: C901
     reference_content: Optional[str] = None,
     submission_content: Optional[str] = None,
     arch: int = None,
-) -> tuple[str, float]:
+) -> RunResult:
     """
-    Executes the provided PyTorch GPU kernel in an isolated environment with a timeout
+    Executes the provided PyTorch GPU kernel in an isolated environment
 
     Args:
         script_content: The PyTorch script containing the GPU kernel to benchmark
@@ -247,33 +249,8 @@ def run_pytorch_script(  # noqa: C901
         with open("eval.py", "w") as f:
             f.write(script_content)
 
-        execution_start_time = time.perf_counter()
-        result = subprocess.run(
-            ["python", "eval.py"],
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
-            text=True,
-        )
-
-        if result.returncode != 0:
-            raise RuntimeError(
-                "Script execution failed with return code "
-                + f"{result.returncode}:\n{result.stderr}"
-            )
-
-        score = None
-        for line in result.stdout.splitlines():
-            if line.startswith("score:"):
-                score = float(line.split(":")[1].strip())
-                return "score", score
-
-        if score is None:
-            execution_end_time = time.perf_counter()
-            score = execution_end_time - execution_start_time
+        return run_program(["python", "eval.py"])
 
-        return result.stdout, score
-    except Exception as e:
-        return f"Error executing script: {str(e)}", 0.0
     finally:
         tmp_files = ["eval.py", "reference.py", "train.py"]
         for f in tmp_files: