From 0c04a1b5ce6f9a8337a05d2a9ea0d17f5149a7c0 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Sun, 12 Jan 2025 16:39:27 +0200 Subject: [PATCH] moved string generation to modal runner and updated CI --- scripts/ci_test_cuda.py | 51 +++++++++---- src/discord-cluster-manager/modal_runner.py | 45 ++++++++++- src/discord-cluster-manager/run_eval.py | 83 ++++++--------------- 3 files changed, 102 insertions(+), 77 deletions(-) diff --git a/scripts/ci_test_cuda.py b/scripts/ci_test_cuda.py index 29a7cb5d..9b5ede3b 100644 --- a/scripts/ci_test_cuda.py +++ b/scripts/ci_test_cuda.py @@ -19,9 +19,15 @@ def test_does_not_compile(): output_t custom_kernel(input_tt data) { } """ - cout, score = run_cuda_script(cu_eval, ref.read_text(), sub, arch=None) - assert score == 0 - assert "CUDA compilation failed" in cout + comp, run = run_cuda_script(cu_eval, ref.read_text(), sub, arch=None) + assert comp.success is False + assert run.success is False + assert comp.nvcc_found is True + assert comp.stdout == "" + assert 'train.cuh(2): error: identifier "input_tt" is undefined' in comp.stderr + assert '1 error detected in the compilation of "eval.cu".' in comp.stderr + assert comp.command.startswith("/usr/local/cuda/bin/nvcc") + assert "nvcc: NVIDIA (R) Cuda compiler driver" in comp.nvcc_version def test_cuda_runtime_error(): @@ -44,11 +50,15 @@ def test_cuda_runtime_error(): } """ - cout, score = run_cuda_script(cu_eval, ref.read_text(), sub, arch=None) - assert score == 0 - assert "Command '['./eval.out']' returned non-zero exit status 3." in cout - assert "cudaDeviceSynchronize() at eval.cu(64) in `measure_runtime`" in cout - assert "an illegal memory access was encountered" in cout + comp, run = run_cuda_script(cu_eval, ref.read_text(), sub, arch=None) + assert comp.success is True + assert run.success is False + assert run.command == "./eval.out" + assert "warming up..." in run.stdout + assert "cudaDeviceSynchronize() at eval.cu(64) in `measure_runtime`" in run.stderr + assert "an illegal memory access was encountered" in run.stderr + assert run.exit_code == 3 + assert len(run.result) == 0 def test_cuda_validation_fail(): @@ -68,14 +78,23 @@ def test_cuda_validation_fail(): } """ - cout, score = run_cuda_script(cu_eval, ref.read_text(), sub, arch=None) - assert score == 0 - assert "Command '['./eval.out']' returned non-zero exit status 1." in cout - assert "ERROR AT 0, 0" in cout + comp, run = run_cuda_script(cu_eval, ref.read_text(), sub, arch=None) + assert comp.success is True + assert run.success is False + assert run.command == "./eval.out" + # we never reach the benchmark part, because the test fails + assert "warming up..." not in run.stdout + assert "ERROR AT 0, 0" in run.stderr + assert run.exit_code == 1 + assert run.result["check"] == "fail" def test_cuda_correct(): - sub = Path("examples/identity_cuda/submission.cuh") - - cout, score = run_cuda_script(cu_eval, ref.read_text(), sub.read_text(), arch=None) - assert score > 0 + sub = Path("examples/identity_cuda/submission.cuh").read_text() + + comp, run = run_cuda_script(cu_eval, ref.read_text(), sub, arch=None) + assert comp.success is True + assert run.success is True + assert "warming up..." in run.stdout + assert run.exit_code == 0 + assert run.result["check"] == "pass" diff --git a/src/discord-cluster-manager/modal_runner.py b/src/discord-cluster-manager/modal_runner.py index a3a246a6..97289467 100644 --- a/src/discord-cluster-manager/modal_runner.py +++ b/src/discord-cluster-manager/modal_runner.py @@ -100,12 +100,55 @@ def modal_run_cuda_script( # # noqa: C901 """Modal version of run_cuda_script, handling timeouts""" try: with timeout(timeout_seconds): - return run_cuda_script( + compile_result, run_result = run_cuda_script( script_content, reference_content=reference_content, submission_content=submission_content, arch=arch, include_dirs=MODAL_CUDA_INCLUDE_DIRS, ) + + if not compile_result.success: + if not compile_result.nvcc_found: + return ( + "Error executing script: NVCC not found:\n" + + f"command `{compile_result.command}` " + + f"failed with exit code {compile_result.exit_code}:\n" + + compile_result.stderr, + 0.0, + ) + return ( + "Error executing script: CUDA compilation failed with return code " + + f"{compile_result.exit_code}:\n{compile_result.stderr}\n" + + f"compile command: `{compile_result.command}`", + 0.0, + ) + + if not run_result.success: + # exit code 1 encodes failed tests + if run_result.exit_code == 1: + return f"check_implementation failed:\n{run_result.stderr}", 0.0 + else: + return ( + f"Script failed with exit code " + f"({run_result.exit_code}):\n{run_result.stderr}", + 0.0, + ) + + print("run process stdout:", run_result.stdout) + print("run process stderr:", run_result.stderr) + + score = float(run_result.result.get("duration.mean", "0.0")) / 1e9 + passed = run_result.result.get("check", "") == "pass" + if not passed: + return "check_implementation failed", 0.0 + + if score is None: + return run_result.stdout, run_result.duration + + return run_result.stdout, score + except TimeoutException as e: return f"Timeout Error: {str(e)}", 0.0 + except Exception as e: + return f"Error executing script: {str(e)}", 0.0 diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py index 7cecfbb8..5f54fe18 100644 --- a/src/discord-cluster-manager/run_eval.py +++ b/src/discord-cluster-manager/run_eval.py @@ -1,8 +1,8 @@ +import dataclasses import os import shlex import subprocess import time -import dataclasses from typing import Optional from consts import CUDA_FLAGS @@ -75,7 +75,7 @@ def compile_cuda_script( # # noqa: C901 return CompileResult( nvcc_found=False, success=False, - nvcc_version='', + nvcc_version="", command=_make_cmd(e.cmd), stdout=e.stdout, stderr=e.stderr, @@ -121,25 +121,14 @@ def run_cuda_program(args: list[str]) -> RunResult: env["POPCORN_FD"] = str(pipe_write) execution_start_time = time.perf_counter() - try: - run_process = subprocess.run( - args, - capture_output=True, - text=True, - check=True, - env=env, - pass_fds=[pipe_write], - ) - except subprocess.CalledProcessError as e: - return RunResult( - success=False, - command=_make_cmd(e.cmd), - stdout=e.stdout, - stderr=e.stderr, - exit_code=e.returncode, - duration=time.perf_counter() - execution_start_time, - result={} - ) + run_process = subprocess.run( + args, + capture_output=True, + text=True, + check=False, + env=env, + pass_fds=[pipe_write], + ) execution_end_time = time.perf_counter() # terminate output writing @@ -153,13 +142,13 @@ def run_cuda_program(args: list[str]) -> RunResult: result_dict[key.strip()] = value.strip() return RunResult( - success=True, + success=run_process.returncode == 0, command=_make_cmd(run_process.args), stdout=run_process.stdout, stderr=run_process.stderr, exit_code=run_process.returncode, duration=execution_end_time - execution_start_time, - result=result_dict + result=result_dict, ) @@ -169,7 +158,7 @@ def run_cuda_script( # # noqa: C901 submission_content: str = None, arch: int = None, include_dirs: list[str] = None, -) -> tuple[str, float]: +) -> tuple[CompileResult, RunResult]: """ Executes the provided CUDA kernel in an isolated environment with a timeout @@ -181,7 +170,7 @@ def run_cuda_script( # # noqa: C901 include_dirs: Additional include directories, e.g., for thunderkittens/cutlass etc Returns: - tuple[str, float]: (Kernel output, execution time in milliseconds) + tuple[CompileResult, RunResult]: CUDA compile/eval result information """ if include_dirs is None: include_dirs = [] @@ -207,45 +196,19 @@ def run_cuda_script( # # noqa: C901 ) if not compile_result.success: - if not compile_result.nvcc_found: - return ( - "Error executing script: NVCC not found:\n" - + f"command `{compile_result.command}` failed with exit code {compile_result.exit_code}:\n" - + compile_result.stderr - , 0.0 - ) - return ( - "Error executing script: CUDA compilation failed with return code " - + f"{compile_result.exit_code}:\n{compile_result.stderr}\n" - + f"compile command: `{compile_result.command}`", - 0.0 + return compile_result, RunResult( + success=False, + command="", + stdout="", + stderr="", + exit_code=-1, + duration=0.0, + result={}, ) run_result = run_cuda_program(["./eval.out"]) - if not run_result.success: - # exit code 1 encodes failed tests - if run_result.exit_code == 1: - return f"check_implementation failed:\n{run_result.stderr}", 0.0 - else: - return f"Script failed with exit code ({run_result.exit_code}):\n{run_result.stderr}", 0.0 - - print("run process stdout:", run_result.stdout) - print("run process stderr:", run_result.stderr) - - score = float(run_result.result.get("duration.mean", "0.0")) / 1e9 - passed = run_result.result.get("check", "") == "pass" - if not passed: - return "check_implementation failed", 0.0 + return compile_result, run_result - if score is None: - return run_result.stdout, run_result.duration - - return run_result.stdout, score - - except subprocess.CalledProcessError as e: - return f"Error executing script: {str(e)}\n{e.stderr}", 0.0 - except Exception as e: - return f"Error executing script: {str(e)}", 0.0 finally: tmp_files = ["reference.cuh", "train.cuh", "eval.cu", "eval.out"] for f in tmp_files: