Skip to content

Commit

Permalink
moved string generation to modal runner and updated CI
Browse files Browse the repository at this point in the history
  • Loading branch information
ngc92 committed Jan 13, 2025
1 parent fa92d9c commit 0c04a1b
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 77 deletions.
51 changes: 35 additions & 16 deletions scripts/ci_test_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,15 @@ def test_does_not_compile():
output_t custom_kernel(input_tt data) { }
"""

cout, score = run_cuda_script(cu_eval, ref.read_text(), sub, arch=None)
assert score == 0
assert "CUDA compilation failed" in cout
comp, run = run_cuda_script(cu_eval, ref.read_text(), sub, arch=None)
assert comp.success is False
assert run.success is False
assert comp.nvcc_found is True
assert comp.stdout == ""
assert 'train.cuh(2): error: identifier "input_tt" is undefined' in comp.stderr
assert '1 error detected in the compilation of "eval.cu".' in comp.stderr
assert comp.command.startswith("/usr/local/cuda/bin/nvcc")
assert "nvcc: NVIDIA (R) Cuda compiler driver" in comp.nvcc_version


def test_cuda_runtime_error():
Expand All @@ -44,11 +50,15 @@ def test_cuda_runtime_error():
}
"""
cout, score = run_cuda_script(cu_eval, ref.read_text(), sub, arch=None)
assert score == 0
assert "Command '['./eval.out']' returned non-zero exit status 3." in cout
assert "cudaDeviceSynchronize() at eval.cu(64) in `measure_runtime`" in cout
assert "an illegal memory access was encountered" in cout
comp, run = run_cuda_script(cu_eval, ref.read_text(), sub, arch=None)
assert comp.success is True
assert run.success is False
assert run.command == "./eval.out"
assert "warming up..." in run.stdout
assert "cudaDeviceSynchronize() at eval.cu(64) in `measure_runtime`" in run.stderr
assert "an illegal memory access was encountered" in run.stderr
assert run.exit_code == 3
assert len(run.result) == 0


def test_cuda_validation_fail():
Expand All @@ -68,14 +78,23 @@ def test_cuda_validation_fail():
}
"""
cout, score = run_cuda_script(cu_eval, ref.read_text(), sub, arch=None)
assert score == 0
assert "Command '['./eval.out']' returned non-zero exit status 1." in cout
assert "ERROR AT 0, 0" in cout
comp, run = run_cuda_script(cu_eval, ref.read_text(), sub, arch=None)
assert comp.success is True
assert run.success is False
assert run.command == "./eval.out"
# we never reach the benchmark part, because the test fails
assert "warming up..." not in run.stdout
assert "ERROR AT 0, 0" in run.stderr
assert run.exit_code == 1
assert run.result["check"] == "fail"


def test_cuda_correct():
sub = Path("examples/identity_cuda/submission.cuh")

cout, score = run_cuda_script(cu_eval, ref.read_text(), sub.read_text(), arch=None)
assert score > 0
sub = Path("examples/identity_cuda/submission.cuh").read_text()

comp, run = run_cuda_script(cu_eval, ref.read_text(), sub, arch=None)
assert comp.success is True
assert run.success is True
assert "warming up..." in run.stdout
assert run.exit_code == 0
assert run.result["check"] == "pass"
45 changes: 44 additions & 1 deletion src/discord-cluster-manager/modal_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,55 @@ def modal_run_cuda_script( # # noqa: C901
"""Modal version of run_cuda_script, handling timeouts"""
try:
with timeout(timeout_seconds):
return run_cuda_script(
compile_result, run_result = run_cuda_script(
script_content,
reference_content=reference_content,
submission_content=submission_content,
arch=arch,
include_dirs=MODAL_CUDA_INCLUDE_DIRS,
)

if not compile_result.success:
if not compile_result.nvcc_found:
return (
"Error executing script: NVCC not found:\n"
+ f"command `{compile_result.command}` "
+ f"failed with exit code {compile_result.exit_code}:\n"
+ compile_result.stderr,
0.0,
)
return (
"Error executing script: CUDA compilation failed with return code "
+ f"{compile_result.exit_code}:\n{compile_result.stderr}\n"
+ f"compile command: `{compile_result.command}`",
0.0,
)

if not run_result.success:
# exit code 1 encodes failed tests
if run_result.exit_code == 1:
return f"check_implementation failed:\n{run_result.stderr}", 0.0
else:
return (
f"Script failed with exit code "
f"({run_result.exit_code}):\n{run_result.stderr}",
0.0,
)

print("run process stdout:", run_result.stdout)
print("run process stderr:", run_result.stderr)

score = float(run_result.result.get("duration.mean", "0.0")) / 1e9
passed = run_result.result.get("check", "") == "pass"
if not passed:
return "check_implementation failed", 0.0

if score is None:
return run_result.stdout, run_result.duration

return run_result.stdout, score

except TimeoutException as e:
return f"Timeout Error: {str(e)}", 0.0
except Exception as e:
return f"Error executing script: {str(e)}", 0.0
83 changes: 23 additions & 60 deletions src/discord-cluster-manager/run_eval.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import dataclasses
import os
import shlex
import subprocess
import time
import dataclasses
from typing import Optional

from consts import CUDA_FLAGS
Expand Down Expand Up @@ -75,7 +75,7 @@ def compile_cuda_script( # # noqa: C901
return CompileResult(
nvcc_found=False,
success=False,
nvcc_version='',
nvcc_version="",
command=_make_cmd(e.cmd),
stdout=e.stdout,
stderr=e.stderr,
Expand Down Expand Up @@ -121,25 +121,14 @@ def run_cuda_program(args: list[str]) -> RunResult:
env["POPCORN_FD"] = str(pipe_write)

execution_start_time = time.perf_counter()
try:
run_process = subprocess.run(
args,
capture_output=True,
text=True,
check=True,
env=env,
pass_fds=[pipe_write],
)
except subprocess.CalledProcessError as e:
return RunResult(
success=False,
command=_make_cmd(e.cmd),
stdout=e.stdout,
stderr=e.stderr,
exit_code=e.returncode,
duration=time.perf_counter() - execution_start_time,
result={}
)
run_process = subprocess.run(
args,
capture_output=True,
text=True,
check=False,
env=env,
pass_fds=[pipe_write],
)
execution_end_time = time.perf_counter()

# terminate output writing
Expand All @@ -153,13 +142,13 @@ def run_cuda_program(args: list[str]) -> RunResult:
result_dict[key.strip()] = value.strip()

return RunResult(
success=True,
success=run_process.returncode == 0,
command=_make_cmd(run_process.args),
stdout=run_process.stdout,
stderr=run_process.stderr,
exit_code=run_process.returncode,
duration=execution_end_time - execution_start_time,
result=result_dict
result=result_dict,
)


Expand All @@ -169,7 +158,7 @@ def run_cuda_script( # # noqa: C901
submission_content: str = None,
arch: int = None,
include_dirs: list[str] = None,
) -> tuple[str, float]:
) -> tuple[CompileResult, RunResult]:
"""
Executes the provided CUDA kernel in an isolated environment with a timeout
Expand All @@ -181,7 +170,7 @@ def run_cuda_script( # # noqa: C901
include_dirs: Additional include directories, e.g., for thunderkittens/cutlass etc
Returns:
tuple[str, float]: (Kernel output, execution time in milliseconds)
tuple[CompileResult, RunResult]: CUDA compile/eval result information
"""
if include_dirs is None:
include_dirs = []
Expand All @@ -207,45 +196,19 @@ def run_cuda_script( # # noqa: C901
)

if not compile_result.success:
if not compile_result.nvcc_found:
return (
"Error executing script: NVCC not found:\n"
+ f"command `{compile_result.command}` failed with exit code {compile_result.exit_code}:\n"
+ compile_result.stderr
, 0.0
)
return (
"Error executing script: CUDA compilation failed with return code "
+ f"{compile_result.exit_code}:\n{compile_result.stderr}\n"
+ f"compile command: `{compile_result.command}`",
0.0
return compile_result, RunResult(
success=False,
command="",
stdout="",
stderr="",
exit_code=-1,
duration=0.0,
result={},
)

run_result = run_cuda_program(["./eval.out"])
if not run_result.success:
# exit code 1 encodes failed tests
if run_result.exit_code == 1:
return f"check_implementation failed:\n{run_result.stderr}", 0.0
else:
return f"Script failed with exit code ({run_result.exit_code}):\n{run_result.stderr}", 0.0

print("run process stdout:", run_result.stdout)
print("run process stderr:", run_result.stderr)

score = float(run_result.result.get("duration.mean", "0.0")) / 1e9
passed = run_result.result.get("check", "") == "pass"
if not passed:
return "check_implementation failed", 0.0
return compile_result, run_result

if score is None:
return run_result.stdout, run_result.duration

return run_result.stdout, score

except subprocess.CalledProcessError as e:
return f"Error executing script: {str(e)}\n{e.stderr}", 0.0
except Exception as e:
return f"Error executing script: {str(e)}", 0.0
finally:
tmp_files = ["reference.cuh", "train.cuh", "eval.cu", "eval.out"]
for f in tmp_files:
Expand Down

0 comments on commit 0c04a1b

Please sign in to comment.