Skip to content

Commit

Permalink
added an enum for exit codes
Browse files Browse the repository at this point in the history
  • Loading branch information
ngc92 committed Jan 14, 2025
1 parent 6d030d0 commit 348e36c
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 11 deletions.
8 changes: 5 additions & 3 deletions scripts/ci_test_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

sys.path.append("src/discord-cluster-manager")

from consts import ExitCode
from leaderboard_eval import cu_eval
from run_eval import run_cuda_script

Expand All @@ -23,6 +24,7 @@ def test_does_not_compile():
assert comp.success is False
assert run.success is False
assert comp.nvcc_found is True
assert comp.exit_code != ExitCode.SUCCESS
assert comp.stdout == ""
assert 'train.cuh(2): error: identifier "input_tt" is undefined' in comp.stderr
assert '1 error detected in the compilation of "eval.cu".' in comp.stderr
Expand Down Expand Up @@ -57,7 +59,7 @@ def test_cuda_runtime_error():
assert "warming up..." in run.stdout
assert "cudaDeviceSynchronize() at eval.cu(63) in `measure_runtime`" in run.stderr
assert "an illegal memory access was encountered" in run.stderr
assert run.exit_code == 110
assert run.exit_code == ExitCode.CUDA_FAIL
assert len(run.result) == 0


Expand Down Expand Up @@ -85,7 +87,7 @@ def test_cuda_validation_fail():
# we never reach the benchmark part, because the test fails
assert "warming up..." not in run.stdout
assert "ERROR AT 0, 0" in run.stderr
assert run.exit_code == 112
assert run.exit_code == ExitCode.VALIDATE_FAIL
assert run.result["check"] == "fail"


Expand All @@ -96,5 +98,5 @@ def test_cuda_correct():
assert comp.success is True
assert run.success is True
assert "warming up..." in run.stdout
assert run.exit_code == 0
assert run.exit_code == ExitCode.SUCCESS
assert run.result["check"] == "pass"
7 changes: 4 additions & 3 deletions scripts/ci_test_python.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

sys.path.append("src/discord-cluster-manager")

from consts import ExitCode
from leaderboard_eval import py_eval
from run_eval import run_pytorch_script

Expand All @@ -21,7 +22,7 @@ def test_does_not_import():

run = run_pytorch_script(py_eval, ref.read_text(), sub, arch=None)
assert run.success is False
assert run.exit_code == 1
assert run.exit_code != ExitCode.SUCCESS
assert "IndentationError: unexpected indent\n" in run.stderr


Expand All @@ -38,7 +39,7 @@ def custom_kernel(input):
# we never reach the benchmark part, because the test fails
assert "warming up..." not in run.stdout
assert "mismatch found! custom implementation doesnt match reference." in run.stdout
assert run.exit_code == 112
assert run.exit_code == ExitCode.VALIDATE_FAIL
assert run.result["check"] == "fail"


Expand All @@ -48,5 +49,5 @@ def test_correct():
run = run_pytorch_script(py_eval, ref.read_text(), sub, arch=None)
assert run.success is True
assert "warming up..." in run.stdout
assert run.exit_code == 0
assert run.exit_code == ExitCode.SUCCESS
assert run.result["check"] == "pass"
18 changes: 17 additions & 1 deletion src/discord-cluster-manager/consts.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from enum import Enum
from enum import Enum, IntEnum
from typing import Type


Expand All @@ -25,6 +25,22 @@ class ModalGPU(Enum):
H100 = "H100"


class ExitCode(IntEnum):
"""
Exit codes for our runners. These are just the codes actively return,
others are possible (e.g., exiting due to segfault, permissions, signal, ...)
"""

# program ran successfully
SUCCESS = 0
# a cuda API call failed
CUDA_FAIL = 110
# could not setup file descriptor for custom pipe
PIPE_FAILED = 111
# didn't crash, but tests failed
VALIDATE_FAIL = 112


def combine_enums(enums: list[Type[Enum]], combined_name: str) -> Enum:
combined_members = {}
for enum in enums:
Expand Down
6 changes: 2 additions & 4 deletions src/discord-cluster-manager/run_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import time
from typing import Optional

from consts import CUDA_FLAGS
from consts import CUDA_FLAGS, ExitCode


@dataclasses.dataclass
Expand Down Expand Up @@ -141,10 +141,8 @@ def run_program(args: list[str]) -> RunResult:
key, _, value = line.partition(":")
result_dict[key.strip()] = value.strip()

# 0 everything was fine
# 112 program ran fine, but we detected a test failure
return RunResult(
success=run_process.returncode == 0,
success=run_process.returncode == ExitCode.SUCCESS,
command=_make_cmd(run_process.args),
stdout=run_process.stdout,
stderr=run_process.stderr,
Expand Down

0 comments on commit 348e36c

Please sign in to comment.