Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a new benchmark ENAMEL for evaluating the efficiency of LLM-generated code #260

Open
wants to merge 29 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
4eadff5
Add a new benchmark ENAMEL
q-rz Jun 30, 2024
d23b938
Add a new benchmark ENAMEL
q-rz Jul 1, 2024
72e50d3
Add a new benchmark ENAMEL
q-rz Jul 1, 2024
3d4c275
Add a new benchmark ENAMEL
q-rz Jul 1, 2024
7847bb0
Add a new benchmark ENAMEL
q-rz Jul 1, 2024
48c2f1c
Add a new benchmark ENAMEL
q-rz Jul 1, 2024
02c43e9
Add a new benchmark ENAMEL
q-rz Jul 1, 2024
d1e10b9
Add a new benchmark ENAMEL
q-rz Jul 18, 2024
86f3902
Add a new benchmark ENAMEL
q-rz Jul 18, 2024
4caa5dd
Add a new benchmark ENAMEL
q-rz Jul 18, 2024
027afcb
Add a new benchmark ENAMEL
q-rz Jul 18, 2024
eb43103
Add a new benchmark ENAMEL
q-rz Jul 18, 2024
f3e86ac
Add a new benchmark ENAMEL
q-rz Jul 21, 2024
5aafc0e
Add a new benchmark ENAMEL
q-rz Jul 21, 2024
afb8471
Add a new benchmark ENAMEL
q-rz Jul 21, 2024
8cf92a6
Add a new benchmark ENAMEL
q-rz Jul 21, 2024
71c69f6
Add a new benchmark ENAMEL
q-rz Jul 21, 2024
32265c8
Add a new benchmark ENAMEL
q-rz Jul 21, 2024
bef7566
Add a new benchmark ENAMEL
q-rz Jul 21, 2024
bf3348f
Add a new benchmark ENAMEL
q-rz Jul 21, 2024
93c47cc
Add a new benchmark ENAMEL
q-rz Jul 22, 2024
6b6163d
Add a new benchmark ENAMEL
q-rz Jul 22, 2024
fd7694f
Add a new benchmark ENAMEL
q-rz Jul 22, 2024
cd0810c
Add a new benchmark ENAMEL
q-rz Jul 22, 2024
de62d20
Add a new benchmark ENAMEL
q-rz Jul 22, 2024
80f4e14
Add a new benchmark ENAMEL
q-rz Jul 22, 2024
7109433
Add a new benchmark ENAMEL
q-rz Jul 22, 2024
1553acf
Merge branch 'bigcode-project:main' into main
q-rz Jul 22, 2024
cca02b2
Update README.md
q-rz Jul 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ Below are the features and tasks of this framework:
- `StarCoderFIM`: which uses the default FIM tokens `"<fim_prefix>", "<fim_middle>", "<fim_suffix>"`, and
- `SantaCoderFIM`: which uses SantaCoder FIM tokens `"<fim-prefix>", "<fim-middle>", "<fim-suffix>"`
- [Mercury](https://huggingface.co/datasets/Elfsong/Mercury) for evaluating computational efficiency of **Python** code generation.
- [ENAMEL](https://github.com/q-rz/enamel) evaluates the efficiency ($\textnormal{eff@}k$) of generated code compared with **expert-written** reference solutions under HumanEval problems.

More details about each task can be found in the documentation in [`docs/README.md`](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/docs/README.md).
## Setup
Expand Down
3 changes: 2 additions & 1 deletion bigcode_eval/tasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
concode, ds1000, gsm, humaneval, humanevalplus, humanevalpack,
instruct_humaneval, instruct_wizard_humaneval, mbpp, mbppplus,
multiple, parity, python_bugs, quixbugs, recode, santacoder_fim,
studenteval, mercury)
studenteval, mercury, enamel)

TASK_REGISTRY = {
**apps.create_all_tasks(),
Expand All @@ -31,6 +31,7 @@
**santacoder_fim.create_all_tasks(),
"studenteval": studenteval.StudentEval,
"mercury": mercury.Mercury,
**enamel.create_all_tasks(),
}

ALL_TASKS = sorted(list(TASK_REGISTRY))
Expand Down
348 changes: 348 additions & 0 deletions bigcode_eval/tasks/custom_metrics/enamel_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,348 @@
from copy import deepcopy
import gc
import pickle
import time

import io
import os
import sys
import resource
import platform
import contextlib

import numpy as np

def calc_exec_time(ts): # Hodges--Lehmann estimator
ts = np.array(ts) / 2.
ts = ts[None, :] + ts[:, None]
ts = ts[np.tril_indices_from(ts)]
return np.median(ts)

def calc_eff(elapsed, ref, timeout):
return max(0., timeout - elapsed) / (timeout - ref)

def calc_eff_at_k(e, k): # numerically stable implementation
n = len(e)
lbd = [k / n]
k_ = k - 1
for r in range(n - 1, k_, -1):
lbd.append(lbd[-1] * (1 - k_ / r))
lbd = np.flip(lbd)
e = np.sort(e)[k_ :]
return (lbd * e).sum()

def calc_pass_at_k(n, c, k): # from the HumanEval paper
if n - c < k: return 1.0
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))

class Test: # a test case
def __init__(self, input = None, answer = None, ref = None):
self.input = input
self.answer = answer
self.ref = ref # reference execution time

class Refs: # references for efficiency evaluation
def __init__(self, tests, hardness):
neg_inf = float('-inf')
self.refs = [neg_inf] * len(hardness)
self.ref_max = neg_inf
self.lid = None
self.cid = None
# finds the longest reference execution time for calibration
for j, (size, tests_j) in enumerate(tests):
if hardness[j]:
for k, test in enumerate(tests_j):
if self.refs[j] < test.ref:
self.refs[j] = test.ref
if self.ref_max < test.ref:
self.ref_max = test.ref
self.lid = j
self.cid = k

class EnamUnpickler(pickle.Unpickler):
CLS_DICT = {'enam.evaluate.Test': Test, 'enam.evaluate.Refs': Refs}
def find_class(self, module, name):
cls_name = f'{module}.{name}'
if cls_name in self.CLS_DICT:
return self.CLS_DICT[cls_name]
else:
return super().find_class(module, name)

TPL_RUN = '''%s
%s
__t0 = time.time()
__output = %s(*__input)
__t1 = time.time()
''' # % (prompt, code, entry_point) # this should work no matter code includes prompt or not
TPL_TEST = '''%s
pass
%s
__accepted = __check(__input, __answer, __output)
''' # % (prompt, checker)

def evaluate_one(code, problem, tests, refs, k, hardness, n_reps, memory_giga, timeout_factor, tolerence_sec):
timeout = timeout_factor * refs.ref_max
memory_bytes = memory_giga * (1024 ** 3)
n_levels = len(tests)
zero_effs = [0. for j in range(n_levels)]
effs = []
for j, (size, tests_j) in enumerate(tests):
n_reps_j = n_reps[j]
level_elapsed = []
level_break = False
for k, test in enumerate(tests_j):
elapsed = [None for rep in range(n_reps_j)]
for rep in range(n_reps_j):
scope = dict(time = time, input = None, print = None, __input = deepcopy(test.input)) # in case that the code modifies the input
try:
unsafe_timed_execute(TPL_RUN % (problem.prompt, code, problem.entry_point), scope, memory_bytes, timeout + tolerence_sec)
scope['__input'] = test.input
scope['__answer'] = test.answer # to prevent the code reading the answer
unsafe_execute(TPL_TEST % (problem.prompt, problem.checker), scope) # assuming that the checker does not modify the input
except TimeoutException as e:
level_break = True
break
except MemoryError as e:
level_break = True
break
except OverflowError as e:
level_break = True
break
except KeyboardInterrupt as e:
raise e
except BaseException as e:
return False, zero_effs
else:
if '__accepted' in scope and scope['__accepted']:
elapsed[rep] = scope['__t1'] - scope['__t0']
else:
return False, zero_effs
if level_break:
break
else:
level_elapsed.append(calc_exec_time(elapsed).item())
if level_break:
break
else:
effs.append(calc_eff(elapsed = max(level_elapsed), ref = refs.refs[j], timeout = timeout))
if j == 0 and level_break:
return False, zero_effs
for j in range(len(effs), n_levels):
effs.append(0.)
return True, effs

def compute_refs(problem, tests, n_reps, hardness): # computes the calibration factor of of execution time
for j in range(len(tests)):
if hardness[j]:
for k in range(len(tests[j][-1])):
test = tests[j][-1][k]
n_reps_j = n_reps[j]
elapsed = [None for rep in range(n_reps_j)]
for rep in range(n_reps_j):
scope = dict(time = time, __input = deepcopy(test.input)) # in case that the code modifies the input
unsafe_execute(TPL_RUN % (problem.prompt, problem.reference_solution, problem.entry_point), scope) # assuming that the reference solution is error-free
elapsed[rep] = scope['__t1'] - scope['__t0']
test.ref = calc_exec_time(elapsed).item()
return Refs(tests = tests, hardness = hardness)

def evaluate_all(problems, codes, tests, k, hardness, n_reps, memory_giga, timeout_factor, tolerence_sec):
if isinstance(k, int):
k = [k]
min_codes = min(len(codes_i) for codes_i in codes)
k = sorted({k_ for k_ in k if k_ <= min_codes})
passes = [[] for k_ in k]
effs = [[] for k_ in k]
gc.collect()
for problem, codes_i, tests_i in zip(problems, codes, tests):
refs_i = compute_refs(problem = problem, tests = tests_i, n_reps = n_reps, hardness = hardness)
n_levels = len(tests_i)
problem_passes = []
problem_effs = []
for code in codes_i:
passed, code_effs = evaluate_one(
code = code, problem = problem, tests = tests_i, refs = refs_i,
k = k, hardness = hardness, n_reps = n_reps, memory_giga = memory_giga,
timeout_factor = timeout_factor, tolerence_sec = tolerence_sec)
problem_passes.append(passed)
problem_effs.append(code_effs)
for j, k_ in enumerate(k):
passes[j].append(calc_pass_at_k(n = len(problem_passes), c = sum(problem_passes), k = k_))
effs[j].append(calc_eff_at_k(e = np.average(problem_effs, axis = 1, weights = hardness), k = k_))
metrics = dict()
for k_, pass_k in zip(k, passes):
metrics[f'pass@{k_}'] = np.mean(pass_k).item()
for k_, eff_k in zip(k, effs):
metrics[f'eff@{k_}'] = np.mean(eff_k).item()
return metrics

def might_catch_timeout_signal(generation, pattern_seq = (' while ', ' try:')):
i = 0
for pattern in pattern_seq:
i = generarion.find(pattern, i)
if i == -1:
return False
i += len(pattern)
return True

might_catch_timeout_signal.WARNING = """\
We have detected that the generated code samples use `try ... except` within a loop, which might catch \
our timeout signal and cause a dead loop. Since resolving this rare issue via `multiprocessing` would \
significantly slow down the evaluation process for our large-scale inputs, we have decided not to resolve \
this issue. If this issue does happen, please consider removing the corresponding code samples."""

"""The following functions are adapted from code_eval (@link https://huggingface.co/spaces/evaluate-metric/code_eval)"""

def get_memory_usage():
return sys.getsizeof(sys.modules[__name__])

@contextlib.contextmanager
def set_memory_limit(maximum_memory_bytes = None):
try:
if maximum_memory_bytes is not None:
_not_darwin = (not platform.uname().system == "Darwin")
_rlimit_as = resource.getrlimit(resource.RLIMIT_AS)
_rlimit_data = resource.getrlimit(resource.RLIMIT_DATA)
if _not_darwin:
_rlimit_stack = resource.getrlimit(resource.RLIMIT_STACK)
memory_limit = int(get_memory_usage() + maximum_memory_bytes)
resource.setrlimit(resource.RLIMIT_AS, (memory_limit, _rlimit_as[-1]))
resource.setrlimit(resource.RLIMIT_DATA, (memory_limit, _rlimit_data[-1]))
if _not_darwin:
resource.setrlimit(resource.RLIMIT_STACK, (memory_limit, _rlimit_stack[-1]))
yield
finally:
if maximum_memory_bytes is not None:
resource.setrlimit(resource.RLIMIT_AS, _rlimit_as)
resource.setrlimit(resource.RLIMIT_DATA, _rlimit_data)
if _not_darwin:
resource.setrlimit(resource.RLIMIT_STACK, _rlimit_stack)

class TimeoutException(Exception):
pass

def timeout_signal_handler(signum, frame):
raise TimeoutException("Timed out!")

@contextlib.contextmanager
def set_time_limit(seconds):
import signal
signal.setitimer(signal.ITIMER_REAL, seconds)
signal.signal(signal.SIGALRM, timeout_signal_handler)
try:
yield
finally:
signal.setitimer(signal.ITIMER_REAL, 0)

class WriteOnlyStringIO(io.StringIO):
def read(self, *args, **kwargs):
raise OSError
def readline(self, *args, **kwargs):
raise OSError
def readlines(self, *args, **kwargs):
raise OSError
def readable(self, *args, **kwargs):
return False

class redirect_stdin(contextlib._RedirectStream): # type: ignore
_stream = "stdin"

@contextlib.contextmanager
def swallow_io():
stream = WriteOnlyStringIO()
with contextlib.redirect_stdout(stream):
with contextlib.redirect_stderr(stream):
with redirect_stdin(stream):
yield

@contextlib.contextmanager
def chdir(root):
if root == ".":
yield
return
cwd = os.getcwd()
os.chdir(root)
try:
yield
except BaseException as exc:
raise exc
finally:
os.chdir(cwd)

@contextlib.contextmanager
def create_tempdir():
import tempfile
with tempfile.TemporaryDirectory() as dirname:
with chdir(dirname):
yield dirname

@contextlib.contextmanager
def reliability_guard():
"""
This disables various destructive functions and prevents the generated code
from interfering with the test (e.g. fork bomb, killing other processes,
removing filesystem files, etc.)

WARNING
This function is NOT a security sandbox. Untrusted code, including, model-
generated code, should not be blindly executed outside of one. See the
Codex paper for more information about OpenAI's code sandbox, and proceed
with caution.
"""

with create_tempdir():
with swallow_io():
try:

import faulthandler

faulthandler.disable()

import builtins, os, shutil, subprocess

os.environ["OMP_NUM_THREADS"] = "1"

_keys = dict(
builtins = ('exit', 'quit'),
os = ('kill', 'system', 'putenv', 'remove', 'removedirs', 'rmdir', 'fchdir', 'setuid', 'fork', 'forkpty', 'killpg', 'rename', 'renames', 'truncate', 'replace', 'unlink', 'fchmod', 'fchown', 'chmod', 'chown', 'chroot', 'lchflags', 'lchmod', 'lchown', 'getcwd', 'chdir'),
shutil = ('rmtree', 'move', 'chown'),
subprocess = ('Popen',),
)
_baks = dict()
for lib, keys in _keys.items():
obj = locals()[lib]
_bak = dict()
for key in keys:
if hasattr(obj, key):
_bak[key] = getattr(obj, key)
_baks[lib] = _bak

#__builtins__["help"] = None

yield
finally:
for lib, keys in _keys.items():
obj = locals()[lib]
for key, val in _baks[lib].items():
setattr(obj, key, val)

def unsafe_execute(program: str, exec_globals: dict):
try:
gc_bak = gc.isenabled()
gc.disable()
with reliability_guard():
exec(program, exec_globals)
finally:
if gc_bak:
gc.enable()

def unsafe_timed_execute(program: str, exec_globals: dict, maximum_memory_bytes: float, time_limit_seconds: float):
try:
gc_bak = gc.isenabled()
gc.disable()
with reliability_guard():
with set_memory_limit(maximum_memory_bytes):
with set_time_limit(time_limit_seconds):
exec(program, exec_globals)
finally:
if gc_bak:
gc.enable()
Loading